2004-03-26 09:21:39

by Zoltan Menyhart

[permalink] [raw]
Subject: Migrate pages from a ccNUMA node to another - patch

diff -Nru 2.6.4.ref/arch/ia64/kernel/acpi.c 2.6.4.mig4/arch/ia64/kernel/acpi.c
--- 2.6.4.ref/arch/ia64/kernel/acpi.c Tue Mar 16 10:18:04 2004
+++ 2.6.4.mig4/arch/ia64/kernel/acpi.c Thu Mar 25 08:58:09 2004
@@ -457,6 +457,7 @@
for (i = 0; i < MAX_PXM_DOMAINS; i++) {
if (pxm_bit_test(i)) {
pxm_to_nid_map[i] = numnodes;
+ node_set_online(numnodes);
nid_to_pxm_map[numnodes++] = i;
}
}
diff -Nru 2.6.4.ref/arch/ia64/kernel/entry.S 2.6.4.mig4/arch/ia64/kernel/entry.S
--- 2.6.4.ref/arch/ia64/kernel/entry.S Tue Mar 16 10:18:04 2004
+++ 2.6.4.mig4/arch/ia64/kernel/entry.S Thu Mar 25 08:58:09 2004
@@ -1518,7 +1518,11 @@
data8 sys_ni_syscall
data8 sys_ni_syscall
data8 sys_ni_syscall // 1275
+#if defined(CONFIG_NUMA)
+ data8 sys_page_migrate // 1276: Migrate pages to another NUMA node
+#else
data8 sys_ni_syscall
+#endif
data8 sys_ni_syscall
data8 sys_ni_syscall
data8 sys_ni_syscall
diff -Nru 2.6.4.ref/arch/ia64/mm/Makefile 2.6.4.mig4/arch/ia64/mm/Makefile
--- 2.6.4.ref/arch/ia64/mm/Makefile Tue Mar 16 10:18:04 2004
+++ 2.6.4.mig4/arch/ia64/mm/Makefile Thu Mar 25 08:58:15 2004
@@ -5,7 +5,7 @@
obj-y := init.o fault.o tlb.o extable.o

obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
-obj-$(CONFIG_NUMA) += numa.o
+obj-$(CONFIG_NUMA) += numa.o migrate.o
obj-$(CONFIG_DISCONTIGMEM) += discontig.o
ifndef CONFIG_DISCONTIGMEM
obj-y += contig.o
diff -Nru 2.6.4.ref/arch/ia64/mm/migrate.c 2.6.4.mig4/arch/ia64/mm/migrate.c
--- 2.6.4.ref/arch/ia64/mm/migrate.c Thu Jan 1 01:00:00 1970
+++ 2.6.4.mig4/arch/ia64/mm/migrate.c Thu Mar 25 08:58:15 2004
@@ -0,0 +1,1274 @@
+/*
+ * Migrate pages from a ccNUMA node to another.
+ * ============================================
+ *
+ * Version 0.1, 31st of March 2004
+ * By Zoltan Menyhart, Bull S.A. <[email protected]>
+ * The usual GPL applies.
+ *
+ * This is Linux, and explanatory comments / error messages are seen
+ * as a sign of weakness :-)))
+ *
+ * O.K. check out "migrate.txt" and "page_migrate.h".
+ */
+
+
+#include <linux/mm.h>
+#include <linux/smp_lock.h>
+#include <linux/pagemap.h>
+#include <linux/rmap-locking.h>
+#include <linux/swap.h>
+#include <linux/vmalloc.h>
+#include <asm/rmap.h>
+#include <asm/tlbflush.h>
+#include <asm/page_migrate.h>
+#if defined(_TEST_)
+#include <linux/delay.h> // For "ia64_get_itc()"
+#endif
+
+#if !defined(CONFIG_DISCONTIGMEM) || !defined(CONFIG_NUMA)
+#error "That's a NUMA stuff"
+#endif
+
+
+/*
+ * Type of a virtual address.
+ */
+typedef unsigned long vaddr_t; // Pointers converted to this type
+
+
+#if defined(_TEST_)
+
+// Set the bits - as defined below - for some kernel messages.
+unsigned int _pr_flag_;
+
+#define PRINT_page 1 // Dump "struct page"-s
+#define PRINT_mm 2 // Dump "struct mm_struct"-s
+#define PRINT_vma 4 // Dump "struct vm_area_struct"-s
+#define PRINT_pte 8 // Show PTE-s, r-maps
+#define PRINT_errors 0x10
+#define PRINT_etc 0x20
+#define PRINT_pgd 0x40 // Show PGD scan
+
+#define PRINT(args...) do { if (_pr_flag_) printk(args); } while (0)
+#define PRINT_ERR(args...) \
+ do { if (_pr_flag_ & PRINT_errors) printk(args); } while (0)
+#define PRINT_ETC(args...) \
+ do { if (_pr_flag_ & PRINT_etc) printk(args); } while (0)
+#define PRINT_PGD(args...) \
+ do { if (_pr_flag_ & PRINT_pgd) printk(args); } while (0)
+
+static const char dest_not_online[] = "Destination node not online\n";
+static const char no_vma[] = "Cannot find VMA for address 0x%lx\n";
+static const char illegal_pid[] = "Illegal PID\n";
+static const char inv_n_addresses[] = "Invalid number of addresses";
+static const char ill_va_alias[] = "v-addr alias in range: 0x%p...0x%p\n";
+static const char no_momory[] = "No more memory\n";
+static const char ill_user_buff[] = "Illegal user buffer address\n";
+
+void dump_mm(const struct mm_struct * const);
+void dump_vma(const struct vm_area_struct * const);
+void dump_page(const char * const, const struct page * const);
+void dump_pte_stuff(const pte_t * const);
+phaddr_t gimme_an_address(const caddr_t);
+
+#define STATIC
+#define INLINE
+
+#else // #if defined(_TEST_)
+
+#define PRINT(args...) do { } while (0)
+#define PRINT_ERR(args...) do { } while (0)
+#define PRINT_ETC(args...) do { } while (0)
+#define PRINT_PGD(args...) do { } while (0)
+
+#define dump_mm(vma) do { } while (0)
+#define dump_vma(vma) do { } while (0)
+#define dump_page(text, page) do { } while (0)
+#define dump_pte_stuff(ptep) do { } while (0)
+
+#define STATIC static
+#define INLINE inline
+
+#endif // #if defined(_TEST_)
+
+
+STATIC INLINE long long
+migr_virt_addr_range(const caddr_t, const size_t, const int, const pid_t);
+
+STATIC INLINE long long
+migr_vaddr_range_2(vaddr_t, const vaddr_t, const int, struct mm_struct * const);
+
+STATIC INLINE int
+migr_1_page_by_pte(pte_t * const, const int, struct mm_struct * const);
+
+STATIC INLINE long long
+batch_migrate(const caddr_t, size_t, const int, const pid_t);
+
+STATIC struct mm_struct *
+look_up_mm(const pid_t);
+
+int
+check_migr_1_page_part_2(struct page * const, struct page * const,
+ struct mm_struct * const, pte_t * const);
+
+int
+get_pages_if_valid(phaddr_t * const, unsigned int);
+
+STATIC INLINE int
+check_pages_if_in_pgd(phaddr_t * const, const size_t, const struct mm_struct * const);
+
+STATIC INLINE int
+check_migrate_1_page(const phaddr_t, const int, struct mm_struct * const);
+
+
+// These are the flags which are copied for the new page:
+#define FLAG_MASK (PG_referenced | PG_uptodate | PG_dirty | PG_active | \
+ PG_highmem | PG_arch_1 | PG_private | PG_writeback | \
+ PG_nosave | PG_mappedtodisk | PG_reclaim | PG_compound)
+
+
+#if defined(_NEED_STATISTICS_)
+
+/*
+ * Statistics are accessed in a non atomic way. Who cares? Just some statistics :-)
+ */
+STATIC struct _statistics_ _statistics;
+STATIC struct _statistics_size_ _statistics_sizes = {sizeof _statistics, MAX_NUMNODES};
+
+#define DECLARE_ITC_VAR(var) unsigned long var
+#define SAVE_ITC(var) var = ia64_get_itc()
+#define STORE_DELAY(var, destination) _statistics.t.destination += ia64_get_itc() - var
+#define COUNT(what) _statistics.c.what++
+#define ERROR_CNT(what) _statistics.e.what++
+#define ERROR_CNT_ADD(var, delta) _statistics.e.var += delta
+
+STATIC INLINE int page_migrate_statistics(const caddr_t, const int);
+
+#else
+
+#define DECLARE_ITC_VAR(var)
+#define SAVE_ITC(var) do { } while (0)
+#define STORE_DELAY(var, destination) do { } while (0)
+#define COUNT(what) do { } while (0)
+#define ERROR_CNT(what) do { } while (0)
+#define ERROR_CNT_ADD(var, delta) do { } while (0)
+
+#endif // #if defined(_NEED_STATISTICS_)
+
+
+/*
+ * Migrate pages from a NUMA node to another (and some other minor services).
+ * (See "migrate.txt" and "page_migrate.h".)
+ *
+ * As usual, "-Exxx" returned on errors.
+ */
+asmlinkage long long
+sys_page_migrate(const int cmd, const caddr_t address, const size_t length,
+ const int node, const pid_t pid)
+{
+ long long rc;
+ DECLARE_ITC_VAR(time); // Total time for "sys_page_migrate()"
+
+ SAVE_ITC(/* out */ time);
+ PRINT("\nsys_page_migrate(%d, 0x%p, 0x%lx, %d, %d): pid = %d\n",
+ cmd, address, length, node, pid, current->pid);
+ switch (cmd){
+ //
+ // Migrate some pages from a NUMA node to another.
+ //
+ case _PHADDR_BATCH_MIGRATE_:
+ if (!node_online(node)){
+ PRINT_ERR(dest_not_online);
+ ERROR_CNT(bad_request);
+ rc = -ENODEV;
+ break;
+ }
+ if (length > PAGE_SIZE / sizeof(phaddr_t)){
+ PRINT_ERR(inv_n_addresses);
+ ERROR_CNT(bad_request);
+ rc = -EINVAL;
+ break;
+ }
+ rc = batch_migrate(/* user buffer */ address, /* buffer */ length,
+ node, pid);
+ break;
+ //
+ // Migrate virtual address range.
+ //
+ case _VA_RANGE_MIGRATE_:
+ if (!node_online(node)){
+ PRINT_ERR(dest_not_online);
+ ERROR_CNT(bad_request);
+ rc = -ENODEV;
+ break;
+ }
+ //
+ // Some architectures do not decode all the MSB-s of virtual addresses
+ // for the PGD, PMD and PTE indices, i.e. they have got holes or aliases
+ // in the virtual address space. Make sure that "length" does not span
+ // over virtual address holes nor it creates illegal alias to an
+ // otherwise valid address.
+ //
+ if (__IS_VA_ALIAS((vaddr_t) address, length)){
+ PRINT_ERR(ill_va_alias, address, address + length);
+ ERROR_CNT(non_existent_addr);
+ rc = -EFAULT;
+ break;
+ }
+ rc = migr_virt_addr_range(/* user virtual */ address,
+ /* address range */ length, node, pid);
+ break;
+
+#if defined(_NEED_STATISTICS_)
+ case _STATISTICS_:
+ rc = page_migrate_statistics(/* user buffer */ address,
+ /* ? clear statistics ? */ length != 0);
+ break;
+ case _SIZEOF_STATISTICS_:
+ rc = *(long long *) &_statistics_sizes; // Yeh, I know...
+ break;
+#endif
+#if defined(_TEST_)
+ case _GIMME_AN_ADDRESS_:
+ rc = (long long) gimme_an_address(/* user vistual */ address);
+ break;
+#endif
+ default:
+ ERROR_CNT(bad_request);
+ rc = -EINVAL;
+ break;
+ }
+ STORE_DELAY(/* in */ time, /* out */ total);
+ return rc;
+}
+
+
+/*
+ * Migrate virtual address range of a process.
+ *
+ * Arguments: address: Starting virtual address in a process'es address space
+ * length: Length of the address range to be migrated
+ * node: Destination NUMA node
+ * pid: ID of the victim process, "0" means myself
+ *
+ * Returns: On (partial) success, the number of the pages actually migrated is
+ * returned (in form of "struct _un_success_count_").
+ * As usual, "-Exxx" returned on errors.
+ */
+STATIC INLINE long long
+migr_virt_addr_range(const caddr_t address, const size_t length, const int node,
+ const pid_t pid)
+{
+ const vaddr_t ulimit = (vaddr_t) address + length;
+ struct mm_struct *mm;
+ long long rc;
+ struct vm_area_struct *beg_vma;
+ DECLARE_ITC_VAR(vma_time); // Time for "find_vma()"
+ DECLARE_ITC_VAR(mmap_sem); // Time for "down_read(&mm->mmap_sem)"
+ DECLARE_ITC_VAR(pgd_lock); // "spin_lock(&mm->page_table_lock)"
+ DECLARE_ITC_VAR(pgd_unlock); // "spin_unlock(&mm->page_table_lock)"
+
+ if (pid != 0 && pid != current->pid){
+ //
+ // Look up the "mm_struct" belonging to the process ID.
+ //
+ if ((mm = look_up_mm(pid)) == NULL){
+ PRINT_ERR(illegal_pid);
+ ERROR_CNT(bad_request);
+ return -ESRCH;
+ }
+ //
+ // On success, "mm->mm_users" got incremented to make sure that
+ // "mm_struct" does not go away.
+ //
+ } else {
+ mm = current->mm;
+ //
+ // Actually, there is no need to grab "mm" because it is ours, wont go
+ // away in the mean time. As we do not want to ask questions when
+ // releasing it...
+ // It is safe just to increment the counter: it is ours.
+ //
+ atomic_inc(&mm->mm_users);
+ }
+ SAVE_ITC(/* out */ mmap_sem);
+ down_read(&mm->mmap_sem); // Protect the VMA list
+ STORE_DELAY(/* in */ mmap_sem, /* out */ mmap_sem);
+ dump_mm(mm);
+ //
+ // Check if the starting virtual "address" is valid.
+ //
+ SAVE_ITC(/* out */ vma_time);
+ beg_vma = find_vma(mm, (vaddr_t) address); // Look up the first VMA for
+ // which "address < ->vm_end"
+ STORE_DELAY(/* in */ vma_time, /* out */ find_vma);
+ if (beg_vma == NULL || beg_vma->vm_start > (vaddr_t) address){
+ if (beg_vma != NULL)
+ dump_vma(beg_vma);
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+ PRINT_ERR(no_vma, (vaddr_t) address);
+ ERROR_CNT(non_existent_addr);
+ return -EFAULT;
+ }
+ //
+ // It is safe to start walking the PGD, the PMD and the PTE at "address".
+ //
+ dump_vma(beg_vma);
+ //
+ // We need the page table lock to synchronize with "kswapd"
+ // and the SMP-safe atomic PTE updates.
+ //
+ SAVE_ITC(/* out */ pgd_lock);
+ spin_lock(&mm->page_table_lock);
+ STORE_DELAY(/* in */ pgd_lock, /* out */ pgd_lock);
+ //
+ // Look up pages in the PGD and migrate them one by one.
+ //
+ rc = migr_vaddr_range_2((vaddr_t) address & PAGE_MASK,
+ /* round up */ PAGE_ALIGN(ulimit), node, mm);
+ //
+ // Let the others complete the page fault handler code. They will find the
+ // condition "someone has already installed the PTE" to be TRUE.
+ //
+ SAVE_ITC(/* out */ pgd_unlock);
+ spin_unlock(&mm->page_table_lock);
+ STORE_DELAY(/* in */ pgd_unlock, /* out */ pgd_unlock);
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+ return rc;
+}
+
+
+/*
+ * Migrate virtual address range belonging to a PGD.
+ *
+ * Arguments: address: Starting virtual address in a process'es address space
+ * ulimit: The address range is below this upper limit
+ * node: Destination NUMA node
+ * mm: -> victim "mm_struct"
+ *
+ * Returns: On (partial) success, the number of the pages actually migrated is
+ * returned in "struct _un_success_count_".
+ * As usual, "-Exxx" returned on errors.
+ *
+ * Notes: - We've already checked that "[address...ulimit)" is inside the allowed
+ * user virtual address range.
+ * - Caller has to hold "mm->mmap_sem" for read and "mm->page_table_lock".
+ */
+STATIC INLINE long long
+migr_vaddr_range_2(vaddr_t address, const vaddr_t ulimit, const int node,
+ struct mm_struct * const mm)
+{
+ unsigned long g, m, e; // PGD, MPD and PTE indices
+ const pgd_t *pgd;
+ const pmd_t *pmd;
+ pte_t *pte, *pte0;
+ int rc;
+ struct _un_success_count_ count = {0, 0};
+ DECLARE_ITC_VAR(pgd_scan_t); // PGD scan time
+
+ //
+ // We've already cheked that it is safe to start walking the PGD, the PMD and the
+ // PTE at "address". We've also checked that "[address...ulimit)" does not span
+ // over virtual address holes nor it creates illegal alias to an otherwise
+ // valid address.
+ //
+ g = pgd_index(address); // PGD scan starts here
+ m = pmd_index(address); // The 1st PMD scan starts here
+ e = pte_index(address); // The 1st PTE scan starts here
+ //
+ // Check the user pages only, starting at the PTE corresponding to "address".
+ // Note: "mm->pgd" is an identity mapped virtual address.
+ //
+ SAVE_ITC(/* out */ pgd_scan_t);
+ for (pgd = mm->pgd + g; address < ulimit && g < USER_PTRS_PER_PGD;
+ // Other than the 1st PMD scans start at 0 index
+ m = 0, g++, pgd++){
+ PRINT_PGD("address: 0x%016lx pgd: 0x%p ", address, pgd);
+ PRINT_PGD("g: 0x%lx m: 0x%lx e: 0x%lx\n", g, m, e);
+ PRINT_PGD("__VA():\t 0x%016lx\n", __VA(g, m, e));
+ //
+ // "pgd" contains an identity mapped virtual address.
+ // Migration tolarates holes in the virtual address space.
+ //
+ if (pgd_none(*pgd) || pgd_bad(*pgd)){
+ address &= ~(PTRS_PER_PMD * PTRS_PER_PTE * PAGE_SIZE - 1);
+ address += PTRS_PER_PMD * PTRS_PER_PTE * PAGE_SIZE;
+ continue;
+ }
+ //
+ // "*pgd" is a physical address.
+ //
+ for (pmd = pmd_offset(pgd, 0) + m; m < PTRS_PER_PMD && address < ulimit;
+ // Other than the 1st PTE scans start at 0 index
+ e = 0, m++, pmd++){
+ //
+ // "pmd" contains an identity mapped virtual address.
+ // Migration tolarates holes in the virtual address space.
+ //
+ if (pmd_none(*pmd) || pmd_bad(*pmd)){
+ address &= ~(PTRS_PER_PTE * PAGE_SIZE - 1);
+ address += PTRS_PER_PTE * PAGE_SIZE;
+ continue;
+ }
+ //
+ // "*pmd" is a physical address.
+ //
+ pte0 = pte_offset_map(pmd, 0);
+ //
+ // "pte0" contains some kind of virtual address of the
+ // beginning of a PTE page.
+ //
+ for (pte = pte0 + e; e < PTRS_PER_PTE && address < ulimit;
+ address += PAGE_SIZE, e++, pte++){
+ if (!pte_present(*pte))
+ continue;
+ //
+ // We've found a page... Let's move it.
+ //
+ PRINT("\nVirtual addr:\t0x%016lx\n", __VA(g, m, e));
+ STORE_DELAY(/* in */ pgd_scan_t, /* out */ pgd_scan);
+ if ((rc = migr_1_page_by_pte(pte, node, mm)) < 0){
+ pte_unmap(pte0);
+ return rc;
+ }
+ SAVE_ITC(/* out */ pgd_scan_t);
+ if (rc > 0)
+ count.successful++;
+ else
+ count.failed++;
+ }
+ pte_unmap(pte0);
+ }
+ }
+ STORE_DELAY(/* in */ pgd_scan_t, /* out */ pgd_scan);
+ return *(long long *) &count; // Yeh, I know...
+}
+
+
+/*
+ * Common part of checking & migrating the pages one by one.
+ *
+ * Arguments: src_node: Source NUMA node
+ * old_p: -> old page structure
+ * node: Destination NUMA node
+ * mm: -> victim "mm_struct"
+ * pte: -> PTE of the page to be moved
+ *
+ * Returns: 1: Migration O. K.
+ * 0: Minor error, no actual migration has been done
+ * -Exxx: Catastrophic error
+ *
+ * Notes: - "mm->page_table_lock" and "mm->mmap_sem" have to be held.
+ * - The old page is "get_page()"-ed on entry to meke sure it does not go
+ * away in the mean time - on return it gets "put_page()"-ed.
+ */
+STATIC int
+common_check_migrate_1_page(const int src_node, struct page * const old_p,
+ const int node, struct mm_struct * const mm, pte_t * const pte)
+{
+ struct page *new_p;
+ int rc;
+ DECLARE_ITC_VAR(alloc_time); // Time for "vmalloc()"
+ DECLARE_ITC_VAR(lock_time); // Time for "lock_page()"
+ DECLARE_ITC_VAR(unlock_time); // Time for "unlock_page()"
+ DECLARE_ITC_VAR(free_time); // "__free_pages()", "page_cache_release()"
+
+ //
+ // Allocate the new page in advance. It is less dangerous
+ // - to have a page "floating around" and then take locks
+ // than
+ // - to acquire some locks (e.g. to be able to check the conditions)
+ // and then allocate the page
+ //
+ // Do not insist on allocating the page...
+ //
+ SAVE_ITC(/* out */ alloc_time);
+ new_p = alloc_pages_node(node, GFP_HIGHUSER | __GFP_NORETRY, 0);
+ STORE_DELAY(/* in */ alloc_time, /* out */ page_alloc);
+ if (new_p == NULL){
+ put_page(old_p);
+ PRINT_ERR("No more memory on node %d\n", node);
+ ERROR_CNT(no_memory);
+ return -ENOMEM;
+ }
+ SAVE_ITC(/* out */ lock_time);
+ lock_page(old_p);
+ STORE_DELAY(/* in */ lock_time, /* out */ page_lock);
+
+ //
+ // Would be too long to do everything here.
+ //
+ rc = check_migr_1_page_part_2(old_p, new_p, mm, pte);
+
+ SAVE_ITC(/* out */ unlock_time);
+ unlock_page(old_p);
+ STORE_DELAY(/* in */ unlock_time, /* out */ page_unlock);
+ PRINT("check_migr_1_page_part_2() returned: %d\n", rc);
+ if (rc == 0){
+ //
+ // The old page was "lru_cache_add_active()"-ed e.g. in
+ // "do_anonymous_page()". As on entry the old page was again
+ // "get_page()"-ed, its reference counter is at least 2 right now.
+ //
+ page_cache_release(old_p);
+ _statistics.count[src_node][node]++;
+ } else{
+ SAVE_ITC(/* out */ free_time);
+ __free_pages(new_p, 0);
+ STORE_DELAY(/* in */ free_time, /* out */ page_free);
+ }
+ //
+ // On success, "put_page()" sets free the old page
+ // (unless someone else got hold of it in the mean time).
+ //
+ SAVE_ITC(/* out */ free_time);
+ put_page(old_p);
+ STORE_DELAY(/* in */ free_time, /* out */ page_free);
+ return rc == 0 ? 1 : 0;
+}
+
+
+/*
+ * Migrate a page identified by its PTE.
+ *
+ * Arguments: pte: -> PTE of the page to be moved
+ * node: Destination NUMA node
+ * mm: -> victim "mm_struct"
+ *
+ * Returns: 1: Success
+ * 0: We cannot cope with this page (it is valid, though)
+ * -Exxx: Fatal errors
+ *
+ * Note: "mm->page_table_lock" and "mm->mmap_sem" have to be held.
+ */
+STATIC INLINE int
+migr_1_page_by_pte(pte_t * const pte, const int node, struct mm_struct * const mm)
+{
+ const phaddr_t old_addr = pte_val(*pte) & _PFN_MASK;
+ const int src_node = paddr_to_nid(old_addr);
+ struct page * const old_p = pfn_to_page(old_addr >> PAGE_SHIFT);
+
+ dump_page("\nOld", old_p);
+ if (node == src_node){
+ PRINT_ETC("Old ph adr:\t0x%016llx old node: %d new node: %d\n",
+ old_addr, src_node, node);
+ return 1; // Done :-)
+ }
+ //
+ // Actually, there is no need to grab the old page because it is sure that it
+ // has been "get_page()"-ed before and we still keep "->page_table_lock".
+ // We are going to invoke "common_check_migrate_1_page()" that is used by the
+ // physical address driven migration, too. This latter - not knowing in advance
+ // whom a page belongs to and what "->page_table_lock" is to take - needs to
+ // grab the old page.
+ // Invoking the common service requires us to do the same.
+ //
+ get_page(old_p); // Should we call "page_cache_get()" ?
+ //
+ // The old page will be "put_page()"-ed.
+ //
+ return common_check_migrate_1_page(src_node, old_p, node, mm, pte);
+}
+
+
+/*
+ * Migrate some pages identified by their physical address from a NUMA node to another.
+ *
+ * Arguments: table: -> the user buffer containing the physical addresses of
+ * the pages to be migrated.
+ * Max. "PAGE_SIZE / sizeof(phaddr_t *)" of them can be
+ * mifgrated at once.
+ * n: Number of the physical page addresses
+ * node: Destination NUMA node
+ * pid: Pages are assumed to belong to this process
+ *
+ * Returns: On (partial) success, the number of the pages actually migrated is
+ * returned (in form of "struct _un_success_count_").
+ * As usual, "-Exxx" returned on errors.
+ */
+STATIC INLINE long long
+batch_migrate(const caddr_t table, size_t n, const int node, const pid_t pid)
+{
+ int rc;
+ phaddr_t *p, *bp;
+ struct mm_struct *mm;
+ struct _un_success_count_ count = { 0, 0};
+ DECLARE_ITC_VAR(alloc_time); // Time for "vmalloc()"
+ DECLARE_ITC_VAR(mmap_sem); // Time for "down_read(&mm->mmap_sem)"
+ DECLARE_ITC_VAR(pgd_lock); // "spin_lock(&mm->page_table_lock)"
+ DECLARE_ITC_VAR(pgd_unlock); // "spin_unlock(&mm->page_table_lock)"
+ DECLARE_ITC_VAR(pgd_scan_t); // PGD scan time
+
+ if (pid != 0 && pid != current->pid){
+ //
+ // Look up the "mm_struct" belonging to the process ID.
+ //
+ if ((mm = look_up_mm(pid)) == NULL){
+ PRINT_ERR(illegal_pid);
+ ERROR_CNT(bad_request);
+ return -ESRCH;
+ }
+ //
+ // On success, "mm->mm_users" got incremented to make sure that
+ // "mm_struct" does not go away.
+ //
+ } else {
+ mm = current->mm;
+ //
+ // Actually, there is no need to grab "mm" because it is ours, wont go
+ // away in the mean time. As we do not want to ask questions when
+ // releasing it...
+ // It is safe just to increment the counter: it is ours.
+ //
+ atomic_inc(&mm->mm_users);
+ }
+ //
+ // Fetch the table of the addresses.
+ //
+ SAVE_ITC(/* out */ alloc_time);
+ bp = vmalloc(PAGE_SIZE);
+ STORE_DELAY(/* in */ alloc_time, /* out */ page_alloc);
+ if (bp == NULL){
+ mmput(mm);
+ PRINT_ERR(no_momory);
+ ERROR_CNT(no_memory);
+ return -ENOMEM;
+ }
+ if (copy_from_user(bp, table, n * sizeof(phaddr_t)) != 0){
+ vfree(bp);
+ mmput(mm);
+ PRINT_ERR(ill_user_buff);
+ ERROR_CNT(bad_request);
+ return -EFAULT;
+ }
+ SAVE_ITC(/* out */ mmap_sem);
+ down_read(&mm->mmap_sem); // Protect the VMA list
+ STORE_DELAY(/* in */ mmap_sem, /* out */ mmap_sem);
+ dump_mm(mm);
+ //
+ // We need the page table lock to synchronize with "kswapd"
+ // and the SMP-safe atomic PTE updates.
+ //
+ SAVE_ITC(/* out */ pgd_lock);
+ spin_lock(&mm->page_table_lock);
+ STORE_DELAY(/* in */ pgd_lock, /* out */ pgd_lock);
+ //
+ // Check to see if the pages are mapped by "mm->pgd" as user pages.
+ // For those which are, call "get_page()" to make sure they do not go away.
+ // "1" will be OR-ed to invalid addresses.
+ //
+ SAVE_ITC(/* out */ pgd_scan_t);
+ rc = check_pages_if_in_pgd(bp, n, mm);
+ STORE_DELAY(/* in */ pgd_scan_t, /* out */ pgd_scan);
+ if (rc >= 0){
+ //
+ // The number of the valid addresses is equal to "rc".
+ //
+ ERROR_CNT_ADD(non_existent_addr, n - rc);
+ for (n = rc, p = bp; n > 0; p++){
+#if defined(_TEST_)
+ if (p - bp >= PAGE_SIZE / sizeof(phaddr_t))
+ panic("\nAddress table overflow\n");
+#endif
+ if (*p & 1) // Address has not been validated
+ continue;
+ //
+ // Check & migrate the next page.
+ // The old page gets "put_page()"-ed.
+ //
+ if ((rc = check_migrate_1_page(*p, node, mm)) < 0)
+ break;
+ else if (rc > 0){
+ count.successful++;
+ n--; // Decrement for a good address only
+ } else
+ count.failed++;
+ }
+ if (rc >= 0)
+ rc = *(long long *) &count; // Yeh, I know...
+ }
+ //
+ // Let the others complete the page fault handler code. They will find the
+ // condition "someone has already installed the PTE" to be TRUE.
+ //
+ SAVE_ITC(/* out */ pgd_unlock);
+ spin_unlock(&mm->page_table_lock);
+ STORE_DELAY(/* in */ pgd_unlock, /* out */ pgd_unlock);
+ up_read(&mm->mmap_sem);
+ vfree(bp);
+ mmput(mm); // Decrement "mm->mm_users" and free
+ // "*mm" if the counter becomes zero
+ return rc;
+}
+
+
+/*
+ * Check & migrate the pages one by one.
+ *
+ * Arguments: address: Physical addresses of the page to be migrated
+ * node: Destination NUMA node
+ * mm: -> victim "mm_struct"
+ *
+ * Returns: 1: Migration O. K.
+ * 0: Minor error, no actual migration has been done
+ * -Exxx: Catastrophic error
+ *
+ * Notes: - "mm->page_table_lock" and "mm->mmap_sem" have to be held.
+ * - The old page is "get_page()"-ed on entry to meke sure it does not go
+ * away in the mean time - on return it gets "put_page()"-ed.
+ */
+STATIC INLINE int
+check_migrate_1_page(const phaddr_t address, const int node, struct mm_struct * const mm)
+{
+ const int src_node = paddr_to_nid(address);
+ const unsigned long old_pfn = address >> PAGE_SHIFT;
+ struct page *old_p;
+
+ //
+ // Should be revised for the node hot plug :-)
+ //
+#if defined(_TEST_)
+ if (src_node == -1)
+ panic("\nCannot map source address to node\n");
+ if (!node_online(src_node))
+ panic("\nSource node not online\n");
+ if (!pfn_valid(old_pfn))
+ panic("\nNot a valid source pfn\n");
+#endif
+ old_p = pfn_to_page(old_pfn);
+ if (node == src_node){
+ PRINT_ETC("Old ph adr:\t0x%016llx old node: %d new node: %d\n",
+ address, src_node, node);
+ put_page(old_p);
+ return 1; // Done :-)
+ }
+ return common_check_migrate_1_page(src_node, old_p, node, mm, NULL);
+}
+
+
+/*
+ * The real page migration is done here.
+ *
+ * Arguments: old: -> old page structure
+ * new: -> new page structure
+ * node: Destination NUMA node
+ * pte: -> PTE of the page to be moved
+ *
+ * Returns: Negative values (like -Exxx) indicate errors
+ *
+ * Notes: - The old page has to be "get_page()"-ed and locked.
+ * - Its "pte_chain" has to locked.
+ * - The new page and its "pte_chain" has to locked.
+ * - "mm->page_table_lock" and "mm->mmap_sem" have to be held.
+ */
+STATIC INLINE int
+page_migrate_2(struct page * const old, struct page * const new,
+ struct mm_struct * const mm, pte_t *pte_p)
+{
+ const struct page *pte_page;
+ struct vm_area_struct *vma;
+ pte_t pte;
+ vaddr_t vaddress;
+ DECLARE_ITC_VAR(vma_time); // Time for "find_vma()"
+ DECLARE_ITC_VAR(flush_tlb_time); // Time for "flush_tlb_page()"
+ DECLARE_ITC_VAR(add_lru_time); // Time for "lru_cache_add_active()"
+ DECLARE_ITC_VAR(copy_time); // Time for "copy_user_highpage()"
+ DECLARE_ITC_VAR(upd_mmu_cache); // Time for "update_mmu_cache()"
+
+ if (!PageDirect(old)){
+ PRINT_ERR("Direct mapped pages only\n");
+ ERROR_CNT(page_type_not_supp);
+ return -EFAULT;
+ }
+ if (pte_p == NULL) // Architecture independent code :-)
+ pte_p = rmap_ptep_map(old->pte.direct);
+ //
+ // "struct page" of the page that hotst the PTE.
+ //
+ pte_page = kmap_atomic_to_page(pte_p); // Architecture independent code :-)
+ //
+ // "pte_page->mapping" points at the victim process'es "mm_struct"
+ //
+#if defined(_TEST_)
+ if (mm != (struct mm_struct *) pte_page->mapping)
+ panic("\nBroken r-map ???\n");
+ dump_mm(mm);
+#endif
+ //
+ // "page->index" has the high bits of the address; the lower bits of the address
+ // are calculated from the offset of the PTE within the page table page.
+ //
+ vaddress = pte_page->index + ((unsigned long) pte_p & ~PAGE_MASK) * PTRS_PER_PTE;
+ PRINT("Virtual addr:\t0x%lx\n", vaddress);
+ //
+ // Double check if the virtual address is still valid.
+ //
+ SAVE_ITC(/* out */ vma_time);
+ vma = find_vma(mm, vaddress); // "vma" cache should help much
+ STORE_DELAY(/* in */ vma_time, /* out */ find_vma);
+ if (vma == NULL || vma->vm_start > vaddress){
+ PRINT_ERR("During mremap() ?\n");
+ ERROR_CNT(page_gone_away);
+ rmap_ptep_unmap(pte_p);
+ return -EFAULT;
+ }
+ dump_vma(vma);
+ //
+ // Nuke the page table entry.
+ //
+ flush_cache_page(vma, vaddress); // Architecture independent code :-)
+ pte = ptep_get_and_clear(pte_p);
+ SAVE_ITC(/* out */ flush_tlb_time);
+ flush_tlb_page(vma, vaddress); // Architecture independent code :-)
+ STORE_DELAY(/* in */ flush_tlb_time, /* out */ flush_tlb);
+ //
+ // From now on, the other CPUs cannot touch the content of the page. Should they
+ // try to, they would observe page faults. They pass easily "mmap_sem" beacause
+ // they take it for read, too. As we hold "page_table_lock", they queue up in the
+ // page fault handler.
+ //
+ PRINT("Old ph addr:\t0x%016lx\n", page_to_phys(old));
+ PRINT("Old PTE:\t0x%016lx\n", pte_val(pte));
+ PRINT("_PFN_MASK:\t0x%016lx\n", _PFN_MASK);
+ //
+ // Copy some of the page structure.
+ //
+ dump_page("Source", old);
+ new->flags = (new->flags & ~FLAG_MASK) | (old->flags & FLAG_MASK);
+ new->pte.direct = old->pte.direct;
+ SetPageDirect(new); // Direct mapped pages only
+ old->pte.direct = NULL;
+ ClearPageDirect(old);
+ if (PagePrivate(new))
+ new->private = old->private;
+ SAVE_ITC(/* out */ add_lru_time);
+ lru_cache_add_active(new);
+ STORE_DELAY(/* in */ add_lru_time, /* out */ add_lru);
+ dump_page("New", new);
+ //
+ // Here is where the data is copied.
+ //
+ SAVE_ITC(/* out */ copy_time);
+ copy_user_highpage(new, old, vaddress); // Architecture independent code :-)
+ STORE_DELAY(/* in */ copy_time, /* out */ copy);
+ //
+ // The new PTE keeps everything but the PFN.
+ //
+ pte = mk_pte(new, __pgprot((pte_val(pte) & ~_PFN_MASK)));
+ PRINT("New ph addr:\t0x%016lx\nNew PTE:\t0x%016lx\n\n",
+ page_to_phys(new), pte_val(pte));
+ set_pte(pte_p, pte);
+ SAVE_ITC(/* out */ upd_mmu_cache);
+ update_mmu_cache(vma, vaddress, pte); // Architecture independent code :-)
+ STORE_DELAY(/* in */ upd_mmu_cache, /* out */ update_mmu_cache);
+ rmap_ptep_unmap(pte_p);
+ return 0;
+}
+
+
+/*
+ * Some more tests and go on with the page migration.
+ *
+ * Arguments: old: -> old page structure
+ * new: -> new page structure
+ * node: Destination NUMA node
+ * pte: -> PTE of the page to be moved
+ *
+ * Returns: Negative values indicate errors
+ *
+ * Notes: - The old page has to be "get_page()"-ed and locked.
+ * - "mm->page_table_lock" and "mm->mmap_sem" have to be held.
+ */
+STATIC INLINE int
+check_migr_1_page_part_2(struct page * const old, struct page * const new,
+ struct mm_struct * const mm, pte_t * const pte)
+{
+ int rc;
+ DECLARE_ITC_VAR(pte_chain_lock_time); // Time for "pte_chain_lock()"
+ DECLARE_ITC_VAR(unlock_time); // Time for "unlock_page()"
+
+ if (PageReserved(old)){
+ PRINT_ERR("What shall I do with a reserved page ?\n");
+ ERROR_CNT(page_type_not_supp);
+ return -ENXIO;
+ }
+ if (PageError(old)){
+ PRINT_ERR("Page has got error(s)\n");
+ ERROR_CNT(errors);
+ return -EIO;
+ }
+ if (!PageUptodate(old)){
+ PRINT_ERR("Page has no valid data ???\n");
+// return -EIO;
+ }
+ if (PageCompound(old)){
+ PRINT_ERR("What shall I do with a compound page ?\n");
+ ERROR_CNT(page_type_not_supp);
+ return -ENXIO;
+ }
+ if (old->mapping != NULL){
+ PRINT_ERR("Anonymous pages only\n");
+ ERROR_CNT(page_type_not_supp);
+ return -ENXIO;
+ }
+ if (PageSwapCache(old)){
+ PRINT_ERR("What shall I do with a page in swap cache ?\n");
+ ERROR_CNT(page_type_not_supp);
+ return -ENXIO;
+ }
+ if (PageHighMem(page)){
+ PRINT_ERR("What shall I do with a HIGHMEM page ?\n");
+ ERROR_CNT(page_type_not_supp);
+ return -ENXIO;
+ }
+ SAVE_ITC(/* out */ pte_chain_lock_time);
+ pte_chain_lock(old);
+ STORE_DELAY(/* in */ pte_chain_lock_time, /* out */ pte_chain_lock);
+ if (!page_mapped(old)){ // Actually means "r-mapped"
+ PRINT_ERR("Page not in r-map\n");
+ pte_chain_unlock(old);
+ ERROR_CNT(page_type_not_supp);
+ return -EFAULT;
+ }
+ //
+ // As nobody else should know about this new page, taking these locks should not
+ // be in conflict with anything.
+ //
+ if (TestSetPageLocked(new))
+ panic("\nSomeone is stealing my new page\n");
+ if (!pte_chain_trylock(new))
+ panic("\nSomeone is stealing my new pte chain\n");
+
+ //
+ // The real page migration.
+ //
+ rc = page_migrate_2(old, new, mm, pte);
+
+ pte_chain_unlock(new);
+ SAVE_ITC(/* out */ unlock_time);
+ unlock_page(new);
+ STORE_DELAY(/* in */ unlock_time, /* out */ new_page_unlock);
+ pte_chain_unlock(old);
+ return rc;
+}
+
+
+/*
+ * Check to see if the pages are mapped by "mm->pgd" as user pages.
+ * For those which are, call "get_page()" to make sure they do not go away.
+ *
+ * Arguments: phaddresses: -> the user buffer containing the physical addresses of
+ * the pages to be migrated
+ * n: Number of the physical page addresses
+ * mm: -> victim "mm_struct"
+ *
+ * Returns: The number of the addresses validated
+ *
+ * Note: Caller has to hold "mm->mmap_sem" for read and "mm->page_table_lock".
+ */
+STATIC INLINE int
+check_pages_if_in_pgd(phaddr_t * const phaddresses, const size_t n,
+ const struct mm_struct * const mm)
+{
+ const pgd_t *pgd;
+ const pmd_t *pmd;
+ const pte_t *pte, *pte0;
+ unsigned long g, m, e;
+ unsigned int i, found = 0;
+ phaddr_t *p;
+
+ //
+ // Mark the addresses as not already validated.
+ //
+ for (i = 0, p = phaddresses; i < n; i++, p++)
+ *p |= 1; // "(1 & _PFN_MASK) == 0"
+ //
+ // Check the user pages only.
+ // Note: "mm->pgd" is an identity mapped virtual address.
+ //
+ for (g = FIRST_USER_PGD_NR, pgd = mm->pgd + g; g < USER_PTRS_PER_PGD;
+ g++, pgd++){
+ if (pgd_none(*pgd) || pgd_bad(*pgd))
+ continue;
+ //
+ // "*pgd" is a physical address.
+ //
+ for (m = 0, pmd = pmd_offset(pgd, 0); m < PTRS_PER_PMD; m++, pmd++){
+ //
+ // "pmd" contains an identity mapped virtual address.
+ //
+ if (pmd_none(*pmd) || pmd_bad(*pmd))
+ continue;
+ //
+ // "*pmd" is a physical address.
+ //
+ pte0 = pte_offset_map(pmd, 0);
+ //
+ // "pte0" contains some kind of virtual address of the
+ // beginning of a PTE page.
+ //
+ for (e = 0, pte = pte0; e < PTRS_PER_PTE; e++, pte++){
+ if (!pte_present(*pte))
+ continue;
+ //
+ // Check this PTE against the list of the addresses.
+ //
+ for (i = 0, p = phaddresses; i < n; i++, p++){
+ if ((pte_val(*pte) & _PFN_MASK) !=
+ (*p & _PFN_MASK))
+ continue;
+ *p &= _PFN_MASK; // Validate the address
+ PRINT("Virtual addr:\t0x%016lx\n",
+ __VA(g, m, e));
+ //
+ // Make sure the page does not go away.
+ // Should we call "page_cache_get()" instead ?
+ //
+ get_page(pfn_to_page(*p >> PAGE_SHIFT));
+ if (++found == n){
+ pte_unmap(pte0);
+ return found;
+ }
+ //
+ // There should be no more than one address on
+ // the list that matches this PTE.
+ //
+ break;
+ }
+ }
+ pte_unmap(pte0);
+ }
+ }
+ return found;
+}
+
+
+/*
+ * Look up an "mm_struct" belonging to a process ID.
+ *
+ * "NULL" is returned on failure.
+ *
+ * Notes: - On success, "->mm_users" gets incremented to make sure that "mm_struct"
+ * does not go away.
+ * - "->mm" of a kernel thread is "NULL"; anyway, we don't dare to touch a
+ * kernel thread
+ */
+STATIC struct mm_struct *
+look_up_mm(const pid_t pid)
+{
+ struct task_struct *p;
+ struct mm_struct *mm;
+ DECLARE_ITC_VAR(time); // "mm" look up time
+
+ SAVE_ITC(/* out */ time);
+ read_lock(&tasklist_lock);
+ if ((p = find_task_by_pid(pid)) == NULL){
+ read_unlock(&tasklist_lock);
+ STORE_DELAY(/* in */ time, /* out */ mm_lookup);
+ return NULL;
+ }
+ //
+ // "get_task_mm()" includes "task_lock()" that "nests both inside and outside of
+ // read_lock(&tasklist_lock)" - as a note in "sched.h" states.
+ //
+ mm = get_task_mm(p); // Can be "NULL" for a kernel thread
+ //
+ // On success, "mm->mm_users" got incremented to make sure that "mm_struct" does
+ // not go away.
+ //
+ read_unlock(&tasklist_lock);
+ STORE_DELAY(/* in */ time, /* out */ mm_lookup);
+ return mm;
+}
+
+
+#if defined(_NEED_STATISTICS_)
+
+
+/*
+ * Fetch and clear the statistics.
+ *
+ * Accessed in a non atomic way. Who cares? Just some statistics :-)
+ */
+STATIC INLINE int
+page_migrate_statistics(const caddr_t vaddress, const int flag)
+{
+ //
+ // Assuming all the CPU-s are clocked at the same frequency.
+ //
+ _statistics.t.cyc_per_usec = local_cpu_data->cyc_per_usec;
+ if (copy_to_user(vaddress, &_statistics, sizeof _statistics) != 0)
+ return -EFAULT;
+ if (flag)
+ memset(&_statistics, 0,sizeof _statistics);
+ return 0;
+}
+
+
+#endif // #if defined(_NEED_STATISTICS_)
+
+
+#if defined(_TEST_)
+
+
+void
+dump_mm(const struct mm_struct * const mm)
+{
+ if (_pr_flag_ & PRINT_mm){
+ PRINT("mm: 0x%p\n", mm);
+ PRINT("mmap: 0x%p mm_rb.rb_node: 0x%p\n", mm->mmap, mm->mm_rb.rb_node);
+ PRINT("mmap_cache: 0x%p free_area_cache: 0x%lx\n", mm->mmap_cache,
+ mm->free_area_cache);
+ PRINT("pgd: 0x%p mm_users: %d mm_count: %d map_count: %d\n",
+ mm->pgd, atomic_read(&mm->mm_users),
+ atomic_read(&mm->mm_count), mm->map_count);
+ PRINT("mmap_sem.count: %d mmap_sem.wait_lock: %d\n", mm->mmap_sem.count,
+ mm->mmap_sem.wait_lock.lock);
+ PRINT("&mmap_sem.wait_list: 0x%p next: 0x%p prev: 0x%p\n",
+ &mm->mmap_sem.wait_list, mm->mmap_sem.wait_list.next,
+ mm->mmap_sem.wait_list.prev);
+ PRINT("page_table_lock: %u\n", mm->page_table_lock.lock);
+ PRINT("&mmlist: 0x%p next: 0x%p prev: 0x%p\n", &mm->mmlist,
+ mm->mmlist.next, mm->mmlist.prev);
+ PRINT("start_code: 0x%lx end_code: 0x%lx\n", mm->start_code,
+ mm->end_code);
+ PRINT("start_data: 0x%lx end_data: 0x%lx\n", mm->start_data,
+ mm->end_data);
+ PRINT("start_brk: 0x%lx brk: 0x%lx start_stack: 0x%lx\n", mm->start_brk,
+ mm->brk, mm->start_stack);
+ PRINT("arg_start: 0x%lx arg_end: 0x%lx\n", mm->arg_start, mm->arg_end);
+ PRINT("env_start: 0x%lx env_end: 0x%lx\n", mm->env_start, mm->env_end);
+ PRINT("rss: 0x%lx total_vm: 0x%lx locked_vm: 0x%lx\n", mm->rss,
+ mm->total_vm, mm->locked_vm);
+ PRINT("def_flags: 0x%lu cpu_vm_mask: 0x%lx\n", mm->def_flags,
+ mm->cpu_vm_mask);
+// unsigned long saved_auxv[40];
+ PRINT("dumpable: %u ", mm->dumpable);
+#ifdef CONFIG_HUGETLB_PAGE
+ PRINT("used_hugetlb: 0x%d ", mm->used_hugetlb);
+#endif
+ PRINT("context: 0x%lu core_waiters: %d\n", mm->context,
+ mm->core_waiters);
+ PRINT("core_startup_done: 0x%p\n", mm->core_startup_done);
+ PRINT("core_done.done: %d ", mm->core_done.done);
+ PRINT("core_done.wait.lock: %u\n", mm->core_done.wait.lock.lock);
+ PRINT("&core_done.wait.task_list: 0x%p next: 0x%p prev: 0x%p\n",
+ &mm->core_done.wait.task_list, mm->core_done.wait.task_list.next,
+ mm->core_done.wait.task_list.prev);
+ PRINT("ioctx_list_lock.read_counter: %d ",
+ mm->ioctx_list_lock.read_counter);
+ PRINT("ioctx_list_lock.write_lock: %d\n",
+ mm->ioctx_list_lock.write_lock);
+ PRINT("ioctx_list: 0x%p &default_kioctx: 0x%p\n\n", mm->ioctx_list,
+ &mm->default_kioctx);
+ }
+}
+
+
+void
+dump_vma(const struct vm_area_struct * const vma)
+{
+ if (_pr_flag_ & PRINT_vma){
+ PRINT("vm_area: 0x%p\n", vma);
+ PRINT("mm: 0x%p\n", vma->vm_mm);
+ PRINT("start: 0x%lx end: 0x%lx\n", vma->vm_start, vma->vm_end);
+ PRINT("next: 0x%p &rb: 0x%p\n", vma->vm_next, &vma->vm_rb);
+ PRINT("prot: 0x%lx flags: 0x%lx\n", pgprot_val(vma->vm_page_prot),
+ vma->vm_flags);
+ PRINT("&shared: 0x%p next: 0x%p prev: 0x%p\n", &vma->shared,
+ vma->shared.next, vma->shared.prev);
+ PRINT("ops: 0x%p private: 0x%p\n", vma->vm_ops, vma->vm_private_data);
+ PRINT("file: 0x%p pgoff: 0x%lx\n\n", vma->vm_file, vma->vm_pgoff);
+ }
+}
+
+
+void
+dump_page(const char * const text, const struct page * const p)
+{
+ if (_pr_flag_ & PRINT_page){
+ PRINT("%s page struct: 0x%p\n", text, p);
+ PRINT("flags: 0x%lx count: 0x%x\n", p->flags, atomic_read(&p->count));
+ PRINT("&list: 0x%p next: 0x%p prev: 0x%p\n", &p->list,
+ p->list.next, p->list.prev);
+ PRINT("mapping: 0x%p index: 0x%lx\n", p->mapping, p->index);
+ PRINT("&lru: 0x%p next: 0x%p prev: 0x%p\n", &p->lru,
+ p->lru.next, p->lru.prev);
+ if (PageDirect(p))
+ PRINT("pte.direct: 0x%p", p->pte.direct);
+ else
+ PRINT("pte.chain: 0x%p", p->pte.chain);
+ PRINT(" private: 0x%lx\n", p->private);
+#if defined(WANT_PAGE_VIRTUAL)
+ PRINT("virtual: 0x%p\n", p->virtual);
+#endif
+ PRINT("\n");
+ }
+}
+
+
+void
+dump_pte_stuff(const pte_t * const pte_addr)
+{
+ if (_pr_flag_ & PRINT_pte){
+ // "struct page" of the page that hosts the PTE.
+ const struct page * const pte_page = kmap_atomic_to_page(pte_addr);
+
+ PRINT("pte_paddr: 0x%p pte: 0x%016lx\n", pte_addr, pte_val(*pte_addr));
+ dump_page("\npte", pte_page);
+ }
+}
+
+
+#define _DATA_ 0x6000000000000000UL // User data segment
+
+
+/*
+ * Give me a valid physical address (if "vaddress == -1"),
+ * otherwise translate a user mode virtual address to a physical one.
+ */
+phaddr_t
+gimme_an_address(caddr_t vaddress)
+{
+ const struct vm_area_struct *vma;
+ const pgd_t *pgd;
+ const pmd_t *pmd;
+ const pte_t *pte;
+ phaddr_t phaddress = -EFAULT;
+
+ if (vaddress == (caddr_t) -1)
+ vaddress = (caddr_t) _DATA_;
+ PRINT("Virtual addr:\t0x%016lx\n", (vaddr_t) vaddress);
+ down_read(&current->mm->mmap_sem);
+ vma = find_vma(current->mm, (vaddr_t) vaddress);
+ if (vma == NULL || vma->vm_start > (vaddr_t) vaddress){
+ up_read(&current->mm->mmap_sem);
+ return -EFAULT;
+ }
+ spin_lock(&current->mm->page_table_lock);
+ do {
+ pgd = pgd_offset(current->mm, (vaddr_t) vaddress);
+ if (pgd_none(*pgd) || pgd_bad(*pgd))
+ break;
+ pmd = pmd_offset(pgd, (vaddr_t) vaddress);
+ if (pmd_none(*pmd) || pmd_bad(*pmd))
+ break;
+ pte = pte_offset_map(pmd, (vaddr_t) vaddress);
+ if (!pte_present(*pte)){
+ pte_unmap(pte);
+ break;
+ }
+ phaddress = pte_pfn(*pte) << PAGE_SHIFT;
+ pte_unmap(pte);
+ } while (0);
+ spin_unlock(&current->mm->page_table_lock);
+ up_read(&current->mm->mmap_sem);
+ PRINT("Physical addr:\t0x%016llx\n", (long long) phaddress);
+ return phaddress;
+}
+
+
+#endif // #if defined(_TEST_)
diff -Nru 2.6.4.ref/include/asm-generic/rmap.h 2.6.4.mig4/include/asm-generic/rmap.h
--- 2.6.4.ref/include/asm-generic/rmap.h Tue Mar 16 10:18:15 2004
+++ 2.6.4.mig4/include/asm-generic/rmap.h Thu Mar 25 08:59:42 2004
@@ -87,4 +87,45 @@
}
#endif

+/*
+ * Shared pages have a chain of pte_chain structures, used to locate
+ * all the mappings to this page. We only need a pointer to the pte
+ * here, the page struct for the page table page contains the process
+ * it belongs to and the offset within that process.
+ *
+ * We use an array of pte pointers in this structure to minimise cache misses
+ * while traversing reverse maps.
+ */
+#define NRPTE ((L1_CACHE_BYTES - sizeof(unsigned long))/sizeof(pte_addr_t))
+
+/*
+ * next_and_idx encodes both the address of the next pte_chain and the
+ * offset of the highest-index used pte in ptes[].
+ */
+struct pte_chain {
+ unsigned long next_and_idx;
+ pte_addr_t ptes[NRPTE];
+} ____cacheline_aligned;
+
+static inline struct pte_chain *pte_chain_next(struct pte_chain *pte_chain)
+{
+ return (struct pte_chain *)(pte_chain->next_and_idx & ~NRPTE);
+}
+
+static inline struct pte_chain *pte_chain_ptr(unsigned long pte_chain_addr)
+{
+ return (struct pte_chain *)(pte_chain_addr & ~NRPTE);
+}
+
+static inline int pte_chain_idx(struct pte_chain *pte_chain)
+{
+ return pte_chain->next_and_idx & NRPTE;
+}
+
+static inline unsigned long
+pte_chain_encode(struct pte_chain *pte_chain, int idx)
+{
+ return (unsigned long)pte_chain | idx;
+}
+
#endif /* _GENERIC_RMAP_H */
diff -Nru 2.6.4.ref/include/asm-ia64/page_migrate.h 2.6.4.mig4/include/asm-ia64/page_migrate.h
--- 2.6.4.ref/include/asm-ia64/page_migrate.h Thu Jan 1 01:00:00 1970
+++ 2.6.4.mig4/include/asm-ia64/page_migrate.h Thu Mar 25 08:59:42 2004
@@ -0,0 +1,245 @@
+#define _TEST_
+#define _NEED_STATISTICS_
+
+
+/*
+ * Migrate pages from a NUMA node to another.
+ * ==========================================
+ *
+ * Version 0.1, 23th of March 2004
+ * By Zoltan Menyhart, Bull S.A. <[email protected]>
+ * The usual GPL applies.
+ *
+ * (See "migrate.txt".)
+ *
+ * Sytem call syntax:
+ *
+ * long long sys_page_migrate(int command, caddr_t address, size_t length,
+ * int node, pid_t pid);
+ *
+ * On error "-1" is returned and "errno" holds the error code.
+ *
+ * The following commands are available:
+ */
+enum {
+/*
+ * - Return a physical address.
+ * (testing only, the kernel has to be compiled with "#define _TEST_")
+ */
+ _GIMME_AN_ADDRESS_,
+/*
+ * On entry, if "address" is a valid virtual address in the address space of the
+ * current task with an existing backing page, then its physical address is returned;
+ * if it is equal to "-1L", then the system finds a valid physical address on its own.
+ * The other arguments are don't care.
+ *
+ * - Fetch and clear the statistics.
+ */
+ _STATISTICS_,
+/*
+ * "address" is a pointer to the user's buffer. If "length != 0" then having bben
+ * fetched, the statistics get cleared. The other arguments are don't care.
+ *
+ * - Obtain the size of the statistics structure (see "struct _statistics_size_"):
+ */
+ _SIZEOF_STATISTICS_,
+/*
+ * The arguments are don't care.
+ *
+ * - Batch migrate pages from a NUMA node to another.
+ */
+ _PHADDR_BATCH_MIGRATE_,
+/*
+ * "address" points at the user table containing the physical address of the pages to
+ * be migrated. "length" is the number of the physical addresses in the buffer. Max.
+ * "PAGE_SIZE / sizeof(phaddr_t)" of them can be mifgrated at once.
+ * "node" is the destination NUMA node.
+ * Addresses are assumed to belong to the process indicated by "pid".
+ * The number of the pages actually migrated is returned
+ * (see "struct _un_success_count_).
+ *
+ * - Migrate virtual address range of a process:
+ */
+ _VA_RANGE_MIGRATE_,
+/*
+ * "sddress" is the starting virtual address in a process'es address space.
+ * "length" is the length of the address range to be migrated
+ * Addresses are assumed to belong to the process indicated by "pid".
+ * The number of the pages actually migrated is returned
+ * (see "struct _un_success_count_).
+ */
+};
+
+
+/*
+ * Type of a physical address -- hopefully enough for all architectures.
+ * (We allow negative values, too, for indicating some errors.)
+ */
+typedef long long phaddr_t;
+
+
+struct _un_success_count_ {
+ unsigned int successful; // Pages successfully migrated
+ unsigned int failed; // Minor failures
+};
+
+
+struct _statistics_size_ {
+ unsigned int sizeof_statistics; // sizeof(struct _statistics_)
+ unsigned int max_nodes; // MAX_NUMNODES
+};
+
+
+/*
+ * Statistics are accessed in a non atomic way. Who cares? Just some statistics :-)
+ */
+struct _statistics_ {
+ struct { // Error counters
+ unsigned long non_existent_addr;
+ unsigned long page_gone_away;
+ unsigned long busy;
+ unsigned long bad_request;
+ unsigned long no_memory; // On the target node
+ unsigned long page_type_not_supp;
+ unsigned long errors; // "PageError(page)" is set
+ } e;
+ struct { // Clock ticks
+ unsigned long total;
+ unsigned long page_alloc;
+ unsigned long page_free;
+ unsigned long page_lock;
+ unsigned long new_page_unlock;
+ unsigned long page_unlock;
+ unsigned long validation;
+ unsigned long pgd_scan;
+ unsigned long pgd_lock;
+ unsigned long pgd_unlock;
+ unsigned long mm_list_lock;
+ unsigned long mmap_sem;
+ unsigned long pte_chain_lock;
+ unsigned long find_vma;
+ unsigned long flush_tlb;
+ unsigned long add_lru;
+ unsigned long copy;
+ unsigned long update_mmu_cache;
+ unsigned long mm_lookup;
+ unsigned long cyc_per_usec;
+ unsigned long perfbullctl;
+ unsigned long pci_cfg_rd;
+ unsigned long pci_cfg_wr;
+ } t;
+ struct { // Event counters
+ unsigned long mm_hit;
+ unsigned long pgd_scan;
+ unsigned long perfbullctl;
+ unsigned long pci_cfg_rd;
+ unsigned long pci_cfg_wr;
+ } c;
+#if defined(__KERNEL__)
+ unsigned long count[MAX_NUMNODES][MAX_NUMNODES];
+#else
+ unsigned long count[0][0];
+#endif
+};
+
+
+#if !defined(__KERNEL__)
+
+
+#include <unistd.h>
+#include <sys/types.h>
+
+#if !defined(__NR_page_migrate)
+#define __NR_page_migrate 1276
+#endif
+
+
+/*
+ * Migrate some pages of the process of PID.
+ */
+static inline int
+migrate_ph_pages(const phaddr_t * const table, const size_t length, const int node,
+ struct _un_success_count_ * const p, const pid_t pid)
+{
+ union {
+ long long ll;
+ struct _un_success_count_ s;
+ } u;
+
+ u.ll = syscall(__NR_page_migrate, _PHADDR_BATCH_MIGRATE_,
+ table, length, node, pid);
+ if (u.ll == -1)
+ return -1;
+ if (p != NULL){
+ p->successful = u.s.successful;
+ p->failed = u.s.failed;
+ }
+ return 0;
+}
+
+
+/*
+ * Migrate virtual address range of the process of PID.
+ */
+static inline int
+migrate_virt_addr_range(const caddr_t address, const size_t length, const int node,
+ struct _un_success_count_ * const p, const pid_t pid)
+{
+ union {
+ long long ll;
+ struct _un_success_count_ s;
+ } u;
+
+ u.ll = syscall(__NR_page_migrate, _VA_RANGE_MIGRATE_,
+ address, length, node, pid);
+ if (u.ll == -1)
+ return -1;
+ if (p != NULL){
+ p->successful = u.s.successful;
+ p->failed = u.s.failed;
+ }
+ return 0;
+}
+
+
+/*
+ * Obtain the size of the statistics structure.
+ */
+static inline int
+get_stat_sizes(struct _statistics_size_ * const p)
+{
+ union {
+ long long ll;
+ struct _statistics_size_ s;
+ } u;
+
+ u.ll = syscall(__NR_page_migrate, _SIZEOF_STATISTICS_, 0, 0, 0, 0);
+ if (u.ll == -1)
+ return -1;
+ if (p != NULL)
+ *p = u.s;
+ return 0;
+}
+
+
+/*
+ * Fetch and clear the statistics.
+ */
+static inline int
+get_staistics(struct _statistics_ * const p, const long slear_flag)
+{
+ return syscall(__NR_page_migrate, _STATISTICS_, p, slear_flag, 0, 0);
+}
+
+
+/*
+ * Return a physical address.
+ */
+static inline phaddr_t
+gimme_a_ph_address(const caddr_t p)
+{
+ return syscall(__NR_page_migrate, _GIMME_AN_ADDRESS_, p, 0, 0, 0);
+}
+
+
+#endif
diff -Nru 2.6.4.ref/include/asm-ia64/pgtable.h 2.6.4.mig4/include/asm-ia64/pgtable.h
--- 2.6.4.ref/include/asm-ia64/pgtable.h Tue Mar 16 10:18:15 2004
+++ 2.6.4.mig4/include/asm-ia64/pgtable.h Thu Mar 25 08:59:42 2004
@@ -112,6 +112,28 @@
#define PTRS_PER_PTE (__IA64_UL(1) << (PAGE_SHIFT-3))

/*
+ * The IA64 architecture does not decode all the MSB-s of virtual addresses for PGD, PMD
+ * and PTE indices, i.e. IA64 has got holes or aliases in the virtual address space.
+ * These def's are provided to check to see if an "address" -- "length" pair spans over
+ * virtual address holes or it creates illegal alias to an otherwise valid address.
+ * (User mode only.)
+ */
+#define __VA_BITS_PER_REGION (PAGE_SHIFT - 3 - 3 + /* PGD low index */ \
+ 2 * (PAGE_SHIFT - 3) + /* PMD and PTE indices */ \
+ PAGE_SHIFT) /* The page itself */
+#define __VA_ALIAS_MASK ((1UL << __VA_BITS_PER_REGION) - 1)
+#define __IS_VA_ALIAS(address, length) \
+ ((~__VA_ALIAS_MASK & (address)) != \
+ (~__VA_ALIAS_MASK & ((address) + (length) - 1)))
+
+/*
+ * Virtual address composed by use of PGD, PMD and PTE indices:
+ */
+#define __VA(pgdi, pmdi, ptei) (((pgdi) >> (PAGE_SHIFT - 6)) << 61 | \
+ ((pgdi) & ((PTRS_PER_PGD >> 3) - 1)) << PGDIR_SHIFT | \
+ (pmdi) << PMD_SHIFT | (ptei) << PAGE_SHIFT)
+
+/*
* All the normal masks have the "page accessed" bits on, as any time
* they are used, the page is accessed. They are cleared only by the
* page-out routines.
@@ -325,8 +347,10 @@
(init_mm.pgd + (((addr) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)))

/* Find an entry in the second-level page table.. */
-#define pmd_offset(dir,addr) \
- ((pmd_t *) pgd_page(*(dir)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)))
+#define pmd_index(addr) \
+ (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
+#define pmd_offset(dir, addr) \
+ ((pmd_t *) pgd_page(*(dir)) + pmd_index(addr))

/*
* Find an entry in the third-level page table. This looks more complicated than it
diff -Nru 2.6.4.ref/include/asm-ia64/rmap-locking.h 2.6.4.mig4/include/asm-ia64/rmap-locking.h
--- 2.6.4.ref/include/asm-ia64/rmap-locking.h Thu Jan 1 01:00:00 1970
+++ 2.6.4.mig4/include/asm-ia64/rmap-locking.h Thu Mar 25 08:59:42 2004
@@ -0,0 +1,25 @@
+/*
+ * include/linux/rmap-locking.h
+ *
+ * Locking primitives for exclusive access to a page's reverse-mapping
+ * pte chain.
+ */
+
+#include <linux/slab.h>
+
+struct pte_chain;
+extern kmem_cache_t *pte_chain_cache;
+
+#define pte_chain_lock(page) bit_spin_lock(PG_chainlock, &page->flags)
+#define pte_chain_trylock(page) bit_spin_trylock(PG_chainlock, &page->flags)
+#define pte_chain_unlock(page) bit_spin_unlock(PG_chainlock, &page->flags)
+
+struct pte_chain *pte_chain_alloc(int gfp_flags);
+void __pte_chain_free(struct pte_chain *pte_chain);
+
+static inline void pte_chain_free(struct pte_chain *pte_chain)
+{
+ if (pte_chain)
+ __pte_chain_free(pte_chain);
+}
+
diff -Nru 2.6.4.ref/include/linux/rmap-locking.h 2.6.4.mig4/include/linux/rmap-locking.h
--- 2.6.4.ref/include/linux/rmap-locking.h Tue Mar 16 10:18:15 2004
+++ 2.6.4.mig4/include/linux/rmap-locking.h Thu Mar 25 08:59:42 2004
@@ -11,6 +11,7 @@
extern kmem_cache_t *pte_chain_cache;

#define pte_chain_lock(page) bit_spin_lock(PG_chainlock, &page->flags)
+#define pte_chain_trylock(page) bit_spin_trylock(PG_chainlock, &page->flags)
#define pte_chain_unlock(page) bit_spin_unlock(PG_chainlock, &page->flags)

struct pte_chain *pte_chain_alloc(int gfp_flags);
diff -Nru 2.6.4.ref/mm/rmap.c 2.6.4.mig4/mm/rmap.c
--- 2.6.4.ref/mm/rmap.c Tue Mar 16 10:18:17 2004
+++ 2.6.4.mig4/mm/rmap.c Thu Mar 25 09:00:13 2004
@@ -46,40 +46,9 @@
* We use an array of pte pointers in this structure to minimise cache misses
* while traversing reverse maps.
*/
-#define NRPTE ((L1_CACHE_BYTES - sizeof(unsigned long))/sizeof(pte_addr_t))
-
-/*
- * next_and_idx encodes both the address of the next pte_chain and the
- * offset of the highest-index used pte in ptes[].
- */
-struct pte_chain {
- unsigned long next_and_idx;
- pte_addr_t ptes[NRPTE];
-} ____cacheline_aligned;

kmem_cache_t *pte_chain_cache;

-static inline struct pte_chain *pte_chain_next(struct pte_chain *pte_chain)
-{
- return (struct pte_chain *)(pte_chain->next_and_idx & ~NRPTE);
-}
-
-static inline struct pte_chain *pte_chain_ptr(unsigned long pte_chain_addr)
-{
- return (struct pte_chain *)(pte_chain_addr & ~NRPTE);
-}
-
-static inline int pte_chain_idx(struct pte_chain *pte_chain)
-{
- return pte_chain->next_and_idx & NRPTE;
-}
-
-static inline unsigned long
-pte_chain_encode(struct pte_chain *pte_chain, int idx)
-{
- return (unsigned long)pte_chain | idx;
-}
-
/*
* pte_chain list management policy:
*
diff -Nru 2.6.4.ref/test/migstat.c 2.6.4.mig4/test/migstat.c
--- 2.6.4.ref/test/migstat.c Thu Jan 1 01:00:00 1970
+++ 2.6.4.mig4/test/migstat.c Thu Mar 25 09:02:00 2004
@@ -0,0 +1,130 @@
+/*
+ * Display and reset page migration statistics.
+ *
+ * Usage: migstat [-c]
+ */
+
+
+#include <stdio.h>
+#include <errno.h>
+#include <malloc.h>
+#include "page_migrate.h"
+
+#define CONV(x) x, (x * mult + div / 2) / div, (x * mult + div / 2) / div / 1000
+
+extern int errno;
+
+struct _statistics_ *sp;
+struct _statistics_size_ ss;
+
+main(const int argc, const char * const argv[])
+{
+ int from, to;
+ unsigned long *p;
+ unsigned long ok = 0;
+ unsigned long mult = 1, div = 1;
+ unsigned long time;
+ int clear_flag = 0;
+
+ if (argc == 2 && strcmp(argv[1], "-c") == 0)
+ clear_flag = 1;
+ else if (argc != 1){
+ fprintf(stderr, "Usage: %s [-c]\n", argv[0]);
+ return 1;
+ }
+ if (get_stat_sizes(&ss) < 0){
+ perror("get_stat_sizes()");
+ return 1;
+ }
+ if ((sp = malloc(ss.sizeof_statistics)) == NULL){
+ fprintf(stderr, "malloc(%d) failed\n", ss.sizeof_statistics);
+ return 1;
+ }
+ if (get_staistics(sp, clear_flag) < 0){
+ perror("get_staistics()");
+ return 1;
+ }
+ printf("\nError counters:\n");
+ if (sp->e.non_existent_addr != 0)
+ printf("non_existent_addr: %ld\n", sp->e.non_existent_addr);
+ if (sp->e.page_gone_away != 0)
+ printf("page_gone_away: %ld\n", sp->e.page_gone_away);
+ if (sp->e.busy != 0)
+ printf("busy: %ld\n", sp->e.busy);
+ if (sp->e.bad_request != 0)
+ printf("bad_request: %ld\n", sp->e.bad_request);
+ if (sp->e.no_memory != 0)
+ printf("no_memory: %ld\n", sp->e.no_memory);
+ if (sp->e.page_type_not_supp != 0)
+ printf("page_type_not_supp: %ld\n", sp->e.page_type_not_supp);
+ if (sp->e.errors != 0)
+ printf("page errors: %ld\n", sp->e.errors);
+ printf("Total: %ld\n", sp->e.non_existent_addr +
+ sp->e.page_gone_away + sp->e.busy + sp->e.bad_request +
+ sp->e.no_memory + sp->e.page_type_not_supp + sp->e.errors);
+
+ printf("\n\tMigrated to:\n");
+ printf("From:\t");
+ for (to = 0; to < ss.max_nodes; to++)
+ printf("%d:%c", to, to < ss.max_nodes - 1 ? '\t' : '\n');
+ p = &sp->count[0][0];
+ for (from = 0; from < ss.max_nodes; from++){
+ printf("%d:\t", from);
+ for (to = 0; to < ss.max_nodes; p++, to++){
+ ok += *p;
+ if (from == to && *p == 0)
+ printf("-");
+ else
+ printf("%lu", *p);
+ printf("%c", to < ss.max_nodes - 1 ? '\t' : '\n');
+ }
+ }
+ printf("Total: %ld\n\n", ok);
+
+ div = sp->t.cyc_per_usec;
+ printf(" Clock ticks: Microsec: Millisec:\n");
+ printf("total: %12ld %10ld %8ld\n", CONV(sp->t.total));
+ printf("page_alloc: %12ld %10ld %8ld\n", CONV(sp->t.page_alloc));
+ printf("page_free: %12ld %10ld %8ld\n", CONV(sp->t.page_free));
+ printf("page_lock: %12ld %10ld %8ld\n", CONV(sp->t.page_lock));
+ printf("page_unlock: %12ld %10ld %8ld\n", CONV(sp->t.page_unlock));
+ printf("new_pg_unlock: %12ld %10ld %8ld\n", CONV(sp->t.new_page_unlock));
+ printf("validation: %12ld %10ld %8ld\n", CONV(sp->t.validation));
+ printf("pgd_scan: %12ld %10ld %8ld\n", CONV(sp->t.pgd_scan));
+ printf("pgd_lock: %12ld %10ld %8ld\n", CONV(sp->t.pgd_lock));
+ printf("pgd_unlock: %12ld %10ld %8ld\n", CONV(sp->t.pgd_unlock));
+ printf("mm_list_lock: %12ld %10ld %8ld\n", CONV(sp->t.mm_list_lock));
+ printf("mmap_sem: %12ld %10ld %8ld\n", CONV(sp->t.mmap_sem));
+ printf("pte_chain_lock: %12ld %10ld %8ld\n", CONV(sp->t.pte_chain_lock));
+ printf("find_vma: %12ld %10ld %8ld\n", CONV(sp->t.find_vma));
+ printf("flush_tlb: %12ld %10ld %8ld\n", CONV(sp->t.flush_tlb));
+ printf("add_lru: %12ld %10ld %8ld\n", CONV(sp->t.add_lru));
+ printf("copy: %12ld %10ld %8ld\n", CONV(sp->t.copy));
+ printf("upd_mmu_cache: %12ld %10ld %8ld\n", CONV(sp->t.update_mmu_cache));
+ time = sp->t.total - sp->t.page_alloc - sp->t.page_free -
+ sp->t.page_lock - sp->t.page_unlock - sp->t.new_page_unlock -
+ sp->t.validation - // sp->t.pgd_unlock - sp->t.mmap_sem -
+ // sp->t.pgd_scan - sp->t.pgd_lock - sp->t.mmlist_lock -
+ sp->t.pte_chain_lock - sp->t.find_vma - sp->t.flush_tlb -
+ sp->t.add_lru - sp->t.copy - sp->t.update_mmu_cache;
+ printf("Where is %12ld %10ld %8ld ?\n", CONV(time));
+
+ printf("cyc_per_usec: %11ld\n", sp->t.cyc_per_usec);
+
+ if (sp->c.pgd_scan != 0){
+ printf("\npgd_scan:\t\t%11ld\n", sp->c.pgd_scan);
+ printf("mm_hit:\t\t\t%11ld\nmiss:\t\t\t%11ld\n", sp->c.mm_hit,
+ sp->e.non_existent_addr + ok - sp->c.mm_hit);
+ }
+
+ if (sp->c.perfbullctl != 0){
+ printf("\npci_cfg_rd:\t%11ld\t %10ld\n", CONV(sp->t.pci_cfg_rd));
+ printf("pci_cfg_rd count:\t%11ld\n", sp->c.pci_cfg_rd);
+ printf("pci_cfg_wr:\t%11ld\t %10ld\n", CONV(sp->t.pci_cfg_wr));
+ printf("pci_cfg_wr count:\t%11ld\n", sp->c.pci_cfg_wr);
+ printf("perfbullctl:\t%11ld\t %10ld\n", CONV(sp->t.perfbullctl));
+ printf("perfbullctl count:\t%11ld\n", sp->c.perfbullctl);
+ }
+ return 0;
+}
+
diff -Nru 2.6.4.ref/test/ph.c 2.6.4.mig4/test/ph.c
--- 2.6.4.ref/test/ph.c Thu Jan 1 01:00:00 1970
+++ 2.6.4.mig4/test/ph.c Thu Mar 25 09:02:00 2004
@@ -0,0 +1,94 @@
+/*
+ * Demo: migrate some of our pages identified by their physical addresses.
+ */
+
+
+#include <stdio.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include "page_migrate.h"
+
+#if !defined(PAGE_SIZE)
+#define PAGE_SIZE (16 * 1024)
+#endif
+
+#define MMAPSIZE (1024 * 1024 * 256)
+
+phaddr_t address;
+extern int errno;
+phaddr_t table[PAGE_SIZE / sizeof(phaddr_t)];
+struct _un_success_count_ u_s;
+
+
+size_t
+fill(volatile void *p)
+{
+ size_t count = 123;
+ size_t i;
+
+ for (i = 0; i < count; i++, p += PAGE_SIZE){
+ * (unsigned long *) p = 0xdeadbeefL;
+ if ((address = gimme_a_ph_address((void *) p)) < 0)
+ break;
+ table[i] = address;
+ }
+ printf("# addresses: %d\n", i);
+ return i;
+}
+
+
+mig(volatile void *p, int node)
+{
+ int rc;
+ size_t count;
+
+ count = fill(p);
+ rc = migrate_ph_pages(table, count, node, &u_s, 0);
+ printf("\nmig(..., %d): rc = %ld errno = %d *p: 0x%lx\n", node, rc, errno,
+ * (unsigned long *) p);
+ printf("successful: %d failed: %d\n", u_s.successful, u_s.failed);
+ if (rc < 0){
+ perror("migrate_virt_addr_range()");
+ exit(-1);
+ }
+ address = gimme_a_ph_address((void *) p);
+ printf("\nmig(..., %d): ph address = 0x%016llx\n", node, address);
+ if (address < 0){
+ perror("gimme_a_ph_address()");
+ exit(-1);
+ }
+}
+
+
+main()
+{
+ volatile void *p;
+
+ p = mmap(NULL, MMAPSIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (p == MAP_FAILED){
+ perror("\nmmap()");
+ return 1;
+ }
+ /*
+ * No backing page => should fail.
+ */
+ printf("\nmain(): ph address = 0x%llx\n", address);
+ * (unsigned long *) p = 0xdeadbeef03L;
+ /*
+ * Now there should be a backing page.
+ */
+ address = gimme_a_ph_address((void *) p);
+ printf("\nmain(): ph address = 0x%016llx\n", address);
+ if (address < 0){
+ perror("gimme_a_ph_address()");
+ return 1;
+ }
+ mig(p, 0);
+ mig(p, 1);
+ mig(p, 2);
+ mig(p, 3);
+ return 0;
+}
+
diff -Nru 2.6.4.ref/test/v.c 2.6.4.mig4/test/v.c
--- 2.6.4.ref/test/v.c Thu Jan 1 01:00:00 1970
+++ 2.6.4.mig4/test/v.c Thu Mar 25 09:02:00 2004
@@ -0,0 +1,78 @@
+/*
+ * Demo: migrate some of its own virtual address range.
+ */
+
+
+#include <stdio.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include "page_migrate.h"
+
+#define MMAPSIZE (1024 * 1024 * 256)
+
+phaddr_t address;
+extern int errno;
+
+struct _un_success_count_ u_s;
+
+
+mig(volatile void *p, int node)
+{
+ int rc;
+
+ rc = migrate_virt_addr_range((caddr_t) p, MMAPSIZE, node, &u_s, 0);
+ printf("\nmig(..., %d): rc = %ld errno = %d *p: 0x%lx\n", node, rc, errno,
+ * (unsigned long *) p);
+ printf("successful: %d failed: %d\n", u_s.successful, u_s.failed);
+ if (rc < 0){
+ perror("migrate_virt_addr_range()");
+ exit(-1);
+ }
+ address = gimme_a_ph_address((void *) p);
+ printf("\nmig(..., %d): ph address = 0x%016llx\n", node, address);
+ if (address < 0){
+ perror("gimme_a_ph_address()");
+ exit(-1);
+ }
+}
+
+
+main()
+{
+ volatile void *p0, *p;
+
+ p0 = p = mmap(NULL, MMAPSIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (p == MAP_FAILED){
+ perror("\nmmap()");
+ return 1;
+ }
+ /*
+ * Make sure 2 pages are exist.
+ */
+ * (unsigned long *) p = 0xdeadbeef01L;
+ p += 1024 * 16;
+ * (unsigned long *) p = 0xdeadbeef02L;
+ address = gimme_a_ph_address((void *) p);
+ /*
+ * No backing page => should fail.
+ */
+ p += 1024 * 64;
+ printf("\nmain(): ph address = 0x%llx\n", address);
+ * (unsigned long *) p = 0xdeadbeef03L;
+ /*
+ * Now there should be a backing page.
+ */
+ address = gimme_a_ph_address((void *) p);
+ printf("\nmain(): ph address = 0x%016llx\n", address);
+ if (address < 0){
+ perror("gimme_a_ph_address()");
+ return 1;
+ }
+ mig(p0, 0);
+ mig(p0, 1);
+ mig(p0, 2);
+ mig(p0, 3);
+ return 0;
+}
diff -Nru 2.6.4.ref/test/victim.c 2.6.4.mig4/test/victim.c
--- 2.6.4.ref/test/victim.c Thu Jan 1 01:00:00 1970
+++ 2.6.4.mig4/test/victim.c Thu Mar 25 09:02:00 2004
@@ -0,0 +1,36 @@
+/*
+ * Victim process for "vmig".
+ */
+
+
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+
+#define MMAPSIZE (1024 * 1024 * 1024L)
+#define N MMAPSIZE / sizeof(long)
+
+
+main()
+{
+ int i;
+ volatile long *p0, *p;
+ long sum0, sum;
+
+ printf("victim: pid = %d\n", getpid());
+ p0 = p = mmap(NULL, MMAPSIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (p == MAP_FAILED){
+ perror("\nmmap()");
+ return 1;
+ }
+ printf("address: %p, size: 0x%lx\n", p, MMAPSIZE);
+ for (i = 0, sum0 = 0; i < N; i++)
+ sum0 += *p++ = random();
+ do {
+ for (i = 0, sum = 0, p = p0; i < N; i++)
+ sum += *p++;
+ printf("\nvictim: pid = %d, sum: %ld\n", getpid(), sum);
+ } while (sum0 == sum);
+}
diff -Nru 2.6.4.ref/test/vmig.c 2.6.4.mig4/test/vmig.c
--- 2.6.4.ref/test/vmig.c Thu Jan 1 01:00:00 1970
+++ 2.6.4.mig4/test/vmig.c Thu Mar 25 09:02:00 2004
@@ -0,0 +1,36 @@
+/*
+ * Migrate the victim process by hand.
+ */
+
+
+#include <stdio.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include "page_migrate.h"
+
+// Who cares for the SH library ?
+#define SH_ADDRESS (2UL << 60)
+#define SH_SIZE (16UL * 1024 * 1024 * 1024 * 1024)
+
+struct _un_success_count_ u_s;
+
+main(const int argc, const char * const argv[])
+{
+ int node;
+ pid_t pid;
+ int rc;
+
+ if (argc != 3){
+ fprintf(stderr, "usage: vmig <pid> <node>\n");
+ return 1;
+ }
+ pid = atoi(argv[1]);
+ node = atoi(argv[2]);
+ rc = migrate_virt_addr_range((caddr_t) SH_ADDRESS, SH_SIZE, node, &u_s, pid);
+ if (rc < 0)
+ perror("migrate_virt_addr_range()");
+ else
+ printf("successful: %d failed: %d\n", u_s.successful, u_s.failed);
+ return 0;
+}


Attachments:
mig-2.6.4-bk4-2004-march-25 (64.20 kB)

2004-03-26 17:21:26

by Dave Hansen

[permalink] [raw]
Subject: Re: Migrate pages from a ccNUMA node to another - patch

Have you considered any common ground your patch might share with the
people doing memory hotplug?

http://people.valinux.co.jp/~iwamoto/mh.html

They have a similar problem to your migration that occurs when a user
wants to remove a whole or partial NUMA node.
[email protected]

Is your code something that you'd like to see go into the mainline 2.6
or 2.7 kernel?

Also, please don't spam-encode your address when sending to the list.
It just makes it harder for people to send feedback.

-- Dave

2004-03-30 08:28:49

by IWAMOTO Toshihiro

[permalink] [raw]
Subject: Re: Migrate pages from a ccNUMA node to another - patch

Hi Zoltan,

At Fri, 26 Mar 2004 09:20:46 -0800,
Dave Hansen wrote:
>
> Have you considered any common ground your patch might share with the
> people doing memory hotplug?
>
> http://people.valinux.co.jp/~iwamoto/mh.html
>
> They have a similar problem to your migration that occurs when a user
> wants to remove a whole or partial NUMA node.
> [email protected]

Processes must be migrated to other nodes when a node is being
removed. Conversely, processes may be migrated from other nodes when
a node is added. I'm not familiar with NUMA things, and I think our
team doesn't have a particular solution. If you have some idea,
that's great.

BTW, it seems page migration can use my remap_onepage function. Our
code can move most kinds of pages including hugetlbfs pages and page
caches.

--
IWAMOTO Toshihiro

2004-03-30 09:04:37

by Hirokazu Takahashi

[permalink] [raw]
Subject: Re: Migrate pages from a ccNUMA node to another - patch

Hello,

> > Have you considered any common ground your patch might share with the
> > people doing memory hotplug?
> >
> > http://people.valinux.co.jp/~iwamoto/mh.html
> >
> > They have a similar problem to your migration that occurs when a user
> > wants to remove a whole or partial NUMA node.
> > [email protected]
>
> Processes must be migrated to other nodes when a node is being
> removed. Conversely, processes may be migrated from other nodes when
> a node is added. I'm not familiar with NUMA things, and I think our
> team doesn't have a particular solution. If you have some idea,
> that's great.
>
> BTW, it seems page migration can use my remap_onepage function. Our
> code can move most kinds of pages including hugetlbfs pages and page
> caches.

I believe his patch will interest you since most of the code is
independent of cpu architecture and it also covers mmaped files,
shmem, ramdisk, mlocked pages and so on.

We will post new version of the memory hotplug patches in a week.

Thank you,
Hirokazu Takahashi.

2004-03-30 11:20:11

by Zoltan Menyhart

[permalink] [raw]
Subject: Re: Migrate pages from a ccNUMA node to another - patch

Hirokazu Takahashi wrote:
>
> Hello,
>
> > > Have you considered any common ground your patch might share with the
> > > people doing memory hotplug?
> > >
> > > http://people.valinux.co.jp/~iwamoto/mh.html
> > >
> > > They have a similar problem to your migration that occurs when a user
> > > wants to remove a whole or partial NUMA node.
> > > [email protected]
> >
> > Processes must be migrated to other nodes when a node is being
> > removed. Conversely, processes may be migrated from other nodes when
> > a node is added. I'm not familiar with NUMA things, and I think our
> > team doesn't have a particular solution. If you have some idea,
> > that's great.
> >
> > BTW, it seems page migration can use my remap_onepage function. Our
> > code can move most kinds of pages including hugetlbfs pages and page
> > caches.
>
> I believe his patch will interest you since most of the code is
> independent of cpu architecture and it also covers mmaped files,
> shmem, ramdisk, mlocked pages and so on.
>
> We will post new version of the memory hotplug patches in a week.
>
> Thank you,
> Hirokazu Takahashi.

I am afraid the "remap_onepage()" function + the modifications necessary
at some other places are too much for me :-)

You do a couple of retries, waits. I cannot afford spending so much as
overhead due to some performance optimization.

I can understand that if you want to remove a node / memory module, then you
have to succeed by all means, you have to handle all kinds of pages,
the performance is not at a premium.

Regards,

Zolt?n Menyh?rt

2004-03-30 11:38:09

by Zoltan Menyhart

[permalink] [raw]
Subject: Re: Migrate pages from a ccNUMA node to another - patch

Dave Hansen wrote:
>
> Have you considered any common ground your patch might share with the
> people doing memory hotplug?
>
> http://people.valinux.co.jp/~iwamoto/mh.html
>
> They have a similar problem to your migration that occurs when a user
> wants to remove a whole or partial NUMA node.
> [email protected]

Comparing my stuff to their work, I just do some small performance enhancements:

- I do not modify a single line on the existing VM paths - if my stuff has no
improvement for you, then yo will not be obliged to pay any overhead
- I do not insist on :-)) ... that would block the execution of the application
while the resources are not available
- I handle only the simplest case: private anonymous pages (...a singe PTE...)

- IWAMOTO Toshihiro provides a complete "fool proof" solution with obligation to
cussed in the migration

> Is your code something that you'd like to see go into the mainline 2.6
> or 2.7 kernel?

Since someone is asking...

Thanks,

Zolt?n

2004-03-30 12:07:15

by Hirokazu Takahashi

[permalink] [raw]
Subject: Re: Migrate pages from a ccNUMA node to another - patch

Hello,

Zolt?n Menyh?rt wrote:

> > > > Have you considered any common ground your patch might share with the
> > > > people doing memory hotplug?
> > > >
> > > > http://people.valinux.co.jp/~iwamoto/mh.html
> > > >
> > > > They have a similar problem to your migration that occurs when a user
> > > > wants to remove a whole or partial NUMA node.
> > > > [email protected]
> > >
> > > Processes must be migrated to other nodes when a node is being
> > > removed. Conversely, processes may be migrated from other nodes when
> > > a node is added. I'm not familiar with NUMA things, and I think our
> > > team doesn't have a particular solution. If you have some idea,
> > > that's great.
> > >
> > > BTW, it seems page migration can use my remap_onepage function. Our
> > > code can move most kinds of pages including hugetlbfs pages and page
> > > caches.
> >
> > I believe his patch will interest you since most of the code is
> > independent of cpu architecture and it also covers mmaped files,
> > shmem, ramdisk, mlocked pages and so on.
> >
> > We will post new version of the memory hotplug patches in a week.
> >
> > Thank you,
> > Hirokazu Takahashi.
>
> I am afraid the "remap_onepage()" function + the modifications necessary
> at some other places are too much for me :-)
>
> You do a couple of retries, waits. I cannot afford spending so much as
> overhead due to some performance optimization.

I understand what you want to do. Page migration is meaningless if the
cost of it is high.

> I can understand that if you want to remove a node / memory module, then you
> have to succeed by all means, you have to handle all kinds of pages,
> the performance is not at a premium.
>
> Regards,
>
> Zolt?n Menyh?rt

It's not hard to add "no-retry-mode" to "remap_onepage()" function
if you want. It may skip to migrate some pages if they are accessed
heavily. In paticular if you only want to care about anonymous pages,
they will be handled very well.

Thank you,
Hirokazu Takahashi.

2004-03-30 14:31:27

by Zoltan Menyhart

[permalink] [raw]
Subject: Re: Migrate pages from a ccNUMA node to another - patch

Hirokazu Takahashi wrote:

[...]
>
> It's not hard to add "no-retry-mode" to "remap_onepage()" function
> if you want. It may skip to migrate some pages if they are accessed
> heavily. In paticular if you only want to care about anonymous pages,
> they will be handled very well.

Well, why not to give it a try ?
Yet your code is not really easy to read. :-)
I do not dare to adapt it on my own, I am afraid of breaking something.
Could you please provide me a modified version of your "remap_onepage()" ?
Can we move to 2.6.4 ?

In addition to "no-retry-mode", I need to specify where the new page
should be allocated from.

Here is my interface I need to implement with "remap_onepage()":

/*
* Common part of checking & migrating the pages one by one.
*
* Arguments: src_node: Source NUMA node
* old_p: -> old page structure
* node: Destination NUMA node
* mm: -> victim "mm_struct"
* pte: -> PTE of the page to be moved
*
* Returns: 1: Migration O. K.
* 0: Minor error, no actual migration has been done
* -Exxx: Catastrophic error
*
* Notes: - "mm->page_table_lock" and "mm->mmap_sem" have to be held.
* - The old page is "get_page()"-ed on entry to make sure it does not go
* away in the mean time - on return it gets "put_page()"-ed.
*/
int
common_check_migrate_1_page(const int src_node, struct page * const old_p,
const int node, struct mm_struct * const mm, pte_t * const pte)

Notes: "pte" can be NULL if I do not know it apriori
I cannot release "mm->page_table_lock" otherwise I have to re-scan the "mm->pgd".

Thanks,

Zolt?n Menyh?rt

2004-03-30 15:20:30

by Dave Hansen

[permalink] [raw]
Subject: Re: Migrate pages from a ccNUMA node to another - patch

On Tue, 2004-03-30 at 03:39, Zoltan Menyhart wrote:
> Dave Hansen wrote:
> >
> > Have you considered any common ground your patch might share with the
> > people doing memory hotplug?
>
> Comparing my stuff to their work, I just do some small performance enhancements:
>
> - I do not modify a single line on the existing VM paths - if my stuff has no
> improvement for you, then yo will not be obliged to pay any overhead
...
> - I handle only the simplest case: private anonymous pages (...a singe PTE...)

By not modifying a single line in the existing VM path, your patch
simply duplicates functionality from that existing code, which I'm not
sure is any better.

I think there's a lot of commonality with what the swap code, NUMA page
migration, and memory removal have to do. However, none of them share
any code today. I think all of the implementations could benefit from
making them a bit more generic.

-- Dave

2004-03-30 15:59:38

by Dave Hansen

[permalink] [raw]
Subject: Re: Migrate pages from a ccNUMA node to another - patch

On Tue, 2004-03-30 at 03:39, Zoltan Menyhart wrote:
> Dave Hansen wrote:
> > Is your code something that you'd like to see go into the mainline 2.6
> > or 2.7 kernel?
>
> Since someone is asking...

Before anything else, please take a long look at
Documentation/CodingStyle. Pay particular attention to the column
width, indenting, and function sections.

One of the best things about your code is that it uses a lot of
architecture-independent functions and data structures. The page table
walks in your patch, for instance, would work on any Linux
architecture. However, all of this code is in the ia64 arc. Why? Will
other NUMA architectures not need this page migration functionality?

It's great that you are commenting so many things, but normal Linux
style is to use C-style comments, not C++. Also, although it's great
while you're developing a patch, it's best to try and refrain from
documenting things in comments things that are already a non-tricky part
of the way that things already work:
//
// "pte_page->mapping" points at the victim process'es "mm_struct"
//
These comments really just take up space and reduce readability.

I find the comments inside of function and macro calls a bit hard to
read:
STORE_DELAY(/* in */ unlock_time, /* out */ new_page_unlock);


void
dump_mm(const struct mm_struct * const mm)
{
...

void
dump_vma(const struct vm_area_struct * const vma)
{


I think every VM hacker has a couple of these functions stashed around
in various patches for debugging, but they're not really something that
belongs in the kernel. In general, you should try to remove debugging
code before posting a patch.


+ case _SIZEOF_STATISTICS_:
+ rc = *(long long *) &_statistics_sizes;
+ break;

I'm sure the statistics are very important, but they're a bit
intrusive. Can you separate that code out into a file by itself? Are
they even something that a user would want when they're running
normally, or is it another debugging feature?

+#if defined(CONFIG_NUMA)
+ data8 sys_page_migrate // 1276: Migrate pages
to another NUMA node
+#else
data8 sys_ni_syscall
+#endif

See cond_syscall. Basically you declare a weak symbol and override it
later if necessary.

+obj-$(CONFIG_NUMA) += numa.o migrate.o

Can you separate this out under its own config option?


+asmlinkage long long
+sys_page_migrate(const int cmd, const caddr_t address, const size_t.
...
+ switch (cmd){
...
+ case _PHADDR_BATCH_MIGRATE_:
...
+ case _VA_RANGE_MIGRATE_:
...
+ case _STATISTICS_:
...
+ case _GIMME_AN_ADDRESS_:

This smells strongly of an ioctl. If there really are 2 distinct kinds
of memory removal operations, then go ahead and make 2 different
syscalls. As for the _STATISTICS_ and _GIMME_AN_ADDRESS_, they really
shouldn't be there at all. They're just abusing the syscall.

+migrate_virt_addr_range(
...
+ u.ll = syscall(__NR_page_migrate, _VA_RANGE_MIGRATE_,
+ address, length, node, pid);
...
+}

Making syscalls from inside of the kernel is strongly discouraged. I'm
not sure what you're trying to do there. You might want to look at some
existing code like sys_mmap() vs do_mmap().

+#define __VA(pgdi, pmdi, ptei) (((pgdi) >> (PAGE_SHIFT - 6)) << 61 | \
+ ((pgdi) & ((PTRS_PER_PGD >> 3) - 1)) << PGDIR_SHIFT | \
+ (pmdi) << PMD_SHIFT | (ptei) << PAGE_SHIFT)

There are magic numbers galore in this macro. Would this work?

#define __VA(pgdi, pmdi, ptei) ((pgdi)*PGDIR_SIZE + \
(pmdi)*PMD_SIZE + \
(ptei)*PAGE_SIZE)
If ia64 doesn't have the _SIZE macros, you can just copy them from
include/asm-i386/pgtable*.h

-- 2.6.4.ref/mm/rmap.c Tue Mar 16 10:18:17 2004
+++ 2.6.4.mig4/mm/rmap.c Thu Mar 25 09:00:13 2004
...
-struct pte_chain {
- unsigned long next_and_idx;
- pte_addr_t ptes[NRPTE];
-} ____cacheline_aligned;-- 2.6.4.ref/include/asm-ia64/rmap-locking.h


Exposing the VM internals like that probably isn't going to be
acceptable. Why was this necessary?

--- 2.6.4.ref/test/vmig.c Thu Jan 1 01:00:00 1970
+++ 2.6.4.mig4/test/vmig.c Thu Mar 25 09:02:00 2004

If you need userspace code to demonstrate how to use your patch, it's
probably best to post it separately instead of including it in the
patch. Someone might mistake it for kernel code.

I'm sure I missed some things, but I it's hard to look at the patch in
depth functionally before it is cleaned up a bit.

I look forward to seeing an updated version.

-- Dave

2004-03-30 16:37:55

by Dave Hansen

[permalink] [raw]
Subject: Re: Migrate pages from a ccNUMA node to another - patch

On Tue, 2004-03-30 at 07:58, Dave Hansen wrote:
> I'm sure I missed some things, but I it's hard to look at the patch in
> depth functionally before it is cleaned up a bit.

One thing I forgot...

There don't appear to be any security checks in your syscall. Should
all users be allowed to migrate memory around at will from any pid?

-- Dave

2004-04-03 02:58:04

by Hirokazu Takahashi

[permalink] [raw]
Subject: Re: Migrate pages from a ccNUMA node to another - patch

Hello,

>> It's not hard to add "no-retry-mode" to "remap_onepage()" function
>> if you want. It may skip to migrate some pages if they are accessed
>> heavily. In paticular if you only want to care about anonymous pages,
>> they will be handled very well.
>
>Well, why not to give it a try ?
>Yet your code is not really easy to read. :-)
>I do not dare to adapt it on my own, I am afraid of breaking something.
>Could you please provide me a modified version of your "remap_onepage()" ?
>Can we move to 2.6.4 ?

Iwamot and I are working on this. We'll post it soon.

>In addition to "no-retry-mode", I need to specify where the new page
>should be allocated from.
>
>Here is my interface I need to implement with "remap_onepage()":

I guess aruguments src_node, mm and pte would be redundant since
they can be looked up from old_p with the reverse mapping scheme.

>/*
> * Common part of checking & migrating the pages one by one.
> *
> * Arguments: src_node: Source NUMA node
> * old_p: -> old page structure
> * node: Destination NUMA node
> * mm: -> victim "mm_struct"
> * pte: -> PTE of the page to be moved
> *
> * Returns: 1: Migration O. K.
> * 0: Minor error, no actual migration has been done
> * -Exxx: Catastrophic error
> *
> * Notes: - "mm->page_table_lock" and "mm->mmap_sem" have to be held.
> * - The old page is "get_page()"-ed on entry to make sure it does not go
> * away in the mean time - on return it gets "put_page()"-ed.
> */
>int
>common_check_migrate_1_page(const int src_node, struct page * const old_p,
> const int node, struct mm_struct * const mm, pte_t * const pte)
>
>Notes: "pte" can be NULL if I do not know it apriori
> I cannot release "mm->page_table_lock" otherwise I have to re-scan the "mm->pgd".

Re-schan plicy would be much better since migrating pages is heavy work.
I don't think that holding mm->page_table_lock for long time would be
good idea.

How do you think about following algorism:
1. get mm->page_table_lock
2. chose some pages.
3. release mm->page_table_lock
4. call remap_onepage() against each page.
5. goto step1 if there remain pages to be migrated.

2004-04-05 15:07:56

by Zoltan Menyhart

[permalink] [raw]
Subject: Re: Migrate pages from a ccNUMA node to another - patch

Hirokazu Takahashi wrote:

> I guess aruguments src_node, mm and pte would be redundant since
> they can be looked up from old_p with the reverse mapping scheme.

In my version 0.2, I can do with only the following arguments:
* node: Destination NUMA node
* mm: -> victim "mm_struct"
* pte: -> PTE of the page to be moved
(If I have "mm" at hand, why not to use it ? Why not to avoid fetching the r-map
page struct ?)

> >Notes: "pte" can be NULL if I do not know it apriori
> > I cannot release "mm->page_table_lock" otherwise I have to re-scan the "mm->pgd".
>
> Re-schan plicy would be much better since migrating pages is heavy work.
> I don't think that holding mm->page_table_lock for long time would be
> good idea.

Re-scanning is "cache killer", at least on IA64 with huge user memory size.
I have more than 512 Mbytes user memory and its PTEs do not fit into the L2 cache.

In my current design, I have the outer loops: PGD, PMD and PTE walking; and once
I find a valid PTE, I check it against the list of max. 2048 physical addresses as
the inner loop.
I reversed them: walking through the list of max. 2048 physical addresses as outer
loop and the PGD - PMD - PTE scans as inner loops resulted in 4 to 5 times slower
migration.

> How do you think about following algorism:
> 1. get mm->page_table_lock
> 2. chose some pages.
> 3. release mm->page_table_lock
> 4. call remap_onepage() against each page.
> 5. goto step1 if there remain pages to be migrated.

I want to move the most frequently used pages - at least with the HW assisted
hot page detection.
I take "mm->page_table_lock", I nuke the PTE. We've got a good chance that the CPU
using the page observes a page fault almost immediately. It enters the page fault
handler and gets blocked by "mm->page_table_lock". If I released the lock, the CPU
could continue and realize that there is nothing to do, the page fault has already
been repaired. In the mean time, it is me who wait for "mm->page_table_lock".
At worst this scenario happens 2048 times.
If I keep the lock, the victim CPU enters only once the page fault handler.

I think what we should do is to "pull in" pages in to a node rather than than
"pushing them out" for two reasons:
- the recipient CPU executes the migration instead of busy waiting for the lock
- there is chance that the recipient CPU will find the migrated data useful
in its cache

Regards,

Zolt?n Menyh?rt

2004-04-05 15:41:32

by Dave Hansen

[permalink] [raw]
Subject: Re: Migrate pages from a ccNUMA node to another - patch

On Mon, 2004-04-05 at 08:07, Zoltan Menyhart wrote:
> Hirokazu Takahashi wrote:
>
> > I guess aruguments src_node, mm and pte would be redundant since
> > they can be looked up from old_p with the reverse mapping scheme.
>
> In my version 0.2, I can do with only the following arguments:
> * node: Destination NUMA node
> * mm: -> victim "mm_struct"
> * pte: -> PTE of the page to be moved
> (If I have "mm" at hand, why not to use it ? Why not to avoid fetching the r-map
> page struct ?)

That's a good point. There is at least some cost (at least 1 lock)
associated with walking the rmap chains. If it can be avoided, it might
as well be.

But, if someone needs the "no walk" interface, just wrap the function:

foo(page)
{
rmap_results = get_rmap_stuff(page);
__foo(page, rmap_results);
}

__foo(page, rmap_results)
{
...
}

> > >Notes: "pte" can be NULL if I do not know it apriori
> > > I cannot release "mm->page_table_lock" otherwise I have to re-scan the "mm->pgd".
> >
> > Re-schan plicy would be much better since migrating pages is heavy work.
> > I don't think that holding mm->page_table_lock for long time would be
> > good idea.
>
> Re-scanning is "cache killer", at least on IA64 with huge user memory size.
> I have more than 512 Mbytes user memory and its PTEs do not fit into the L2 cache.
>
> In my current design, I have the outer loops: PGD, PMD and PTE walking; and once
> I find a valid PTE, I check it against the list of max. 2048 physical addresses as
> the inner loop.
> I reversed them: walking through the list of max. 2048 physical addresses as outer
> loop and the PGD - PMD - PTE scans as inner loops resulted in 4 to 5 times slower
> migration.

Could you explain where you're getting these "magic numbers?" I don't
quite understand the significance of 2048 physical addresses or 512 MB
of memory.

Zoltan, it appears that we have a bit of an inherent conflict with how
much CPU each of you is expecting to use in the removal and migration
cases. You're coming from a HPC environment where each CPU cycle is
valuable, while the people trying to remove memory are probably going to
be taking CPUs offline soon anyway, and care a bit less about how
efficient they're being with CPU and cache resources.

Could you be a bit more explicit about how expensive (cpu-wise) these
migrate operations can be?

-- Dave

2004-04-08 13:32:43

by Hirokazu Takahashi

[permalink] [raw]
Subject: Re: Migrate pages from a ccNUMA node to another - patch

Hello,

> > I guess aruguments src_node, mm and pte would be redundant since
> > they can be looked up from old_p with the reverse mapping scheme.
>
> In my version 0.2, I can do with only the following arguments:
> * node: Destination NUMA node
> * mm: -> victim "mm_struct"
> * pte: -> PTE of the page to be moved
> (If I have "mm" at hand, why not to use it ? Why not to avoid fetching the r-map
> page struct ?)
>
> > >Notes: "pte" can be NULL if I do not know it apriori
> > > I cannot release "mm->page_table_lock" otherwise I have to re-scan the "mm->pgd".
> >
> > Re-schan plicy would be much better since migrating pages is heavy work.
> > I don't think that holding mm->page_table_lock for long time would be
> > good idea.
>
> Re-scanning is "cache killer", at least on IA64 with huge user memory size.
> I have more than 512 Mbytes user memory and its PTEs do not fit into the L2 cache.
>
> In my current design, I have the outer loops: PGD, PMD and PTE walking; and once
> I find a valid PTE, I check it against the list of max. 2048 physical addresses as
> the inner loop.
> I reversed them: walking through the list of max. 2048 physical addresses as outer
> loop and the PGD - PMD - PTE scans as inner loops resulted in 4 to 5 times slower
> migration.

I've been thinking about it.

I guess our page remap patches would be overkill for your purpose.
Point of our patches is that:
1. Blocks new access to a specified page.
2. Waits for the page going into quiescent state.
3. Copies data from the page to a new page and exchanges them.

In my understanding you want to handle only anonymous pages
which don't have backing store yet. This means that you only need
step 3.

> > How do you think about following algorism:
> > 1. get mm->page_table_lock
> > 2. chose some pages.
> > 3. release mm->page_table_lock
> > 4. call remap_onepage() against each page.
> > 5. goto step1 if there remain pages to be migrated.
>
> I want to move the most frequently used pages - at least with the HW assisted
> hot page detection.
> I take "mm->page_table_lock", I nuke the PTE. We've got a good chance that the CPU
> using the page observes a page fault almost immediately. It enters the page fault
> handler and gets blocked by "mm->page_table_lock". If I released the lock, the CPU
> could continue and realize that there is nothing to do, the page fault has already
> been repaired. In the mean time, it is me who wait for "mm->page_table_lock".

If you use the HW assisted hot page detection, just notify our remapping
functions of hot pages directly. Everything would be handled well and
pagefault handler would be blocked by PG_lock bit.