2002-11-13 23:39:29

by Benjamin LaHaise

[permalink] [raw]
Subject: [patch] remove hugetlb syscalls

Hello,

Since the functionality of the hugetlb syscalls is now available via
hugetlbfs with better control over permissions, could you apply the
following patch that gets rid of a lot of duplicate and unnescessary
code by removing the two hugetlb syscalls? This patch was only tested
on x86, so if people could take a look over it and see if I missed
anything I'd appreciate it. Thanks,

-ben
--
"Do you seek knowledge in time travel?"


arch/i386/kernel/entry.S | 4
arch/i386/kernel/sys_i386.c | 67 -----------
arch/i386/mm/hugetlbpage.c | 218 ---------------------------------------
arch/ia64/kernel/entry.S | 4
arch/ia64/kernel/sys_ia64.c | 75 -------------
arch/ia64/mm/hugetlbpage.c | 199 -----------------------------------
arch/sparc64/kernel/sys_sparc.c | 97 -----------------
arch/sparc64/kernel/systbls.S | 4
include/asm-alpha/unistd.h | 2
include/asm-i386/unistd.h | 2
include/asm-ia64/unistd.h | 2
include/asm-ppc/unistd.h | 2
include/asm-ppc64/unistd.h | 2
include/asm-sparc/unistd.h | 2
include/asm-sparc64/unistd.h | 2
include/asm-x86_64/ia32_unistd.h | 2
17 files changed, 6 insertions(+), 678 deletions(-)

:r v2.5.47-mm2-hugetlbcleanup.diff
diff -urN v2.5.47-mm2/arch/i386/kernel/entry.S v2.5.47-mm2-hugetlbcleanup/arch/i386/kernel/entry.S
--- v2.5.47-mm2/arch/i386/kernel/entry.S Wed Nov 13 18:12:15 2002
+++ v2.5.47-mm2-hugetlbcleanup/arch/i386/kernel/entry.S Wed Nov 13 18:17:05 2002
@@ -760,8 +760,8 @@
.long sys_io_getevents
.long sys_io_submit
.long sys_io_cancel
- .long sys_alloc_hugepages /* 250 */
- .long sys_free_hugepages
+ .long sys_ni_syscall /* 250 */
+ .long sys_ni_syscall
.long sys_exit_group
.long sys_lookup_dcookie
.long sys_epoll_create
diff -urN v2.5.47-mm2/arch/i386/kernel/sys_i386.c v2.5.47-mm2-hugetlbcleanup/arch/i386/kernel/sys_i386.c
--- v2.5.47-mm2/arch/i386/kernel/sys_i386.c Wed Nov 13 18:09:55 2002
+++ v2.5.47-mm2-hugetlbcleanup/arch/i386/kernel/sys_i386.c Wed Nov 13 18:21:24 2002
@@ -9,7 +9,6 @@
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/mm.h>
-#include <linux/hugetlb.h>
#include <linux/smp.h>
#include <linux/smp_lock.h>
#include <linux/sem.h>
@@ -248,69 +247,3 @@
return error;
}

-#ifdef CONFIG_HUGETLB_PAGE
-/* get_addr function gets the currently unused virtaul range in
- * current process's address space. It returns the HPAGE_SIZE
- * aligned address (in cases of success). Other kernel generic
- * routines only could gurantee that allocated address is PAGE_SIZE aligned.
- */
-static unsigned long get_addr(unsigned long addr, unsigned long len)
-{
- struct vm_area_struct *vma;
- if (addr) {
- addr = (addr + HPAGE_SIZE - 1) & HPAGE_MASK;
- vma = find_vma(current->mm, addr);
- if (TASK_SIZE > addr + len && !(vma && addr + len >= vma->vm_start))
- goto found_addr;
- }
- addr = TASK_UNMAPPED_BASE;
- for (vma = find_vma(current->mm, addr); TASK_SIZE > addr + len; vma = vma->vm_next) {
- if (!vma || addr + len < vma->vm_start)
- goto found_addr;
- addr = (vma->vm_end + HPAGE_SIZE - 1) & HPAGE_MASK;
- }
- return -ENOMEM;
-found_addr:
- return addr;
-}
-
-asmlinkage unsigned long sys_alloc_hugepages(int key, unsigned long addr, unsigned long len, int prot, int flag)
-{
- struct mm_struct *mm = current->mm;
- unsigned long raddr;
- int retval = 0;
- extern int alloc_hugetlb_pages(int, unsigned long, unsigned long, int, int);
- if (!cpu_has_pse || key < 0 || len & ~HPAGE_MASK)
- return -EINVAL;
- down_write(&mm->mmap_sem);
- raddr = get_addr(addr, len);
- if (raddr != -ENOMEM)
- retval = alloc_hugetlb_pages(key, raddr, len, prot, flag);
- up_write(&mm->mmap_sem);
- return (retval < 0) ? (unsigned long)retval : raddr;
-}
-
-asmlinkage int sys_free_hugepages(unsigned long addr)
-{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- int retval;
-
- vma = find_vma(current->mm, addr);
- if (!vma || !(vma->vm_flags & VM_HUGETLB) || vma->vm_start != addr)
- return -EINVAL;
- down_write(&mm->mmap_sem);
- retval = do_munmap(vma->vm_mm, addr, vma->vm_end - addr);
- up_write(&mm->mmap_sem);
- return retval;
-}
-#else
-asmlinkage unsigned long sys_alloc_hugepages(int key, unsigned long addr, size_t len, int prot, int flag)
-{
- return -ENOSYS;
-}
-asmlinkage int sys_free_hugepages(unsigned long addr)
-{
- return -ENOSYS;
-}
-#endif
diff -urN v2.5.47-mm2/arch/i386/mm/hugetlbpage.c v2.5.47-mm2-hugetlbcleanup/arch/i386/mm/hugetlbpage.c
--- v2.5.47-mm2/arch/i386/mm/hugetlbpage.c Wed Nov 13 18:10:01 2002
+++ v2.5.47-mm2-hugetlbcleanup/arch/i386/mm/hugetlbpage.c Wed Nov 13 18:33:51 2002
@@ -32,17 +32,6 @@
int key;
} htlbpagek[MAX_ID];

-static struct inode *find_key_inode(int key)
-{
- int i;
-
- for (i = 0; i < MAX_ID; i++) {
- if (htlbpagek[i].key == key)
- return (htlbpagek[i].in);
- }
- return NULL;
-}
-
static struct page *alloc_hugetlb_page(void)
{
int i;
@@ -101,57 +90,6 @@
set_pte(page_table, entry);
}

-static int anon_get_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, int write_access, pte_t *page_table)
-{
- struct page *page = alloc_hugetlb_page();
- if (page)
- set_huge_pte(mm, vma, page, page_table, write_access);
- return page ? 1 : -1;
-}
-
-static int make_hugetlb_pages_present(unsigned long addr, unsigned long end, int flags)
-{
- int write;
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- pte_t *pte;
-
- vma = find_vma(mm, addr);
- if (!vma)
- goto out_error1;
-
- write = (vma->vm_flags & VM_WRITE) != 0;
- if ((vma->vm_end - vma->vm_start) & (HPAGE_SIZE - 1))
- goto out_error1;
- spin_lock(&mm->page_table_lock);
- do {
- pte = huge_pte_alloc(mm, addr);
- if ((pte) && (pte_none(*pte))) {
- if (anon_get_hugetlb_page(mm, vma,
- write ? VM_WRITE : VM_READ,
- pte) == -1)
- goto out_error;
- } else
- goto out_error;
- addr += HPAGE_SIZE;
- } while (addr < end);
- spin_unlock(&mm->page_table_lock);
- vma->vm_flags |= (VM_HUGETLB | VM_RESERVED);
- if (flags & MAP_PRIVATE)
- vma->vm_flags |= VM_DONTCOPY;
- vma->vm_ops = &hugetlb_vm_ops;
- return 0;
-out_error: /* Error case, remove the partial lp_resources. */
- if (addr > vma->vm_start) {
- vma->vm_end = addr;
- zap_hugepage_range(vma, vma->vm_start, vma->vm_end - vma->vm_start);
- vma->vm_end = end;
- }
- spin_unlock(&mm->page_table_lock);
- out_error1:
- return -1;
-}
-
int
copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma)
@@ -297,138 +235,6 @@
spin_unlock(&mm->page_table_lock);
}

-static struct inode *set_new_inode(unsigned long len, int prot, int flag, int key)
-{
- struct inode *inode;
- int i;
-
- for (i = 0; i < MAX_ID; i++) {
- if (htlbpagek[i].key == 0)
- break;
- }
- if (i == MAX_ID)
- return NULL;
- inode = kmalloc(sizeof (struct inode), GFP_ATOMIC);
- if (inode == NULL)
- return NULL;
-
- inode_init_once(inode);
- atomic_inc(&inode->i_writecount);
- inode->i_mapping = &inode->i_data;
- inode->i_mapping->host = inode;
- inode->i_ino = (unsigned long)key;
-
- htlbpagek[i].key = key;
- htlbpagek[i].in = inode;
- inode->i_uid = current->fsuid;
- inode->i_gid = current->fsgid;
- inode->i_mode = prot;
- inode->i_size = len;
- return inode;
-}
-
-static int check_size_prot(struct inode *inode, unsigned long len, int prot, int flag)
-{
- if (inode->i_uid != current->fsuid)
- return -1;
- if (inode->i_gid != current->fsgid)
- return -1;
- if (inode->i_size != len)
- return -1;
- return 0;
-}
-
-static int alloc_shared_hugetlb_pages(int key, unsigned long addr, unsigned long len, int prot, int flag)
-{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- struct inode *inode;
- struct address_space *mapping;
- int idx;
- int retval = -ENOMEM;
- int newalloc = 0;
-
-try_again:
- spin_lock(&htlbpage_lock);
-
- inode = find_key_inode(key);
- if (inode == NULL) {
- if (!capable(CAP_SYS_ADMIN)) {
- if (!in_group_p(0)) {
- retval = -EPERM;
- goto out_err;
- }
- }
- if (!(flag & IPC_CREAT)) {
- retval = -ENOENT;
- goto out_err;
- }
- inode = set_new_inode(len, prot, flag, key);
- if (inode == NULL)
- goto out_err;
- newalloc = 1;
- } else {
- if (check_size_prot(inode, len, prot, flag) < 0) {
- retval = -EINVAL;
- goto out_err;
- }
- else if (atomic_read(&inode->i_writecount)) {
- spin_unlock(&htlbpage_lock);
- goto try_again;
- }
- }
- spin_unlock(&htlbpage_lock);
- mapping = inode->i_mapping;
-
- addr = do_mmap_pgoff(NULL, addr, len, (unsigned long) prot,
- MAP_NORESERVE|MAP_FIXED|MAP_PRIVATE|MAP_ANONYMOUS, 0);
- if (IS_ERR((void *) addr))
- goto freeinode;
-
- vma = find_vma(mm, addr);
- if (!vma) {
- retval = -EINVAL;
- goto freeinode;
- }
-
- retval = hugetlb_prefault(mapping, vma);
- if (retval)
- goto out;
-
- vma->vm_flags |= (VM_HUGETLB | VM_RESERVED);
- vma->vm_ops = &hugetlb_vm_ops;
- spin_unlock(&mm->page_table_lock);
- spin_lock(&htlbpage_lock);
- atomic_set(&inode->i_writecount, 0);
- spin_unlock(&htlbpage_lock);
- return retval;
-out:
- if (addr > vma->vm_start) {
- unsigned long raddr;
- raddr = vma->vm_end;
- vma->vm_end = addr;
- zap_hugepage_range(vma, vma->vm_start, vma->vm_end - vma->vm_start);
- vma->vm_end = raddr;
- }
- spin_unlock(&mm->page_table_lock);
- do_munmap(mm, vma->vm_start, len);
- if (newalloc)
- goto freeinode;
- return retval;
-out_err: spin_unlock(&htlbpage_lock);
-freeinode:
- if (newalloc) {
- for(idx=0;idx<MAX_ID;idx++)
- if (htlbpagek[idx].key == inode->i_ino) {
- htlbpagek[idx].key = 0;
- htlbpagek[idx].in = NULL;
- break;
- }
- kfree(inode);
- }
- return retval;
-}
-
int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
{
struct mm_struct *mm = current->mm;
@@ -470,30 +276,6 @@
return ret;
}

-static int alloc_private_hugetlb_pages(int key, unsigned long addr, unsigned long len, int prot, int flag)
-{
- if (!capable(CAP_SYS_ADMIN)) {
- if (!in_group_p(0))
- return -EPERM;
- }
- addr = do_mmap_pgoff(NULL, addr, len, prot,
- MAP_NORESERVE|MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, 0);
- if (IS_ERR((void *) addr))
- return -ENOMEM;
- if (make_hugetlb_pages_present(addr, (addr + len), flag) < 0) {
- do_munmap(current->mm, addr, len);
- return -ENOMEM;
- }
- return 0;
-}
-
-int alloc_hugetlb_pages(int key, unsigned long addr, unsigned long len, int prot, int flag)
-{
- if (key > 0)
- return alloc_shared_hugetlb_pages(key, addr, len, prot, flag);
- return alloc_private_hugetlb_pages(key, addr, len, prot, flag);
-}
-
int set_hugetlb_mem_size(int count)
{
int j, lcount;
diff -urN v2.5.47-mm2/arch/ia64/kernel/entry.S v2.5.47-mm2-hugetlbcleanup/arch/ia64/kernel/entry.S
--- v2.5.47-mm2/arch/ia64/kernel/entry.S Wed Nov 13 18:09:55 2002
+++ v2.5.47-mm2-hugetlbcleanup/arch/ia64/kernel/entry.S Wed Nov 13 18:27:58 2002
@@ -1242,8 +1242,8 @@
data8 sys_sched_setaffinity
data8 sys_sched_getaffinity
data8 sys_security
- data8 sys_alloc_hugepages
- data8 sys_free_hugepages // 1235
+ data8 ia64_ni_syscall
+ data8 ia64_ni_syscall // 1235
data8 sys_exit_group
data8 sys_lookup_dcookie
data8 sys_io_setup
diff -urN v2.5.47-mm2/arch/ia64/kernel/sys_ia64.c v2.5.47-mm2-hugetlbcleanup/arch/ia64/kernel/sys_ia64.c
--- v2.5.47-mm2/arch/ia64/kernel/sys_ia64.c Wed Nov 13 18:09:55 2002
+++ v2.5.47-mm2-hugetlbcleanup/arch/ia64/kernel/sys_ia64.c Wed Nov 13 18:20:33 2002
@@ -9,7 +9,6 @@
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/mm.h>
-#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/sched.h>
#include <linux/file.h> /* doh, must come after sched.h... */
@@ -243,80 +242,6 @@
return addr;
}

-#ifdef CONFIG_HUGETLB_PAGE
-
-asmlinkage unsigned long
-sys_alloc_hugepages (int key, unsigned long addr, size_t len, int prot, int flag)
-{
- struct mm_struct *mm = current->mm;
- long retval;
- extern int alloc_hugetlb_pages (int, unsigned long, unsigned long, int, int);
-
- if ((key < 0) || (len & (HPAGE_SIZE - 1)))
- return -EINVAL;
-
- if (addr && ((REGION_NUMBER(addr) != REGION_HPAGE) || (addr & (HPAGE_SIZE - 1))))
- addr = TASK_HPAGE_BASE;
-
- if (!addr)
- addr = TASK_HPAGE_BASE;
- down_write(&mm->mmap_sem);
- {
- retval = arch_get_unmapped_area(NULL, COLOR_HALIGN(addr), len, 0, 0);
- if (retval != -ENOMEM)
- retval = alloc_hugetlb_pages(key, retval, len, prot, flag);
- }
- up_write(&mm->mmap_sem);
-
- if (IS_ERR((void *) retval))
- return retval;
-
- force_successful_syscall_return();
- return retval;
-}
-
-asmlinkage int
-sys_free_hugepages (unsigned long addr)
-{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- extern int free_hugepages(struct vm_area_struct *);
- int retval;
-
- down_write(&mm->mmap_sem);
- {
- vma = find_vma(mm, addr);
- if (!vma || !is_vm_hugetlb_page(vma) || (vma->vm_start != addr))
- retval = -EINVAL;
- goto out;
-
- spin_lock(&mm->page_table_lock);
- {
- retval = free_hugepages(vma);
- }
- spin_unlock(&mm->page_table_lock);
- }
-out:
- up_write(&mm->mmap_sem);
- return retval;
-}
-
-#else /* !CONFIG_HUGETLB_PAGE */
-
-asmlinkage unsigned long
-sys_alloc_hugepages (int key, size_t addr, unsigned long len, int prot, int flag)
-{
- return -ENOSYS;
-}
-
-asmlinkage unsigned long
-sys_free_hugepages (unsigned long addr)
-{
- return -ENOSYS;
-}
-
-#endif /* !CONFIG_HUGETLB_PAGE */
-
asmlinkage long
sys_vm86 (long arg0, long arg1, long arg2, long arg3)
{
diff -urN v2.5.47-mm2/arch/ia64/mm/hugetlbpage.c v2.5.47-mm2-hugetlbcleanup/arch/ia64/mm/hugetlbpage.c
--- v2.5.47-mm2/arch/ia64/mm/hugetlbpage.c Wed Nov 13 18:09:55 2002
+++ v2.5.47-mm2-hugetlbcleanup/arch/ia64/mm/hugetlbpage.c Wed Nov 13 18:40:11 2002
@@ -31,18 +31,6 @@
int key;
} htlbpagek[MAX_ID];

-static struct inode *
-find_key_inode(int key)
-{
- int i;
-
- for (i = 0; i < MAX_ID; i++) {
- if (htlbpagek[i].key == key)
- return (htlbpagek[i].in);
- }
- return NULL;
-}
-
static struct page *
alloc_hugetlb_page (void)
{
@@ -305,193 +293,6 @@
}

int
-free_hugepages (struct vm_area_struct *mpnt)
-{
- unlink_vma(mpnt);
- zap_hugetlb_resources(mpnt);
- kmem_cache_free(vm_area_cachep, mpnt);
- return 1;
-}
-
-static struct inode *
-set_new_inode (unsigned long len, int prot, int flag, int key)
-{
- struct inode *inode;
- int i;
-
- for (i = 0; i < MAX_ID; i++) {
- if (htlbpagek[i].key == 0)
- break;
- }
- if (i == MAX_ID)
- return NULL;
- inode = kmalloc(sizeof (struct inode), GFP_ATOMIC);
- if (inode == NULL)
- return NULL;
-
- inode_init_once(inode);
- atomic_inc(&inode->i_writecount);
- inode->i_mapping = &inode->i_data;
- inode->i_mapping->host = inode;
- inode->i_ino = (unsigned long) key;
-
- htlbpagek[i].key = key;
- htlbpagek[i].in = inode;
- inode->i_uid = current->fsuid;
- inode->i_gid = current->fsgid;
- inode->i_mode = prot;
- inode->i_size = len;
- return inode;
-}
-
-static int
-check_size_prot (struct inode *inode, unsigned long len, int prot, int flag)
-{
- if (inode->i_uid != current->fsuid)
- return -1;
- if (inode->i_gid != current->fsgid)
- return -1;
- if (inode->i_size != len)
- return -1;
- return 0;
-}
-
-int
-alloc_shared_hugetlb_pages (int key, unsigned long addr, unsigned long len, int prot, int flag)
-{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- struct inode *inode;
- struct address_space *mapping;
- struct page *page;
- int idx;
- int retval = -ENOMEM;
- int newalloc = 0;
-
-try_again:
- spin_lock(&htlbpage_lock);
- inode = find_key_inode(key);
- if (inode == NULL) {
- if (!capable(CAP_SYS_ADMIN)) {
- if (!in_group_p(0)) {
- retval = -EPERM;
- goto out_err;
- }
- }
- if (!(flag & IPC_CREAT)) {
- retval = -ENOENT;
- goto out_err;
- }
- inode = set_new_inode(len, prot, flag, key);
- if (inode == NULL)
- goto out_err;
- newalloc = 1;
- } else {
- if (check_size_prot(inode, len, prot, flag) < 0) {
- retval = -EINVAL;
- goto out_err;
- }
- else if (atomic_read(&inode->i_writecount)) {
- spin_unlock(&htlbpage_lock);
- goto try_again;
- }
- }
- spin_unlock(&htlbpage_lock);
- mapping = inode->i_mapping;
-
- addr = do_mmap_pgoff(NULL, addr, len, (unsigned long) prot,
- MAP_NORESERVE|MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, 0);
- if (IS_ERR((void *) addr))
- goto freeinode;
-
- vma = find_vma(mm, addr);
- if (!vma) {
- retval = -EINVAL;
- goto freeinode;
- }
-
- spin_lock(&mm->page_table_lock);
- do {
- pte_t *pte = huge_pte_alloc(mm, addr);
- if ((pte) && (pte_none(*pte))) {
- idx = (addr - vma->vm_start) >> HPAGE_SHIFT;
- page = find_get_page(mapping, idx);
- if (page == NULL) {
- page = alloc_hugetlb_page();
- if (page == NULL)
- goto out;
- add_to_page_cache(page, mapping, idx);
- }
- set_huge_pte(mm, vma, page, pte,
- (vma->vm_flags & VM_WRITE));
- } else
- goto out;
- addr += HPAGE_SIZE;
- } while (addr < vma->vm_end);
- retval = 0;
- vma->vm_flags |= (VM_HUGETLB | VM_RESERVED);
- vma->vm_ops = &hugetlb_vm_ops;
- spin_unlock(&mm->page_table_lock);
- spin_lock(&htlbpage_lock);
- atomic_set(&inode->i_writecount, 0);
- spin_unlock(&htlbpage_lock);
- return retval;
-out:
- if (addr > vma->vm_start) {
- unsigned long raddr = vma->vm_end;
- vma->vm_end = addr;
- zap_hugetlb_resources(vma);
- vma->vm_end = raddr;
- }
- spin_unlock(&mm->page_table_lock);
- do_munmap(mm, vma->vm_start, len);
- if (newalloc)
- goto freeinode;
- return retval;
-
-out_err:
- spin_unlock(&htlbpage_lock);
-freeinode:
- if (newalloc) {
- for (idx = 0; idx < MAX_ID; idx++)
- if (htlbpagek[idx].key == inode->i_ino) {
- htlbpagek[idx].key = 0;
- htlbpagek[idx].in = NULL;
- break;
- }
- kfree(inode);
- }
- return retval;
-}
-
-static int
-alloc_private_hugetlb_pages (int key, unsigned long addr, unsigned long len, int prot, int flag)
-{
- if (!capable(CAP_SYS_ADMIN)) {
- if (!in_group_p(0))
- return -EPERM;
- }
- addr = do_mmap_pgoff(NULL, addr, len, prot,
- MAP_NORESERVE | MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS, 0);
- if (IS_ERR((void *) addr))
- return -ENOMEM;
- if (make_hugetlb_pages_present(addr, (addr + len), flag) < 0) {
- do_munmap(current->mm, addr, len);
- return -ENOMEM;
- }
- return 0;
-}
-
-int
-alloc_hugetlb_pages (int key, unsigned long addr, unsigned long len, int prot, int flag)
-{
- if (key > 0)
- return alloc_shared_hugetlb_pages(key, addr, len, prot, flag);
- else
- return alloc_private_hugetlb_pages(key, addr, len, prot, flag);
-}
-
-int
set_hugetlb_mem_size (int count)
{
int j, lcount;
diff -urN v2.5.47-mm2/arch/sparc64/kernel/sys_sparc.c v2.5.47-mm2-hugetlbcleanup/arch/sparc64/kernel/sys_sparc.c
--- v2.5.47-mm2/arch/sparc64/kernel/sys_sparc.c Wed Nov 13 18:09:56 2002
+++ v2.5.47-mm2-hugetlbcleanup/arch/sparc64/kernel/sys_sparc.c Wed Nov 13 18:21:12 2002
@@ -13,7 +13,6 @@
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/mm.h>
-#include <linux/hugetlb.h>
#include <linux/sem.h>
#include <linux/msg.h>
#include <linux/shm.h>
@@ -683,99 +682,3 @@
return err;
}

-#ifdef CONFIG_HUGETLB_PAGE
-#define HPAGE_ALIGN(x) (((unsigned long)x + (HPAGE_SIZE -1)) & HPAGE_MASK)
-extern long sys_munmap(unsigned long, size_t);
-
-/* get_addr function gets the currently unused virtual range in
- * the current process's address space. It returns the LARGE_PAGE_SIZE
- * aligned address (in cases of success). Other kernel generic
- * routines only could gurantee that allocated address is PAGE_SIZE aligned.
- */
-static long get_addr(unsigned long addr, unsigned long len)
-{
- struct vm_area_struct *vma;
- if (addr) {
- addr = HPAGE_ALIGN(addr);
- vma = find_vma(current->mm, addr);
- if (((TASK_SIZE - len) >= addr) &&
- (!vma || addr + len <= vma->vm_start))
- goto found_addr;
- }
- addr = HPAGE_ALIGN(TASK_UNMAPPED_BASE);
- for (vma = find_vma(current->mm, addr); ; vma = vma->vm_next) {
- if (TASK_SIZE - len < addr)
- return -ENOMEM;
- if (!vma || ((addr + len) < vma->vm_start))
- goto found_addr;
- addr = vma->vm_end;
- }
-found_addr:
- addr = HPAGE_ALIGN(addr);
- return addr;
-}
-
-extern int alloc_hugetlb_pages(int, unsigned long, unsigned long, int, int);
-
-asmlinkage long
-sys_alloc_hugepages(int key, unsigned long addr, unsigned long len, int prot, int flag)
-{
- struct mm_struct *mm = current->mm;
- unsigned long raddr;
- int retval;
-
- if (key < 0)
- return -EINVAL;
- if (len & (HPAGE_SIZE - 1))
- return -EINVAL;
- down_write(&mm->mmap_sem);
- raddr = get_addr(addr, len);
- retval = 0;
- if (raddr == -ENOMEM) {
- retval = -ENOMEM;
- goto raddr_out;
- }
- retval = alloc_hugetlb_pages(key, raddr, len, prot, flag);
-
-raddr_out:
- up_write(&mm->mmap_sem);
- if (retval < 0)
- return (long) retval;
-
- return raddr;
-}
-
-extern int free_hugepages(struct vm_area_struct *);
-
-asmlinkage int
-sys_free_hugepages(unsigned long addr)
-{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- int retval;
-
- vma = find_vma(current->mm, addr);
- if ((!vma) || (!is_vm_hugetlb_page(vma)) || (vma->vm_start!=addr))
- return -EINVAL;
- down_write(&mm->mmap_sem);
- spin_lock(&mm->page_table_lock);
- retval = free_hugepages(vma);
- spin_unlock(&mm->page_table_lock);
- up_write(&mm->mmap_sem);
- return retval;
-}
-
-#else
-
-asmlinkage long
-sys_alloc_hugepages(int key, unsigned long addr, size_t len, int prot, int flag)
-{
- return -ENOSYS;
-}
-asmlinkage int
-sys_free_hugepages(unsigned long addr)
-{
- return -ENOSYS;
-}
-
-#endif
diff -urN v2.5.47-mm2/arch/sparc64/kernel/systbls.S v2.5.47-mm2-hugetlbcleanup/arch/sparc64/kernel/systbls.S
--- v2.5.47-mm2/arch/sparc64/kernel/systbls.S Wed Nov 13 18:12:16 2002
+++ v2.5.47-mm2-hugetlbcleanup/arch/sparc64/kernel/systbls.S Wed Nov 13 18:27:21 2002
@@ -65,8 +65,8 @@
.word sys32_ipc, sys32_sigreturn, sys_clone, sys_nis_syscall, sys32_adjtimex
/*220*/ .word sys32_sigprocmask, sys_ni_syscall, sys32_delete_module, sys_ni_syscall, sys_getpgid
.word sys32_bdflush, sys32_sysfs, sys_nis_syscall, sys32_setfsuid16, sys32_setfsgid16
-/*230*/ .word sys32_select, sys_time, sys_nis_syscall, sys_stime, sys_alloc_hugepages
- .word sys_free_hugepages, sys_llseek, sys_mlock, sys_munlock, sys_mlockall
+/*230*/ .word sys32_select, sys_time, sys_nis_syscall, sys_stime, sys_ni_syscall
+ .word sys_ni_syscall, sys_llseek, sys_mlock, sys_munlock, sys_mlockall
/*240*/ .word sys_munlockall, sys_sched_setparam, sys_sched_getparam, sys_sched_setscheduler, sys_sched_getscheduler
.word sys_sched_yield, sys_sched_get_priority_max, sys_sched_get_priority_min, sys32_sched_rr_get_interval, sys32_nanosleep
/*250*/ .word sys32_mremap, sys32_sysctl, sys_getsid, sys_fdatasync, sys32_nfsservctl
diff -urN v2.5.47-mm2/include/asm-alpha/unistd.h v2.5.47-mm2-hugetlbcleanup/include/asm-alpha/unistd.h
--- v2.5.47-mm2/include/asm-alpha/unistd.h Wed Nov 13 18:09:58 2002
+++ v2.5.47-mm2-hugetlbcleanup/include/asm-alpha/unistd.h Wed Nov 13 18:23:03 2002
@@ -340,8 +340,6 @@
#define __NR_io_getevents 400
#define __NR_io_submit 401
#define __NR_io_cancel 402
-#define __NR_alloc_hugepages 403
-#define __NR_free_hugepages 404
#define __NR_exit_group 405
#define NR_SYSCALLS 406

diff -urN v2.5.47-mm2/include/asm-i386/unistd.h v2.5.47-mm2-hugetlbcleanup/include/asm-i386/unistd.h
--- v2.5.47-mm2/include/asm-i386/unistd.h Wed Nov 13 18:09:58 2002
+++ v2.5.47-mm2-hugetlbcleanup/include/asm-i386/unistd.h Wed Nov 13 18:22:04 2002
@@ -254,8 +254,6 @@
#define __NR_io_getevents 247
#define __NR_io_submit 248
#define __NR_io_cancel 249
-#define __NR_alloc_hugepages 250
-#define __NR_free_hugepages 251
#define __NR_exit_group 252
#define __NR_lookup_dcookie 253
#define __NR_sys_epoll_create 254
diff -urN v2.5.47-mm2/include/asm-ia64/unistd.h v2.5.47-mm2-hugetlbcleanup/include/asm-ia64/unistd.h
--- v2.5.47-mm2/include/asm-ia64/unistd.h Wed Nov 13 18:09:58 2002
+++ v2.5.47-mm2-hugetlbcleanup/include/asm-ia64/unistd.h Wed Nov 13 18:24:20 2002
@@ -223,8 +223,6 @@
#define __NR_sched_setaffinity 1231
#define __NR_sched_getaffinity 1232
#define __NR_security 1233
-#define __NR_alloc_hugepages 1234
-#define __NR_free_hugepages 1235
#define __NR_exit_group 1236
#define __NR_lookup_dcookie 1237
#define __NR_io_setup 1238
diff -urN v2.5.47-mm2/include/asm-ppc/unistd.h v2.5.47-mm2-hugetlbcleanup/include/asm-ppc/unistd.h
--- v2.5.47-mm2/include/asm-ppc/unistd.h Wed Nov 13 18:10:05 2002
+++ v2.5.47-mm2-hugetlbcleanup/include/asm-ppc/unistd.h Wed Nov 13 18:24:03 2002
@@ -236,8 +236,6 @@
#define __NR_io_getevents 229
#define __NR_io_submit 230
#define __NR_io_cancel 231
-#define __NR_alloc_hugepages 232
-#define __NR_free_hugepages 233
#define __NR_exit_group 234
#define __NR_lookup_dcookie 235
#define __NR_sys_epoll_create 236
diff -urN v2.5.47-mm2/include/asm-ppc64/unistd.h v2.5.47-mm2-hugetlbcleanup/include/asm-ppc64/unistd.h
--- v2.5.47-mm2/include/asm-ppc64/unistd.h Wed Nov 13 18:24:46 2002
+++ v2.5.47-mm2-hugetlbcleanup/include/asm-ppc64/unistd.h Wed Nov 13 18:25:17 2002
@@ -241,8 +241,6 @@
#define __NR_io_getevents 229
#define __NR_io_submit 230
#define __NR_io_cancel 231
-#define __NR_alloc_hugepages 232
-#define __NR_free_hugepages 233
#define __NR_exit_group 234

#define __NR(n) #n
diff -urN v2.5.47-mm2/include/asm-sparc/unistd.h v2.5.47-mm2-hugetlbcleanup/include/asm-sparc/unistd.h
--- v2.5.47-mm2/include/asm-sparc/unistd.h Wed Nov 13 18:10:05 2002
+++ v2.5.47-mm2-hugetlbcleanup/include/asm-sparc/unistd.h Wed Nov 13 18:23:27 2002
@@ -249,8 +249,6 @@
#define __NR_time 231 /* Linux Specific */
/* #define __NR_oldstat 232 Linux Specific */
#define __NR_stime 233 /* Linux Specific */
-#define __NR_alloc_hugepages 234 /* Linux Specific */
-#define __NR_free_hugepages 235 /* Linux Specific */
#define __NR__llseek 236 /* Linux Specific */
#define __NR_mlock 237
#define __NR_munlock 238
diff -urN v2.5.47-mm2/include/asm-sparc64/unistd.h v2.5.47-mm2-hugetlbcleanup/include/asm-sparc64/unistd.h
--- v2.5.47-mm2/include/asm-sparc64/unistd.h Wed Nov 13 18:10:05 2002
+++ v2.5.47-mm2-hugetlbcleanup/include/asm-sparc64/unistd.h Wed Nov 13 18:23:42 2002
@@ -251,8 +251,6 @@
#endif
/* #define __NR_oldstat 232 Linux Specific */
#define __NR_stime 233 /* Linux Specific */
-#define __NR_alloc_hugepages 234 /* Linux Specific */
-#define __NR_free_hugepages 235 /* Linux Specific */
#define __NR__llseek 236 /* Linux Specific */
#define __NR_mlock 237
#define __NR_munlock 238
diff -urN v2.5.47-mm2/include/asm-x86_64/ia32_unistd.h v2.5.47-mm2-hugetlbcleanup/include/asm-x86_64/ia32_unistd.h
--- v2.5.47-mm2/include/asm-x86_64/ia32_unistd.h Mon Nov 4 16:15:14 2002
+++ v2.5.47-mm2-hugetlbcleanup/include/asm-x86_64/ia32_unistd.h Wed Nov 13 18:26:03 2002
@@ -256,8 +256,6 @@
#define __NR_ia32_io_getevents 247
#define __NR_ia32_io_submit 248
#define __NR_ia32_io_cancel 249
-#define __NR_ia32_alloc_hugepages 250
-#define __NR_ia32_free_hugepages 251
#define __NR_ia32_exit_group 252

#define IA32_NR_syscalls 260 /* must be > than biggest syscall! */


2002-11-14 00:35:38

by Rik van Riel

[permalink] [raw]
Subject: Re: [patch] remove hugetlb syscalls

On Wed, 13 Nov 2002, Benjamin LaHaise wrote:

> Since the functionality of the hugetlb syscalls is now available via
> hugetlbfs with better control over permissions, could you apply the
> following patch that gets rid of a lot of duplicate and unnescessary
> code by removing the two hugetlb syscalls?

#include <massive_applause.h>

Yes, lets get rid of this ugliness before somebody actually
finds a way to use these syscalls...

regards,

Rik
--
Bravely reimplemented by the knights who say "NIH".
http://www.surriel.com/ http://guru.conectiva.com/
Current spamtrap: <a href=mailto:"[email protected]">[email protected]</a>

2002-11-14 08:46:12

by Eric Dumazet

[permalink] [raw]
Subject: Re: [patch] remove hugetlb syscalls

I beg to differ.

I already use the syscalls.

The patch doesnt change Documentation/vm/hugetlbpage.txt

How one is supposed to use hugetlbfs ? That's not documented.

Before dropping support for syscalls, please change the Documentation.

Thanks

----- Original Message -----
From: "Rik van Riel" <[email protected]>
To: "Benjamin LaHaise" <[email protected]>
Cc: "Andrew Morton" <[email protected]>; <[email protected]>;
<[email protected]>
Sent: Thursday, November 14, 2002 1:42 AM
Subject: Re: [patch] remove hugetlb syscalls


> On Wed, 13 Nov 2002, Benjamin LaHaise wrote:
>
> > Since the functionality of the hugetlb syscalls is now available via
> > hugetlbfs with better control over permissions, could you apply the
> > following patch that gets rid of a lot of duplicate and unnescessary
> > code by removing the two hugetlb syscalls?
>
> #include <massive_applause.h>
>
> Yes, lets get rid of this ugliness before somebody actually
> finds a way to use these syscalls...
>
> regards,
>
> Rik
> --
> Bravely reimplemented by the knights who say "NIH".
> http://www.surriel.com/ http://guru.conectiva.com/
> Current spamtrap: <a
href=mailto:"[email protected]">[email protected]</a>
>
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

2002-11-14 14:06:23

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [patch] remove hugetlb syscalls

On Thu, Nov 14, 2002 at 09:52:33AM +0100, dada1 wrote:
> I beg to differ.
>
> I already use the syscalls.

For what?

> How one is supposed to use hugetlbfs ? That's not documented.

mount -t hugetlbfs whocares /huge

fd = open("/huge/nose", ..)

mmap(.., fd, ..)

2002-11-14 15:07:32

by Eric Dumazet

[permalink] [raw]
Subject: Re: [patch] remove hugetlb syscalls

Thanks Christoph

If I asked, this is because I tried the obvious and it doesnt work.

# cat /proc/version
Linux version 2.5.47 ([email protected]) (gcc version 3.2) #10 Tue Nov 12
11:27:43 CET 2002
# cat /proc/sys/vm/nr_hugepages
4
# cat /proc/meminfo | grep Huge
HugePages_Total: 4
HugePages_Free: 4
Hugepagesize: 4096 kB
# mount | grep huge
whocares on /huge type hugetlbfs (rw)
# cat huge.c
#include <unistd.h>
#include <asm/unistd.h>
#include <errno.h>
#include <stdio.h>
#include <sys/mman.h>

#ifndef __NR_sys_alloc_hugepages
# define __NR_sys_alloc_hugepages 250
#endif

#define BIGSZ (4*1024*1024)

_syscall5(void *, sys_alloc_hugepages, int, key, unsigned long, addr,
size_t, len, int, prot, int, flag)

main(argc, argv)
int argc ;
char *argv[] ;
{
char *ptr ;
int c ;
int fd = -1 ;
int nbp = 1 ;
while ((c = getopt(argc, argv, "n:f:")) != EOF) {
switch (c) {
case 'n':
nbp = atoi(optarg) ; break ;
case 'f' :
fd = open(optarg, 2) ; break ;
}
}
if (fd != -1) {
ftruncate(fd, nbp*BIGSZ) ;
ptr = mmap(0, nbp*BIGSZ, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0) ;
if (ptr == (char *)-1)
ptr = mmap(0, nbp*BIGSZ, PROT_READ|PROT_WRITE, MAP_PRIVATE,
fd, 0) ;
}
else
ptr = sys_alloc_hugepages(0, 0, nbp*BIGSZ, PROT_READ|PROT_WRITE, 0) ;

printf("alloc %d 4Mo pages ptr=%p errno=%d\n", nbp, ptr, errno) ;

pause() ;
}

# ./huge # (using the syscall)
alloc 1 4Mo pages ptr=0x40400000 errno=0
^C
# ls -l /huge/BIG
-rw-r--r-- 1 root root 4194304 Nov 14 15:57 /huge/BIG
# ./huge -f /huge/BIG (using mmap)
./huge -f /huge/BIG
alloc 1 4Mo pages ptr=0xffffffff errno=22
^C
# strace ...
open("/huge/BIG", O_RDWR) = 3
ftruncate(3, 4194304) = 0
mmap2(NULL, 4194304, PROT_READ|PROT_WRITE, MAP_SHARED, 3, 0) = -1 EINVAL
(Invalid argument)
mmap2(NULL, 4194304, PROT_READ|PROT_WRITE, MAP_PRIVATE, 3, 0) = -1 EINVAL
(Invalid argument)


Not a trivial task it seems. The syscall is very easy.. sorry.

Thanks


From: "Christoph Hellwig" <[email protected]>
> On Thu, Nov 14, 2002 at 09:52:33AM +0100, dada1 wrote:
> > I beg to differ.
> >
> > I already use the syscalls.
>
> For what?
>
> > How one is supposed to use hugetlbfs ? That's not documented.
>
> mount -t hugetlbfs whocares /huge
>
> fd = open("/huge/nose", ..)
>
> mmap(.., fd, ..)
>

2002-11-14 15:24:54

by Benjamin LaHaise

[permalink] [raw]
Subject: Re: [patch] remove hugetlb syscalls

On Thu, Nov 14, 2002 at 04:13:56PM +0100, dada1 wrote:
> Thanks Christoph
>
> If I asked, this is because I tried the obvious and it doesnt work.

It's a file. You need to use MAP_SHARED.

-ben

2002-11-14 15:32:07

by Eric Dumazet

[permalink] [raw]
Subject: Re: [patch] remove hugetlb syscalls

Please look again into my mail :)

ptr = mmap(0, nbp*BIGSZ, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0) ;
if (ptr == (char *)-1)
ptr = mmap(0, nbp*BIGSZ, PROT_READ|PROT_WRITE, MAP_PRIVATE,
fd, 0);

mmap2(NULL, 4194304, PROT_READ|PROT_WRITE, MAP_SHARED, 3, 0) = -1 EINVAL

mmap2(NULL, 4194304, PROT_READ|PROT_WRITE, MAP_PRIVATE, 3, 0) = -1 EINVAL

I tried the two versions. MAP_SHARED and MAP_PRIVATE



From: "Benjamin LaHaise" <[email protected]>
> On Thu, Nov 14, 2002 at 04:13:56PM +0100, dada1 wrote:
> > Thanks Christoph
> >
> > If I asked, this is because I tried the obvious and it doesnt work.
>
> It's a file. You need to use MAP_SHARED.
>
> -ben
>

2002-11-14 17:45:04

by David Mosberger-Tang

[permalink] [raw]
Subject: Re: [patch] remove hugetlb syscalls

>>>>> On Thu, 14 Nov 2002 15:20:10 +0100, Christoph Hellwig <[email protected]> said:

Christoph> mount -t hugetlbfs whocares /huge

Christoph> fd = open("/huge/nose", ..)

Christoph> mmap(.., fd, ..)

One potential downside of this is that programmers might expect
mremap(), mprotect() etc. to work on the returned memory at the
granularity of base-pages. I'm not sure though whether that was part
of the reason Linus wanted separate syscalls.

--david

2002-11-14 17:58:23

by Alan

[permalink] [raw]
Subject: Re: [patch] remove hugetlb syscalls

On Thu, 2002-11-14 at 17:51, David Mosberger-Tang wrote:
> One potential downside of this is that programmers might expect
> mremap(), mprotect() etc. to work on the returned memory at the
> granularity of base-pages. I'm not sure though whether that was part
> of the reason Linus wanted separate syscalls.

The extra syscalls dont change anything. mremap/mprotect still fails in
the same way after you use them

2002-11-14 18:46:22

by davidm

[permalink] [raw]
Subject: Re: [patch] remove hugetlb syscalls

>>>>> On 14 Nov 2002 18:31:15 +0000, Alan Cox <[email protected]> said:

Alan> On Thu, 2002-11-14 at 17:51, David Mosberger-Tang wrote:
>> One potential downside of this is that programmers might expect
>> mremap(), mprotect() etc. to work on the returned memory at the
>> granularity of base-pages. I'm not sure though whether that was
>> part of the reason Linus wanted separate syscalls.

Alan> The extra syscalls dont change anything. mremap/mprotect still
Alan> fails in the same way after you use them

But that's excactly the point. The hugepage interface returns a
different kind of virtual memory. There are tons of programs out
there using mmap(). If such a program gets fed a path to the
hugepagefs, it might end up with huge pages without knowing anything
about huge pages. For the most part, that might work fine, but it
could lead to subtle failures.

--david

2002-11-14 19:19:18

by Alan

[permalink] [raw]
Subject: Re: [patch] remove hugetlb syscalls

On Thu, 2002-11-14 at 18:53, David Mosberger-Tang wrote:
> But that's excactly the point. The hugepage interface returns a
> different kind of virtual memory. There are tons of programs out
> there using mmap(). If such a program gets fed a path to the
> hugepagefs, it might end up with huge pages without knowing anything
> about huge pages. For the most part, that might work fine, but it
> could lead to subtle failures.

Your argument makes sense. You are arguing

I created it with weirdass syscall, magically all my libraries and other
apps will know this so not use mremap etc on that space.

Thats complete donkey poo

2002-11-14 19:21:41

by Jeff Garzik

[permalink] [raw]
Subject: Re: [patch] remove hugetlb syscalls

David Mosberger-Tang wrote:

> But that's excactly the point. The hugepage interface returns a
> different kind of virtual memory. There are tons of programs out
> there using mmap(). If such a program gets fed a path to the
> hugepagefs, it might end up with huge pages without knowing anything
> about huge pages. For the most part, that might work fine, but it
> could lead to subtle failures.


Yeah, that was one of Linus's points about the syscalls, in a private
email. I mentioned how the new syscalls were in poor taste, when
existing syscalls would work fine, and he flamed me right back ;-)

One of his main points to me was exactly what you are elucidating:
there are subtle differences between normal pages and superpages that
are exposed to userland, and we should make that explicit [with the
syscalls] rather than hide it [with hugetlbfs/mmap/etc.]. So I think
this is further indication Linus has a very valid point ;-)

However, that said, I think hugetlbfs will almost always get used in
preference to the syscalls, so leaving them in may be more a statement
of technical correctness/cleanliness than anything else.

[tangent warning]
This whole hugetlb affair is unfortunately pretty ugly, and this thread
is just one component of that. All these discussions occurred off-list,
and it's _still_ a political football. Sigh. I just hope that the
furor dies down soon, that smart technical [apolitical] decisions are
made, and future discussions are at least CC'd to lkml.

Jeff



2002-11-14 19:41:11

by Alan

[permalink] [raw]
Subject: Re: [patch] remove hugetlb syscalls

On Thu, 2002-11-14 at 19:52, Alan Cox wrote:
> On Thu, 2002-11-14 at 18:53, David Mosberger-Tang wrote:
> > But that's excactly the point. The hugepage interface returns a
> > different kind of virtual memory. There are tons of programs out
> > there using mmap(). If such a program gets fed a path to the
> > hugepagefs, it might end up with huge pages without knowing anything
> > about huge pages. For the most part, that might work fine, but it
> > could lead to subtle failures.
>
> Your argument makes sense. You are arguing

Makes no sense rather 8)


2002-11-14 19:42:19

by Alan

[permalink] [raw]
Subject: Re: [patch] remove hugetlb syscalls

On Thu, 2002-11-14 at 19:28, Jeff Garzik wrote:
> However, that said, I think hugetlbfs will almost always get used in
> preference to the syscalls, so leaving them in may be more a statement
> of technical correctness/cleanliness than anything else.

Just rewrite the syscall crap in userspace as library functions calling
hugetlbfs. End of problem, less kernel code, less bugs, same interface

2002-11-14 19:54:19

by Benjamin LaHaise

[permalink] [raw]
Subject: Re: [patch] remove hugetlb syscalls

On Thu, Nov 14, 2002 at 11:51:46AM -0800, Rohit Seth wrote:
> [-- text/html is unsupported (use 'v' to view this part) --]

Sorry, but your html-only email got trapped by the spam filters
of linux-mm and linux-kernel.

-ben
--
"Do you seek knowledge in time travel?"

2002-11-14 20:07:30

by Rohit Seth

[permalink] [raw]
Subject: Re: [patch] remove hugetlb syscalls

Benjamin LaHaise wrote:

>On Thu, Nov 14, 2002 at 04:13:56PM +0100, dada1 wrote:
>
>
>>Thanks Christoph
>>
>>If I asked, this is because I tried the obvious and it doesnt work.
>>
>>
>
>It's a file. You need to use MAP_SHARED.
>
>
This is not the problem with MAP_SHARED. It is the lack of (arch
specific) hugepage aligned function support in the kernel. You can use
the mmap on hugetlbfs using only MAP_FIXED with properly aligned
addresses (but then this also is only a hint to kernel). With addr ==
NULL in mmap, the function is bound to fail almost all the times.


Thanks Ben for letting me know the problem with previous post.

>
>



2002-11-14 20:29:52

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [patch] remove hugetlb syscalls

On Thu, 14 Nov 2002 15:20:10 +0100, Christoph Hellwig <[email protected]> said:
Christoph> mount -t hugetlbfs whocares /huge
Christoph> fd = open("/huge/nose", ..)
Christoph> mmap(.., fd, ..)

On Thu, Nov 14, 2002 at 09:51:55AM -0800, David Mosberger-Tang wrote:
> One potential downside of this is that programmers might expect
> mremap(), mprotect() etc. to work on the returned memory at the
> granularity of base-pages. I'm not sure though whether that was part
> of the reason Linus wanted separate syscalls.
> --david

I'm not entirely sure. There is quite a bit about this filesystem that
assumes the user already knows what he's doing, for instance:

(1) CAP_IPC_LOCK is required to access anything on it
(2) only mmap() and truncate() are supported
(3) ->f_op->mmap() will hand -EINVAL back to userspace instead of
automatically placing the vma, for explicit and 0 start adresses
(4) ->f_op->mmap() will also hand back -EINAL if one attempts to mmap()
beyond the end of the file

There is the assumption that the user knows what he's doing here, and
he'd better anyway, because he's capable(CAP_IPC_LOCK). Some easy
accommodations could be made, for instance, implicitly truncating the
file to be larger if mmap()'d beyond the end of it, and automatic
placement is easily doable. But I'm not concerned about it at all; the
primary user (IBM's DB2 database) is perfectly capable of dealing with
these constraints on its usage by tweaking buffer pool and other
parameters. Of course, other users may be accommodated if desired.


Bill

2002-11-14 20:26:24

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [patch] remove hugetlb syscalls

On Wed, Nov 13, 2002 at 06:45:55PM -0500, Benjamin LaHaise wrote:
> Since the functionality of the hugetlb syscalls is now available via
> hugetlbfs with better control over permissions, could you apply the
> following patch that gets rid of a lot of duplicate and unnescessary
> code by removing the two hugetlb syscalls? This patch was only tested
> on x86, so if people could take a look over it and see if I missed
> anything I'd appreciate it. Thanks,

The main reason I haven't considered doing this is because they already
got in and there appears to be a user (Oracle/IA64).

My strategy has been instead to contribute fixes, cleanups, and some
redesign of internal data structure management so that the code is more
palatable and robust in general. And I think this process has gone well
enough to say that the system calls are relatively inoffensive, or will
be once what's in various patch queues as of today hits mainline.

Basically, I myself don't have a user of the stuff I need to service,
but I already know who's going to scream if it hits the chopping block.

The primary motivation behind the funding of hugetlbfs was to export
the superpage functionality as a tmpfs-like filesystem to other
in-kernel facilities, most notably SysV shm, for interoperability with
other databases (DB2) that are very insistent about utilizing superpages
in combination with SysV shm. Some additional benefits are available,
though not utilized, because the files may be larger than the virtual
address space on i386, and so mmap()-based windowing into a large
virtual arena could reap both TLB and space consumption benefits, which
benefit is also desired for (eventual) database usage, and is already
very beneficial to simpler applications with high concurrency levels
operating on shared memory arenas backed by it..

Oracle itself prefers to perform scatter/gather mappings at a
granularity too small for hugetlb on i386, and so sys_remap_file_pages()
is its preferred solution.

There are various oddities here. sys_remap_file_pages() is a new
syscall, hugetlbfs is a RAM-backed filesystem that supports only mmap()
and truncate(), and only on specifically-aligned regions. But IMHO with
effort and careful design, these facilities have either been cleanly
implemented or made clean once re-examined, and so supporting databases
has in the end not damaged the integrity or cleanliness of the generic VM.


Bill

2002-11-14 20:33:07

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [patch] remove hugetlb syscalls

On Thu, Nov 14, 2002 at 12:11:32PM -0800, Rohit Seth wrote:
> This is not the problem with MAP_SHARED. It is the lack of (arch
> specific) hugepage aligned function support in the kernel. You can use
> the mmap on hugetlbfs using only MAP_FIXED with properly aligned
> addresses (but then this also is only a hint to kernel). With addr ==
> NULL in mmap, the function is bound to fail almost all the times.

There's very little standing in the way of automatic placement. If in
your opinion it should be implemented, I'll add that feature today.

IIRC you mentioned you would like to export the arch-specific
hugepage-aligned vma placement functions; once these are available,
it should be trivial to reuse them.


Thanks,
Bill

2002-11-14 20:41:16

by Benjamin LaHaise

[permalink] [raw]
Subject: Re: [patch] remove hugetlb syscalls

On Thu, Nov 14, 2002 at 12:30:35PM -0800, William Lee Irwin III wrote:
> The main reason I haven't considered doing this is because they already
> got in and there appears to be a user (Oracle/IA64).

Not in shipping code. Certainly no vendor kernels that I am aware of
have shipped these syscalls yet either, as nearly all of the developers
find them revolting. Not to mention that the code cleanups and bugfixes
are still ongoing.

-ben

2002-11-14 21:04:41

by Benjamin LaHaise

[permalink] [raw]
Subject: Re: [patch] remove hugetlb syscalls

Oracle does not run as root, so they can't even use the syscalls
directly. At least with hugetlbfs we can chmod the filesystem to be
owned by the oracle user.

-ben

On Thu, Nov 14, 2002 at 01:02:20PM -0800, William Lee Irwin III wrote:
> On Thu, Nov 14, 2002 at 12:30:35PM -0800, William Lee Irwin III wrote:
> >> The main reason I haven't considered doing this is because they already
> >> got in and there appears to be a user (Oracle/IA64).
>
> On Thu, Nov 14, 2002 at 03:48:09PM -0500, Benjamin LaHaise wrote:
> > Not in shipping code. Certainly no vendor kernels that I am aware of
> > have shipped these syscalls yet either, as nearly all of the developers
> > find them revolting. Not to mention that the code cleanups and bugfixes
> > are still ongoing.
>
> This is a bit out of my hands; the support decision came from elsewhere.
> I have to service my users first, and after that, I don't generally want
> to stand in the way of others. In general it's good to have minimalistic
> interfaces, but I'm not a party to the concerns regarding the syscalls.
> My direct involvement there has been either of a kernel janitor nature,
> helping to adapt it to Linux kernel idioms, or reusing code for hugetlbfs.
>
> I guess the only real statement left to make is that hugetlbfs (or my
> participation/implementation of it) was not originally intended to
> compete with the syscalls, though there's a lot of obvious overlap
> (which I tried to exploit by means of code reuse).
>
> Bill

--
"Do you seek knowledge in time travel?"

2002-11-14 21:24:28

by davidm

[permalink] [raw]
Subject: Re: [patch] remove hugetlb syscalls

>>>>> On Thu, 14 Nov 2002 12:34:11 -0800, William Lee Irwin III <[email protected]> said:

William> (3) ->f_op->mmap() will hand -EINVAL back to userspace
William> instead of automatically placing the vma, for explicit and
William> 0 start adresses

This sounds like a receipe for creating unportable programs because
the alignment constraints will be different from one platform to
another. (Adding gethugepagesize() in libc would alleviate the
problem but it wouldn't solve it completely.)

Overall, it does sound to me that the hugetlbfs is so specialized that
it would be much cleaner to provide a separate interface at the
user-level. That would leave more flexibility should the
implementation change over time (which it well might).

--david

2002-11-14 21:36:12

by Rohit Seth

[permalink] [raw]
Subject: Re: [patch] remove hugetlb syscalls

Benjamin LaHaise wrote:

>Oracle does not run as root, so they can't even use the syscalls
>directly. At least with hugetlbfs we can chmod the filesystem to be
>owned by the oracle user.
>
> -ben
>
>
>
Strictly speaking user don't have to be root. Currently the syscall
only requires users to have root as one of the supplementary groups (and
that is how Oracle is actually using these syscalls). And if
CAP_IPC_LOCK (to make it coherent with fs side of the world) is what is
preferdto provide access to hugepages then that change is simple also.
Don't need to do any chmod.

rohit


2002-11-14 21:39:33

by davidm

[permalink] [raw]
Subject: Re: [patch] remove hugetlb syscalls

>>>>> On Thu, 14 Nov 2002 13:38:22 -0800, William Lee Irwin III <[email protected]> said:

Bill> On Thu, 14 Nov 2002 12:34:11 -0800, William Lee Irwin III
Bill> said:
William> (3) ->f_op->mmap() will hand -EINVAL back to userspace
William> instead of automatically placing the vma, for explicit and
William> 0 start adresses

Bill> On Thu, Nov 14, 2002 at 01:31:21PM -0800, David Mosberger-Tang
Bill> wrote:
>> This sounds like a receipe for creating unportable programs
>> because the alignment constraints will be different from one
>> platform to another. (Adding gethugepagesize() in libc would
>> alleviate the problem but it wouldn't solve it completely.)
>> Overall, it does sound to me that the hugetlbfs is so specialized
>> that it would be much cleaner to provide a separate interface at
>> the user-level. That would leave more flexibility should the
>> implementation change over time (which it well might). --david

Bill> Okay, that's a serious problem. But it's easy to fix; I'll
Bill> just call the hugepage vma placement functions that are also
Bill> used by the syscalls.

Sounds good to me.

Thanks,

--david

2002-11-14 21:52:09

by Benjamin LaHaise

[permalink] [raw]
Subject: Re: [patch] remove hugetlb syscalls

On Thu, Nov 14, 2002 at 01:40:25PM -0800, Rohit Seth wrote:
> Strictly speaking user don't have to be root. Currently the syscall
> only requires users to have root as one of the supplementary groups (and
> that is how Oracle is actually using these syscalls). And if
> CAP_IPC_LOCK (to make it coherent with fs side of the world) is what is
> preferdto provide access to hugepages then that change is simple also.
> Don't need to do any chmod.

Chmod is easier to administor (the special permissions are obvious with
a standard tool called ls), and doesn't require giving random apps root
privs (good practice still dictates that database backends should not
have root). Capabilities would work, but have yet to catch on in any
real sense and are lacking in terms of useful tools in most distributions.

-ben
--
"Do you seek knowledge in time travel?"

2002-11-14 22:05:25

by Rohit Seth

[permalink] [raw]
Subject: RE: [patch] remove hugetlb syscalls



> -----Original Message-----
> From: William Lee Irwin III [mailto:[email protected]]
> >
> On Thu, Nov 14, 2002 at 12:11:32PM -0800, Rohit Seth wrote:
> > This is not the problem with MAP_SHARED. It is the lack of (arch
> > specific) hugepage aligned function support in the kernel.
> You can use
> > the mmap on hugetlbfs using only MAP_FIXED with properly aligned
> > addresses (but then this also is only a hint to kernel).
> With addr ==
> > NULL in mmap, the function is bound to fail almost all the times.
>
> There's very little standing in the way of automatic
> placement. If in your opinion it should be implemented, I'll
> add that feature today.
>
mmap with addr==NULL is in my opinion a very useful thing.

> IIRC you mentioned you would like to export the arch-specific
> hugepage-aligned vma placement functions; once these are
> available, it should be trivial to reuse them.
>
>
I will export those functions from arch specific trees.

2002-11-14 22:08:10

by Benjamin LaHaise

[permalink] [raw]
Subject: Re: [patch] remove hugetlb syscalls

On Thu, Nov 14, 2002 at 12:30:35PM -0800, William Lee Irwin III wrote:
> The main reason I haven't considered doing this is because they already
> got in and there appears to be a user (Oracle/IA64).

Not in shipping code. Certainly no vendor kernels that I am aware of
have shipped these syscalls yet either, as nearly all of the developers
find them revolting. Not to mention that the code cleanups and bugfixes
are still ongoing.

-ben

2002-11-14 22:09:33

by Benjamin LaHaise

[permalink] [raw]
Subject: Re: [patch] remove hugetlb syscalls

On Thu, Nov 14, 2002 at 11:51:46AM -0800, Rohit Seth wrote:
> [-- text/html is unsupported (use 'v' to view this part) --]

Sorry, but your html-only email got trapped by the spam filters
of linux-mm and linux-kernel.

-ben
--
"Do you seek knowledge in time travel?"

2002-11-14 22:14:08

by Benjamin LaHaise

[permalink] [raw]
Subject: Re: [patch] remove hugetlb syscalls

On Thu, Nov 14, 2002 at 12:30:35PM -0800, William Lee Irwin III wrote:
> The main reason I haven't considered doing this is because they already
> got in and there appears to be a user (Oracle/IA64).

Not in shipping code. Certainly no vendor kernels that I am aware of
have shipped these syscalls yet either, as nearly all of the developers
find them revolting. Not to mention that the code cleanups and bugfixes
are still ongoing.

-ben
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/

2002-11-15 00:33:33

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [patch] remove hugetlb syscalls

On Thu, Nov 14, 2002 at 12:30:35PM -0800, William Lee Irwin III wrote:
>> The main reason I haven't considered doing this is because they already
>> got in and there appears to be a user (Oracle/IA64).

On Thu, Nov 14, 2002 at 03:48:09PM -0500, Benjamin LaHaise wrote:
> Not in shipping code. Certainly no vendor kernels that I am aware of
> have shipped these syscalls yet either, as nearly all of the developers
> find them revolting. Not to mention that the code cleanups and bugfixes
> are still ongoing.

This is a bit out of my hands; the support decision came from elsewhere.
I have to service my users first, and after that, I don't generally want
to stand in the way of others. In general it's good to have minimalistic
interfaces, but I'm not a party to the concerns regarding the syscalls.
My direct involvement there has been either of a kernel janitor nature,
helping to adapt it to Linux kernel idioms, or reusing code for hugetlbfs.

I guess the only real statement left to make is that hugetlbfs (or my
participation/implementation of it) was not originally intended to
compete with the syscalls, though there's a lot of obvious overlap
(which I tried to exploit by means of code reuse).

Bill

2002-11-15 00:34:41

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [patch] remove hugetlb syscalls

On Thu, 14 Nov 2002 12:34:11 -0800, William Lee Irwin III said:
William> (3) ->f_op->mmap() will hand -EINVAL back to userspace
William> instead of automatically placing the vma, for explicit and
William> 0 start adresses

On Thu, Nov 14, 2002 at 01:31:21PM -0800, David Mosberger-Tang wrote:
> This sounds like a receipe for creating unportable programs because
> the alignment constraints will be different from one platform to
> another. (Adding gethugepagesize() in libc would alleviate the
> problem but it wouldn't solve it completely.)
> Overall, it does sound to me that the hugetlbfs is so specialized that
> it would be much cleaner to provide a separate interface at the
> user-level. That would leave more flexibility should the
> implementation change over time (which it well might).
> --david

Okay, that's a serious problem. But it's easy to fix; I'll just call
the hugepage vma placement functions that are also used by the syscalls.


Bill

2002-11-15 00:33:31

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [patch] remove hugetlb syscalls

On Thu, Nov 14, 2002 at 04:11:34PM -0500, Benjamin LaHaise wrote:
> Oracle does not run as root, so they can't even use the syscalls
> directly. At least with hugetlbfs we can chmod the filesystem to be
> owned by the oracle user.

Okay, the advantage with respect to permissions is clear; now there is
a correction to the permissions checking I should do, as CAP_IPC_LOCK
is currently checked in ->f_ops->mmap(), but the permissions are
enforcible by means of ordinary vfs permissions, and so it's redundant.


Bill

2002-11-16 17:53:44

by Rik van Riel

[permalink] [raw]
Subject: Re: [patch] remove hugetlb syscalls

On Thu, 14 Nov 2002, Jeff Garzik wrote:

> [tangent warning]
> This whole hugetlb affair is unfortunately pretty ugly, and this thread
> is just one component of that. All these discussions occurred off-list,
> and it's _still_ a political football.

This always happens when the discussions occurred in private and
99% of the community didn't see them. No need to be surprised
that people are asking the questions others have answered before
in a place nobody could see ;)

Rik
--
Bravely reimplemented by the knights who say "NIH".
http://www.surriel.com/ http://guru.conectiva.com/
Current spamtrap: <a href=mailto:"[email protected]">[email protected]</a>

2002-11-16 18:08:19

by Linus Torvalds

[permalink] [raw]
Subject: Re: [patch] remove hugetlb syscalls


On Sat, 16 Nov 2002, Rik van Riel wrote:
>
> This always happens when the discussions occurred in private and
> 99% of the community didn't see them. No need to be surprised
> that people are asking the questions others have answered before
> in a place nobody could see ;)

Part of the problem is that the companies involved (mainly Intel and
Oracle) both have a hard time just talking about their stuff in public on
linux-kernel. It looks like Intel is getting a lot better at it, it's
been my one major message to the people there I know. Oracle engineers
still don't seem to talk much publicly about their issues and trials,
though.

(Part of that may actually be due to stupid benchmark rules. They aren't
even allowed to publicize their basic benchmark numbers while doing
development, even though those numbers are the primary thing they care
about. Frigging stupid rules, entirely designed for closed development.)

So not only did you have a feature that is mostly useful only to a
smallish group of people - you had that group of people not used to open
communication in the first place, AND you had rules that made some of the
important part of the communication illegal in the first place.

Still wonder why it wasn't widely discussed during development? Intel
engineers would bvasically take people asdie in private at conferences
talking about what kinds of improvments Oracle was seeing..

Oh, well.

Linus

2002-11-16 18:16:36

by Jan Niehusmann

[permalink] [raw]
Subject: Re: [patch] remove hugetlb syscalls

On Thu, Nov 14, 2002 at 08:14:04PM +0000, Alan Cox wrote:
> On Thu, 2002-11-14 at 19:52, Alan Cox wrote:
> > On Thu, 2002-11-14 at 18:53, David Mosberger-Tang wrote:
> > > But that's excactly the point. The hugepage interface returns a
> > > different kind of virtual memory. There are tons of programs out
> > > there using mmap(). If such a program gets fed a path to the
> > > hugepagefs, it might end up with huge pages without knowing anything
> > > about huge pages. For the most part, that might work fine, but it
> > > could lead to subtle failures.
> >
> > Your argument makes sense. You are arguing
> Makes no sense rather 8)

Sorry, I didn't follow the whole discussion, so my argument may make no
sense at all, too. :-)

If I understand David correctly, he doesn't worry about programs that
want to use huge pages, but about programs just using mmap just to
access some file. If they get called with a file that behaves subtly
different than usual files, that may lead to failures or even security
holes.

But that's no argument to keep the syscalls, if you decide to implement
hugepagefs. The existance of the syscalls doesn't prevent bugs created
or triggered by hugepagefs.

Jan