The page tables for hugetlb mappings are handled differently than page tables
for normal pages. Rather than integrating multiple page size support into the
main VM (which would tremendously complicate the code) some hooks were created.
This allows hugetlb special cases to be handled "out of line" by a separate
interface.
Hugetlbfs was the huge page interface chosen. At the time, large database
users were the only big users of huge pages and the hugetlbfs design meets
their needs pretty well. Over time, hugetlbfs has been expanded to enable new
uses of huge page memory with varied results. As features are added, the
semantics become a permanent part of the Linux API. This makes maintenance of
hugetlbfs an increasingly difficult task and inhibits the addition of features
and functionality in support of ever-changing hardware.
To remedy the situation, I propose an API (currently called
pagetable_operations). All of the current hugetlbfs-specific hooks are moved
into an operations struct that is attached to VMAs. The end result is a more
explicit and IMO a cleaner interface between hugetlbfs and the core VM. We are
then free to add other hugetlb interfaces (such as a /dev/zero-styled character
device) that can operate either in concert with or independent of hugetlbfs.
There should be no measurable performance impact for normal page users (we're
checking if pagetable_ops != NULL instead of checking for vm_flags &
VM_HUGETLB). Of course we do increase the VMA size by one pointer. For huge
pages, there is an added indirection for pt_op() calls. This patch series does
not change the logic of the the hugetlbfs operations, just moves them into the
pagetable_operations struct.
Comments? Do you think it's as good of an idea as I do?
Signed-off-by: Adam Litke <[email protected]>
---
include/linux/mm.h | 25 +++++++++++++++++++++++++
1 files changed, 25 insertions(+), 0 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2d2c08d..a2fa66d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -98,6 +98,7 @@ struct vm_area_struct {
/* Function pointers to deal with this struct. */
struct vm_operations_struct * vm_ops;
+ struct pagetable_operations_struct * pagetable_ops;
/* Information about our backing store: */
unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE
@@ -218,6 +219,30 @@ struct vm_operations_struct {
};
struct mmu_gather;
+
+struct pagetable_operations_struct {
+ int (*fault)(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address, int write_access);
+ int (*copy_vma)(struct mm_struct *dst, struct mm_struct *src,
+ struct vm_area_struct *vma);
+ int (*pin_pages)(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct page **pages, struct vm_area_struct **vmas,
+ unsigned long *position, int *length, int i);
+ void (*change_protection)(struct vm_area_struct *vma,
+ unsigned long address, unsigned long end, pgprot_t newprot);
+ unsigned long (*unmap_page_range)(struct vm_area_struct *vma,
+ unsigned long address, unsigned long end, long *zap_work);
+ void (*free_pgtable_range)(struct mmu_gather **tlb,
+ unsigned long addr, unsigned long end,
+ unsigned long floor, unsigned long ceiling);
+};
+
+#define has_pt_op(vma, op) \
+ ((vma)->pagetable_ops && (vma)->pagetable_ops->op)
+#define pt_op(vma, call) \
+ ((vma)->pagetable_ops->call)
+
struct inode;
#define page_private(page) ((page)->private)
Signed-off-by: Adam Litke <[email protected]>
---
fs/hugetlbfs/inode.c | 6 ++++++
mm/memory.c | 4 ++--
2 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 4f4cd13..c0a7984 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -36,6 +36,7 @@
static struct super_operations hugetlbfs_ops;
static const struct address_space_operations hugetlbfs_aops;
const struct file_operations hugetlbfs_file_operations;
+static struct pagetable_operations_struct hugetlbfs_pagetable_ops;
static struct inode_operations hugetlbfs_dir_inode_operations;
static struct inode_operations hugetlbfs_inode_operations;
@@ -70,6 +71,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
*/
vma->vm_flags |= VM_HUGETLB | VM_RESERVED;
vma->vm_ops = &hugetlb_vm_ops;
+ vma->pagetable_ops = &hugetlbfs_pagetable_ops;
vma_len = (loff_t)(vma->vm_end - vma->vm_start);
@@ -560,6 +562,10 @@ const struct file_operations hugetlbfs_file_operations = {
.get_unmapped_area = hugetlb_get_unmapped_area,
};
+static struct pagetable_operations_struct hugetlbfs_pagetable_ops = {
+ .copy_vma = copy_hugetlb_page_range,
+};
+
static struct inode_operations hugetlbfs_dir_inode_operations = {
.create = hugetlbfs_create,
.lookup = simple_lookup,
diff --git a/mm/memory.c b/mm/memory.c
index ef09f0a..80eafd5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -602,8 +602,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
return 0;
}
- if (is_vm_hugetlb_page(vma))
- return copy_hugetlb_page_range(dst_mm, src_mm, vma);
+ if (has_pt_op(vma, copy_vma))
+ return pt_op(vma, copy_vma)(dst_mm, src_mm, vma);
dst_pgd = pgd_offset(dst_mm, addr);
src_pgd = pgd_offset(src_mm, addr);
Signed-off-by: Adam Litke <[email protected]>
---
fs/hugetlbfs/inode.c | 1 +
mm/memory.c | 6 +++---
2 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index c0a7984..2d1dd84 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -564,6 +564,7 @@ const struct file_operations hugetlbfs_file_operations = {
static struct pagetable_operations_struct hugetlbfs_pagetable_ops = {
.copy_vma = copy_hugetlb_page_range,
+ .pin_pages = follow_hugetlb_page,
};
static struct inode_operations hugetlbfs_dir_inode_operations = {
diff --git a/mm/memory.c b/mm/memory.c
index 80eafd5..9467c65 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1039,9 +1039,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|| !(vm_flags & vma->vm_flags))
return i ? : -EFAULT;
- if (is_vm_hugetlb_page(vma)) {
- i = follow_hugetlb_page(mm, vma, pages, vmas,
- &start, &len, i);
+ if (has_pt_op(vma, pin_pages)) {
+ i = pt_op(vma, pin_pages)(mm, vma, pages,
+ vmas, &start, &len, i);
continue;
}
Signed-off-by: Adam Litke <[email protected]>
---
fs/hugetlbfs/inode.c | 3 ++-
include/linux/hugetlb.h | 4 ++--
mm/hugetlb.c | 12 ++++++++----
mm/memory.c | 10 ++++------
4 files changed, 16 insertions(+), 13 deletions(-)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 2d1dd84..146a4b7 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -289,7 +289,7 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, pgoff_t pgoff)
v_offset = 0;
__unmap_hugepage_range(vma,
- vma->vm_start + v_offset, vma->vm_end);
+ vma->vm_start + v_offset, vma->vm_end, 0);
}
}
@@ -565,6 +565,7 @@ const struct file_operations hugetlbfs_file_operations = {
static struct pagetable_operations_struct hugetlbfs_pagetable_ops = {
.copy_vma = copy_hugetlb_page_range,
.pin_pages = follow_hugetlb_page,
+ .unmap_page_range = unmap_hugepage_range,
};
static struct inode_operations hugetlbfs_dir_inode_operations = {
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index a60995a..add92b3 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -16,8 +16,8 @@ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int *, int);
-void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
-void __unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
+unsigned long unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long, long *);
+void __unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long, long *);
int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
int hugetlb_report_meminfo(char *);
int hugetlb_report_node_meminfo(int, char *);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index cb362f7..3bcc0db 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -356,7 +356,7 @@ nomem:
}
void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
- unsigned long end)
+ unsigned long end, long *zap_work)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
@@ -397,10 +397,13 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
list_del(&page->lru);
put_page(page);
}
+
+ if (zap_work)
+ *zap_work -= (end - start) / (HPAGE_SIZE / PAGE_SIZE);
}
-void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
- unsigned long end)
+unsigned long unmap_hugepage_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, long *zap_work)
{
/*
* It is undesirable to test vma->vm_file as it should be non-null
@@ -412,9 +415,10 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
*/
if (vma->vm_file) {
spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
- __unmap_hugepage_range(vma, start, end);
+ __unmap_hugepage_range(vma, start, end, zap_work);
spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
}
+ return end;
}
static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
diff --git a/mm/memory.c b/mm/memory.c
index 9467c65..aa6b06e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -839,12 +839,10 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
tlb_start_valid = 1;
}
- if (unlikely(is_vm_hugetlb_page(vma))) {
- unmap_hugepage_range(vma, start, end);
- zap_work -= (end - start) /
- (HPAGE_SIZE / PAGE_SIZE);
- start = end;
- } else
+ if (unlikely(has_pt_op(vma, unmap_page_range)))
+ start = pt_op(vma, unmap_page_range)
+ (vma, start, end, &zap_work);
+ else
start = unmap_page_range(*tlbp, vma,
start, end, &zap_work, details);
Signed-off-by: Adam Litke <[email protected]>
---
fs/hugetlbfs/inode.c | 1 +
mm/hugetlb.c | 4 +++-
mm/memory.c | 4 ++--
3 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 3461f9b..1de73c1 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -568,6 +568,7 @@ static struct pagetable_operations_struct hugetlbfs_pagetable_ops = {
.unmap_page_range = unmap_hugepage_range,
.change_protection = hugetlb_change_protection,
.free_pgtable_range = hugetlb_free_pgd_range,
+ .fault = hugetlb_fault,
};
static struct inode_operations hugetlbfs_dir_inode_operations = {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3bcc0db..63de369 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -588,6 +588,8 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long vaddr = *position;
int remainder = *length;
+ BUG_ON(!has_pt_op(vma, fault));
+
spin_lock(&mm->page_table_lock);
while (vaddr < vma->vm_end && remainder) {
pte_t *pte;
@@ -604,7 +606,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
int ret;
spin_unlock(&mm->page_table_lock);
- ret = hugetlb_fault(mm, vma, vaddr, 0);
+ ret = pt_op(vma, fault)(mm, vma, vaddr, 0);
spin_lock(&mm->page_table_lock);
if (ret == VM_FAULT_MINOR)
continue;
diff --git a/mm/memory.c b/mm/memory.c
index 442e6b2..c8e17b4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2455,8 +2455,8 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
count_vm_event(PGFAULT);
- if (unlikely(is_vm_hugetlb_page(vma)))
- return hugetlb_fault(mm, vma, address, write_access);
+ if (unlikely(has_pt_op(vma, fault)))
+ return pt_op(vma, fault)(mm, vma, address, write_access);
pgd = pgd_offset(mm, address);
pud = pud_alloc(mm, pgd, address);
Signed-off-by: Adam Litke <[email protected]>
---
fs/hugetlbfs/inode.c | 1 +
mm/mprotect.c | 5 +++--
2 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 146a4b7..1016694 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -566,6 +566,7 @@ static struct pagetable_operations_struct hugetlbfs_pagetable_ops = {
.copy_vma = copy_hugetlb_page_range,
.pin_pages = follow_hugetlb_page,
.unmap_page_range = unmap_hugepage_range,
+ .change_protection = hugetlb_change_protection,
};
static struct inode_operations hugetlbfs_dir_inode_operations = {
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 3b8f3c0..172e204 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -201,8 +201,9 @@ success:
dirty_accountable = 1;
}
- if (is_vm_hugetlb_page(vma))
- hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
+ if (has_pt_op(vma, change_protection))
+ pt_op(vma, change_protection)(vma, start, end,
+ vma->vm_page_prot);
else
change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
Signed-off-by: Adam Litke <[email protected]>
---
fs/hugetlbfs/inode.c | 1 +
mm/memory.c | 6 +++---
2 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 1016694..3461f9b 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -567,6 +567,7 @@ static struct pagetable_operations_struct hugetlbfs_pagetable_ops = {
.pin_pages = follow_hugetlb_page,
.unmap_page_range = unmap_hugepage_range,
.change_protection = hugetlb_change_protection,
+ .free_pgtable_range = hugetlb_free_pgd_range,
};
static struct inode_operations hugetlbfs_dir_inode_operations = {
diff --git a/mm/memory.c b/mm/memory.c
index aa6b06e..442e6b2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -279,15 +279,15 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
anon_vma_unlink(vma);
unlink_file_vma(vma);
- if (is_vm_hugetlb_page(vma)) {
- hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
+ if (has_pt_op(vma, free_pgtable_range)) {
+ pt_op(vma, free_pgtable_range)(tlb, addr, vma->vm_end,
floor, next? next->vm_start: ceiling);
} else {
/*
* Optimization: gather nearby vmas into one call down
*/
while (next && next->vm_start <= vma->vm_end + PMD_SIZE
- && !is_vm_hugetlb_page(next)) {
+ && !has_pt_op(next, free_pgtable_range)) {
vma = next;
next = vma->vm_next;
anon_vma_unlink(vma);
On Mon, 2007-02-19 at 10:31 -0800, Adam Litke wrote:
> Signed-off-by: Adam Litke <[email protected]>
> ---
>
> include/linux/mm.h | 25 +++++++++++++++++++++++++
> 1 files changed, 25 insertions(+), 0 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 2d2c08d..a2fa66d 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -98,6 +98,7 @@ struct vm_area_struct {
>
> /* Function pointers to deal with this struct. */
> struct vm_operations_struct * vm_ops;
> + struct pagetable_operations_struct * pagetable_ops;
>
please make it at least const, those things have no business ever being
written to right? And by making them const the compiler helps catch
that, and as bonus the data gets moved to rodata so that it won't share
cachelines with anything that gets dirty
On Mon, 2007-02-19 at 10:31 -0800, Adam Litke wrote:
> The page tables for hugetlb mappings are handled differently than page tables
> for normal pages. Rather than integrating multiple page size support into the
> main VM (which would tremendously complicate the code) some hooks were created.
> This allows hugetlb special cases to be handled "out of line" by a separate
> interface.
ok it makes sense to clean this up.. what I don't like is that there
STILL are all the double cases... for this to work and be worth it both
the common case and the hugetlb case should be using the ops structure
always! Anything else and you're just replacing bad code with bad
code ;(
On Mon, 2007-02-19 at 19:41 +0100, Arjan van de Ven wrote:
> On Mon, 2007-02-19 at 10:31 -0800, Adam Litke wrote:
> > Signed-off-by: Adam Litke <[email protected]>
> > ---
> >
> > include/linux/mm.h | 25 +++++++++++++++++++++++++
> > 1 files changed, 25 insertions(+), 0 deletions(-)
> >
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 2d2c08d..a2fa66d 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -98,6 +98,7 @@ struct vm_area_struct {
> >
> > /* Function pointers to deal with this struct. */
> > struct vm_operations_struct * vm_ops;
> > + struct pagetable_operations_struct * pagetable_ops;
> >
>
> please make it at least const, those things have no business ever being
> written to right? And by making them const the compiler helps catch
> that, and as bonus the data gets moved to rodata so that it won't share
> cachelines with anything that gets dirty
Yep I agree. Changed.
--
Adam Litke - (agl at us.ibm.com)
IBM Linux Technology Center
On Mon, 2007-02-19 at 19:43 +0100, Arjan van de Ven wrote:
> On Mon, 2007-02-19 at 10:31 -0800, Adam Litke wrote:
> > The page tables for hugetlb mappings are handled differently than page tables
> > for normal pages. Rather than integrating multiple page size support into the
> > main VM (which would tremendously complicate the code) some hooks were created.
> > This allows hugetlb special cases to be handled "out of line" by a separate
> > interface.
>
> ok it makes sense to clean this up.. what I don't like is that there
> STILL are all the double cases... for this to work and be worth it both
> the common case and the hugetlb case should be using the ops structure
> always! Anything else and you're just replacing bad code with bad
> code ;(
Hmm. Do you think everyone would support an extra pointer indirection
for every handle_pte_fault() call? If not, then I definitely wouldn't
mind creating a default_pagetable_ops and calling into that.
--
Adam Litke - (agl at us.ibm.com)
IBM Linux Technology Center
On Mon, Feb 19, 2007 at 10:31:34AM -0800, Adam Litke wrote:
> +struct pagetable_operations_struct {
> + int (*fault)(struct mm_struct *mm,
> + struct vm_area_struct *vma,
> + unsigned long address, int write_access);
> + int (*copy_vma)(struct mm_struct *dst, struct mm_struct *src,
> + struct vm_area_struct *vma);
> + int (*pin_pages)(struct mm_struct *mm, struct vm_area_struct *vma,
> + struct page **pages, struct vm_area_struct **vmas,
> + unsigned long *position, int *length, int i);
> + void (*change_protection)(struct vm_area_struct *vma,
> + unsigned long address, unsigned long end, pgprot_t newprot);
> + unsigned long (*unmap_page_range)(struct vm_area_struct *vma,
> + unsigned long address, unsigned long end, long *zap_work);
> + void (*free_pgtable_range)(struct mmu_gather **tlb,
> + unsigned long addr, unsigned long end,
> + unsigned long floor, unsigned long ceiling);
> +};
I very very strongly approve of the approach this operations structure
entails.
-- wli
On Mon, 2007-02-19 at 13:34 -0600, Adam Litke wrote:
> On Mon, 2007-02-19 at 19:43 +0100, Arjan van de Ven wrote:
> > On Mon, 2007-02-19 at 10:31 -0800, Adam Litke wrote:
> > > The page tables for hugetlb mappings are handled differently than page tables
> > > for normal pages. Rather than integrating multiple page size support into the
> > > main VM (which would tremendously complicate the code) some hooks were created.
> > > This allows hugetlb special cases to be handled "out of line" by a separate
> > > interface.
> >
> > ok it makes sense to clean this up.. what I don't like is that there
> > STILL are all the double cases... for this to work and be worth it both
> > the common case and the hugetlb case should be using the ops structure
> > always! Anything else and you're just replacing bad code with bad
> > code ;(
>
> Hmm. Do you think everyone would support an extra pointer indirection
> for every handle_pte_fault() call?
maybe. I'm not entirely convinced... (I like the cleanup potential a lot
code wise.. but if it costs performance, then... well I'd hate to see
linux get slower for hugetlbfs)
> If not, then I definitely wouldn't
> mind creating a default_pagetable_ops and calling into that.
... but without it to be honest, your patch adds nothing real.. there's
ONE user of your code, and there's no real cleanup unless you get rid of
all the special casing.... since the special casing is the really ugly
part of hugetlbfs, not the actual code inside the special case..
--
if you want to mail me at work (you don't), use arjan (at) linux.intel.com
Test the interaction between Linux and your BIOS via http://www.linuxfirmwarekit.org
On Mon, Feb 19, 2007 at 10:31:34AM -0800, Adam Litke wrote:
> Signed-off-by: Adam Litke <[email protected]>
> ---
>
> include/linux/mm.h | 25 +++++++++++++++++++++++++
> 1 files changed, 25 insertions(+), 0 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 2d2c08d..a2fa66d 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -98,6 +98,7 @@ struct vm_area_struct {
>
> /* Function pointers to deal with this struct. */
> struct vm_operations_struct * vm_ops;
> + struct pagetable_operations_struct * pagetable_ops;
>
> /* Information about our backing store: */
> unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE
> @@ -218,6 +219,30 @@ struct vm_operations_struct {
> };
>
> struct mmu_gather;
> +
> +struct pagetable_operations_struct {
> + int (*fault)(struct mm_struct *mm,
> + struct vm_area_struct *vma,
> + unsigned long address, int write_access);
> + int (*copy_vma)(struct mm_struct *dst, struct mm_struct *src,
> + struct vm_area_struct *vma);
> + int (*pin_pages)(struct mm_struct *mm, struct vm_area_struct *vma,
> + struct page **pages, struct vm_area_struct **vmas,
> + unsigned long *position, int *length, int i);
> + void (*change_protection)(struct vm_area_struct *vma,
> + unsigned long address, unsigned long end, pgprot_t newprot);
> + unsigned long (*unmap_page_range)(struct vm_area_struct *vma,
> + unsigned long address, unsigned long end, long *zap_work);
> + void (*free_pgtable_range)(struct mmu_gather **tlb,
> + unsigned long addr, unsigned long end,
> + unsigned long floor, unsigned long ceiling);
> +};
I don't think adding another operation vector is a good idea. But I'd
rather extend the vma operations vector to deal with all nessecary
buts ubstead if addubg a second one.
On (19/02/07 22:29), Christoph Hellwig didst pronounce:
> On Mon, Feb 19, 2007 at 10:31:34AM -0800, Adam Litke wrote:
> > Signed-off-by: Adam Litke <[email protected]>
> > ---
> >
> > include/linux/mm.h | 25 +++++++++++++++++++++++++
> > 1 files changed, 25 insertions(+), 0 deletions(-)
> >
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 2d2c08d..a2fa66d 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -98,6 +98,7 @@ struct vm_area_struct {
> >
> > /* Function pointers to deal with this struct. */
> > struct vm_operations_struct * vm_ops;
> > + struct pagetable_operations_struct * pagetable_ops;
> >
> > /* Information about our backing store: */
> > unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE
> > @@ -218,6 +219,30 @@ struct vm_operations_struct {
> > };
> >
> > struct mmu_gather;
> > +
> > +struct pagetable_operations_struct {
> > + int (*fault)(struct mm_struct *mm,
> > + struct vm_area_struct *vma,
> > + unsigned long address, int write_access);
> > + int (*copy_vma)(struct mm_struct *dst, struct mm_struct *src,
> > + struct vm_area_struct *vma);
> > + int (*pin_pages)(struct mm_struct *mm, struct vm_area_struct *vma,
> > + struct page **pages, struct vm_area_struct **vmas,
> > + unsigned long *position, int *length, int i);
> > + void (*change_protection)(struct vm_area_struct *vma,
> > + unsigned long address, unsigned long end, pgprot_t newprot);
> > + unsigned long (*unmap_page_range)(struct vm_area_struct *vma,
> > + unsigned long address, unsigned long end, long *zap_work);
> > + void (*free_pgtable_range)(struct mmu_gather **tlb,
> > + unsigned long addr, unsigned long end,
> > + unsigned long floor, unsigned long ceiling);
> > +};
>
> I don't think adding another operation vector is a good idea. But I'd
> rather extend the vma operations vector to deal with all nessecary
> buts ubstead if addubg a second one.
Well, there are a lot of users of vm_operations_struct that have no interest in
the operations in pagetable_operations_struct. Expanding vm_operations_struct
would increase the size of all VMAs by more than is necessary.
Also, having the pagetable ops in vm_operations_struct might lead device
drivers to believe they should be doing something entertaining there. In
reality, we would only want drivers playing with pagetable_operations when
they really know what they are doing and why. Having the pagetable_ops
set is similar to VM_HUGETLB set as a strong sign that something unusual is
going on that is fairly easy to check for.
I prefer the additional struct to extending VMAs anyway.
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
On Mon, 2007-02-19 at 19:43 +0100, Arjan van de Ven wrote:
> On Mon, 2007-02-19 at 10:31 -0800, Adam Litke wrote:
> > The page tables for hugetlb mappings are handled differently than page tables
> > for normal pages. Rather than integrating multiple page size support into the
> > main VM (which would tremendously complicate the code) some hooks were created.
> > This allows hugetlb special cases to be handled "out of line" by a separate
> > interface.
>
> ok it makes sense to clean this up.. what I don't like is that there
> STILL are all the double cases... for this to work and be worth it both
> the common case and the hugetlb case should be using the ops structure
> always! Anything else and you're just replacing bad code with bad
> code ;(
I don't fully agree. I think it makes sense to have the "special" case
be a function pointer and the "normal" case stay where it is for
performances. You don't want to pay the cost of the function pointer
call in the normal case do you ?
Ben.
> maybe. I'm not entirely convinced... (I like the cleanup potential a lot
> code wise.. but if it costs performance, then... well I'd hate to see
> linux get slower for hugetlbfs)
>
> > If not, then I definitely wouldn't
> > mind creating a default_pagetable_ops and calling into that.
>
> ... but without it to be honest, your patch adds nothing real.. there's
> ONE user of your code, and there's no real cleanup unless you get rid of
> all the special casing.... since the special casing is the really ugly
> part of hugetlbfs, not the actual code inside the special case..
Well... I disagree there too :-)
I've been working recently for example on some spufs improvements that
require similar tweaking of the user address space as hugetlbfs. The
problem I have is that while there are hooks in the generic code pretty
much everywhere I need.... they are all hugetlb specific, that is they
call directly into the hugetlb code.
For now, I found ways of doing my stuff without hooking all over the
page table operations (well, I had no real choices) but I can imagine it
making sense to allow something (hugetlb being one of them) to take over
part of the user address space.
Ben.