2008-11-12 23:23:20

by Pallipadi, Venkatesh

[permalink] [raw]
Subject: [patch 5/8] x86 PAT: Implement track/untrack of pfnmap regions for x86

Hookup remap_pfn_range and vm_insert_pfn and corresponding copy and free
routines with reserve and free tracking.

reserve and free here only takes care of non RAM region mapping. For RAM
region, driver should use set_memory_[uc|wc|wb] to set the cache type and
then setup the mapping for user pte. We can bypass below
reserve/free in that case.

Signed-off-by: Venkatesh Pallipadi <[email protected]>
Signed-off-by: Suresh Siddha <[email protected]>

---
arch/x86/include/asm/pgtable.h | 10 ++
arch/x86/mm/pat.c | 201 +++++++++++++++++++++++++++++++++++++++++
2 files changed, 211 insertions(+)

Index: tip/arch/x86/mm/pat.c
===================================================================
--- tip.orig/arch/x86/mm/pat.c 2008-11-12 09:55:55.000000000 -0800
+++ tip/arch/x86/mm/pat.c 2008-11-12 12:06:50.000000000 -0800
@@ -596,6 +596,207 @@ void unmap_devmem(unsigned long pfn, uns
free_memtype(addr, addr + size);
}

+int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t vma_prot)
+{
+ unsigned long flags;
+ unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
+
+ int is_ram = 0;
+ int id_sz, ret;
+
+ is_ram = pagerange_is_ram(paddr, paddr + size);
+
+ if (is_ram != 0) {
+ /*
+ * For mapping RAM pages, drivers need to call
+ * set_memory_[uc|wc|wb] directly, for reserve and free, before
+ * setting up the PTE.
+ */
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+
+ ret = reserve_memtype(paddr, paddr + size, want_flags, &flags);
+ if (ret)
+ return ret;
+
+ if (flags != want_flags) {
+ free_memtype(paddr, paddr + size);
+ printk(KERN_INFO
+ "%s:%d map pfn expected mapping type %s for %Lx-%Lx, got %s\n",
+ current->comm, current->pid,
+ cattr_name(want_flags),
+ paddr, (unsigned long long)(paddr + size),
+ cattr_name(flags));
+ return -EINVAL;
+ }
+
+ /* Need to keep identity mapping in sync */
+ if (paddr >= __pa(high_memory))
+ return 0;
+
+ id_sz = (__pa(high_memory) < paddr + size) ?
+ __pa(high_memory) - paddr :
+ size;
+
+ if (ioremap_change_attr((unsigned long)__va(paddr), id_sz, flags) < 0) {
+ free_memtype(paddr, paddr + size);
+ printk(KERN_INFO
+ "%s:%d reserve_pfn_range ioremap_change_attr failed %s "
+ "for %Lx-%Lx\n",
+ current->comm, current->pid,
+ cattr_name(flags),
+ paddr, (unsigned long long)(paddr + size));
+ return -EINVAL;
+ }
+ return 0;
+}
+
+void free_pfn_range(u64 paddr, unsigned long size)
+{
+ int is_ram;
+
+ is_ram = pagerange_is_ram(paddr, paddr + size);
+ if (is_ram == 0)
+ free_memtype(paddr, paddr + size);
+}
+
+int track_pfn_vma_copy(struct vm_area_struct *vma)
+{
+ int retval = 0;
+ unsigned long i, j;
+ u64 paddr;
+ pgprot_t prot;
+ pte_t pte;
+ unsigned long vma_start = vma->vm_start;
+ unsigned long vma_end = vma->vm_end;
+ unsigned long vma_size = vma_end - vma_start;
+
+ if (!pat_enabled)
+ return 0;
+
+ if (is_linear_pfn_mapping(vma)) {
+ /*
+ * reserve the whole chunk starting from vm_pgoff,
+ * But, we have to get the protection from pte.
+ */
+ if (follow_pfnmap_pte(vma, vma_start, &pte)) {
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+ prot = pte_pgprot(pte);
+ paddr = (u64)vma->vm_pgoff << PAGE_SHIFT;
+ return reserve_pfn_range(paddr, vma_size, prot);
+ }
+
+ /* reserve entire vma page by page, using pfn and prot from pte */
+ for (i = 0; i < vma_size; i += PAGE_SIZE) {
+ if (follow_pfnmap_pte(vma, vma_start + i, &pte))
+ continue;
+
+ paddr = pte_pa(pte);
+ prot = pte_pgprot(pte);
+ retval = reserve_pfn_range(paddr, PAGE_SIZE, prot);
+ if (retval)
+ goto cleanup_ret;
+ }
+ return 0;
+
+cleanup_ret:
+ /* Reserve error: Cleanup partial reservation and return error */
+ for (j = 0; j < i; j += PAGE_SIZE) {
+ if (follow_pfnmap_pte(vma, vma_start + j, &pte))
+ continue;
+
+ paddr = pte_pa(pte);
+ free_pfn_range(paddr, PAGE_SIZE);
+ }
+
+ return retval;
+}
+
+int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot,
+ unsigned long pfn, unsigned long size)
+{
+ int retval = 0;
+ unsigned long i, j;
+ u64 base_paddr;
+ u64 paddr;
+ unsigned long vma_start = vma->vm_start;
+ unsigned long vma_end = vma->vm_end;
+ unsigned long vma_size = vma_end - vma_start;
+
+ if (!pat_enabled)
+ return 0;
+
+ if (is_linear_pfn_mapping(vma)) {
+ /* reserve the whole chunk starting from vm_pgoff */
+ paddr = (u64)vma->vm_pgoff << PAGE_SHIFT;
+ return reserve_pfn_range(paddr, vma_size, prot);
+ }
+
+ /* reserve page by page using pfn and size */
+ base_paddr = (u64)pfn << PAGE_SHIFT;
+ for (i = 0; i < size; i += PAGE_SIZE) {
+ paddr = base_paddr + i;
+ retval = reserve_pfn_range(paddr, PAGE_SIZE, prot);
+ if (retval)
+ goto cleanup_ret;
+ }
+ return 0;
+
+cleanup_ret:
+ /* Reserve error: Cleanup partial reservation and return error */
+ for (j = 0; j < i; j += PAGE_SIZE) {
+ paddr = base_paddr + j;
+ free_pfn_range(paddr, PAGE_SIZE);
+ }
+
+ return retval;
+}
+
+void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
+ unsigned long size)
+{
+ unsigned long i;
+ u64 paddr;
+ unsigned long vma_start = vma->vm_start;
+ unsigned long vma_end = vma->vm_end;
+ unsigned long vma_size = vma_end - vma_start;
+
+ if (!pat_enabled)
+ return;
+
+ if (is_linear_pfn_mapping(vma)) {
+ /* free the whole chunk starting from vm_pgoff */
+ paddr = (u64)vma->vm_pgoff << PAGE_SHIFT;
+ free_pfn_range(paddr, vma_size);
+ return;
+ }
+
+ if (size != 0 && size != vma_size) {
+ /* free page by page, using pfn and size */
+
+ paddr = (u64)pfn << PAGE_SHIFT;
+ for (i = 0; i < size; i += PAGE_SIZE) {
+ paddr = paddr + i;
+ free_pfn_range(paddr, PAGE_SIZE);
+ }
+ } else {
+ /* free entire vma, page by page, using the pfn from pte */
+
+ for (i = 0; i < vma_size; i += PAGE_SIZE) {
+ pte_t pte;
+
+ if (follow_pfnmap_pte(vma, vma_start + i, &pte))
+ continue;
+
+ paddr = pte_pa(pte);
+ free_pfn_range(paddr, PAGE_SIZE);
+ }
+ }
+}
+
#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)

/* get Nth element of the linked list */
Index: tip/arch/x86/include/asm/pgtable.h
===================================================================
--- tip.orig/arch/x86/include/asm/pgtable.h 2008-11-12 09:55:55.000000000 -0800
+++ tip/arch/x86/include/asm/pgtable.h 2008-11-12 12:03:43.000000000 -0800
@@ -219,6 +219,11 @@ static inline unsigned long pte_pfn(pte_
return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT;
}

+static inline u64 pte_pa(pte_t pte)
+{
+ return pte_val(pte) & PTE_PFN_MASK;
+}
+
#define pte_page(pte) pfn_to_page(pte_pfn(pte))

static inline int pmd_large(pmd_t pte)
@@ -328,6 +333,11 @@ static inline pgprot_t pgprot_modify(pgp

#define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)

+/* Indicate that x86 has its own track and untrack pfn vma functions */
+#define track_pfn_vma_new track_pfn_vma_new
+#define track_pfn_vma_copy track_pfn_vma_copy
+#define untrack_pfn_vma untrack_pfn_vma
+
#ifndef __ASSEMBLY__
#define __HAVE_PHYS_MEM_ACCESS_PROT
struct file;

--


2008-12-16 20:09:16

by Andrew Morton

[permalink] [raw]
Subject: Re: [patch 5/8] x86 PAT: Implement track/untrack of pfnmap regions for x86

On Wed, 12 Nov 2008 13:26:52 -0800
Venkatesh Pallipadi <[email protected]> wrote:

> Hookup remap_pfn_range and vm_insert_pfn and corresponding copy and free
> routines with reserve and free tracking.
>
> reserve and free here only takes care of non RAM region mapping. For RAM
> region, driver should use set_memory_[uc|wc|wb] to set the cache type and
> then setup the mapping for user pte. We can bypass below
> reserve/free in that case.
>
> Signed-off-by: Venkatesh Pallipadi <[email protected]>
> Signed-off-by: Suresh Siddha <[email protected]>
>
> ---
> arch/x86/include/asm/pgtable.h | 10 ++
> arch/x86/mm/pat.c | 201 +++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 211 insertions(+)
>
> Index: tip/arch/x86/mm/pat.c
> ===================================================================
> --- tip.orig/arch/x86/mm/pat.c 2008-11-12 09:55:55.000000000 -0800
> +++ tip/arch/x86/mm/pat.c 2008-11-12 12:06:50.000000000 -0800
> @@ -596,6 +596,207 @@ void unmap_devmem(unsigned long pfn, uns
> free_memtype(addr, addr + size);
> }
>
> +int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t vma_prot)

I don't think the kernel is improved by leaving all these things
undocumented.

These are interfaces which other architectures might choose to
implement, so a few words explaining why they exist, what they are
supposed to do, etc wouldn't hurt.

It would certainly help code reviewers.

> +{
> + unsigned long flags;
> + unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
> +
> + int is_ram = 0;
> + int id_sz, ret;

The newline in the middle of the definition of locals makes no sense.

> + is_ram = pagerange_is_ram(paddr, paddr + size);
> +
> + if (is_ram != 0) {
> + /*
> + * For mapping RAM pages, drivers need to call
> + * set_memory_[uc|wc|wb] directly, for reserve and free, before
> + * setting up the PTE.
> + */
> + WARN_ON_ONCE(1);
> + return 0;
> + }
> +
> + ret = reserve_memtype(paddr, paddr + size, want_flags, &flags);
> + if (ret)
> + return ret;
> +
> + if (flags != want_flags) {
> + free_memtype(paddr, paddr + size);
> + printk(KERN_INFO

Would KERN_ERR be more appropriate?

> + "%s:%d map pfn expected mapping type %s for %Lx-%Lx, got %s\n",
> + current->comm, current->pid,

get_task_comm() is more reliable, but it hardly matters here.

> + cattr_name(want_flags),
> + paddr, (unsigned long long)(paddr + size),

Printing the u64 paddr without a cast is OK in arch code (we know that
u64 is implemented as unsigned long long). But one might choose to
cast it anyway, to set a good example.

Or one could not bother. But this code does it both ways!

> + cattr_name(flags));
> + return -EINVAL;
> + }
> +
> + /* Need to keep identity mapping in sync */
> + if (paddr >= __pa(high_memory))
> + return 0;
> +
> + id_sz = (__pa(high_memory) < paddr + size) ?
> + __pa(high_memory) - paddr :
> + size;
> +
> + if (ioremap_change_attr((unsigned long)__va(paddr), id_sz, flags) < 0) {
> + free_memtype(paddr, paddr + size);
> + printk(KERN_INFO

KERN_ERR?

> + "%s:%d reserve_pfn_range ioremap_change_attr failed %s "
> + "for %Lx-%Lx\n",
> + current->comm, current->pid,
> + cattr_name(flags),
> + paddr, (unsigned long long)(paddr + size));

ditto

> + return -EINVAL;
> + }
> + return 0;
> +}
> +
> +void free_pfn_range(u64 paddr, unsigned long size)
> +{
> + int is_ram;
> +
> + is_ram = pagerange_is_ram(paddr, paddr + size);
> + if (is_ram == 0)
> + free_memtype(paddr, paddr + size);
> +}
>
> ...
>
> +int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot,
> + unsigned long pfn, unsigned long size)
> +{
> + int retval = 0;
> + unsigned long i, j;
> + u64 base_paddr;
> + u64 paddr;
> + unsigned long vma_start = vma->vm_start;
> + unsigned long vma_end = vma->vm_end;
> + unsigned long vma_size = vma_end - vma_start;
> +
> + if (!pat_enabled)
> + return 0;
> +
> + if (is_linear_pfn_mapping(vma)) {
> + /* reserve the whole chunk starting from vm_pgoff */
> + paddr = (u64)vma->vm_pgoff << PAGE_SHIFT;
> + return reserve_pfn_range(paddr, vma_size, prot);
> + }
> +
> + /* reserve page by page using pfn and size */
> + base_paddr = (u64)pfn << PAGE_SHIFT;
> + for (i = 0; i < size; i += PAGE_SIZE) {
> + paddr = base_paddr + i;
> + retval = reserve_pfn_range(paddr, PAGE_SIZE, prot);
> + if (retval)
> + goto cleanup_ret;
> + }
> + return 0;
> +
> +cleanup_ret:
> + /* Reserve error: Cleanup partial reservation and return error */
> + for (j = 0; j < i; j += PAGE_SIZE) {
> + paddr = base_paddr + j;
> + free_pfn_range(paddr, PAGE_SIZE);

Could we simply do

free_pfn_range(base_paddr, i * PAGE_SIZE);

here?

If not, then perhaps add a helper function to do this, because that
loop gets repeated in several places.

> + }
> +
> + return retval;
> +}
> +
>
> ...
>

2008-12-16 23:19:33

by Pallipadi, Venkatesh

[permalink] [raw]
Subject: Re: [patch 5/8] x86 PAT: Implement track/untrack of pfnmap regions for x86

On Tue, Dec 16, 2008 at 12:07:59PM -0800, Andrew Morton wrote:
> Venkatesh Pallipadi <[email protected]> wrote:
>
> > 2 files changed, 211 insertions(+)
> >
> > Index: tip/arch/x86/mm/pat.c
> > ===================================================================
> > --- tip.orig/arch/x86/mm/pat.c 2008-11-12 09:55:55.000000000 -0800
> > +++ tip/arch/x86/mm/pat.c 2008-11-12 12:06:50.000000000 -0800
> > @@ -596,6 +596,207 @@ void unmap_devmem(unsigned long pfn, uns
> > free_memtype(addr, addr + size);
> > }
> >
> > +int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t vma_prot)
>
> I don't think the kernel is improved by leaving all these things
> undocumented.
>
> These are interfaces which other architectures might choose to
> implement, so a few words explaining why they exist, what they are
> supposed to do, etc wouldn't hurt.
>
> It would certainly help code reviewers.

Agreed. Will add documentation to the new functions here and update the patch.

>
> > +{
> > + unsigned long flags;
> > + unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
> > +
> > + int is_ram = 0;
> > + int id_sz, ret;
>
> The newline in the middle of the definition of locals makes no sense.

Will change.

> > + is_ram = pagerange_is_ram(paddr, paddr + size);
> > +
> > + if (is_ram != 0) {
> > + /*
> > + * For mapping RAM pages, drivers need to call
> > + * set_memory_[uc|wc|wb] directly, for reserve and free, before
> > + * setting up the PTE.
> > + */
> > + WARN_ON_ONCE(1);
> > + return 0;
> > + }
> > +
> > + ret = reserve_memtype(paddr, paddr + size, want_flags, &flags);
> > + if (ret)
> > + return ret;
> > +
> > + if (flags != want_flags) {
> > + free_memtype(paddr, paddr + size);
> > + printk(KERN_INFO
>
> Would KERN_ERR be more appropriate?

OK.

>
> > + "%s:%d map pfn expected mapping type %s for %Lx-%Lx, got %s\n",
> > + current->comm, current->pid,
>
> get_task_comm() is more reliable, but it hardly matters here.

current->comm was used at other places in this file. I should probably change
all of them as a separate cleanup patch.

>
> > + cattr_name(want_flags),
> > + paddr, (unsigned long long)(paddr + size),
>
> Printing the u64 paddr without a cast is OK in arch code (we know that
> u64 is implemented as unsigned long long). But one might choose to
> cast it anyway, to set a good example.
>
> Or one could not bother. But this code does it both ways!
>

Will change.

> > + cattr_name(flags));
> > + return -EINVAL;
> > + }
> > +
> > + /* Need to keep identity mapping in sync */
> > + if (paddr >= __pa(high_memory))
> > + return 0;
> > +
> > + id_sz = (__pa(high_memory) < paddr + size) ?
> > + __pa(high_memory) - paddr :
> > + size;
> > +
> > + if (ioremap_change_attr((unsigned long)__va(paddr), id_sz, flags) < 0) {
> > + free_memtype(paddr, paddr + size);
> > + printk(KERN_INFO
>
> KERN_ERR?

OK.

>
> > + "%s:%d reserve_pfn_range ioremap_change_attr failed %s "
> > + "for %Lx-%Lx\n",
> > + current->comm, current->pid,
> > + cattr_name(flags),
> > + paddr, (unsigned long long)(paddr + size));
>
> ditto

Will do.

>
> > + return -EINVAL;
> > + }
> > + return 0;
> > +}
> > +
> > +void free_pfn_range(u64 paddr, unsigned long size)
> > +{
> > + int is_ram;
> > +
> > + is_ram = pagerange_is_ram(paddr, paddr + size);
> > + if (is_ram == 0)
> > + free_memtype(paddr, paddr + size);
> > +}
> >
> > ...
> >
> > +int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot,
> > + unsigned long pfn, unsigned long size)
> > +{
> > + int retval = 0;
> > + unsigned long i, j;
> > + u64 base_paddr;
> > + u64 paddr;
> > + unsigned long vma_start = vma->vm_start;
> > + unsigned long vma_end = vma->vm_end;
> > + unsigned long vma_size = vma_end - vma_start;
> > +
> > + if (!pat_enabled)
> > + return 0;
> > +
> > + if (is_linear_pfn_mapping(vma)) {
> > + /* reserve the whole chunk starting from vm_pgoff */
> > + paddr = (u64)vma->vm_pgoff << PAGE_SHIFT;
> > + return reserve_pfn_range(paddr, vma_size, prot);
> > + }
> > +
> > + /* reserve page by page using pfn and size */
> > + base_paddr = (u64)pfn << PAGE_SHIFT;
> > + for (i = 0; i < size; i += PAGE_SIZE) {
> > + paddr = base_paddr + i;
> > + retval = reserve_pfn_range(paddr, PAGE_SIZE, prot);
> > + if (retval)
> > + goto cleanup_ret;
> > + }
> > + return 0;
> > +
> > +cleanup_ret:
> > + /* Reserve error: Cleanup partial reservation and return error */
> > + for (j = 0; j < i; j += PAGE_SIZE) {
> > + paddr = base_paddr + j;
> > + free_pfn_range(paddr, PAGE_SIZE);
>
> Could we simply do
>
> free_pfn_range(base_paddr, i * PAGE_SIZE);
>
> here?
>
> If not, then perhaps add a helper function to do this, because that
> loop gets repeated in several places.
>

free_pfn_range(base_paddr, i * PAGE_SIZE);

will not work as free should correspond to reserve granularity. I agree making
this a function should make this cleaner. Will do that in the refresh of this
patch.

Thanks,
Venki