When cpa was refactored to the new set_memory_ interfaces, it removed
a special case fast path for AGP systems, which did a lot of page by page
attribute changing and held the flushes until they were finished. The
DRM memory manager also required this to get useable speeds.
This introduces a new interface, which accepts an array of memory addresses
to have attributes changed on and to flush once finished.
Further changes to the AGP stack to actually use this interface will be
published later.
Signed-off-by: Dave Airlie <[email protected]>
---
arch/x86/mm/pageattr-test.c | 12 ++-
arch/x86/mm/pageattr.c | 164 +++++++++++++++++++++++++++++++-----------
include/asm-x86/cacheflush.h | 3 +
3 files changed, 132 insertions(+), 47 deletions(-)
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index 75f1b10..22c1496 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -111,6 +111,7 @@ static int pageattr_test(void)
unsigned int level;
int i, k;
int err;
+ unsigned long test_addr;
if (print)
printk(KERN_INFO "CPA self-test:\n");
@@ -165,8 +166,10 @@ static int pageattr_test(void)
continue;
}
- err = change_page_attr_clear(addr[i], len[i],
- __pgprot(_PAGE_GLOBAL));
+
+ test_addr = addr[i];
+ err = change_page_attr_clear(&test_addr, len[i],
+ __pgprot(_PAGE_GLOBAL), 0);
if (err < 0) {
printk(KERN_ERR "CPA %d failed %d\n", i, err);
failed++;
@@ -198,8 +201,9 @@ static int pageattr_test(void)
failed++;
continue;
}
- err = change_page_attr_set(addr[i], len[i],
- __pgprot(_PAGE_GLOBAL));
+ test_addr = addr[i];
+ err = change_page_attr_set(&test_addr, len[i],
+ __pgprot(_PAGE_GLOBAL), 0);
if (err < 0) {
printk(KERN_ERR "CPA reverting failed: %d\n", err);
failed++;
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 7b79f6b..6124726 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -22,14 +22,18 @@
* The current flushing context - we pass it instead of 5 arguments:
*/
struct cpa_data {
- unsigned long vaddr;
+ unsigned long *vaddr;
pgprot_t mask_set;
pgprot_t mask_clr;
int numpages;
- int flushtlb;
+ int flags;
unsigned long pfn;
+ int curaddr;
};
+#define CPA_FLUSHTLB 1
+#define CPA_ARRAY 2
+
#ifdef CONFIG_X86_64
static inline unsigned long highmap_start_pfn(void)
@@ -145,6 +149,35 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
}
}
+static void cpa_flush_array(unsigned long *start, int numpages, int cache)
+{
+ unsigned int i, level;
+ unsigned long *addr;
+
+ BUG_ON(irqs_disabled());
+
+ on_each_cpu(__cpa_flush_range, NULL, 1, 1);
+
+ if (!cache)
+ return;
+
+ /*
+ * We only need to flush on one CPU,
+ * clflush is a MESI-coherent instruction that
+ * will cause all other CPUs to flush the same
+ * cachelines:
+ */
+ for (i = 0, addr = start; i < numpages; i++, addr++) {
+ pte_t *pte = lookup_address(*addr, &level);
+
+ /*
+ * Only flush present addresses:
+ */
+ if (pte && (pte_val(*pte) & _PAGE_PRESENT))
+ clflush_cache_range((void *) *addr, PAGE_SIZE);
+ }
+}
+
/*
* Certain areas of memory on x86 require very specific protection flags,
* for example the BIOS area or kernel text. Callers don't always get this
@@ -349,7 +382,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
*/
new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot));
__set_pmd_pte(kpte, address, new_pte);
- cpa->flushtlb = 1;
+ cpa->flags |= CPA_FLUSHTLB;
do_split = 0;
}
@@ -527,11 +560,13 @@ out_unlock:
static int __change_page_attr(struct cpa_data *cpa, int primary)
{
- unsigned long address = cpa->vaddr;
+ unsigned long address;
int do_split, err;
unsigned int level;
pte_t *kpte, old_pte;
+ address = cpa->vaddr[cpa->curaddr];
+
repeat:
kpte = lookup_address(address, &level);
if (!kpte)
@@ -543,7 +578,7 @@ repeat:
return 0;
printk(KERN_WARNING "CPA: called for zero pte. "
"vaddr = %lx cpa->vaddr = %lx\n", address,
- cpa->vaddr);
+ *cpa->vaddr);
WARN_ON(1);
return -EINVAL;
}
@@ -570,7 +605,7 @@ repeat:
*/
if (pte_val(old_pte) != pte_val(new_pte)) {
set_pte_atomic(kpte, new_pte);
- cpa->flushtlb = 1;
+ cpa->flags |= CPA_FLUSHTLB;
}
cpa->numpages = 1;
return 0;
@@ -594,7 +629,7 @@ repeat:
*/
err = split_large_page(kpte, address);
if (!err) {
- cpa->flushtlb = 1;
+ cpa->flags |= CPA_FLUSHTLB;
goto repeat;
}
@@ -607,6 +642,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
{
struct cpa_data alias_cpa;
int ret = 0;
+ unsigned long temp_cpa_vaddr, vaddr;
if (cpa->pfn > max_pfn_mapped)
return 0;
@@ -615,11 +651,16 @@ static int cpa_process_alias(struct cpa_data *cpa)
* No need to redo, when the primary call touched the direct
* mapping already:
*/
- if (!within(cpa->vaddr, PAGE_OFFSET,
+ vaddr = cpa->vaddr[cpa->curaddr];
+
+ if (!within(vaddr, PAGE_OFFSET,
PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
alias_cpa = *cpa;
- alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
+ temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
+ alias_cpa.vaddr = &temp_cpa_vaddr;
+ alias_cpa.curaddr = 0;
+ alias_cpa.flags &= ~CPA_ARRAY;
ret = __change_page_attr_set_clr(&alias_cpa, 0);
}
@@ -631,7 +672,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
* No need to redo, when the primary call touched the high
* mapping already:
*/
- if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end))
+ if (within(vaddr, (unsigned long) _text, (unsigned long) _end))
return 0;
/*
@@ -642,8 +683,11 @@ static int cpa_process_alias(struct cpa_data *cpa)
return 0;
alias_cpa = *cpa;
- alias_cpa.vaddr =
- (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base;
+ temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
+ __START_KERNEL_map - phys_base;
+ alias_cpa.vaddr = &temp_cpa_vaddr;
+ alias_cpa.curaddr = 0;
+ alias_cpa.flags &= ~CPA_ARRAY;
/*
* The high mapping range is imprecise, so ignore the return value.
@@ -681,7 +725,10 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
*/
BUG_ON(cpa->numpages > numpages);
numpages -= cpa->numpages;
- cpa->vaddr += cpa->numpages * PAGE_SIZE;
+ if (cpa->flags & CPA_ARRAY)
+ cpa->curaddr++;
+ else
+ *cpa->vaddr += cpa->numpages * PAGE_SIZE;
}
return 0;
}
@@ -692,8 +739,9 @@ static inline int cache_attr(pgprot_t attr)
(_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
}
-static int change_page_attr_set_clr(unsigned long addr, int numpages,
- pgprot_t mask_set, pgprot_t mask_clr)
+static int change_page_attr_set_clr(unsigned long *addr, int numpages,
+ pgprot_t mask_set, pgprot_t mask_clr,
+ int array)
{
struct cpa_data cpa;
int ret, cache, checkalias;
@@ -708,19 +756,25 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
return 0;
/* Ensure we are PAGE_SIZE aligned */
- if (addr & ~PAGE_MASK) {
- addr &= PAGE_MASK;
- /*
- * People should not be passing in unaligned addresses:
- */
- WARN_ON_ONCE(1);
+ if (!array) {
+ if (*addr & ~PAGE_MASK) {
+ *addr &= PAGE_MASK;
+ /*
+ * People should not be passing in unaligned addresses:
+ */
+ WARN_ON_ONCE(1);
+ }
}
cpa.vaddr = addr;
cpa.numpages = numpages;
cpa.mask_set = mask_set;
cpa.mask_clr = mask_clr;
- cpa.flushtlb = 0;
+ cpa.flags = 0;
+ cpa.curaddr = 0;
+
+ if (array)
+ cpa.flags |= CPA_ARRAY;
/* No alias checking for _NX bit modifications */
checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
@@ -730,7 +784,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
/*
* Check whether we really changed something:
*/
- if (!cpa.flushtlb)
+ if (!(cpa.flags & CPA_FLUSHTLB))
goto out;
/*
@@ -746,7 +800,10 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
* wbindv):
*/
if (!ret && cpu_has_clflush)
- cpa_flush_range(addr, numpages, cache);
+ if (cpa.flags & CPA_ARRAY)
+ cpa_flush_array(addr, numpages, cache);
+ else
+ cpa_flush_range(*addr, numpages, cache);
else
cpa_flush_all(cache);
@@ -756,57 +813,74 @@ out:
return ret;
}
-static inline int change_page_attr_set(unsigned long addr, int numpages,
- pgprot_t mask)
+static inline int change_page_attr_set(unsigned long *addr, int numpages,
+ pgprot_t mask, int array)
{
- return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0));
+ return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0),
+ array);
}
-static inline int change_page_attr_clear(unsigned long addr, int numpages,
- pgprot_t mask)
+static inline int change_page_attr_clear(unsigned long *addr, int numpages,
+ pgprot_t mask, int array)
{
- return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask);
+ return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask,
+ array);
}
int set_memory_uc(unsigned long addr, int numpages)
{
- return change_page_attr_set(addr, numpages,
- __pgprot(_PAGE_PCD));
+ return change_page_attr_set(&addr, numpages,
+ __pgprot(_PAGE_PCD), 0);
}
EXPORT_SYMBOL(set_memory_uc);
+int set_memory_array_uc(unsigned long *addr, int addrinarray)
+{
+ return change_page_attr_set(addr, addrinarray,
+ __pgprot(_PAGE_PCD), 1);
+}
+EXPORT_SYMBOL(set_memory_array_uc);
+
int set_memory_wb(unsigned long addr, int numpages)
{
- return change_page_attr_clear(addr, numpages,
- __pgprot(_PAGE_PCD | _PAGE_PWT));
+ return change_page_attr_clear(&addr, numpages,
+ __pgprot(_PAGE_PCD | _PAGE_PWT), 0);
}
EXPORT_SYMBOL(set_memory_wb);
+int set_memory_array_wb(unsigned long *addr, int addrinarray)
+{
+ return change_page_attr_clear(addr, addrinarray,
+ __pgprot(_PAGE_PCD | _PAGE_PWT), 1);
+}
+EXPORT_SYMBOL(set_memory_array_wb);
+
int set_memory_x(unsigned long addr, int numpages)
{
- return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX));
+ return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
}
EXPORT_SYMBOL(set_memory_x);
int set_memory_nx(unsigned long addr, int numpages)
{
- return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX));
+ return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
}
EXPORT_SYMBOL(set_memory_nx);
int set_memory_ro(unsigned long addr, int numpages)
{
- return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW));
+ return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
}
int set_memory_rw(unsigned long addr, int numpages)
{
- return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW));
+ return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
}
int set_memory_np(unsigned long addr, int numpages)
{
- return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT));
+ return change_page_attr_clear(&addr, numpages,
+ __pgprot(_PAGE_PRESENT), 0);
}
int set_pages_uc(struct page *page, int numpages)
@@ -859,20 +933,24 @@ int set_pages_rw(struct page *page, int numpages)
static int __set_pages_p(struct page *page, int numpages)
{
- struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
+ unsigned long tempaddr = (unsigned long)page_address(page);
+ struct cpa_data cpa = { .vaddr = &tempaddr,
.numpages = numpages,
.mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
- .mask_clr = __pgprot(0)};
+ .mask_clr = __pgprot(0),
+ .flags = 0};
return __change_page_attr_set_clr(&cpa, 1);
}
static int __set_pages_np(struct page *page, int numpages)
{
- struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page),
+ unsigned long tempaddr = (unsigned long)page_address(page);
+ struct cpa_data cpa = { .vaddr = &tempaddr,
.numpages = numpages,
.mask_set = __pgprot(0),
- .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)};
+ .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
+ .flags = 0};
return __change_page_attr_set_clr(&cpa, 1);
}
diff --git a/include/asm-x86/cacheflush.h b/include/asm-x86/cacheflush.h
index 5396c21..5646584 100644
--- a/include/asm-x86/cacheflush.h
+++ b/include/asm-x86/cacheflush.h
@@ -42,6 +42,9 @@ int set_memory_ro(unsigned long addr, int numpages);
int set_memory_rw(unsigned long addr, int numpages);
int set_memory_np(unsigned long addr, int numpages);
+int set_memory_array_uc(unsigned long *addr, int addrinarray);
+int set_memory_array_wb(unsigned long *addr, int addrinarray);
+
void clflush_cache_range(void *addr, unsigned int size);
void cpa_init(void);
Dave Airlie wrote:
> When cpa was refactored to the new set_memory_ interfaces, it removed
> a special case fast path for AGP systems, which did a lot of page by page
> attribute changing and held the flushes until they were finished. The
> DRM memory manager also required this to get useable speeds.
>
> This introduces a new interface, which accepts an array of memory addresses
> to have attributes changed on and to flush once finished.
>
> Further changes to the AGP stack to actually use this interface will be
> published later.
>
> Signed-off-by: Dave Airlie <[email protected]>
> ---
> arch/x86/mm/pageattr-test.c | 12 ++-
> arch/x86/mm/pageattr.c | 164 +++++++++++++++++++++++++++++++-----------
> include/asm-x86/cacheflush.h | 3 +
> 3 files changed, 132 insertions(+), 47 deletions(-)
>
...
Dave,
Nice work, but how do we handle highmem pages? I know that they don't
need any attribute change since they're not in the kernel map, but both
the AGP module and the DRM memory manager typically hold highmem
addresses in their arrays, so the code should presumably detect those
and avoid them?
Since this is an AGPgart and DRM fastpath, the interface should ideally
be adapted to match the data structures used by those callers. The
AGPgart module uses an array of addresses, which effectively stops it
from using pages beyond the DMA32 limit. The DRM memory manager uses an
array of struct page pointers, but using that was, If I understand
things correctly, rejected.
So, if we, at some point, want to have an AGPgart module capable of
using anything beyond the DMA32 limit we will end up with an interface
that doesn't match neither AGPgart nor DRM, for which users the fastpath
was originally intended.
/Thomas
Dave Airlie <[email protected]> writes:
>
> +#define CPA_FLUSHTLB 1
> +#define CPA_ARRAY 2
I don't think CPA_ARRAY should be a separate case. Rather single
page flushing should be an array with only a single entry. pageattr
is already very complex, no need to make add more special cases.
> +
> + /*
> + * Only flush present addresses:
> + */
> + if (pte && (pte_val(*pte) & _PAGE_PRESENT))
> + clflush_cache_range((void *) *addr, PAGE_SIZE);
Also it is doubtful clflush really makes sense on a large array. Just
doing wbinvd might be faster then. Or perhaps better supporting Self-Snoop
should be revisited, that would at least eliminate it on most Intel
CPUs.
-Andi
Andi Kleen wrote:
> Dave Airlie <[email protected]> writes:
>
>>
>> +#define CPA_FLUSHTLB 1
>> +#define CPA_ARRAY 2
>>
>
> I don't think CPA_ARRAY should be a separate case. Rather single
> page flushing should be an array with only a single entry. pageattr
> is already very complex, no need to make add more special cases.
>
>> +
>> + /*
>> + * Only flush present addresses:
>> + */
>> + if (pte && (pte_val(*pte) & _PAGE_PRESENT))
>> + clflush_cache_range((void *) *addr, PAGE_SIZE);
>>
>
> Also it is doubtful clflush really makes sense on a large array. Just
> doing wbinvd might be faster then. Or perhaps better supporting Self-Snoop
> should be revisited, that would at least eliminate it on most Intel
> CPUs.
>
>
I agree that wbinvd() seems to be faster on large arrays on the
processors I've tested. But isn't there a severe latency problem with
that instruction, that makes people really want to avoid it in all
possible cases?
Also I think we need to clarify the semantics of the c_p_a
functionality. Right now both AGP and DRM relies on c_p_a doing an
explicit cache flush. Otherwise the data won't appear on the device side
of the aperture.
If we use self-snoop, the AGP and DRM drivers can't rely on this flush
being performed, and they have to do the flush themselves, and for
non-self-snooping processors, the flush needs to be done twice?
/Thomas
> -Andi
>
> Also I think we need to clarify the semantics of the c_p_a
> functionality. Right now both AGP and DRM relies on c_p_a doing an
> explicit cache flush. Otherwise the data won't appear on the device side
> of the aperture.
But surely not in cpa I hope ? Or are you saying you first write data
and then do cpa? If true that would be quite an abuse of CPA
I would say and you should fix it ASAP.
> If we use self-snoop, the AGP and DRM drivers can't rely on this flush
> being performed, and they have to do the flush themselves, and for
They definitely should flush themselves if they want data to reach
the device. That is obviously required any time they reuse a page
too.
-Andi
Andi Kleen wrote:
>> Also I think we need to clarify the semantics of the c_p_a
>> functionality. Right now both AGP and DRM relies on c_p_a doing an
>> explicit cache flush. Otherwise the data won't appear on the device side
>> of the aperture.
>>
>
> But surely not in cpa I hope ? Or are you saying you first write data
> and then do cpa? If true that would be quite an abuse of CPA
> I would say and you should fix it ASAP.
>
>
As AGP buffers are moved in- and out of AGP, the caching policy changes,
so yes, there may be writes to cache coherent memory that needs to be
flushed at some point. Since CPA has been doing that up to now, and the
codepaths involved are quite time-critical, a double cache-flush is a
no-no, so if this is left to the caller, we must be able to tell CPA
that any needed cache-flush has already been performed.
>> If we use self-snoop, the AGP and DRM drivers can't rely on this flush
>> being performed, and they have to do the flush themselves, and for
>>
>
> They definitely should flush themselves if they want data to reach
> the device. That is obviously required any time they reuse a page
> too.
>
Understood,
but then we *must* really find a way to say "don't flush the cache
again", perhaps part of Dave's array function?
/Thomas
> -Andi
>
On Mon, Mar 31, 2008 at 11:06:16AM +0200, Thomas Hellstr?m wrote:
> Andi Kleen wrote:
> >>Also I think we need to clarify the semantics of the c_p_a
> >>functionality. Right now both AGP and DRM relies on c_p_a doing an
> >>explicit cache flush. Otherwise the data won't appear on the device side
> >>of the aperture.
> >>
> >
> >But surely not in cpa I hope ? Or are you saying you first write data
> >and then do cpa? If true that would be quite an abuse of CPA
> >I would say and you should fix it ASAP.
> >
> >
> As AGP buffers are moved in- and out of AGP, the caching policy changes,
> so yes, there may be writes to cache coherent memory that needs to be
> flushed at some point. Since CPA has been doing that up to now, and the
> codepaths involved are quite time-critical, a double cache-flush is a
That doesn't make sense. You shouldn't be using CPA in any
time critical code path. It will always be slow.
For anything time critical you should keep a pool of uncached pages
once set up using CPA and reuse them.
CPA really should only be used on initialization or for
larger setup changes which are ok to go somewhat slower.
> no-no, so if this is left to the caller, we must be able to tell CPA
> that any needed cache-flush has already been performed.
Sounds like a bad design.
> >>If we use self-snoop, the AGP and DRM drivers can't rely on this flush
> >>being performed, and they have to do the flush themselves, and for
> >>
> >
> >They definitely should flush themselves if they want data to reach
> >the device. That is obviously required any time they reuse a page
> >too.
> >
> Understood,
> but then we *must* really find a way to say "don't flush the cache
> again", perhaps part of Dave's array function?
The cache must be flushed in CPA, there is no way around it.
If you write data into the buffers before you do CPA on them
you could avoid it, but I don't think you should do that anywhere
near time critical code, so it actually shouldn't matter.
-Andi
Thomas Hellstr?m wrote:
> Dave Airlie wrote:
>> When cpa was refactored to the new set_memory_ interfaces, it removed
>> a special case fast path for AGP systems, which did a lot of page by page
>> attribute changing and held the flushes until they were finished. The
>> DRM memory manager also required this to get useable speeds.
>>
>> This introduces a new interface, which accepts an array of memory
>> addresses
>> to have attributes changed on and to flush once finished.
>>
>> Further changes to the AGP stack to actually use this interface will be
>> published later.
>>
>> Signed-off-by: Dave Airlie <[email protected]>
>> ---
>> arch/x86/mm/pageattr-test.c | 12 ++-
>> arch/x86/mm/pageattr.c | 164
>> +++++++++++++++++++++++++++++++-----------
>> include/asm-x86/cacheflush.h | 3 +
>> 3 files changed, 132 insertions(+), 47 deletions(-)
>>
> ...
> Dave,
> Nice work, but how do we handle highmem pages?
Cache attributes fundamentally work on a mapping and not on physical memory.
(MTRR's are special there, they do work on physical memory, but that's a special case and not relevant here)
So to be honest, your question doesn't make sense; because all I can do is ask "which mapping of these pages".
Even the old interfaces prior to 2.6.24 only managed to deal with SOME of the mappings of a page.
And if/when future CPUs don't require all mappings to be in sync, clearly the kernel will only change
the mapping that is requested as well.
> Since this is an AGPgart and DRM fastpath, the interface should ideally
> be adapted to match the data structures used by those callers.
Actually, the interface has to make sense conceptually convey the right information,
the implementation should not have to second guess internals of AGP/DRM because that
would just be a recipe for disaster.
>The
> AGPgart module uses an array of addresses, which effectively stops it
> from using pages beyond the DMA32 limit. The DRM memory manager uses an
> array of struct page pointers, but using that was, If I understand
> things correctly, rejected.
yes because simply put, if you pass a struct page to such a function, you're not telling it which
mapping or mappings you want changed....
(And if you say "only the 1:1 mapping, so why doesn't the other side calculate that"... there's no speed gain in doing
the calculation for that on the other side of an interface, and that makes it zero reason to misdesign the interface
to only have the "which mapping" information implicit)
Andi Kleen wrote:
> Also it is doubtful clflush really makes sense on a large array. Just
> doing wbinvd might be faster then.
wbinvd is rarely a good idea; think about it... it'll flush 12Mb of cache *per socket* in one instruction.
(on a modern Intel consumer grade CPU, more on the enterprise ones)
This doesn't only impact the current logical thread, but ALL of the threads in the system, since the cache
coherency needs to be preserved, all have to go empty at the same time.
Forget real time... this can take really long even for non-realtime users ;)
At least clflush breaks this up into smaller pieces so total latency won't suck entirely.
Arjan van de Ven wrote:
> Thomas Hellstr?m wrote:
>> Dave Airlie wrote:
>>> When cpa was refactored to the new set_memory_ interfaces, it removed
>>> a special case fast path for AGP systems, which did a lot of page by
>>> page
>>> attribute changing and held the flushes until they were finished. The
>>> DRM memory manager also required this to get useable speeds.
>>>
>>> This introduces a new interface, which accepts an array of memory
>>> addresses
>>> to have attributes changed on and to flush once finished.
>>>
>>> Further changes to the AGP stack to actually use this interface will be
>>> published later.
>>>
>>> Signed-off-by: Dave Airlie <[email protected]>
>>> ---
>>> arch/x86/mm/pageattr-test.c | 12 ++-
>>> arch/x86/mm/pageattr.c | 164
>>> +++++++++++++++++++++++++++++++-----------
>>> include/asm-x86/cacheflush.h | 3 +
>>> 3 files changed, 132 insertions(+), 47 deletions(-)
>>>
>> ...
>> Dave,
>> Nice work, but how do we handle highmem pages?
>
> Cache attributes fundamentally work on a mapping and not on physical
> memory.
> (MTRR's are special there, they do work on physical memory, but that's
> a special case and not relevant here)
>
> So to be honest, your question doesn't make sense; because all I can
> do is ask "which mapping of these pages".
>
> Even the old interfaces prior to 2.6.24 only managed to deal with SOME
> of the mappings of a page.
> And if/when future CPUs don't require all mappings to be in sync,
> clearly the kernel will only change
> the mapping that is requested as well.
>
>
>
>> Since this is an AGPgart and DRM fastpath, the interface should
>> ideally be adapted to match the data structures used by those callers.
>
> Actually, the interface has to make sense conceptually convey the
> right information,
> the implementation should not have to second guess internals of
> AGP/DRM because that
> would just be a recipe for disaster.
>> The AGPgart module uses an array of addresses, which effectively
>> stops it from using pages beyond the DMA32 limit. The DRM memory
>> manager uses an array of struct page pointers, but using that was, If
>> I understand things correctly, rejected.
>
> yes because simply put, if you pass a struct page to such a function,
> you're not telling it which
> mapping or mappings you want changed....
> (And if you say "only the 1:1 mapping, so why doesn't the other side
> calculate that"... there's no speed gain in doing
> the calculation for that on the other side of an interface, and that
> makes it zero reason to misdesign the interface
> to only have the "which mapping" information implicit)
>
Hmm. I get the point. There should be ways to do reasonably efficient
workarounds in the drivers.
/Thomas
Andi Kleen wrote:
> On Mon, Mar 31, 2008 at 11:06:16AM +0200, Thomas Hellstr?m wrote:
>
>> Andi Kleen wrote:
>>
>>>> Also I think we need to clarify the semantics of the c_p_a
>>>> functionality. Right now both AGP and DRM relies on c_p_a doing an
>>>> explicit cache flush. Otherwise the data won't appear on the device side
>>>> of the aperture.
>>>>
>>>>
>>> But surely not in cpa I hope ? Or are you saying you first write data
>>> and then do cpa? If true that would be quite an abuse of CPA
>>> I would say and you should fix it ASAP.
>>>
>>>
>>>
>> As AGP buffers are moved in- and out of AGP, the caching policy changes,
>> so yes, there may be writes to cache coherent memory that needs to be
>> flushed at some point. Since CPA has been doing that up to now, and the
>> codepaths involved are quite time-critical, a double cache-flush is a
>>
>
> That doesn't make sense. You shouldn't be using CPA in any
> time critical code path. It will always be slow.
>
> For anything time critical you should keep a pool of uncached pages
> once set up using CPA and reuse them.
>
> CPA really should only be used on initialization or for
> larger setup changes which are ok to go somewhat slower.
>
Let me rehprase. Not really time-critical but it is of some importance
that CPA is done quickly.
We're dealing with the tradeoff of reading from uncached device memory
vs taking the pages out of
AGP, setting up a cache-coherent mapping, read and then change back.
What we'd really would like to set up is a pool of completely unmapped
(like highmem) pages. Then we could, to a large extent, avoid the CPA calls.
>
>
> The cache must be flushed in CPA, there is no way around it.
>
> If you write data into the buffers before you do CPA on them
> you could avoid it, but I don't think you should do that anywhere
> near time critical code, so it actually shouldn't matter.
>
> -Andi
>
But then we wouldn't really be discussing SS either? For DRM purposes,
the more performance we can squeeze out of CPA, the better.
/Thomas
On Mon, Mar 31, 2008 at 5:25 PM, Andi Kleen <[email protected]> wrote:
> Dave Airlie <[email protected]> writes:
> >
> > +#define CPA_FLUSHTLB 1
> > +#define CPA_ARRAY 2
>
> I don't think CPA_ARRAY should be a separate case. Rather single
> page flushing should be an array with only a single entry. pageattr
> is already very complex, no need to make add more special cases.
I thought about this but the current interface takes a start address
and number of pages from that point to cpa,
the array interface takes an array of page sized pages.
I don't really think we need to generate an array in the first case
with all the pages in it..
Dave.
On Mon, Mar 31, 2008 at 09:21:19PM +1000, Dave Airlie wrote:
> On Mon, Mar 31, 2008 at 5:25 PM, Andi Kleen <[email protected]> wrote:
> > Dave Airlie <[email protected]> writes:
> > >
> > > +#define CPA_FLUSHTLB 1
> > > +#define CPA_ARRAY 2
> >
> > I don't think CPA_ARRAY should be a separate case. Rather single
> > page flushing should be an array with only a single entry. pageattr
> > is already very complex, no need to make add more special cases.
>
> I thought about this but the current interface takes a start address
> and number of pages from that point to cpa,
> the array interface takes an array of page sized pages.
>
> I don't really think we need to generate an array in the first case
> with all the pages in it..
Just put the length into the array members too.
-Andi
Thomas Hellstr?m wrote:
> Let me rehprase. Not really time-critical but it is of some importance
> that CPA is done quickly.
> We're dealing with the tradeoff of reading from uncached device memory
uncached or write combining ?
> vs taking the pages out of
> AGP, setting up a cache-coherent mapping, read and then change back.
> What we'd really would like to set up is a pool of completely unmapped
> (like highmem) pages. Then we could, to a large extent, avoid the CPA
> calls.
changing attributes by nature means a tlb flush and a bunch of expensive cache work.
That's never going to be cheap, I guess it all depends on how much work you do
on the memory for it to pay off or not...
Arjan van de Ven wrote:
> Thomas Hellstr?m wrote:
>
>> Let me rehprase. Not really time-critical but it is of some
>> importance that CPA is done quickly.
>> We're dealing with the tradeoff of reading from uncached device memory
>
> uncached or write combining ?
The user-space mappings (the ones that we really use) are usually
write-combined, whereas the kernel mappings are uncached. (I think this
is OK since both mapping types implies no cache coherency). Even if
(IIRC) write combining is theoretically prefetchable, some devices give
read speeds around 9MB/s.
>
>> vs taking the pages out of
>> AGP, setting up a cache-coherent mapping, read and then change back.
>> What we'd really would like to set up is a pool of completely
>> unmapped (like highmem) pages. Then we could, to a large extent,
>> avoid the CPA calls.
>
> changing attributes by nature means a tlb flush and a bunch of
> expensive cache work.
> That's never going to be cheap, I guess it all depends on how much
> work you do
> on the memory for it to pay off or not...
Indeed. Actually with the new non-wbinvd() CPA, We seem to benefit
already if the buffer is a single page, though it's probably hard to
measure the impact of repopulating the tlb.
/Thomas
Thomas Hellstr?m wrote:
> Arjan van de Ven wrote:
>> Thomas Hellstr?m wrote:
>>
>>> Let me rehprase. Not really time-critical but it is of some
>>> importance that CPA is done quickly.
>>> We're dealing with the tradeoff of reading from uncached device memory
>>
>> uncached or write combining ?
> The user-space mappings (the ones that we really use) are usually
> write-combined, whereas the kernel mappings are uncached. (I think this
> is OK since both mapping types implies no cache coherency).
This is not officially allowed and may tripple fault your cpu..
To comply with the spec one needs to have ALL mappings the same unfortunately.
(And yes, this is a hard problem)
>Even if
> (IIRC) write combining is theoretically prefetchable, some devices give
> read speeds around 9MB/s.
>>
>>> vs taking the pages out of
>>> AGP, setting up a cache-coherent mapping, read and then change back.
>>> What we'd really would like to set up is a pool of completely
>>> unmapped (like highmem) pages. Then we could, to a large extent,
>>> avoid the CPA calls.
>>
>> changing attributes by nature means a tlb flush and a bunch of
>> expensive cache work.
>> That's never going to be cheap, I guess it all depends on how much
>> work you do
>> on the memory for it to pay off or not...
> Indeed. Actually with the new non-wbinvd() CPA, We seem to benefit
> already if the buffer is a single page, though it's probably hard to
> measure the impact of repopulating the tlb.
>
> /Thomas
>
>
>
Arjan van de Ven wrote:
> Thomas Hellstr?m wrote:
>> Arjan van de Ven wrote:
>>> Thomas Hellstr?m wrote:
>>>
>>>> Let me rehprase. Not really time-critical but it is of some
>>>> importance that CPA is done quickly.
>>>> We're dealing with the tradeoff of reading from uncached device memory
>>>
>>> uncached or write combining ?
>> The user-space mappings (the ones that we really use) are usually
>> write-combined, whereas the kernel mappings are uncached. (I think
>> this is OK since both mapping types implies no cache coherency).
>
> This is not officially allowed and may tripple fault your cpu..
> To comply with the spec one needs to have ALL mappings the same
> unfortunately.
> (And yes, this is a hard problem)
>
Hmm...
Given this problem, the previously mentioned use-case, and the fact that
we mostly really use user-space mappings,
Is there a possibility we could add the following functions to Dave's
patch (provided they would work as intended, of course, namely
invalidate / bring back the kernel mapping).
Then we can set up a pool of unmapped pages, avoid frequent use of CPA
and everybody's happy.
int set_memory_array_np(unsigned long *addr, int addrinarray)
{
return change_page_attr_clr(addr, addrinarray,
__pgprot(_PAGE_PRESENT | _PAGE_RW), 1);
}
EXPORT_SYMBOL(set_memory_array_np);
int set_memory_array_p(unsigned long *addr, int addrinarray)
{
return change_page_attr_set(addr, addrinarray,
__pgprot(_PAGE_PRESENT | _PAGE_RW), 1);
}
EXPORT_SYMBOL(set_memory_array_p);
/Thomas