copy_{from,to}_user() uaccess helpers are implemented by user page
pinning, followed by temporary kernel mapping & then memcpy(). This
helps to achieve user page copy when current virtual address mapping
of the CPU excludes user pages.
Performance wise, results are not encouraging, 'dd' on tmpfs results,
ARM Cortex-A8, BeagleBone White (256MiB RAM):
w/o series - ~29.5 MB/s
w/ series - ~20.5 MB/s
w/ series & highmem disabled - ~21.2 MB/s
On Cortex-A15(2GiB RAM) in QEMU:
w/o series - ~4 MB/s
w/ series - ~2.6 MB/s
Roughly a one-third drop in performance. Disabling highmem improves
performance only slightly.
'hackbench' also showed a similar pattern.
uaccess routines using page pinning & temporary kernel mapping is not
something new, it has been done long long ago by Ingo [1] as part of
4G/4G user/kernel mapping implementation on x86, though not merged in
mainline.
[1] https://lore.kernel.org/lkml/[email protected]/
Signed-off-by: afzal mohammed <[email protected]>
---
lib/Kconfig | 4 +
lib/Makefile | 3 +
lib/uaccess_gup_kmap_memcpy.c | 162 ++++++++++++++++++++++++++++++++++
3 files changed, 169 insertions(+)
create mode 100644 lib/uaccess_gup_kmap_memcpy.c
diff --git a/lib/Kconfig b/lib/Kconfig
index 5d53f9609c252..dadf4f6cc391d 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -622,6 +622,10 @@ config ARCH_HAS_MEMREMAP_COMPAT_ALIGN
config UACCESS_MEMCPY
bool
+# pin page + kmap_atomic + memcpy for user copies, intended for vmsplit 4g/4g
+config UACCESS_GUP_KMAP_MEMCPY
+ bool
+
config ARCH_HAS_UACCESS_FLUSHCACHE
bool
diff --git a/lib/Makefile b/lib/Makefile
index 685aee60de1d5..bc457f85e391a 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -309,3 +309,6 @@ obj-$(CONFIG_OBJAGG) += objagg.o
# KUnit tests
obj-$(CONFIG_LIST_KUNIT_TEST) += list-test.o
+
+# uaccess
+obj-$(CONFIG_UACCESS_GUP_KMAP_MEMCPY) += uaccess_gup_kmap_memcpy.o
diff --git a/lib/uaccess_gup_kmap_memcpy.c b/lib/uaccess_gup_kmap_memcpy.c
new file mode 100644
index 0000000000000..1536762df1fd5
--- /dev/null
+++ b/lib/uaccess_gup_kmap_memcpy.c
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: GPL-2.0
+// Started from arch/um/kernel/skas/uaccess.c
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+
+#include <asm/page.h>
+#include <asm/pgtable.h>
+
+static int do_op_one_page(unsigned long addr, int len,
+ int (*op)(unsigned long addr, int len, void *arg), void *arg,
+ struct page *page)
+{
+ int n;
+
+ addr = (unsigned long) kmap_atomic(page) + (addr & ~PAGE_MASK);
+ n = (*op)(addr, len, arg);
+ kunmap_atomic((void *)addr);
+
+ return n;
+}
+
+static long buffer_op(unsigned long addr, int len,
+ int (*op)(unsigned long, int, void *), void *arg,
+ struct page **pages)
+{
+ long size, remain, n;
+
+ size = min(PAGE_ALIGN(addr) - addr, (unsigned long) len);
+ remain = len;
+ if (size == 0)
+ goto page_boundary;
+
+ n = do_op_one_page(addr, size, op, arg, *pages);
+ if (n != 0) {
+ remain = (n < 0 ? remain : 0);
+ goto out;
+ }
+
+ pages++;
+ addr += size;
+ remain -= size;
+
+page_boundary:
+ if (remain == 0)
+ goto out;
+ while (addr < ((addr + remain) & PAGE_MASK)) {
+ n = do_op_one_page(addr, PAGE_SIZE, op, arg, *pages);
+ if (n != 0) {
+ remain = (n < 0 ? remain : 0);
+ goto out;
+ }
+
+ pages++;
+ addr += PAGE_SIZE;
+ remain -= PAGE_SIZE;
+ }
+ if (remain == 0)
+ goto out;
+
+ n = do_op_one_page(addr, remain, op, arg, *pages);
+ if (n != 0) {
+ remain = (n < 0 ? remain : 0);
+ goto out;
+ }
+
+ return 0;
+out:
+ return remain;
+}
+
+static int copy_chunk_from_user(unsigned long from, int len, void *arg)
+{
+ unsigned long *to_ptr = arg, to = *to_ptr;
+
+ memcpy((void *) to, (void *) from, len);
+ *to_ptr += len;
+ return 0;
+}
+
+static int copy_chunk_to_user(unsigned long to, int len, void *arg)
+{
+ unsigned long *from_ptr = arg, from = *from_ptr;
+
+ memcpy((void *) to, (void *) from, len);
+ *from_ptr += len;
+ return 0;
+}
+
+unsigned long gup_kmap_copy_from_user(void *to, const void __user *from, unsigned long n)
+{
+ struct page **pages;
+ int num_pages, ret, i;
+
+ if (uaccess_kernel()) {
+ memcpy(to, (__force void *)from, n);
+ return 0;
+ }
+
+ num_pages = DIV_ROUND_UP((unsigned long)from + n, PAGE_SIZE) -
+ (unsigned long)from / PAGE_SIZE;
+ pages = kmalloc_array(num_pages, sizeof(*pages), GFP_KERNEL | __GFP_ZERO);
+ if (!pages)
+ goto end;
+
+ ret = get_user_pages_fast((unsigned long)from, num_pages, 0, pages);
+ if (ret < 0)
+ goto free_pages;
+
+ if (ret != num_pages) {
+ num_pages = ret;
+ goto put_pages;
+ }
+
+ n = buffer_op((unsigned long) from, n, copy_chunk_from_user, &to, pages);
+
+put_pages:
+ for (i = 0; i < num_pages; i++)
+ put_page(pages[i]);
+free_pages:
+ kfree(pages);
+end:
+ return n;
+}
+
+unsigned long gup_kmap_copy_to_user(void __user *to, const void *from, unsigned long n)
+{
+ struct page **pages;
+ int num_pages, ret, i;
+
+ if (uaccess_kernel()) {
+ memcpy((__force void *) to, from, n);
+ return 0;
+ }
+
+ num_pages = DIV_ROUND_UP((unsigned long)to + n, PAGE_SIZE) - (unsigned long)to / PAGE_SIZE;
+ pages = kmalloc_array(num_pages, sizeof(*pages), GFP_KERNEL | __GFP_ZERO);
+ if (!pages)
+ goto end;
+
+ ret = get_user_pages_fast((unsigned long)to, num_pages, FOLL_WRITE, pages);
+ if (ret < 0)
+ goto free_pages;
+
+ if (ret != num_pages) {
+ num_pages = ret;
+ goto put_pages;
+ }
+
+
+ n = buffer_op((unsigned long) to, n, copy_chunk_to_user, &from, pages);
+
+put_pages:
+ for (i = 0; i < num_pages; i++)
+ put_page(pages[i]);
+free_pages:
+ kfree(pages);
+end:
+ return n;
+}
--
2.26.2
On Fri, Jun 12, 2020 at 12:18 PM afzal mohammed <[email protected]> wrote:
>
> copy_{from,to}_user() uaccess helpers are implemented by user page
> pinning, followed by temporary kernel mapping & then memcpy(). This
> helps to achieve user page copy when current virtual address mapping
> of the CPU excludes user pages.
>
> Performance wise, results are not encouraging, 'dd' on tmpfs results,
>
> ARM Cortex-A8, BeagleBone White (256MiB RAM):
> w/o series - ~29.5 MB/s
> w/ series - ~20.5 MB/s
> w/ series & highmem disabled - ~21.2 MB/s
>
> On Cortex-A15(2GiB RAM) in QEMU:
> w/o series - ~4 MB/s
> w/ series - ~2.6 MB/s
>
> Roughly a one-third drop in performance. Disabling highmem improves
> performance only slightly.
>
> 'hackbench' also showed a similar pattern.
>
> uaccess routines using page pinning & temporary kernel mapping is not
> something new, it has been done long long ago by Ingo [1] as part of
> 4G/4G user/kernel mapping implementation on x86, though not merged in
> mainline.
>
> [1] https://lore.kernel.org/lkml/[email protected]/
Nice changelog text! I agree the performance drop is not great.
There are probably some things that can be done to optimize it,
but I guess most of the overhead is from the page table operations
and cannot be avoided.
What was the exact 'dd' command you used, in particular the block size?
Note that by default, 'dd' will request 512 bytes at a time, so you usually
only access a single page. It would be interesting to see the overhead with
other typical or extreme block sizes, e.g. '1', '64', '4K', '64K' or '1M'.
If you want to drill down into where exactly the overhead is (i.e.
get_user_pages or kmap_atomic, or something different), using
'perf record dd ..', and 'perf report' may be helpful.
> +static int copy_chunk_from_user(unsigned long from, int len, void *arg)
> +{
> + unsigned long *to_ptr = arg, to = *to_ptr;
> +
> + memcpy((void *) to, (void *) from, len);
> + *to_ptr += len;
> + return 0;
> +}
> +
> +static int copy_chunk_to_user(unsigned long to, int len, void *arg)
> +{
> + unsigned long *from_ptr = arg, from = *from_ptr;
> +
> + memcpy((void *) to, (void *) from, len);
> + *from_ptr += len;
> + return 0;
> +}
Will gcc optimize away the indirect function call and inline everything?
If not, that would be a small part of the overhead.
> +unsigned long gup_kmap_copy_from_user(void *to, const void __user *from, unsigned long n)
> +{
> + struct page **pages;
> + int num_pages, ret, i;
> +
> + if (uaccess_kernel()) {
> + memcpy(to, (__force void *)from, n);
> + return 0;
> + }
> +
> + num_pages = DIV_ROUND_UP((unsigned long)from + n, PAGE_SIZE) -
> + (unsigned long)from / PAGE_SIZE;
Make sure this doesn't turn into actual division operations but uses shifts.
It might even be clearer here to open-code the shift operation so readers
can see what this is meant to compile into.
> + pages = kmalloc_array(num_pages, sizeof(*pages), GFP_KERNEL | __GFP_ZERO);
> + if (!pages)
> + goto end;
Another micro-optimization may be to avoid the kmalloc for the common case,
e.g. anything with "num_pages <= 64", using an array on the stack.
> + ret = get_user_pages_fast((unsigned long)from, num_pages, 0, pages);
> + if (ret < 0)
> + goto free_pages;
> +
> + if (ret != num_pages) {
> + num_pages = ret;
> + goto put_pages;
> + }
I think this is technically incorrect: if get_user_pages_fast() only
gets some of the
pages, you should continue with the short buffer and return the number
of remaining
bytes rather than not copying anything. I think you did that correctly
for a failed
kmap_atomic(), but this has to use the same logic.
Arnd
Hi,
On Fri, Jun 12, 2020 at 02:02:13PM +0200, Arnd Bergmann wrote:
> On Fri, Jun 12, 2020 at 12:18 PM afzal mohammed <[email protected]> wrote:
> > Roughly a one-third drop in performance. Disabling highmem improves
> > performance only slightly.
> There are probably some things that can be done to optimize it,
> but I guess most of the overhead is from the page table operations
> and cannot be avoided.
Ingo's series did a follow_page() first, then as a fallback did it
invoke get_user_pages(), i will try that way as well.
Yes, i too feel get_user_pages_fast() path is the most time consuming,
will instrument & check.
> What was the exact 'dd' command you used, in particular the block size?
> Note that by default, 'dd' will request 512 bytes at a time, so you usually
> only access a single page. It would be interesting to see the overhead with
> other typical or extreme block sizes, e.g. '1', '64', '4K', '64K' or '1M'.
It was the default(512), more test results follows (in MB/s),
512 1K 4K 16K 32K 64K 1M
w/o series 30 46 89 95 90 85 65
w/ series 22 36 72 79 78 75 61
perf drop 26% 21% 19% 16% 13% 12% 6%
Hmm, results ain't that bad :)
> If you want to drill down into where exactly the overhead is (i.e.
> get_user_pages or kmap_atomic, or something different), using
> 'perf record dd ..', and 'perf report' may be helpful.
Let me dig deeper & try to find out where the major overhead and try
to figure out ways to reduce it.
One reason to disable highmem & test (results mentioned earlier) was
to make kmap_atomic() very lightweight, that was not making much
difference, around 3% only.
> > +static int copy_chunk_from_user(unsigned long from, int len, void *arg)
> > +{
> > + unsigned long *to_ptr = arg, to = *to_ptr;
> > +
> > + memcpy((void *) to, (void *) from, len);
> > + *to_ptr += len;
> > + return 0;
> > +}
> > +
> > +static int copy_chunk_to_user(unsigned long to, int len, void *arg)
> > +{
> > + unsigned long *from_ptr = arg, from = *from_ptr;
> > +
> > + memcpy((void *) to, (void *) from, len);
> > + *from_ptr += len;
> > + return 0;
> > +}
>
> Will gcc optimize away the indirect function call and inline everything?
> If not, that would be a small part of the overhead.
i think not, based on objdump, i will make these & wherever other
places possible inline & see the difference.
> > + num_pages = DIV_ROUND_UP((unsigned long)from + n, PAGE_SIZE) -
> > + (unsigned long)from / PAGE_SIZE;
>
> Make sure this doesn't turn into actual division operations but uses shifts.
> It might even be clearer here to open-code the shift operation so readers
> can see what this is meant to compile into.
Okay
>
> > + pages = kmalloc_array(num_pages, sizeof(*pages), GFP_KERNEL | __GFP_ZERO);
> > + if (!pages)
> > + goto end;
>
> Another micro-optimization may be to avoid the kmalloc for the common case,
> e.g. anything with "num_pages <= 64", using an array on the stack.
Okay
> > + ret = get_user_pages_fast((unsigned long)from, num_pages, 0, pages);
> > + if (ret < 0)
> > + goto free_pages;
> > +
> > + if (ret != num_pages) {
> > + num_pages = ret;
> > + goto put_pages;
> > + }
>
> I think this is technically incorrect: if get_user_pages_fast() only
> gets some of the
> pages, you should continue with the short buffer and return the number
> of remaining
> bytes rather than not copying anything. I think you did that correctly
> for a failed
> kmap_atomic(), but this has to use the same logic.
yes, will fix that.
Regards
afzal
On Fri, Jun 12, 2020 at 3:55 PM afzal mohammed <[email protected]> wrote:
> On Fri, Jun 12, 2020 at 02:02:13PM +0200, Arnd Bergmann wrote:
> > On Fri, Jun 12, 2020 at 12:18 PM afzal mohammed <[email protected]> wrote:
>
> > > Roughly a one-third drop in performance. Disabling highmem improves
> > > performance only slightly.
>
> > There are probably some things that can be done to optimize it,
> > but I guess most of the overhead is from the page table operations
> > and cannot be avoided.
>
> Ingo's series did a follow_page() first, then as a fallback did it
> invoke get_user_pages(), i will try that way as well.
Right, that could help, in particular for the small copies. I think a lot
of usercopy calls are only for a few bytes, though this is of course
highly workload dependent and you might only care about the large
ones.
> Yes, i too feel get_user_pages_fast() path is the most time consuming,
> will instrument & check.
>
> > What was the exact 'dd' command you used, in particular the block size?
> > Note that by default, 'dd' will request 512 bytes at a time, so you usually
> > only access a single page. It would be interesting to see the overhead with
> > other typical or extreme block sizes, e.g. '1', '64', '4K', '64K' or '1M'.
>
> It was the default(512), more test results follows (in MB/s),
>
> 512 1K 4K 16K 32K 64K 1M
>
> w/o series 30 46 89 95 90 85 65
>
> w/ series 22 36 72 79 78 75 61
>
> perf drop 26% 21% 19% 16% 13% 12% 6%
>
> Hmm, results ain't that bad :)
There is also still hope of optimizing small aligned copies like
set_ttbr0(user_ttbr);
ldm();
set_ttbr0(kernel_ttbr);
stm();
which could do e.g. 32 bytes at a time, but with more overhead
if you have to loop around it.
Arnd
On Fri, Jun 12, 2020 at 1:20 PM afzal mohammed <[email protected]> wrote:
>
> copy_{from,to}_user() uaccess helpers are implemented by user page
> pinning, followed by temporary kernel mapping & then memcpy(). This
> helps to achieve user page copy when current virtual address mapping
> of the CPU excludes user pages.
>
> Performance wise, results are not encouraging, 'dd' on tmpfs results,
>
> ARM Cortex-A8, BeagleBone White (256MiB RAM):
> w/o series - ~29.5 MB/s
> w/ series - ~20.5 MB/s
> w/ series & highmem disabled - ~21.2 MB/s
>
> On Cortex-A15(2GiB RAM) in QEMU:
> w/o series - ~4 MB/s
> w/ series - ~2.6 MB/s
>
> Roughly a one-third drop in performance. Disabling highmem improves
> performance only slightly.
>
> 'hackbench' also showed a similar pattern.
>
> uaccess routines using page pinning & temporary kernel mapping is not
> something new, it has been done long long ago by Ingo [1] as part of
> 4G/4G user/kernel mapping implementation on x86, though not merged in
> mainline.
>
> [1] https://lore.kernel.org/lkml/[email protected]/
Some comments (more related to generic things).
...
> +// Started from arch/um/kernel/skas/uaccess.c
Does it mean you will deduplicate it there?
...
> +#include <linux/err.h>
> +#include <linux/slab.h>
> +#include <linux/highmem.h>
> +#include <linux/mm.h>
Perhaps ordered?
> +static int do_op_one_page(unsigned long addr, int len,
> + int (*op)(unsigned long addr, int len, void *arg), void *arg,
> + struct page *page)
Maybe typedef for the func() ?
> +{
> + int n;
> +
> + addr = (unsigned long) kmap_atomic(page) + (addr & ~PAGE_MASK);
I don't remember about this one...
> + n = (*op)(addr, len, arg);
> + kunmap_atomic((void *)addr);
> +
> + return n;
> +}
> +
> +static long buffer_op(unsigned long addr, int len,
> + int (*op)(unsigned long, int, void *), void *arg,
> + struct page **pages)
> +{
> + long size, remain, n;
> +
> + size = min(PAGE_ALIGN(addr) - addr, (unsigned long) len);
...but here seems to me you can use helpers (offset_in_page() or how
it's called).
Also consider to use macros like PFN_DOWN(), PFN_UP(), etc in your code.
> + remain = len;
> + if (size == 0)
> + goto page_boundary;
> +
> + n = do_op_one_page(addr, size, op, arg, *pages);
> + if (n != 0) {
> + remain = (n < 0 ? remain : 0);
Why duplicate three times (!) this line, if you can move it to under 'out'?
> + goto out;
> + }
> +
> + pages++;
> + addr += size;
> + remain -= size;
> +
> +page_boundary:
> + if (remain == 0)
> + goto out;
> + while (addr < ((addr + remain) & PAGE_MASK)) {
> + n = do_op_one_page(addr, PAGE_SIZE, op, arg, *pages);
> + if (n != 0) {
> + remain = (n < 0 ? remain : 0);
> + goto out;
> + }
> +
> + pages++;
> + addr += PAGE_SIZE;
> + remain -= PAGE_SIZE;
> + }
Sounds like this can be refactored to iterate over pages rather than addresses.
> + if (remain == 0)
> + goto out;
> +
> + n = do_op_one_page(addr, remain, op, arg, *pages);
> + if (n != 0) {
> + remain = (n < 0 ? remain : 0);
> + goto out;
> + }
> +
> + return 0;
> +out:
> + return remain;
> +}
...
> +static int copy_chunk_from_user(unsigned long from, int len, void *arg)
> +{
> + unsigned long *to_ptr = arg, to = *to_ptr;
> +
> + memcpy((void *) to, (void *) from, len);
What is the point in the casting to void *?
> + *to_ptr += len;
> + return 0;
> +}
> +
> +static int copy_chunk_to_user(unsigned long to, int len, void *arg)
> +{
> + unsigned long *from_ptr = arg, from = *from_ptr;
> +
> + memcpy((void *) to, (void *) from, len);
> + *from_ptr += len;
Ditto.
> + return 0;
> +}
> +
> +unsigned long gup_kmap_copy_from_user(void *to, const void __user *from, unsigned long n)
> +{
> + struct page **pages;
> + int num_pages, ret, i;
> +
> + if (uaccess_kernel()) {
> + memcpy(to, (__force void *)from, n);
> + return 0;
> + }
> +
> + num_pages = DIV_ROUND_UP((unsigned long)from + n, PAGE_SIZE) -
> + (unsigned long)from / PAGE_SIZE;
PFN_UP() ?
> + pages = kmalloc_array(num_pages, sizeof(*pages), GFP_KERNEL | __GFP_ZERO);
> + if (!pages)
> + goto end;
> +
> + ret = get_user_pages_fast((unsigned long)from, num_pages, 0, pages);
> + if (ret < 0)
> + goto free_pages;
> +
> + if (ret != num_pages) {
> + num_pages = ret;
> + goto put_pages;
> + }
> +
> + n = buffer_op((unsigned long) from, n, copy_chunk_from_user, &to, pages);
> +
> +put_pages:
> + for (i = 0; i < num_pages; i++)
> + put_page(pages[i]);
> +free_pages:
> + kfree(pages);
> +end:
> + return n;
> +}
...
I think you can clean up the code a bit after you will get the main
functionality working.
--
With Best Regards,
Andy Shevchenko
Hi,
On Fri, Jun 12, 2020 at 10:07:28PM +0200, Arnd Bergmann wrote:
> I think a lot
> of usercopy calls are only for a few bytes, though this is of course
> highly workload dependent and you might only care about the large
> ones.
Observation is that max. pages reaching copy_{from,to}_user() is 2,
observed maximum of n (number of bytes) being 1 page size. i think C
library cuts any size read, write to page size (if it exceeds) &
invokes the system call. Max. pages reaching 2, happens when 'n'
crosses page boundary, this has been observed w/ small size request
as well w/ ones of exact page size (but not page aligned).
Even w/ dd of various size >4K, never is the number of pages required
to be mapped going greater than 2 (even w/ 'dd' 'bs=1M')
i have a worry (don't know whether it is an unnecessary one): even
if we improve performance w/ large copy sizes, it might end up in a
sluggishness w.r.t user experience due to most (hence a high amount)
of user copy calls being few bytes & there the penalty being higher.
And benchmark would not be able to detect anything abnormal since
usercopy are being tested on large sizes.
Quickly comparing boot-time on Beagle Bone White, boot time increases
by only 4%, perhaps this worry is irrelevant, but just thought will
put it across.
> There is also still hope of optimizing small aligned copies like
>
> set_ttbr0(user_ttbr);
> ldm();
> set_ttbr0(kernel_ttbr);
> stm();
Hmm, more needs to be done to be in a position to test it.
Regards
afzal
On Sat, Jun 13, 2020 at 05:34:32PM +0530, afzal mohammed wrote:
> Observation is that max. pages reaching copy_{from,to}_user() is 2,
> observed maximum of n (number of bytes) being 1 page size. i think C
> library cuts any size read, write to page size (if it exceeds) &
> invokes the system call. Max. pages reaching 2, happens when 'n'
> crosses page boundary, this has been observed w/ small size request
> as well w/ ones of exact page size (but not page aligned).
>
> Even w/ dd of various size >4K, never is the number of pages required
> to be mapped going greater than 2 (even w/ 'dd' 'bs=1M')
>
> i have a worry (don't know whether it is an unnecessary one): even
> if we improve performance w/ large copy sizes, it might end up in a
> sluggishness w.r.t user experience due to most (hence a high amount)
> of user copy calls being few bytes & there the penalty being higher.
> And benchmark would not be able to detect anything abnormal since
> usercopy are being tested on large sizes.
>
> Quickly comparing boot-time on Beagle Bone White, boot time increases
> by only 4%, perhaps this worry is irrelevant, but just thought will
> put it across.
Do stat(2) of the same tmpfs file in a loop (on tmpfs, to eliminate
the filesystem playing silly buggers). And I wouldn't expect anything
good there...
On Sat, Jun 13, 2020 at 01:51:26PM +0100, Al Viro wrote:
> On Sat, Jun 13, 2020 at 05:34:32PM +0530, afzal mohammed wrote:
>
> > Observation is that max. pages reaching copy_{from,to}_user() is 2,
> > observed maximum of n (number of bytes) being 1 page size. i think C
> > library cuts any size read, write to page size (if it exceeds) &
> > invokes the system call. Max. pages reaching 2, happens when 'n'
> > crosses page boundary, this has been observed w/ small size request
> > as well w/ ones of exact page size (but not page aligned).
> >
> > Even w/ dd of various size >4K, never is the number of pages required
> > to be mapped going greater than 2 (even w/ 'dd' 'bs=1M')
> >
> > i have a worry (don't know whether it is an unnecessary one): even
> > if we improve performance w/ large copy sizes, it might end up in a
> > sluggishness w.r.t user experience due to most (hence a high amount)
> > of user copy calls being few bytes & there the penalty being higher.
> > And benchmark would not be able to detect anything abnormal since
> > usercopy are being tested on large sizes.
> >
> > Quickly comparing boot-time on Beagle Bone White, boot time increases
> > by only 4%, perhaps this worry is irrelevant, but just thought will
> > put it across.
>
> Do stat(2) of the same tmpfs file in a loop (on tmpfs, to eliminate
> the filesystem playing silly buggers). And I wouldn't expect anything
> good there...
Incidentally, what about get_user()/put_user()? _That_ is where it's
going to really hurt...
On Sat, Jun 13, 2020 at 05:34:32PM +0530, afzal mohammed wrote:
> Hi,
>
> On Fri, Jun 12, 2020 at 10:07:28PM +0200, Arnd Bergmann wrote:
>
> > I think a lot
> > of usercopy calls are only for a few bytes, though this is of course
> > highly workload dependent and you might only care about the large
> > ones.
>
> Observation is that max. pages reaching copy_{from,to}_user() is 2,
> observed maximum of n (number of bytes) being 1 page size. i think C
> library cuts any size read, write to page size (if it exceeds) &
> invokes the system call. Max. pages reaching 2, happens when 'n'
> crosses page boundary, this has been observed w/ small size request
> as well w/ ones of exact page size (but not page aligned).
You can't make that assumption about read(2). stdio in the C library
may read a page size of data at a time, but programs are allowed to
call read(2) directly, and the C library will pass such a call straight
through to the kernel. So, if userspace requests a 16k read via
read(2), then read(2) will be invoked covering 16k.
As an extreme case, for example:
$ strace -e read dd if=/dev/zero of=/dev/null bs=1048576 count=1
read(0, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = 1048576
--
RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
FTTP is here! 40Mbps down 10Mbps up. Decent connectivity at last!
Hi,
On Sat, Jun 13, 2020 at 02:08:11PM +0300, Andy Shevchenko wrote:
> On Fri, Jun 12, 2020 at 1:20 PM afzal mohammed <[email protected]> wrote:
> > +// Started from arch/um/kernel/skas/uaccess.c
>
> Does it mean you will deduplicate it there?
What i meant was, that file was taken as a template & nothing more, at
same time i wanted to give credit to that file, i will explicitly
mention it next time.
It is not meant to deduplicate it. Functionality here is completely
different.
In the case here, there would be different virtual address mapping
that CPU will be see once in Kernel as compared to user mode.
Here a facility is provided to access the user page, when the
current virtual address mapping of the CPU excludes it. This
is for providing full 4G virtual address to both user & kernel on
32bit ARM to avoid using highmem or reduce the impact of highmem,
i.e. so that Kernel can address till 4GiB (almost) as lowmem.
Here assumption is that user mapping is not a subset of virtual
address mapped by CPU, but a separate one. Upon Kernel entry ttbr0
is changed to Kernel lowmem, while upon Kernel exit is changed back to
user pages (ttbrx in ARM, iiuc, equivalent to cr3 in x86)
Now realize that i am unable to put coherently the problem being
attempted to solve here to a person not familiar w/ the issue
w/o taking considerable time. If above explanation is not enough,
i will try to explain later in a better way.
> > +#include <linux/err.h>
> > +#include <linux/slab.h>
> > +#include <linux/highmem.h>
> > +#include <linux/mm.h>
>
> Perhaps ordered?
will take care
> > +static int do_op_one_page(unsigned long addr, int len,
> > + int (*op)(unsigned long addr, int len, void *arg), void *arg,
> > + struct page *page)
>
> Maybe typedef for the func() ?
will take care
> > +{
> > + int n;
> > +
> > + addr = (unsigned long) kmap_atomic(page) + (addr & ~PAGE_MASK);
>
> I don't remember about this one...
i am not following you here, for my case !CONFIG_64BIT case in that
file was required, hence only it was picked (or rather not deleted)
> > + size = min(PAGE_ALIGN(addr) - addr, (unsigned long) len);
>
> ...but here seems to me you can use helpers (offset_in_page() or how
> it's called).
i was not aware of it, will use it as required.
>
> Also consider to use macros like PFN_DOWN(), PFN_UP(), etc in your code.
Okay
>
> > + remain = len;
> > + if (size == 0)
> > + goto page_boundary;
> > +
> > + n = do_op_one_page(addr, size, op, arg, *pages);
> > + if (n != 0) {
>
> > + remain = (n < 0 ? remain : 0);
>
> Why duplicate three times (!) this line, if you can move it to under 'out'?
yes better to move there
>
> > + goto out;
> > + }
> > +
> > + pages++;
> > + addr += size;
> > + remain -= size;
> > +
> > +page_boundary:
> > + if (remain == 0)
> > + goto out;
> > + while (addr < ((addr + remain) & PAGE_MASK)) {
> > + n = do_op_one_page(addr, PAGE_SIZE, op, arg, *pages);
> > + if (n != 0) {
> > + remain = (n < 0 ? remain : 0);
> > + goto out;
> > + }
> > +
> > + pages++;
> > + addr += PAGE_SIZE;
> > + remain -= PAGE_SIZE;
> > + }
>
> Sounds like this can be refactored to iterate over pages rather than addresses.
Okay, i will check
> > +static int copy_chunk_from_user(unsigned long from, int len, void *arg)
> > +{
> > + unsigned long *to_ptr = arg, to = *to_ptr;
> > +
> > + memcpy((void *) to, (void *) from, len);
>
> What is the point in the casting to void *?
The reason it was there was because of copy-paste :), passing unsigned
long as 'void *' or 'const void *' requires casting right ?, or you
meant something else ?
now i checked removing the cast, compiler is abusing me :), says
'makes pointer from integer without a cast'
> > + num_pages = DIV_ROUND_UP((unsigned long)from + n, PAGE_SIZE) -
> > + (unsigned long)from / PAGE_SIZE;
>
> PFN_UP() ?
Okay
> I think you can clean up the code a bit after you will get the main
> functionality working.
Yes, surely, intention was to post proof-of-concept ASAP, perhaps
contents will change drastically in next version so that any
resemblence of arch/um/kernel/skas/uaccess.c might not be there.
Regards
afzal
Hi,
On Sat, Jun 13, 2020 at 01:56:15PM +0100, Al Viro wrote:
> Incidentally, what about get_user()/put_user()? _That_ is where it's
> going to really hurt...
All other uaccess routines are also planned to be added, posting only
copy_{from,to}_user() was to get early feedback (mentioned in the
cover letter)
Regards
afzal
On Sat, Jun 13, 2020 at 07:12:36PM +0530, afzal mohammed wrote:
> Hi,
>
> On Sat, Jun 13, 2020 at 01:56:15PM +0100, Al Viro wrote:
>
> > Incidentally, what about get_user()/put_user()? _That_ is where it's
> > going to really hurt...
>
> All other uaccess routines are also planned to be added, posting only
> copy_{from,to}_user() was to get early feedback (mentioned in the
> cover letter)
Sure, but what I mean is that I'd expect the performance loss to be
dominated by that, not by copy_from_user/copy_to_user on large amounts
of data. Especially on the loads like kernel builds - a lot of stat()
and getdents() calls there.
On Sat, Jun 13, 2020 at 04:31:02PM +0100, Al Viro wrote:
> On Sat, Jun 13, 2020 at 07:12:36PM +0530, afzal mohammed wrote:
> > Hi,
> >
> > On Sat, Jun 13, 2020 at 01:56:15PM +0100, Al Viro wrote:
> >
> > > Incidentally, what about get_user()/put_user()? _That_ is where it's
> > > going to really hurt...
> >
> > All other uaccess routines are also planned to be added, posting only
> > copy_{from,to}_user() was to get early feedback (mentioned in the
> > cover letter)
>
> Sure, but what I mean is that I'd expect the performance loss to be
> dominated by that, not by copy_from_user/copy_to_user on large amounts
> of data. Especially on the loads like kernel builds - a lot of stat()
> and getdents() calls there.
To clarify: stat() means small copy_to_user(), getdents() - a mix of
put_user() and small copy_to_user(). I would be very surprised if it
does not hurt a lot.
On Sat, Jun 13, 2020 at 04:41:18PM +0100, Al Viro wrote:
> On Sat, Jun 13, 2020 at 04:31:02PM +0100, Al Viro wrote:
> > On Sat, Jun 13, 2020 at 07:12:36PM +0530, afzal mohammed wrote:
> > > Hi,
> > >
> > > On Sat, Jun 13, 2020 at 01:56:15PM +0100, Al Viro wrote:
> > >
> > > > Incidentally, what about get_user()/put_user()? _That_ is where it's
> > > > going to really hurt...
> > >
> > > All other uaccess routines are also planned to be added, posting only
> > > copy_{from,to}_user() was to get early feedback (mentioned in the
> > > cover letter)
> >
> > Sure, but what I mean is that I'd expect the performance loss to be
> > dominated by that, not by copy_from_user/copy_to_user on large amounts
> > of data. Especially on the loads like kernel builds - a lot of stat()
> > and getdents() calls there.
>
> To clarify: stat() means small copy_to_user(), getdents() - a mix of
> put_user() and small copy_to_user(). I would be very surprised if it
> does not hurt a lot.
PS: there's another fun issue here:
fill a file with zeroes
mmap that file in two areas, MAP_SHARED
thread 1:
munmap() the first area
fill the second one with 'X'
thread 2:
write() from the first area into pipe
One could expect that nothing by zeroes gets written into
pipe - it might be a short write() (or -EFAULT), but finding
any 'X' there would be a bug.
Your patches allow for a possibility of write() doing
get_user_pages_fast(), getting the first page just as
munmap() is about to remove it from page tables and bugger
off. Then thread 1 proceeds with the store (via the
second area). And then thread 2 does memcpy() from that
thing via a kmap_atomic()-created alias, observing the
effect of the store.
That might or might not be a POSIX violation, but it does
look like a QoI issue...
On Sat, Jun 13, 2020 at 6:00 PM Al Viro <[email protected]> wrote:
> On Sat, Jun 13, 2020 at 04:41:18PM +0100, Al Viro wrote:
> > On Sat, Jun 13, 2020 at 04:31:02PM +0100, Al Viro wrote:
> > > On Sat, Jun 13, 2020 at 07:12:36PM +0530, afzal mohammed wrote:
> > > > Hi,
> > > >
> > > > On Sat, Jun 13, 2020 at 01:56:15PM +0100, Al Viro wrote:
> > > >
> > > > > Incidentally, what about get_user()/put_user()? _That_ is where it's
> > > > > going to really hurt...
> > > >
> > > > All other uaccess routines are also planned to be added, posting only
> > > > copy_{from,to}_user() was to get early feedback (mentioned in the
> > > > cover letter)
> > >
> > > Sure, but what I mean is that I'd expect the performance loss to be
> > > dominated by that, not by copy_from_user/copy_to_user on large amounts
> > > of data. Especially on the loads like kernel builds - a lot of stat()
> > > and getdents() calls there.
> >
> > To clarify: stat() means small copy_to_user(), getdents() - a mix of
> > put_user() and small copy_to_user(). I would be very surprised if it
> > does not hurt a lot.
>
> PS: there's another fun issue here:
>
> fill a file with zeroes
> mmap that file in two areas, MAP_SHARED
> thread 1:
> munmap() the first area
> fill the second one with 'X'
> thread 2:
> write() from the first area into pipe
>
> One could expect that nothing by zeroes gets written into
> pipe - it might be a short write() (or -EFAULT), but finding
> any 'X' there would be a bug.
>
> Your patches allow for a possibility of write() doing
> get_user_pages_fast(), getting the first page just as
> munmap() is about to remove it from page tables and bugger
> off. Then thread 1 proceeds with the store (via the
> second area). And then thread 2 does memcpy() from that
> thing via a kmap_atomic()-created alias, observing the
> effect of the store.
>
> That might or might not be a POSIX violation, but it does
> look like a QoI issue...
I assume this problem exists in arch/um/kernel/skas/uaccess.c
and in Ingo's old x86 VMSPLIT_4G_4G patch as well, right?
I guess holding mmap_read_lock() would prevent it but make
it even more expensive.
Arnd
On Sat, Jun 13, 2020 at 2:04 PM afzal mohammed <[email protected]> wrote:
> On Fri, Jun 12, 2020 at 10:07:28PM +0200, Arnd Bergmann wrote:
>
> > I think a lot
> > of usercopy calls are only for a few bytes, though this is of course
> > highly workload dependent and you might only care about the large
> > ones.
>
> Observation is that max. pages reaching copy_{from,to}_user() is 2,
> observed maximum of n (number of bytes) being 1 page size. i think C
> library cuts any size read, write to page size (if it exceeds) &
> invokes the system call. Max. pages reaching 2, happens when 'n'
> crosses page boundary, this has been observed w/ small size request
> as well w/ ones of exact page size (but not page aligned).
Right, this is apparently because tmpfs uses shmem_file_read_iter() to
copy the file pages one at a time. generic_file_buffered_read() seems
similar, to copying between an aligned kernel page and address in
user space that is not page aligned would be an important case to
optimize for.
> Quickly comparing boot-time on Beagle Bone White, boot time increases
> by only 4%, perhaps this worry is irrelevant, but just thought will
> put it across.
4% boot time increase sounds like a lot, especially if that is only for
copy_from_user/copy_to_user. In the end it really depends on how well
get_user()/put_user() and small copies can be optimized in the end.
> > There is also still hope of optimizing small aligned copies like
> >
> > set_ttbr0(user_ttbr);
> > ldm();
> > set_ttbr0(kernel_ttbr);
> > stm();
>
> Hmm, more needs to be done to be in a position to test it.
This is going to be highly microarchitecture specific, so anything you test
on the Beaglebone's Cortex-A8 might not apply to A7/A15/A17 systems,
but if you want to test what the overhead is, you could try changing
/dev/zero (or a different chardev like it) to use a series of
put_user(0, u32uptr++) in place of whatever it has, and then replace the
'str' instruction with dummy writes to ttbr0 using the value it already
has, like:
mcr p15, 0, %0, c2, c0, 0 /* set_ttbr0() */
isb /* prevent speculative access to kernel table */
str %1, [%2],0 /* write 32 bit to user space */
mcr p15, 0, %0, c2, c0, 0 /* set_ttbr0() */
isb /* prevent speculative access to user table */
This is obviously going to be very slow compared to the simple store
there is today but maybe cheaper than the
CONFIG_ARM64_SW_TTBR0_PAN uaccess_en/disable()
on arm64 on a single get_user()/put_user().
It would be interesting to compare it to the overhead of a
get_user_page_fast() based implementation. From the numbers you
measured, it seems the beaglebone currently needs an extra ~6µs or
3µs per copy_to/from_user() call with your patch, depending on what
your benchmark was (MB/s for just reading or writing vs MB/s for
copying from one file to another through a user space buffer).
Arnd
On Sat, Jun 13, 2020 at 10:45:33PM +0200, Arnd Bergmann wrote:
> On Sat, Jun 13, 2020 at 2:04 PM afzal mohammed <[email protected]> wrote:
> > Observation is that max. pages reaching copy_{from,to}_user() is 2,
> > observed maximum of n (number of bytes) being 1 page size. i think C
> > library cuts any size read, write to page size (if it exceeds) &
> > invokes the system call. Max. pages reaching 2, happens when 'n'
> > crosses page boundary, this has been observed w/ small size request
> > as well w/ ones of exact page size (but not page aligned).
>
> Right, this is apparently because tmpfs uses shmem_file_read_iter() to
> copy the file pages one at a time. generic_file_buffered_read() seems
> similar, to copying between an aligned kernel page and address in
> user space that is not page aligned would be an important case to
> optimize for.
This is kind of the nature of the page cache. The kernel doesn't
necessarily have contiguous memory in the page cache, so it's going to
be split on page boundaries. This is going to change with my THP series
(I haven't actually changed generic_file_buffered_read(), but it'll
come later).
> > Quickly comparing boot-time on Beagle Bone White, boot time increases
> > by only 4%, perhaps this worry is irrelevant, but just thought will
> > put it across.
>
> 4% boot time increase sounds like a lot, especially if that is only for
> copy_from_user/copy_to_user. In the end it really depends on how well
> get_user()/put_user() and small copies can be optimized in the end.
The write path should also be paid attention to. Look at
generic_perform_write() which first calls iov_iter_fault_in_readable()
(for the entire length of the write) and then actually does the copy
later with iov_iter_copy_from_user_atomic(). So you're going to want
to optimise the case where you access the same pages multiple times.
Hi,
On Sat, Jun 13, 2020 at 02:15:52PM +0100, Russell King - ARM Linux admin wrote:
> On Sat, Jun 13, 2020 at 05:34:32PM +0530, afzal mohammed wrote:
> > i think C
> > library cuts any size read, write to page size (if it exceeds) &
> > invokes the system call.
> You can't make that assumption about read(2). stdio in the C library
> may read a page size of data at a time, but programs are allowed to
> call read(2) directly, and the C library will pass such a call straight
> through to the kernel. So, if userspace requests a 16k read via
> read(2), then read(2) will be invoked covering 16k.
>
> As an extreme case, for example:
>
> $ strace -e read dd if=/dev/zero of=/dev/null bs=1048576 count=1
> read(0, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = 1048576
Okay. Yes, observed that dd is passing whatever is the 'bs' to
Kernel and from the 'dd' sources (of busybox), it is invoking read
system call directly passing 'bs', so it is the tmpfs read that is
splitting it to page size as mentioned by Arnd.
Regards
afzal
Hi,
On Sat, Jun 13, 2020 at 10:45:33PM +0200, Arnd Bergmann wrote:
> 4% boot time increase sounds like a lot, especially if that is only for
> copy_from_user/copy_to_user. In the end it really depends on how well
> get_user()/put_user() and small copies can be optimized in the end.
i mentioned the worst case(happened only once), normally it was in
the range 2-3%
> From the numbers you
> measured, it seems the beaglebone currently needs an extra ~6?s or
> 3?s per copy_to/from_user() call with your patch, depending on what
> your benchmark was (MB/s for just reading or writing vs MB/s for
> copying from one file to another through a user space buffer).
It is MB/s for copying one file to another via user space buffer, i.e.
the value coreutils 'dd' shows w/ status=progress (here it was busybox
'dd', so instead it was enabling a compile time option)
> but if you want to test what the overhead is, you could try changing
> /dev/zero (or a different chardev like it) to use a series of
> put_user(0, u32uptr++) in place of whatever it has, and then replace the
> 'str' instruction with dummy writes to ttbr0 using the value it already
> has, like:
>
> mcr p15, 0, %0, c2, c0, 0 /* set_ttbr0() */
> isb /* prevent speculative access to kernel table */
> str %1, [%2],0 /* write 32 bit to user space */
> mcr p15, 0, %0, c2, c0, 0 /* set_ttbr0() */
> isb /* prevent speculative access to user table */
> It would be interesting to compare it to the overhead of a
> get_user_page_fast() based implementation.
i have to relocate & be on quarantine couple of weeks, so i will
temporarily stop here, otherwise might end up in roadside.
Reading feedbacks from everyone, some of it i could grasp only bits &
pieces, familiarizing more w/ mm & vfs would help me add value better
to the goal/discussion. Linus Walleij, if you wish to explore things,
feel free, right now don't know how my connectivity would be for next
3 weeks.
Regards
afzal
Hi,
On Sun, Jun 14, 2020 at 06:51:43PM +0530, afzal mohammed wrote:
> It is MB/s for copying one file to another via user space buffer, i.e.
> the value coreutils 'dd' shows w/ status=progress (here it was busybox
> 'dd', so instead it was enabling a compile time option)
Just for correctness, status=progress is not required, it's there in
the default 3rd line of coreutils 'dd' o/p
Regards
afzal
From: Al Viro
> Sent: 13 June 2020 16:31
> On Sat, Jun 13, 2020 at 07:12:36PM +0530, afzal mohammed wrote:
> > Hi,
> >
> > On Sat, Jun 13, 2020 at 01:56:15PM +0100, Al Viro wrote:
> >
> > > Incidentally, what about get_user()/put_user()? _That_ is where it's
> > > going to really hurt...
> >
> > All other uaccess routines are also planned to be added, posting only
> > copy_{from,to}_user() was to get early feedback (mentioned in the
> > cover letter)
>
> Sure, but what I mean is that I'd expect the performance loss to be
> dominated by that, not by copy_from_user/copy_to_user on large amounts
> of data. Especially on the loads like kernel builds - a lot of stat()
> and getdents() calls there.
Or any network traffic where the number of usercopies involved in,
for example, sendmsg() gives a measurable performance decrease when
HARDENED_USERCOPY is enabled.
Do you have issues with cache aliasing?
(Is aliasing the right term?)
Where potentially the temporary kernel map doesn't use the same
cache lines as the user processes map.
I'm not sure what problem you are trying to solve, but for 64bit
systems it may be possible to map all of physical memory into the
kernel address map, them you (loosely) only have to find the KVA
that matches the user-VA to do the copy.
IIRC our SYSV kernels used to do that - until we had 384MB of physical
memory and ran out of kernel address space.
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)