This patch allows the kernel to hold atomic kmaps across copy_*_user.
>From an idea by Linus and/or Martin Bligh and/or Andrea.
The basic idea is: when the kernel takes an atomic kmap via the new
kmap_copy_user() function it records state about that kmap in
current->copy_user_state. If a pagefault is taken then the page fault
handler will fix up the copy_*_user state prior to returning to
copy_*_user.
An optimisation to this (Andrea) is to use a sequence number to detect
whether the copy_*_user's fixmap slot was reused during the processing
of the pagefault. If not, and we're on the same CPU then no fixup is
needed.
The fixup code in the pagefault path will rewrite the CPU's ESI or EDI
register to point at the fixed up kmap. This means that the caller of
kmap_copy_user() MUST be using a copy function which uses ESI or EDI in
the normal manner.
The interfaces are designed so that non-x86 architectures which are
using highmem can implement the same trick.
If a different copy_*_user implementation is written then new fixup
code will be needed.
The only new copy_*_user implementation of which I am aware is the
"efficient copy_*_user routines" from Mala Anand and colleagues. They
use ESI/EDI as well - this code has been successfully tested against
those patches.
This patch uses kmap_copy_user() in file_read_actor().
This patch breaks the ramdisk driver when it is used as a module,
unless you've applied Rusty's patch which exports __per_cpu_data.
arch/i386/kernel/i386_ksyms.c | 5 ++
arch/i386/lib/usercopy.c | 10 +++++
arch/i386/mm/fault.c | 71 +++++++++++++++++++++++++++++++++++
include/asm-i386/highmem.h | 5 ++
include/asm-i386/kmap_types.h | 3 +
include/asm-i386/processor.h | 2 +
include/asm-ppc/kmap_types.h | 1
include/asm-sparc/kmap_types.h | 1
include/asm-x86_64/kmap_types.h | 1
include/linux/highmem.h | 80 ++++++++++++++++++++++++++++++++++++++++
include/linux/sched.h | 5 ++
mm/filemap.c | 11 +++--
12 files changed, 189 insertions, 6 deletions
--- 2.5.30/arch/i386/kernel/i386_ksyms.c~kmap_atomic_reads Fri Aug 9 17:36:42 2002
+++ 2.5.30-akpm/arch/i386/kernel/i386_ksyms.c Fri Aug 9 17:36:42 2002
@@ -14,6 +14,7 @@
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/tty.h>
+#include <linux/highmem.h>
#include <asm/semaphore.h>
#include <asm/processor.h>
@@ -74,6 +75,10 @@ EXPORT_SYMBOL(pm_idle);
EXPORT_SYMBOL(pm_power_off);
EXPORT_SYMBOL(get_cmos_time);
EXPORT_SYMBOL(apm_info);
+
+#ifdef CONFIG_HIGHMEM
+EXPORT_SYMBOL(kmap_atomic_seq);
+#endif
#ifdef CONFIG_DEBUG_IOVIRT
EXPORT_SYMBOL(__io_virt_debug);
--- 2.5.30/arch/i386/lib/usercopy.c~kmap_atomic_reads Fri Aug 9 17:36:42 2002
+++ 2.5.30-akpm/arch/i386/lib/usercopy.c Fri Aug 9 17:36:42 2002
@@ -11,6 +11,16 @@
#ifdef CONFIG_X86_USE_3DNOW_AND_WORKS
+/*
+ * We cannot use the mmx functions here with the kmap_atomic fixup
+ * code.
+ *
+ * But CONFIG_X86_USE_3DNOW_AND_WORKS never gets defined anywhere.
+ * Maybe kill this code?
+ */
+
+#error this will not work
+
unsigned long
__generic_copy_to_user(void *to, const void *from, unsigned long n)
{
--- 2.5.30/arch/i386/mm/fault.c~kmap_atomic_reads Fri Aug 9 17:36:42 2002
+++ 2.5.30-akpm/arch/i386/mm/fault.c Fri Aug 9 17:36:42 2002
@@ -13,6 +13,7 @@
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
+#include <linux/highmem.h>
#include <linux/smp.h>
#include <linux/smp_lock.h>
#include <linux/interrupt.h>
@@ -129,6 +130,70 @@ void bust_spinlocks(int yes)
console_loglevel = loglevel_save;
}
+#ifdef CONFIG_HIGHMEM
+
+/*
+ * per-cpu, per-atomic-kmap sequence numbers. Incremented in kmap_atomic.
+ * If these change, we know that an atomic kmap slot has been reused.
+ */
+int kmap_atomic_seq[KM_TYPE_NR] __per_cpu_data = {0};
+
+/*
+ * Note the CPU ID and the currently-held atomic kmap's sequence number
+ */
+static inline void note_atomic_kmap(struct pt_regs *regs)
+{
+ struct copy_user_state *cus = current->copy_user_state;
+
+ if (cus) {
+ cus->cpu = smp_processor_id();
+ cus->seq = this_cpu(kmap_atomic_seq[cus->type]);
+ }
+}
+
+/*
+ * After processing the fault, look to see whether we have switched CPUs
+ * or whether the fault handler has used the same kmap slot (it must have
+ * scheduled to another task). If so, drop the kmap and get a new one.
+ * And then fix up the machine register which copy_*_user() is using so
+ * that it gets the correct address relative to the the new kmap.
+ */
+static void
+__check_atomic_kmap(struct copy_user_state *cus, struct pt_regs *regs)
+{
+ const int cpu = smp_processor_id();
+
+ if (cus->seq != per_cpu(kmap_atomic_seq[cus->type], cpu) ||
+ cus->cpu != cpu) {
+ long *reg;
+ unsigned offset;
+
+ kunmap_atomic(cus->kaddr, cus->type);
+ cus->kaddr = kmap_atomic(cus->page, cus->type);
+ if (cus->src)
+ reg = ®s->esi;
+ else
+ reg = ®s->edi;
+ offset = *reg & (PAGE_SIZE - 1);
+ *reg = ((long)cus->kaddr) | offset;
+ }
+}
+
+static inline void check_atomic_kmap(struct pt_regs *regs)
+{
+ struct copy_user_state *cus = current->copy_user_state;
+
+ if (cus)
+ __check_atomic_kmap(cus, regs);
+}
+
+#else
+static inline void note_atomic_kmap(struct pt_regs *regs)
+{}
+static inline void check_atomic_kmap(struct pt_regs *regs)
+{}
+#endif
+
asmlinkage void do_invalid_op(struct pt_regs *, unsigned long);
/*
@@ -187,6 +252,8 @@ asmlinkage void do_page_fault(struct pt_
if (in_interrupt() || !mm)
goto no_context;
+ note_atomic_kmap(regs);
+
down_read(&mm->mmap_sem);
vma = find_vma(mm, address);
@@ -248,8 +315,10 @@ good_area:
tsk->maj_flt++;
break;
case VM_FAULT_SIGBUS:
+ check_atomic_kmap(regs);
goto do_sigbus;
case VM_FAULT_OOM:
+ check_atomic_kmap(regs);
goto out_of_memory;
default:
BUG();
@@ -264,6 +333,7 @@ good_area:
tsk->thread.screen_bitmap |= 1 << bit;
}
up_read(&mm->mmap_sem);
+ check_atomic_kmap(regs);
return;
/*
@@ -272,6 +342,7 @@ good_area:
*/
bad_area:
up_read(&mm->mmap_sem);
+ check_atomic_kmap(regs);
/* User mode accesses just cause a SIGSEGV */
if (error_code & 4) {
--- 2.5.30/include/asm-i386/highmem.h~kmap_atomic_reads Fri Aug 9 17:36:42 2002
+++ 2.5.30-akpm/include/asm-i386/highmem.h Fri Aug 9 17:36:42 2002
@@ -22,6 +22,7 @@
#include <linux/config.h>
#include <linux/interrupt.h>
+#include <linux/percpu.h>
#include <asm/kmap_types.h>
#include <asm/tlbflush.h>
@@ -76,6 +77,8 @@ static inline void kunmap(struct page *p
* be used in IRQ contexts, so in some (very limited) cases we need
* it.
*/
+extern int kmap_atomic_seq[KM_TYPE_NR] __per_cpu_data;
+
static inline void *kmap_atomic(struct page *page, enum km_type type)
{
enum fixed_addresses idx;
@@ -93,7 +96,7 @@ static inline void *kmap_atomic(struct p
#endif
set_pte(kmap_pte-idx, mk_pte(page, kmap_prot));
__flush_tlb_one(vaddr);
-
+ this_cpu(kmap_atomic_seq[type])++;
return (void*) vaddr;
}
--- 2.5.30/include/asm-i386/kmap_types.h~kmap_atomic_reads Fri Aug 9 17:36:42 2002
+++ 2.5.30-akpm/include/asm-i386/kmap_types.h Fri Aug 9 17:36:42 2002
@@ -19,7 +19,8 @@ D(5) KM_BIO_SRC_IRQ,
D(6) KM_BIO_DST_IRQ,
D(7) KM_PTE0,
D(8) KM_PTE1,
-D(9) KM_TYPE_NR
+D(9) KM_FILEMAP,
+D(10) KM_TYPE_NR
};
#undef D
--- 2.5.30/include/asm-i386/processor.h~kmap_atomic_reads Fri Aug 9 17:36:42 2002
+++ 2.5.30-akpm/include/asm-i386/processor.h Fri Aug 9 17:36:42 2002
@@ -488,4 +488,6 @@ extern inline void prefetchw(const void
#endif
+#define ARCH_HAS_KMAP_FIXUP
+
#endif /* __ASM_I386_PROCESSOR_H */
--- 2.5.30/include/asm-ppc/kmap_types.h~kmap_atomic_reads Fri Aug 9 17:36:42 2002
+++ 2.5.30-akpm/include/asm-ppc/kmap_types.h Fri Aug 9 17:36:42 2002
@@ -15,6 +15,7 @@ enum km_type {
KM_BIO_DST_IRQ,
KM_PTE0,
KM_PTE1,
+ KM_FILEMAP,
KM_TYPE_NR
};
--- 2.5.30/include/asm-sparc/kmap_types.h~kmap_atomic_reads Fri Aug 9 17:36:42 2002
+++ 2.5.30-akpm/include/asm-sparc/kmap_types.h Fri Aug 9 17:36:42 2002
@@ -9,6 +9,7 @@ enum km_type {
KM_USER1,
KM_BIO_SRC_IRQ,
KM_BIO_DST_IRQ,
+ KM_FILEMAP,
KM_TYPE_NR
};
--- 2.5.30/include/asm-x86_64/kmap_types.h~kmap_atomic_reads Fri Aug 9 17:36:42 2002
+++ 2.5.30-akpm/include/asm-x86_64/kmap_types.h Fri Aug 9 17:36:42 2002
@@ -9,6 +9,7 @@ enum km_type {
KM_USER1,
KM_BIO_SRC_IRQ,
KM_BIO_DST_IRQ,
+ KM_FILEMAP,
KM_TYPE_NR
};
--- 2.5.30/include/linux/highmem.h~kmap_atomic_reads Fri Aug 9 17:36:42 2002
+++ 2.5.30-akpm/include/linux/highmem.h Fri Aug 9 17:36:42 2002
@@ -3,6 +3,7 @@
#include <linux/config.h>
#include <linux/fs.h>
+#include <asm/processor.h>
#include <asm/cacheflush.h>
#ifdef CONFIG_HIGHMEM
@@ -10,6 +11,7 @@
extern struct page *highmem_start_page;
#include <asm/highmem.h>
+#include <asm/kmap_types.h>
/* declarations for linux/mm/highmem.c */
unsigned int nr_free_highpages(void);
@@ -71,5 +73,83 @@ static inline void copy_user_highpage(st
kunmap_atomic(vfrom, KM_USER0);
kunmap_atomic(vto, KM_USER1);
}
+
+#if defined(CONFIG_HIGHMEM) && defined(ARCH_HAS_KMAP_FIXUP)
+/*
+ * Used when performing a copy_*_user while holding an atomic kmap
+ */
+struct copy_user_state {
+ struct page *page; /* The page which is kmap_atomiced */
+ void *kaddr; /* Its mapping */
+ enum km_type type; /* Its offset */
+ int src; /* 1: fixup ESI. 0: Fixup EDI */
+ int cpu; /* CPU which the kmap was taken on */
+ int seq; /* The kmap's sequence number */
+};
+
+/*
+ * `src' is true if the kmap_atomic virtual address is the source of the copy.
+ */
+static inline void *
+kmap_copy_user(struct copy_user_state *cus, struct page *page,
+ enum km_type type, int src)
+{
+ cus->page = page;
+ cus->kaddr = kmap_atomic(page, type);
+ if (PageHighMem(page)) {
+ cus->type = type;
+ cus->src = src;
+ BUG_ON(current->copy_user_state != NULL);
+ current->copy_user_state = cus;
+ }
+ return cus->kaddr;
+}
+
+static inline void kunmap_copy_user(struct copy_user_state *cus)
+{
+ if (PageHighMem(cus->page)) {
+ BUG_ON(current->copy_user_state != cus);
+ kunmap_atomic(cus->kaddr, cus->type);
+ current->copy_user_state = NULL;
+ cus->page = NULL; /* debug */
+ }
+}
+
+/*
+ * After a copy_*_user, the kernel virtual address may be different. So
+ * use kmap_copy_user_addr() to get the new value.
+ */
+static inline void *kmap_copy_user_addr(struct copy_user_state *cus)
+{
+ return cus->kaddr;
+}
+
+#else
+
+struct copy_user_state {
+ struct page *page;
+};
+
+/*
+ * This must be a macro because `type' may be undefined
+ */
+
+#define kmap_copy_user(cus, page, type, src) \
+ ({ \
+ (cus)->page = (page); \
+ kmap(page); \
+ })
+
+static inline void kunmap_copy_user(struct copy_user_state *cus)
+{
+ kunmap(cus->page);
+}
+
+static inline void *kmap_copy_user_addr(struct copy_user_state *cus)
+{
+ return page_address(cus->page);
+}
+
+#endif
#endif /* _LINUX_HIGHMEM_H */
--- 2.5.30/include/linux/sched.h~kmap_atomic_reads Fri Aug 9 17:36:42 2002
+++ 2.5.30-akpm/include/linux/sched.h Fri Aug 9 17:36:42 2002
@@ -245,6 +245,8 @@ extern struct user_struct root_user;
typedef struct prio_array prio_array_t;
+struct copy_user_state;
+
struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
struct thread_info *thread_info;
@@ -366,6 +368,9 @@ struct task_struct {
/* journalling filesystem info */
void *journal_info;
struct dentry *proc_dentry;
+#ifdef CONFIG_HIGHMEM
+ struct copy_user_state *copy_user_state;
+#endif
};
extern void __put_task_struct(struct task_struct *tsk);
--- 2.5.30/mm/filemap.c~kmap_atomic_reads Fri Aug 9 17:36:42 2002
+++ 2.5.30-akpm/mm/filemap.c Fri Aug 9 17:37:02 2002
@@ -16,6 +16,7 @@
#include <linux/kernel_stat.h>
#include <linux/mm.h>
#include <linux/mman.h>
+#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/iobuf.h>
@@ -1020,18 +1021,20 @@ no_cached_page:
UPDATE_ATIME(inode);
}
-int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
+int file_read_actor(read_descriptor_t *desc, struct page *page,
+ unsigned long offset, unsigned long size)
{
char *kaddr;
+ struct copy_user_state copy_user_state;
unsigned long left, count = desc->count;
if (size > count)
size = count;
- kaddr = kmap(page);
+ kaddr = kmap_copy_user(©_user_state, page, KM_FILEMAP, 1);
left = __copy_to_user(desc->buf, kaddr + offset, size);
- kunmap(page);
-
+ kunmap_copy_user(©_user_state);
+
if (left) {
size -= left;
desc->error = -EFAULT;
.
On Fri, 9 Aug 2002, Andrew Morton wrote:
>
> This patch allows the kernel to hold atomic kmaps across copy_*_user.
> >From an idea by Linus and/or Martin Bligh and/or Andrea.
Argh.
I've come to hate this approach, I should have told you. That magic
%esi/%edi thing disturbs me, even if I was one of the people responsible
for polluting your virgin brain with the idea. It just makes me squirm,
not just because there may be memcopies that would prefer to use other
registers, but because I just think it's too damn fragile to play with
register contents from exceptions.
So I would suggest instead:
- do_page_fault() already does an
if (in_interrupt() || !mm)
goto no_context;
and the fact is, the "in_interrupt()" should really be an
"preempt_count()", since it's illegal to take a page fault not just in
interrupts, but while non-preemptible in general.
- now, if we do the copy_to/from_user() from a preempt-safe area, the
_existing_ code (with the above one-liner fix) already returns a
partial error (ie no new code-paths - copy_to/from_user() already has
to handle the EFAULT case)
- which means that we can do the kmap_copy_to_user() with _zero_ new
code, by just wrapping it something like this:
repeat:
kmap_atomic(..); // this increments preempt count
nr = copy_from_user(..);
kunmap_atomic(..);
/* bytes uncopied? */
if (nr) {
if (!get_user(dummy, start_addr) &&
!get_user(dummy, end_addr))
goto repeat;
.. handle EFAULT ..
}
Yes, the above requires some care about getting the details right, but
notice how it requires absolutely no magic new code, and how it actually
uses existing well-documented (and has-to-work-anyway) features.
And notice how it works as a _much_ more generic fix - the above actually
allows the true anti-deadlock thing where you can basically "test" whether
the page is already mapped with zero cost, and if it isn't mapped (and you
worry about deadlocking because you've already locked the page that we're
writing into), you can make the slow path do a careful "look up the page
tables by hand" thing.
In other words, you can use the above trick to get rid of that horrible
"__get_user(dummy..)" thing that is one huge big hack right now in
generic_file_write().
(And yes, it requires incrementing the preempt count in kmap/kunmap even
if preemption is otherwise disabled, big deal).
Linus
Linus Torvalds wrote:
>
> ...
> repeat:
> kmap_atomic(..); // this increments preempt count
> nr = copy_from_user(..);
> kunmap_atomic(..);
>
> /* bytes uncopied? */
> if (nr) {
> if (!get_user(dummy, start_addr) &&
> !get_user(dummy, end_addr))
> goto repeat;
> .. handle EFAULT ..
> }
>
> Yes, the above requires some care about getting the details right, but
> notice how it requires absolutely no magic new code, and how it actually
> uses existing well-documented (and has-to-work-anyway) features.
>
OK. The kunmap_atomic() could happen on a different CPU, which will
die with CONFIG_DEBUG_HIGHMEM but apart from that, looks much saner.
We'll need need to manually fault in the user page on the
generic_file_read() path before taking the kmap, because reading
into an unmapped page is a common case: malloc/read.
Actually, p = malloc(lots); write(fd, p, lots); isn't totally
uncommon either, so the prefault on the write path would help
highmem machines (in which case it'd be best to leave it there
for all machines).
> And notice how it works as a _much_ more generic fix - the above actually
> allows the true anti-deadlock thing where you can basically "test" whether
> the page is already mapped with zero cost, and if it isn't mapped (and you
> worry about deadlocking because you've already locked the page that we're
> writing into), you can make the slow path do a careful "look up the page
> tables by hand" thing.
I don't understand what the pagetable walk is here for?
The kernel will sometimes need to read the page from disk to service
the fault, but it's locked...
We could drop the page lock before the __get_user, but that may
break the expectations of some filesystem's prepare/commit pair.
So I'm not clear on how we can lose the (racy, especially with
preemption) "one huge big hack".
The implicit use of preempt_count to mean "in kmap_copy_user" may
turn ugly. But if so another tsk->flags bit can be created. We'll
see...
On Fri, 9 Aug 2002, Andrew Morton wrote:
> Linus Torvalds wrote:
> >
> > ...
> > repeat:
> > kmap_atomic(..); // this increments preempt count
> > nr = copy_from_user(..);
> > kunmap_atomic(..);
> >
> > /* bytes uncopied? */
> > if (nr) {
> > if (!get_user(dummy, start_addr) &&
> > !get_user(dummy, end_addr))
> > goto repeat;
> > .. handle EFAULT ..
> > }
> >
> > Yes, the above requires some care about getting the details right, but
> > notice how it requires absolutely no magic new code, and how it actually
> > uses existing well-documented (and has-to-work-anyway) features.
> >
>
> OK. The kunmap_atomic() could happen on a different CPU, which will
> die with CONFIG_DEBUG_HIGHMEM but apart from that, looks much saner.
No no no.
It cannot happen on another CPU, since even if we take a page fault, we
will all be inside a preempt-safe region (the first thing kmap_atomic()
does is to increment the preempt count, the last thing the kunmap does is
to decrement it).
There's nothing that sleeps anywhere, there's nothing that can cause a
schedule. Exactly because the page fault handler will _see_ that we're in
a critical region, and will do the "fixup()" thing for us.
> We'll need need to manually fault in the user page on the
> generic_file_read() path before taking the kmap, because reading
> into an unmapped page is a common case: malloc/read.
I actually suspect that most reads are fairly small, and the page already
exists. But who knows.. pre-loading is certainly easy (a single
instruction).
> > And notice how it works as a _much_ more generic fix - the above actually
> > allows the true anti-deadlock thing where you can basically "test" whether
> > the page is already mapped with zero cost, and if it isn't mapped (and you
> > worry about deadlocking because you've already locked the page that we're
> > writing into), you can make the slow path do a careful "look up the page
> > tables by hand" thing.
>
> I don't understand what the pagetable walk is here for?
>
> The kernel will sometimes need to read the page from disk to service
> the fault, but it's locked...
>
> We could drop the page lock before the __get_user, but that may
> break the expectations of some filesystem's prepare/commit pair.
The thing is, we can _notice_ when the bad case happens (same page), and
we can for that special case do special logic.
We couldn't do that before, simply because we can't afford to do the page
table walk all the time. But we _can_ afford to do it for the rare cases
that would trap (the deadlock being one of them).
Linus
Linus Torvalds wrote:
>
> ...
> > We'll need need to manually fault in the user page on the
> > generic_file_read() path before taking the kmap, because reading
> > into an unmapped page is a common case: malloc/read.
>
> I actually suspect that most reads are fairly small, and the page already
> exists. But who knows.. pre-loading is certainly easy (a single
> instruction).
These things can be measured ;)
Across a `make -j6 bzImage' the kernel reads 166,000 pages via
file_read_actor(). And 31,000 of those generated a fault in
the copy_*_user. (It wrote 14,400 pages and, of course, none
of those faulted).
And if gcc is getting a fault 20% of the time, some other apps will
get many more. Which implies that we must prefault the page in
file_read_actor to get full benefit.
And if we do that, I'll bet you Victor's dollar that the fixup path
is never executed. I'd have to disable the prefault even to be able
to test it.
What would be nice is a way of formalising the prefault, to pin
the mm's pages across the copy_*_user() in some manner, perhaps?
On Fri, 9 Aug 2002, Andrew Morton wrote:
>
> What would be nice is a way of formalising the prefault, to pin
> the mm's pages across the copy_*_user() in some manner, perhaps?
Too easy to create a DoS-type attack with any trivial implementation.
However, I don't think pinning is worthwhile, since even if the page goes
away, the prefaulting was just a performance optimization. The code should
work fine without it. In fact, it would probably be good to _not_ prefault
for a development kernel, and verify that the code works without it. That
way we can sleep safe in the knowledge that there isn't some race through
code that requires the prefaulting..
I agree that if you could guarantee pinning the out-of-line code would be
a bit simpler, but since we have to handle the EFAULT case anyway, I doubt
that it is _that_ much simpler.
Also, there are actually advantages to doing it the "hard" way. If we ever
want to, we can actually play clever tricks that avoid doing the copy at
all with the slow path.
Example tricks: we can, if we want to, do a read() with no copy for a
common case by adding a COW-bit to the page cache, and if you do aligned
reads into a page that will fault on write, you can just map in the page
cache page directly, mark it COW in the page cache (assuming the page
count tells us we're the only user, of course), and mark it COW in the
mapping.
The nice thing is, this actually works correctly even if the user re-uses
the area for reading multiple times (because the read() will trap not
because the page isn't mapped, but because it is mapped COW on something
that will write to user space). The unmapped case is better, though, since
we don't need to do TLB invalidates for that case (which makes this
potentially worthwhile even on SMP).
I don't know if this is common, but it _would_ make read() have definite
advantages over mmap() on files that are seldom written to or mmap'ed in a
process (which is most of them, gut feel). In particular, once you fault
for _one_ page, you can just map in as many pages as the read() tried to
read in one go - so you can avoid any future work as well.
Imagine doing a
fstat(fd..)
buf = aligned_malloc(st->st_size)
read(fd, buf, st->st_size);
and having it magically populate the VM directly with the whole file
mapping, with _one_ failed page fault. And the above is actually a fairly
common thing. See how many people have tried to optimize using mmap vs
read, and what they _all_ really wanted was this "populate the pages in
one go" thing.
Is it a good idea? I don't know. But it would seem to fall very cleanly
out of the atomic kmap path - without affecting the fast path at _all_. It
would be a very specific and localized optimization, with no impact on the
rest of the system, since it's using the same fixup() logic that we have
to have anyway.
(Yeah, the COW bit on the page cache is special, and it would need page
mapping and obviously file writing to do something like
..
if (page->flags & PAGE_COW) {
page->flags &= ~PAGE_COW;
if (page->count > 1) {
remove-and-reinsert-new-page();
}
}
..
by hand before mapping it writable or writing to it. And the read()
optimization would _only_ work if nobody is using mmap() on the file at
the same time for those pages).
This would definitely be 2.7.x material, I'm just explaining why I like
the flexibility of the approach (as opposed to a very static "memcpy-only-
special-case" thing).
Linus
Linus Torvalds wrote:
>
> On Fri, 9 Aug 2002, Andrew Morton wrote:
> >
> > What would be nice is a way of formalising the prefault, to pin
> > the mm's pages across the copy_*_user() in some manner, perhaps?
>
> Too easy to create a DoS-type attack with any trivial implementation.
hmm, yes. The pin has to be held across ->prepare_write. That
tears it.
> However, I don't think pinning is worthwhile, since even if the page goes
> away, the prefaulting was just a performance optimization. The code should
> work fine without it. In fact, it would probably be good to _not_ prefault
> for a development kernel, and verify that the code works without it. That
> way we can sleep safe in the knowledge that there isn't some race through
> code that requires the prefaulting..
OK. That covers reads. But we need to do something short-term to get
these large performance benefits, and I don't know how to properly fix
the write deadlock. The choices here are:
- live with the current __get_user thing
- make filemap_nopage aware of the problem, via a new `struct page *'
in task_struct (this would be very messy on the reader side).
- or?
(Of course, the write deadlock is a different and longstanding
problem, and I don't _have_ to fix it here, weasel, weasel)
> I agree that if you could guarantee pinning the out-of-line code would be
> a bit simpler, but since we have to handle the EFAULT case anyway, I doubt
> that it is _that_ much simpler.
>
> Also, there are actually advantages to doing it the "hard" way. If we ever
> want to, we can actually play clever tricks that avoid doing the copy at
> all with the slow path.
>
> Example tricks: we can, if we want to, do a read() with no copy for a
> common case by adding a COW-bit to the page cache, and if you do aligned
> reads into a page that will fault on write, you can just map in the page
> cache page directly, mark it COW in the page cache (assuming the page
> count tells us we're the only user, of course), and mark it COW in the
> mapping.
glibc malloc currently returns well-aligned-address + 8. If
it were taught to return well-aligned-address+0 then presumably a
lot of applications would automatically benefit from these
zero-copy reads.
On Saturday 10 August 2002 09:25, Linus Torvalds wrote:
> Example tricks: we can, if we want to, do a read() with no copy for a
> common case by adding a COW-bit to the page cache, and if you do aligned
> reads into a page that will fault on write, you can just map in the page
> cache page directly, mark it COW in the page cache (assuming the page
> count tells us we're the only user, of course), and mark it COW in the
> mapping.
>
> The nice thing is, this actually works correctly even if the user re-uses
> the area for reading multiple times (because the read() will trap not
> because the page isn't mapped, but because it is mapped COW on something
> that will write to user space). The unmapped case is better, though, since
> we don't need to do TLB invalidates for that case (which makes this
> potentially worthwhile even on SMP).
>
> I don't know if this is common, but it _would_ make read() have definite
> advantages over mmap() on files that are seldom written to or mmap'ed in a
> process (which is most of them, gut feel). In particular, once you fault
> for _one_ page, you can just map in as many pages as the read() tried to
> read in one go - so you can avoid any future work as well.
>
> Imagine doing a
>
> fstat(fd..)
> buf = aligned_malloc(st->st_size)
> read(fd, buf, st->st_size);
>
> and having it magically populate the VM directly with the whole file
> mapping, with _one_ failed page fault. And the above is actually a fairly
> common thing. See how many people have tried to optimize using mmap vs
> read, and what they _all_ really wanted was this "populate the pages in
> one go" thing.
>
> Is it a good idea? I don't know. But it would seem to fall very cleanly
> out of the atomic kmap path - without affecting the fast path at _all_.
Sorry, this connection is too subtle for me. I see why we want to do
this, and in fact I've been researching how to do it for the last few
weeks, but I don't see how it's related to the atomic kmap path. Could
you please explain, in words of one syllable?
While I'm feeling disoriented, what exactly is the deadlock path for a
write from a mmaped, not uptodate page, to the same page? And why does
__get_user need to touch the page in *two* places to instantiate it?
Also, how do we know the page won't get evicted before grab_cache_page
gets to it?
--
Daniel
On Sat, 10 Aug 2002, Linus Torvalds wrote:
> and having it magically populate the VM directly with the whole file
> mapping, with _one_ failed page fault. And the above is actually a fairly
> common thing. See how many people have tried to optimize using mmap vs
> read, and what they _all_ really wanted was this "populate the pages in
> one go" thing.
If this is worth it, chances are prefaulting at mmap() time
could also be worth trying ... hmmm ;)
Rik
--
Bravely reimplemented by the knights who say "NIH".
http://www.surriel.com/ http://distro.conectiva.com/
On Sat, 10 Aug 2002, Daniel Phillips wrote:
> Sorry, this connection is too subtle for me. I see why we want to do
> this, and in fact I've been researching how to do it for the last few
> weeks, but I don't see how it's related to the atomic kmap path. Could
> you please explain, in words of one syllable?
We cannot do that optimization generally. I'll give you two reasons, both
of which are sufficient on their own:
- doing the page table walk is simply slower than doing the memcpy if the
page is just there. So you have to have a good heuristic on when it
might be worthwhile to do page table tricks. That heuristic should
include "is the page directly accessible". Which is exactly what you
get if you have a "atomic copy_to_user() that returns failure if it
cannot be done atomically".
- Even if walking the page tables were to be fast (ie ignoring #1),
replacing a page in virtual memory is absolutely not. Especially not on
SMP, where replacing a page in memory implies doing CPU crosscalls in
order to invalidate the TLB on other CPU's for the old page. So before
you do the "clever VM stuff", you had better have a heuristic that says
"this page isn't mapped, so it doesn't need the expensive cross-calls".
Again: guess what gives you pretty much exactly that heuristic?
See? The fact is, "memcpy()" is damned fast for a lot of cases, because it
natively uses the TLB and existing caches. It's slow for other cases, but
you want to have a good _heuristic_ for when you might want to try to
avoid the slow case without avoiding the fast case. Without that heuristic
you can't do the optimization sanely.
And obviously the heuristic should be a really fast one. The atomic
copy_to_user() is the _perfect_ heuristic, because if it just does the
memcpy there is absolutely zero overhead (it just does it). The overhead
comes in only in the case where we're going to be slowed down by the fault
anyway, _and_ where we want to do the clever tricks.
> While I'm feeling disoriented, what exactly is the deadlock path for a
> write from a mmaped, not uptodate page, to the same page? And why does
> __get_user need to touch the page in *two* places to instantiate it?
It doesn't touch it twice. It touches _both_ of the potential pages that
will be involved in the memcpy - since the copy may well not be
page-aligned in user space.
The deadlock is when you do a write of a page into a mapping of the very
same page that isn't yet mapped. What happens is:
- the write has gotten the page lock. Since the wrie knows that the whole
page is going to be overwritten, it is _not_ marked uptodate, and the
old contents (garbage from the allocation) are left alone.
- the copy_from_user() pagefaults and tries to bring in the _same_ page
into user land.
- that involves reading in the page and making sure it is up-to-date
- but since the write has already locked the page, you now have a
deadlock. The write cannot continue, since it needs the old contents,
and the old contents cannot be read in since the write holds the page
lock.
The "copy_from_user() atomically" solves the problem quite nicely. If the
atomic copy fails, we can afford to do the things that we cannot afford to
do normally (because the thing never triggers under real load, and real
load absolutely _needs_ to not try to get the page up-to-date before the
write).
So with the atomic copy-from-user, we can trap the problem only when it is
a problem, and go full speed normally.
Linus
On Sat, 10 Aug 2002, Rik van Riel wrote:
> On Sat, 10 Aug 2002, Linus Torvalds wrote:
>
> > and having it magically populate the VM directly with the whole file
> > mapping, with _one_ failed page fault. And the above is actually a fairly
> > common thing. See how many people have tried to optimize using mmap vs
> > read, and what they _all_ really wanted was this "populate the pages in
> > one go" thing.
>
> If this is worth it, chances are prefaulting at mmap() time
> could also be worth trying ... hmmm ;)
Maybe, maybe not.
The advantage of read() is that it contains an implicit "madvise()", since
the read _tells_ us that it wants X pages.
A page fault does not tell us, and prefaulting can hurt us.
Linus
Linus Torvalds wrote:
> Imagine doing a
>
> fstat(fd..)
> buf = aligned_malloc(st->st_size)
> read(fd, buf, st->st_size);
>
> and having it magically populate the VM directly with the whole file
> mapping, with _one_ failed page fault. And the above is actually a fairly
> common thing. See how many people have tried to optimize using mmap vs
> read, and what they _all_ really wanted was this "populate the pages in
> one go" thing.
This will only provide the performance benefic when `aligned_malloc'
return "fresh" memory, i.e. memory that has never been written to.
Assuming most programs use plain old `malloc', which could be taught to
align nicely, then the optimisation might occur when a program starts
up, but later on it's more likely to return memory which has been
written to and previously freed. So the performance becomes unpredictable.
But it's a nice way to optimise if you are _deliberately_ optimising a
user space program. First call mmap() to get some fresh pages, then
call read() to fill them. Slower on kernels without the optimisation,
fast on kernels with it. :-)
-- Jamie
On Sat, 10 Aug 2002, Jamie Lokier wrote:
>
> This will only provide the performance benefic when `aligned_malloc'
> return "fresh" memory, i.e. memory that has never been written to.
Absolutely.
Think o fthe optimization as a way to give application writers a new way
of being efficient.
In particular, I remember when the gcc people were worried about the most
efficient way to read in a file for preprocessing (Neil Booth, mainly).
Neil did all these timings on where the cut-off point was for using mmap
vs just using read().
For people like that, wouldn't it be nice to just be able to tell them: if
you do X, we guarantee that you'll get optimal zero-copy performance for
reading a file.
> But it's a nice way to optimise if you are _deliberately_ optimising a
> user space program.
Exactly.
Linus
Linus Torvalds wrote:
> For people like that, wouldn't it be nice to just be able to tell them: if
> you do X, we guarantee that you'll get optimal zero-copy performance for
> reading a file.
Don't forget to include the need for mmap(... MAP_ANON ...) prior to the
read.
Given the user will need to establish a new mapping anyway, why pussy
foot around with subtleties? Just add a MAP_PREFAULT flag to mmap(),
which reads the whole file and maps it before returning.
-- Jamie
On Saturday 10 August 2002 19:01, Linus Torvalds wrote:
> On Sat, 10 Aug 2002, Daniel Phillips wrote:
> > Sorry, this connection is too subtle for me. I see why we want to do
> > this, and in fact I've been researching how to do it for the last few
> > weeks, but I don't see how it's related to the atomic kmap path. Could
> > you please explain, in words of one syllable?
>
> We cannot do that optimization generally. I'll give you two reasons, both
> of which are sufficient on their own:
>
> - doing the page table walk is simply slower than doing the memcpy if the
> page is just there. So you have to have a good heuristic on when it
> might be worthwhile to do page table tricks. That heuristic should
> include "is the page directly accessible". Which is exactly what you
> get if you have a "atomic copy_to_user() that returns failure if it
> cannot be done atomically".
>
> - Even if walking the page tables were to be fast (ie ignoring #1),
> replacing a page in virtual memory is absolutely not. Especially not on
> SMP, where replacing a page in memory implies doing CPU crosscalls in
> order to invalidate the TLB on other CPU's for the old page. So before
> you do the "clever VM stuff", you had better have a heuristic that says
> "this page isn't mapped, so it doesn't need the expensive cross-calls".
>
> Again: guess what gives you pretty much exactly that heuristic?
>
> See?
Yes, I see. Easy, when you put it that way.
> The fact is, "memcpy()" is damned fast for a lot of cases, because it
> natively uses the TLB and existing caches. It's slow for other cases, but
> you want to have a good _heuristic_ for when you might want to try to
> avoid the slow case without avoiding the fast case. Without that heuristic
> you can't do the optimization sanely.
>
> And obviously the heuristic should be a really fast one. The atomic
> copy_to_user() is the _perfect_ heuristic, because if it just does the
> memcpy there is absolutely zero overhead (it just does it). The overhead
> comes in only in the case where we're going to be slowed down by the fault
> anyway, _and_ where we want to do the clever tricks.
So the overhead consists of inc/deccing preempt_count around the
copy_*_user, which fakes do_page_fault into forcing an early return.
> > While I'm feeling disoriented, what exactly is the deadlock path for a
> > write from a mmaped, not uptodate page, to the same page? And why does
> > __get_user need to touch the page in *two* places to instantiate it?
>
> It doesn't touch it twice. It touches _both_ of the potential pages that
> will be involved in the memcpy - since the copy may well not be
> page-aligned in user space.
Oh duh. I stared at that for the longest time, without realizing there's no
alignment requirement.
> The deadlock is when you do a write of a page into a mapping of the very
> same page that isn't yet mapped. What happens is:
>
> - the write has gotten the page lock. Since the wrie knows that the whole
> page is going to be overwritten, it is _not_ marked uptodate, and the
> old contents (garbage from the allocation) are left alone.
>
> - the copy_from_user() pagefaults and tries to bring in the _same_ page
> into user land.
>
> - that involves reading in the page and making sure it is up-to-date
>
> - but since the write has already locked the page, you now have a
> deadlock. The write cannot continue, since it needs the old contents,
> and the old contents cannot be read in since the write holds the page
> lock.
>
> The "copy_from_user() atomically" solves the problem quite nicely. If the
> atomic copy fails, we can afford to do the things that we cannot afford to
> do normally (because the thing never triggers under real load, and real
> load absolutely _needs_ to not try to get the page up-to-date before the
> write).
>
> So with the atomic copy-from-user, we can trap the problem only when it is
> a problem, and go full speed normally.
That's all crystal clear now. (Though the way do_page_fault finesses
copy_from_user into returning early is a little - how should I put it -
opaque. Yes, I see it, but...)
I'm sure you're aware there's a lot more you can do with these tricks
than just zero-copy read - there's zero-copy write as well, and there
are both of the above, except a full pte page at a time. There could
even be a file to file copy if there were an interface for it.
I don't see what prevents the read optimization even with a mmapped
page, the page just becomes CoW in all of the mapped region, the read
destination and the page cache.
--
Daniel
On Sat, 10 Aug 2002, Daniel Phillips wrote:
> >
> > And obviously the heuristic should be a really fast one. The atomic
> > copy_to_user() is the _perfect_ heuristic, because if it just does the
> > memcpy there is absolutely zero overhead (it just does it). The overhead
> > comes in only in the case where we're going to be slowed down by the fault
> > anyway, _and_ where we want to do the clever tricks.
>
> So the overhead consists of inc/deccing preempt_count around the
> copy_*_user, which fakes do_page_fault into forcing an early return.
Well, I'm actually expecting that preempt will at some day be the normal
thing to do, so the inc/dec is not so much an overhead of the heuristic,
but a direct result of using "kmap_atomic()" in the first place.
But yes, for the non-preempters, there would be the overhead of doing the
preempt count thing.
That is a nice per-cpu non-atomic thing, and in a cacheline that has been
brought in as part of the system call logic anyway. It will dirty it,
though - and I don't know if that is the "normal" state of that line
otherwise.
[ Side note - one of the reasons I'd potentially like to move the
thread_info thing to the _top_ of the stack (instead of the bottom) is
that that way it could share the cacheline with the kernel stack that
gets dirtied on every kernel entry anyway. Dunno if it matters. ]
> > It doesn't touch it twice. It touches _both_ of the potential pages that
> > will be involved in the memcpy - since the copy may well not be
> > page-aligned in user space.
>
> Oh duh. I stared at that for the longest time, without realizing there's no
> alignment requirement.
Well, I will not claim that that code is very pretty or obvious.
Also, as-is, nobody has ever been able to prove that the pre-fetching as
it stands now really fixes the race, although it makes it certainly makes
it practically speaking impossible to trigger.
> > So with the atomic copy-from-user, we can trap the problem only when it is
> > a problem, and go full speed normally.
>
> That's all crystal clear now. (Though the way do_page_fault finesses
> copy_from_user into returning early is a little - how should I put it -
> opaque. Yes, I see it, but...)
Well, yes. The whole "fixup" thing is certainly not the most obvious thing
ever written (and you can thank Richard Henderson for the approach), but
it has turned out to be a very useful thing to have. It removed all the
races we had between checking whether an area was properly mapped and
actually accessing that area (ie the old "verify_area()" approach), and
it's extremely efficient for the fast path (the fault path is a bit less
so, but ..)
> I'm sure you're aware there's a lot more you can do with these tricks
> than just zero-copy read - there's zero-copy write as well, and there
> are both of the above, except a full pte page at a time. There could
> even be a file to file copy if there were an interface for it.
The file-to-file copy is really nasty to do, for the simple reason that
one page really wants to have just one "owner". So while doing a
file-to-file copy is certainly possible, it tends to imply removing the
cached page from the source and inserting it into the destination.
Which is the right thing to do for streaming copies, but the _wrong_ thing
to do if the source is then used again.
Linus
On Sat, 10 Aug 2002, Jamie Lokier wrote:
>
> Don't forget to include the need for mmap(... MAP_ANON ...) prior to the
> read.
Ahhah! But I _don't_.
Yes, with read() you have to do a brk() or mmap(MAP_ANON) (and brk() is
the _much_ faster of the two).
But with mmap() you need to do a fstat() and a munmap() (while with read
you just re-use the area, and we'd do the right thing thanks to the
COW-ness of the pages).
So I don't think the MAP_ANON thing is a loss for the read.
And read() is often the much nicer interface, simply because you don't
need to worry about the size of the file up-front etc.
Also, because of the delayed nature of mmap()/fault, it has some strange
behaviour if somebody is editing your file in the middle of the compile -
with read() you might get strange syntax errors if somebody changes the
file half-way, but with mmap() your preprocessor may get a SIGSEGV in the
middle just because the file was truncated..
In general, I think read() tends to be the right (and simpler) interface
to use if you don't explicitly want to take advantage of the things mmap
offers (on-demand mappings, no-write-back pageouts, VM coherency etc).
Linus
On Saturday 10 August 2002 20:32, Linus Torvalds wrote:
> On Sat, 10 Aug 2002, Daniel Phillips wrote:
> > I'm sure you're aware there's a lot more you can do with these tricks
> > than just zero-copy read - there's zero-copy write as well, and there
> > are both of the above, except a full pte page at a time. There could
> > even be a file to file copy if there were an interface for it.
>
> The file-to-file copy is really nasty to do, for the simple reason that
> one page really wants to have just one "owner". So while doing a
> file-to-file copy is certainly possible, it tends to imply removing the
> cached page from the source and inserting it into the destination.
>
> Which is the right thing to do for streaming copies, but the _wrong_ thing
> to do if the source is then used again.
If the source is only used for reading it's fine, and you'd know that in
advance if the file is opened r/o.
I will admit that this one is pretty far out there, there is just a ton of
meat and potatoes cleanup work to do before these deathray-type features get
to the top of the stack. But when they do, it's going to be fun.
--
Daniel
Linus Torvalds wrote:
> And read() is often the much nicer interface, simply because you don't
> need to worry about the size of the file up-front etc.
>
> Also, because of the delayed nature of mmap()/fault, it has some strange
> behaviour if somebody is editing your file in the middle of the compile -
> with read() you might get strange syntax errors if somebody changes the
> file half-way, but with mmap() your preprocessor may get a SIGSEGV in the
> middle just because the file was truncated..
>
> In general, I think read() tends to be the right (and simpler) interface
> to use if you don't explicitly want to take advantage of the things mmap
> offers (on-demand mappings, no-write-back pageouts, VM coherency etc).
While working on a race-free rewrite of cp/mv/rm (suggested by Al), I
did overall-time benchmarks on read+write versus sendfile/stat versus
mmap/stat, and found that pretty much the fastest way under Linux 2.2,
2.4, and solaris was read+write of PAGE_SIZE, or PAGE_SIZE*2 chunks.
[obviously, 2.2 and solaris didn't do sendfile test]
The overhead of the extra stat and mmap/munmap syscalls seemed to be the
thing that slowed things down. sendfile was pretty fast, but still an
extra syscall, with an annoyingly large error handling case [only
certain files can be sendfile'd]
I sure would like an O_STREAMING flag, though... let a user app hint to
the system that the pages it is reading or writing are perhaps less
likely to be reused, or access randomly.... A copy-file syscall would
be nice, too, but that's just laziness talking....
Jeff
Christoph Hellwig wrote:
> On Sat, Aug 10, 2002 at 02:52:49PM -0400, Jeff Garzik wrote:
>
>>While working on a race-free rewrite of cp/mv/rm (suggested by Al), I
>>did overall-time benchmarks on read+write versus sendfile/stat versus
>>mmap/stat, and found that pretty much the fastest way under Linux 2.2,
>>2.4, and solaris was read+write of PAGE_SIZE, or PAGE_SIZE*2 chunks.
>>[obviously, 2.2 and solaris didn't do sendfile test]
>
>
> Solaris 9 (and Solaris 8 with a certain patch) support Linux-style
> sendfile(). Linux 2.5 on the other hand doesn't support sendfile to
> files anymore..
Really? Bummer :) That was a useful hack for some cases...
On Sat, Aug 10, 2002 at 02:52:49PM -0400, Jeff Garzik wrote:
> While working on a race-free rewrite of cp/mv/rm (suggested by Al), I
> did overall-time benchmarks on read+write versus sendfile/stat versus
> mmap/stat, and found that pretty much the fastest way under Linux 2.2,
> 2.4, and solaris was read+write of PAGE_SIZE, or PAGE_SIZE*2 chunks.
> [obviously, 2.2 and solaris didn't do sendfile test]
Solaris 9 (and Solaris 8 with a certain patch) support Linux-style
sendfile(). Linux 2.5 on the other hand doesn't support sendfile to
files anymore..
Linus Torvalds wrote:
> Yes, with read() you have to do a brk() or mmap(MAP_ANON) (and brk() is
> the _much_ faster of the two).
Ouch, that means a typical user-space program/library that wants to use
this technique has to have an intimate relationship with its malloc()
implementation: it's not in general safe to call brk() unless you are
the memory allocator. (Yes, I know you can call brk() with Glibc's
malloc(), but... dependencies upon dependencies!) And even when it is
safe to allocate with brk(), there's no safe way to free that memory.
So this would be fine for the stdio built in to Glibc, perhaps.
> But with mmap() you need to do a fstat() and a munmap() (while with read
> you just re-use the area, and we'd do the right thing thanks to the
> COW-ness of the pages).
Granted, you might re-use the area if you're doing block reads like
stdio, compiler, XML parser etc. But not a few programs want to:
1. Allocate enough memory to hold whole file.
2. Load file into memory.
> Also, because of the delayed nature of mmap()/fault, it has some strange
> behaviour if somebody is editing your file in the middle of the compile -
> with read() you might get strange syntax errors if somebody changes the
> file half-way, but with mmap() your preprocessor may get a SIGSEGV in the
> middle just because the file was truncated..
Isn't that SIGBUS :-)
(Not that the architectures are at all consistent on this..)
> In general, I think read() tends to be the right (and simpler) interface
> to use if you don't explicitly want to take advantage of the things mmap
> offers (on-demand mappings, no-write-back pageouts, VM coherency etc).
I agree, although I think this particular optimisation requires some
quite unusual preparation by user space - I still think GCC would need
to call open/fstat/mmap/read/munmap/close.
You've rightly pointed out that memcpy() is faster for a page, rather
than VM tweaking. But this isn't true of large reads, is it?
Then the TLB invalidation cost could, in principle, be amortised over
the whole large read.
-- Jamie
On Sat, 10 Aug 2002, Jamie Lokier wrote:
>
> You've rightly pointed out that memcpy() is faster for a page, rather
> than VM tweaking. But this isn't true of large reads, is it?
> Then the TLB invalidation cost could, in principle, be amortised over
> the whole large read.
Yes. We could make the special case be just for large reads, and amortise
the cost of VM handling etc. That's especially true since a single page
table lookup can look up a lot of pages, so you amortise more than just
the TLB invalidation cost.
I have no idea where the cut-off point would be, and it will probably
depend quite a lot on whether the reader will write to the pages it read
from (causing COW faults) or not. If the read()'er will write to them, VM
tricks probably never pay off (since you will just be delaying the copy
and adding more page faults), so the question is what the common behaviour
is.
I _suspect_ that the common behaviour is to read just a few kB at a time
and that is basically doesn't ever really pay to play VM games.
(The "repeated read of a few kB" case is also likely to be the
best-performing behaviour, simply because it's usually _better_ to do many
small reads that re-use the cache than it is to do one large read that
blows your cache and TLB. Of course, that all depends on what your
patterns are after the read - do you want to have the whole file
accessible or not).
Anyway, this really is more food for thought than anything else, since
this is definitely not anything for 2.6.x. The page cache impact of doing
VM games is going to be noticeable too (because of the COW-by-hand
issues), and the VM behaviour in general changes.
For example, what do you do when somebody has a COW-page mapped into it's
VM space and you want to start paging stuff out? There are "interesting"
cases that just may mean that doing the COW thing is a really stupid thing
to do, even if it is intriguing to _think_ about it.
Linus
Linus Torvalds wrote:
>
> ...
> - do_page_fault() already does an
>
> if (in_interrupt() || !mm)
> goto no_context;
>
> and the fact is, the "in_interrupt()" should really be an
> "preempt_count()", since it's illegal to take a page fault not just in
> interrupts, but while non-preemptible in general.
>
gargh. preempt_disable (and, hence, kmap_atomic) do not bump
the preempt counter with CONFIG_PREEMPT=n.
Is there a plan to change this?
If not, I don't think it's worth making this change just for
the highmem read/write thing (calculating `current' at each
spin_lock site...) I just open coded it.
This works. I still need to do the other architectures' fault
handlers, do writes and test it for more than seven seconds.
arch/i386/mm/fault.c | 6 +++---
include/linux/preempt.h | 14 ++++++++++++--
2 files changed, 15 insertions(+), 5 deletions(-)
--- 2.5.30/arch/i386/mm/fault.c~atomic-copy_user Sat Aug 10 14:44:03 2002
+++ 2.5.30-akpm/arch/i386/mm/fault.c Sat Aug 10 14:44:52 2002
@@ -189,10 +189,10 @@ asmlinkage void do_page_fault(struct pt_
info.si_code = SEGV_MAPERR;
/*
- * If we're in an interrupt or have no user
- * context, we must not take the fault..
+ * If we're in an interrupt, have no user context or are running in an
+ * atomic region then we must not take the fault..
*/
- if (in_interrupt() || !mm)
+ if (preempt_count() || !mm)
goto no_context;
#ifdef CONFIG_X86_REMOTE_DEBUG
--- 2.5.30/include/linux/preempt.h~atomic-copy_user Sat Aug 10 16:18:50 2002
+++ 2.5.30-akpm/include/linux/preempt.h Sat Aug 10 16:20:16 2002
@@ -5,19 +5,29 @@
#define preempt_count() (current_thread_info()->preempt_count)
+#define inc_preempt_count() \
+do { \
+ preempt_count()++; \
+} while (0)
+
+#define dec_preempt_count() \
+do { \
+ preempt_count()--; \
+} while (0)
+
#ifdef CONFIG_PREEMPT
extern void preempt_schedule(void);
#define preempt_disable() \
do { \
- preempt_count()++; \
+ inc_preempt_count(); \
barrier(); \
} while (0)
#define preempt_enable_no_resched() \
do { \
- preempt_count()--; \
+ dec_preempt_count(); \
barrier(); \
} while (0)
filemap.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++--
1 files changed, 49 insertions(+), 2 deletions(-)
--- 2.5.30/mm/filemap.c~kmap_atomic_reads Sat Aug 10 17:09:47 2002
+++ 2.5.30-akpm/mm/filemap.c Sat Aug 10 17:27:35 2002
@@ -1020,7 +1020,37 @@ no_cached_page:
UPDATE_ATIME(inode);
}
-int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
+/*
+ * Fault a userspace page into pagetables. Return non-zero on EFAULT.
+ * FIXME: this assumes that two userspace pages are always sufficient. That's
+ * not true if PAGE_CACHE_SIZE > PAGE_SIZE.
+ */
+static inline int fault_in_page_writeable(char *uaddr, int size)
+{
+ int ret;
+
+ /*
+ * Writing zeroes into userspace here is OK, because we know that if
+ * the zero gets there, we'll be overwriting it.
+ */
+ ret = __put_user(0, uaddr);
+ if (ret == 0) {
+ char *end = uaddr + size - 1;
+
+ /*
+ * If the page was already mapped, this will get a cache miss
+ * for sure, so try to avoid doing it. This is only useful if
+ * userspace is doing page-aligned IO, which is rare. Lose it?
+ */
+ if (((unsigned long)uaddr & PAGE_MASK) !=
+ ((unsigned long)end & PAGE_MASK))
+ ret = __put_user(0, end);
+ }
+ return ret;
+}
+
+int file_read_actor(read_descriptor_t *desc, struct page *page,
+ unsigned long offset, unsigned long size)
{
char *kaddr;
unsigned long left, count = desc->count;
@@ -1028,14 +1058,31 @@ int file_read_actor(read_descriptor_t *
if (size > count)
size = count;
+ /*
+ * Faults on the destination of a read are common, so do it before
+ * taking the kmap.
+ */
+ if (!fault_in_page_writeable(desc->buf, size)) {
+ kaddr = kmap_atomic(page, KM_USER0);
+ inc_preempt_count(); /* An atomic copy_to_user */
+ left = __copy_to_user(desc->buf, kaddr + offset, size);
+ dec_preempt_count();
+ kunmap_atomic(kaddr, KM_USER0);
+ if (left == 0)
+ goto success;
+ printk("%s: Unexpected page fault\n", __FUNCTION__);
+ }
+
+ /* Do it the slow way */
kaddr = kmap(page);
left = __copy_to_user(desc->buf, kaddr + offset, size);
kunmap(page);
-
+
if (left) {
size -= left;
desc->error = -EFAULT;
}
+success:
desc->count = count - size;
desc->written += size;
desc->buf += size;
.
On Sat, 10 Aug 2002, Andrew Morton wrote:
>
> If not, I don't think it's worth making this change just for
> the highmem read/write thing (calculating `current' at each
> spin_lock site...) I just open coded it.
Well, this way it will now do the preempt count twice (once in
kmap_atomic, once in th eopen-coded one) if preempt is enabled.
I'd suggest just making k[un]map_atomic() always do the
inc/dec_preempt_count. Other ideas?
Linus
Linus Torvalds wrote:
>
> On Sat, 10 Aug 2002, Andrew Morton wrote:
> >
> > If not, I don't think it's worth making this change just for
> > the highmem read/write thing (calculating `current' at each
> > spin_lock site...) I just open coded it.
>
> Well, this way it will now do the preempt count twice (once in
> kmap_atomic, once in th eopen-coded one) if preempt is enabled.
>
> I'd suggest just making k[un]map_atomic() always do the
> inc/dec_preempt_count. Other ideas?
>
Well the optimum solution there would be to create and use
`inc_preempt_count_non_preempt()'. I don't see any
way of embedding this in kmap_atomic() or copy_to_user_atomic()
without loss of flexibility or incurring a double-inc somewhere.
Please let my post-virginal brain know if you're not otherwise OK
with the approach ;)
arch/i386/mm/fault.c | 6 +++---
include/linux/preempt.h | 24 ++++++++++++++++++++++--
2 files changed, 25 insertions(+), 5 deletions(-)
--- 2.5.30/arch/i386/mm/fault.c~atomic-copy_user Sat Aug 10 14:44:03 2002
+++ 2.5.30-akpm/arch/i386/mm/fault.c Sat Aug 10 14:44:52 2002
@@ -189,10 +189,10 @@ asmlinkage void do_page_fault(struct pt_
info.si_code = SEGV_MAPERR;
/*
- * If we're in an interrupt or have no user
- * context, we must not take the fault..
+ * If we're in an interrupt, have no user context or are running in an
+ * atomic region then we must not take the fault..
*/
- if (in_interrupt() || !mm)
+ if (preempt_count() || !mm)
goto no_context;
#ifdef CONFIG_X86_REMOTE_DEBUG
--- 2.5.30/include/linux/preempt.h~atomic-copy_user Sat Aug 10 16:18:50 2002
+++ 2.5.30-akpm/include/linux/preempt.h Sat Aug 10 18:23:40 2002
@@ -5,19 +5,29 @@
#define preempt_count() (current_thread_info()->preempt_count)
+#define inc_preempt_count() \
+do { \
+ preempt_count()++; \
+} while (0)
+
+#define dec_preempt_count() \
+do { \
+ preempt_count()--; \
+} while (0)
+
#ifdef CONFIG_PREEMPT
extern void preempt_schedule(void);
#define preempt_disable() \
do { \
- preempt_count()++; \
+ inc_preempt_count(); \
barrier(); \
} while (0)
#define preempt_enable_no_resched() \
do { \
- preempt_count()--; \
+ dec_preempt_count(); \
barrier(); \
} while (0)
@@ -34,6 +44,9 @@ do { \
preempt_schedule(); \
} while (0)
+#define inc_preempt_count_non_preempt() do { } while (0)
+#define dec_preempt_count_non_preempt() do { } while (0)
+
#else
#define preempt_disable() do { } while (0)
@@ -41,6 +54,13 @@ do { \
#define preempt_enable() do { } while (0)
#define preempt_check_resched() do { } while (0)
+/*
+ * Sometimes we want to increment the preempt count, but we know that it's
+ * already incremented if the kernel is compiled for preemptibility.
+ */
+#define inc_preempt_count_non_preempt() inc_preempt_count()
+#define dec_preempt_count_non_preempt() dec_preempt_count()
+
#endif
#endif /* __LINUX_PREEMPT_H */
.
On Sat, Aug 10, 2002 at 03:42:29PM -0700, Linus Torvalds wrote:
> I _suspect_ that the common behaviour is to read just a few kB at a time
> and that is basically doesn't ever really pay to play VM games.
>
> (The "repeated read of a few kB" case is also likely to be the
> best-performing behaviour, simply because it's usually _better_ to do many
> small reads that re-use the cache than it is to do one large read that
> blows your cache and TLB. Of course, that all depends on what your
> patterns are after the read - do you want to have the whole file
> accessible or not).
This is only somewhat related, but I'm wondering if the cache effects
also apply to readahead block sizes. Sequential page-sized read()s from
a file causes readahead to kick in and grow in size. Over time, it ends
up using very large blocks. Would it be beneficial to keep the readahead
size smaller so that it still stays in cache?
Also, this use of large blocks shouldn't really matter, but I'm seeing a
problem where the process ends up sleeping for most of the time,
switching between CPU and I/O rather than simply having the I/O for the
next read() occur in advance of the current read().
The problem appears to be that readahead isn't awakening the process to
present partial results. The blocks get so large that the process
switches between running and being blocked in I/O, which decreases
overall performance (think of a "grep" process that at 100% CPU can just
saturate the disk I/O). Working correctly, readahead would not get in
the way, it would just have blocks ready for "grep" to use, and grep
would use all of the CPU not being used for I/O. Currently, grep sleeps
50% of the time waiting on I/O.
This problem is showing up with NFS over a slow link, causing streaming
audio to be unusable. On the other end of the speed scale, it probably
also affects "grep" and other applications reading from hard disks, etc.
To demonstrate the problem reliably, I've used "strace -r cat" on a
floppy, which is a sufficiently slow medium. :) This is on a 2.4.19
kernel, but 2.5 behaves similarly. Note how the readahead starts small
and gets very large. Also, note how the start of the first larger
readahead occurs shortly after a previous read, and that it blocks early
even though the data should already be there (4.9 seconds). It also
appears to stumble a bit later on. read() times show up as the relative
time for the following write() (which is going /dev/null):
0.000294 open("a/bigzero", O_RDONLY|O_LARGEFILE) = 3
0.000258 fstat64(3, {st_mode=S_IFREG|0775, st_size=914432, ...}) = 0
0.000275 brk(0x804e000) = 0x804e000
0.000223 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.593615 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000807 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000730 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000878 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000209 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000642 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000304 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000482 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.647682 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000537 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000687 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000469 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000185 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000433 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000183 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000430 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000186 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000432 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.649228 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000541 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000194 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000500 read(3, "\0\0\0\0\0"..., 4096) = 4096
4.897722 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000535 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000190 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000431 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000181 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000505 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000199 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000486 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000191 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000429 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000181 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000485 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000193 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000431 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000182 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000434 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000182 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000858 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000221 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.001148 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000243 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000877 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000247 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000649 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000220 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000497 read(3, "\0\0\0\0\0"..., 4096) = 4096
6.615653 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.002430 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000283 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000857 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000217 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000449 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000429 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000179 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000430 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000180 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000179 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000432 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000537 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000198 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000436 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000179 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000455 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000180 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000530 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000283 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000475 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000185 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000430 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000181 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000434 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.001341 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000470 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.001626 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.001282 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000278 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000481 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000186 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000430 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000467 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000179 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000179 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000182 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000429 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000581 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000203 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000662 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000199 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000430 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000180 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000492 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000201 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000430 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000179 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000181 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000484 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000189 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000433 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000181 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000440 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000182 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000448 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000429 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000179 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000429 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000429 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000180 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000188 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.001342 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000328 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000839 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000194 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000449 read(3, "\0\0\0\0\0"..., 4096) = 4096
1.031732 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000531 read(3, "\0\0\0\0\0"..., 4096) = 4096
6.154301 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000544 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000198 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000740 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000250 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000723 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000186 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000444 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000448 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000180 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000179 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000435 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000179 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.001227 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000196 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000454 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000182 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000597 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000207 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000432 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000180 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000497 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000196 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000429 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000179 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000199 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000452 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000179 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000727 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000221 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000486 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000187 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000650 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.001078 read(3, "\0\0\0\0\0"..., 4096) = 4096
7.004463 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000538 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000191 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000430 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000180 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000440 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000180 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000448 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000506 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.001446 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000283 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000469 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000185 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000431 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000181 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000494 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000194 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000429 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000175 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000487 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000193 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000431 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000484 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000191 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000429 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000182 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000684 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000220 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000500 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000201 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000429 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000180 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000484 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000189 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000431 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000543 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000190 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000430 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
7.407175 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000530 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000185 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000435 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000429 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000179 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000429 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000179 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000447 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000180 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000446 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000183 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000954 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000410 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000478 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000182 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000429 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000182 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000429 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000239 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000435 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000181 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000430 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000179 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000429 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000429 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000180 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000182 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000627 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000205 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.001126 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000191 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000468 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000432 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000179 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000537 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000221 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000489 read(3, "\0\0\0\0\0"..., 4096) = 4096
3.391947 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000529 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000185 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000573 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000193 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000486 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000191 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000436 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000531 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000588 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000444 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000181 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000447 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000182 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000180 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000179 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000425 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000185 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.001018 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000811 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000536 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000258 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000531 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000207 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000487 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000194 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000525 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000231 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000439 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000181 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000430 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000175 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000425 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000179 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000431 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000485 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000547 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000194 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000448 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000182 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000479 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000179 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000181 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000633 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000241 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000434 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000182 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000432 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000180 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
1.144692 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000533 read(3, "\0\0\0\0\0"..., 4096) = 4096
0.000185 write(1, "\0\0\0\0\0"..., 4096) = 4096
0.000434 read(3, "\0\0\0\0\0"..., 4096) = 1024
0.000318 write(1, "\0\0\0\0\0"..., 1024) = 1024
0.000276 read(3, "", 4096) = 0
0.000184 close(3) = 0
0.000259 _exit(0) = ?
We probably want huge readahead to occur in the case where programs are
competing for I/O from the same device, but the latency here from slow
devices is horrible.
Simon-
[ Stormix Technologies Inc. ][ NetNation Communications Inc. ]
[ [email protected] ][ [email protected] ]
[ Opinions expressed are not necessarily those of my employers. ]
Simon Kirby wrote:
>
> On Sat, Aug 10, 2002 at 03:42:29PM -0700, Linus Torvalds wrote:
>
> > I _suspect_ that the common behaviour is to read just a few kB at a time
> > and that is basically doesn't ever really pay to play VM games.
> >
> > (The "repeated read of a few kB" case is also likely to be the
> > best-performing behaviour, simply because it's usually _better_ to do many
> > small reads that re-use the cache than it is to do one large read that
> > blows your cache and TLB. Of course, that all depends on what your
> > patterns are after the read - do you want to have the whole file
> > accessible or not).
>
> This is only somewhat related, but I'm wondering if the cache effects
> also apply to readahead block sizes. Sequential page-sized read()s from
> a file causes readahead to kick in and grow in size. Over time, it ends
> up using very large blocks. Would it be beneficial to keep the readahead
> size smaller so that it still stays in cache?
>
> Also, this use of large blocks shouldn't really matter, but I'm seeing a
> problem where the process ends up sleeping for most of the time,
> switching between CPU and I/O rather than simply having the I/O for the
> next read() occur in advance of the current read().
>
> The problem appears to be that readahead isn't awakening the process to
> present partial results. The blocks get so large that the process
> switches between running and being blocked in I/O, which decreases
> overall performance (think of a "grep" process that at 100% CPU can just
> saturate the disk I/O). Working correctly, readahead would not get in
> the way, it would just have blocks ready for "grep" to use, and grep
> would use all of the CPU not being used for I/O. Currently, grep sleeps
> 50% of the time waiting on I/O.
This is interesting.
The 2.5 readahead sort-of does the wrong thing for you. Note how fs/mpage.c:mpage_end_io_read() walks the BIO's pages backwards when
unlocking the pages. And also note that the BIOs are 64kbytes, and
the readahead window is up to 128k, etc.
See, a boring old commodity disk drive will read 10,000 pages per
second. The BIO code there is designed to *not* result in 10,000
context-switches per second in the common case. If the reader is
capable of processing the data faster than the disk then hold
them off and present them with large chunks of data.
And that's usually the right thing to do, because most bulk readers
read fast - if your grep is really spending 50% of its time not
asleep then you either have a very slow grep or a very fast IO
system. It's applications such as gzip, which perform a significant
amount of work crunching on the data which are interesting to study,
and which benefit from readahead.
But that's all disks. You're not talking about disks.
> This problem is showing up with NFS over a slow link, causing streaming
> audio to be unusable. On the other end of the speed scale, it probably
> also affects "grep" and other applications reading from hard disks, etc.
Well, the question is "is the link saturated"? If so then it's not
solvable. If is is not then that's a bug.
> To demonstrate the problem reliably, I've used "strace -r cat" on a
> floppy, which is a sufficiently slow medium. :) This is on a 2.4.19
> kernel, but 2.5 behaves similarly. Note how the readahead starts small
> and gets very large. Also, note how the start of the first larger
> readahead occurs shortly after a previous read, and that it blocks early
> even though the data should already be there (4.9 seconds). It also
> appears to stumble a bit later on. read() times show up as the relative
> time for the following write() (which is going /dev/null):
OK, it's doing 128k of readahead there, which is a bit gross for a floppy.
You can tune that down with `blockdev --setra N /dev/floppy'. The
defaults are not good, and I do intend to go through the various block
drivers and teach them to set their initial readahead size to something
appropriate.
But in this example, where the test is `cat', there is nothing to be gained,
I expect. The disk is achieving its peak bandwidth.
However if the application was encrypting the data, or playing it
through loudspeakers then this may not be appropriate behaviour.
The design goal for readahead is that if an application is capable
of processing 10 megabytes/second and the disk sustains 11 megabytes/sec
then the application should never sleep. (I was about to test this,
but `mke2fs /dev/fd0' oopses in 2.5.30. ho hum)
Tuning the readahead per-fd is easy to do in 2.5. It would be in
units of pages, even though for many requirements, milliseconds
is a more appropriate unit for readahead. The basic unit of wakeup
granularity is 64kbytes - the max size of a BIO. Reducing that
to 4k for floppies would fix it up for you. We need some more BIO
infrastructure for that, and that will happen. Then we can go and
wind back the max bio size for floppies.
With some additional radix tree work we can implement the posix_fadvise
system call nicely, and its POSIX_FADV_WILLNEED could be beneficial.
The infrastructure is in place for network filesystems to be able to
tune their own readahead and expose that to user space, although none
of that has been done.
I don't think fiddling with readahead either in the application, the
system setup or the kernel is a satisfactory way of fixing all this.
It needs asynchronous IO. Then the time-sensitive application
can explicitly manage its own readahead to its own requirements.
(Could do this with POSIX_FADV_WILLNEED as well).
So hmm. Good point, thanks. I'll go play some MP3's off floppies.
(Holy crap. 2.5.31! I'm outta here)
On Sunday 11 August 2002 00:42, Linus Torvalds wrote:
> For example, what do you do when somebody has a COW-page mapped into it's
> VM space and you want to start paging stuff out?
Clearly it requires a CoW break and swapping out that page won't free any
memory directly, but it will in turn allow the cache page to be dropped. I
suppose your point is that these ideas touch the system in a lot of places,
and right now the code is a little too irregular to withstand lathering on a
new layer of cruft. That's true, but <plug>the reverse mapping work
enables some fundamental VM simplifications that make a lot of things more
local, and so a better base for these new, sophisticated features is on its
way.</plug>
> There are "interesting"
> cases that just may mean that doing the COW thing is a really stupid thing
> to do, even if it is intriguing to _think_ about it.
It is good sport, but the real benefits are compelling and will only get more
so. For high end scientific uses (read supercomputing clusters) it's a cinch
developers will prefer high speed file operations that turn in nearly the
same raw performance on large transfers as O_DIRECT while not bypassing the
file cache.
--
Daniel
On Sat, Aug 10, 2002 at 11:07:44PM -0700, Andrew Morton wrote:
> This is interesting.
>
> The 2.5 readahead sort-of does the wrong thing for you. Note how
> fs/mpage.c:mpage_end_io_read() walks the BIO's pages backwards when
> unlocking the pages. And also note that the BIOs are 64kbytes, and
> the readahead window is up to 128k, etc.
>
> See, a boring old commodity disk drive will read 10,000 pages per
> second. The BIO code there is designed to *not* result in 10,000
> context-switches per second in the common case. If the reader is
> capable of processing the data faster than the disk then hold
> them off and present them with large chunks of data.
Hmm. I understand, but I now that I think about it a bit more, I think
I failed to notice the real problem:
The size of the readahead wouldn't matter if it actually prefetched the
data in advance. It's not doing that right now.
What's happening with my MP3 streaming is:
1. read(4k) gets data after a delay. xmms starts playing.
2. read(4k) gets some more data, right way, because readahead worked.
xmms continues.
...
3. read(4k) blocks for a long time while readahead starts up again and
reads a huge block of data. read() then returns the 4k. meanwhile,
xmms has underrun. xmms starts again.
4. goto 2.
It's really easy to see this behavior with the xmms-crossfade plugin and
a large buffer with "buffer debugging" display on. With tcpdump in
another window, I can see that the readahead doesn't start prefetching
until it's right near the end of the data it fetched last, rather than
doing it in advance. This is not obvious except in the case where
read() speed is limited by something like audio playback rates or heavy
processing times.
> But that's all disks. You're not talking about disks.
Well, my example with grep was assuming a CPU the speed of what I have
right now, not something modern. :) "bzip2 -9" would likely apply these
days.
> > This problem is showing up with NFS over a slow link, causing streaming
> > audio to be unusable. On the other end of the speed scale, it probably
> > also affects "grep" and other applications reading from hard disks, etc.
>
> Well, the question is "is the link saturated"? If so then it's not
> solvable. If is is not then that's a bug.
The link is not saturated, but it is used in huge bursts mixed with
periods of silence (where readahead is finished but has not yet started
the next block).
> OK, it's doing 128k of readahead there, which is a bit gross for a floppy.
> You can tune that down with `blockdev --setra N /dev/floppy'. The
Ooh, is there something like this for NFS?
> but `mke2fs /dev/fd0' oopses in 2.5.30. ho hum)
Yes, floppy in 2.5 has been broken for a while...
> So hmm. Good point, thanks. I'll go play some MP3's off floppies.
:)
Simon-
[ Simon Kirby ][ Network Operations ]
[ [email protected] ][ NetNation Communications ]
[ Opinions expressed are not necessarily those of my employer. ]
Simon Kirby wrote:
>
> With tcpdump in another window, I can see that the readahead doesn'
> start prefetching until it's right near the end of the data it
> fetched last, rather than doing it in advance.
That's a big fat bug. And it wouldn't be astonishing if my
shiny new readahead does the same thing - I haven't analysed/tested
this scenario. Shall though.
Knowing zero about NFS, this:
if (!PageError(page) && NFS_SERVER(inode)->rsize >= PAGE_CACHE_SIZE) {
error = nfs_readpage_async(file, inode, page);
goto out;
}
error = nfs_readpage_sync(file, inode, page);
would seem to indicate that it's important to have 4k or 8k rsize and
wsize.
> ...
>
> > OK, it's doing 128k of readahead there, which is a bit gross for a floppy.
> > You can tune that down with `blockdev --setra N /dev/floppy'. The
>
> Ooh, is there something like this for NFS?
In 2.4, /proc/sys/vm/[min|max]_readahead should affect NFS, I think.
In 2.5, no knobs yet. NFS is using the default_backing_dev_info's
readahead setting, which isn't tunable. It needs to create its
own backing_dev_info (probably per mount?), make each inode's
inode.i_data.backing_dev_info point at that backing_dev_info
structure and export it to userspace in some manner. Guess I
should have told Trond that ;)
> > but `mke2fs /dev/fd0' oopses in 2.5.30. ho hum)
>
> Yes, floppy in 2.5 has been broken for a while...
>
Well it's oopsing in the code which tries to work out the
device geometry:
generic_unplug_device (data=0x0) at /usr/src/25/include/asm/spinlock.h:117
117 {
(gdb) bt
#0 generic_unplug_device (data=0x0) at /usr/src/25/include/asm/spinlock.h:117
#1 0xc020b57c in __floppy_read_block_0 (bdev=0xf62c4e00) at floppy.c:3896
#2 0xc020b5f6 in floppy_read_block_0 (dev={value = 512}) at floppy.c:3915
#3 0xc020b745 in floppy_revalidate (dev={value = 512}) at floppy.c:3954
#4 0xc01448b7 in check_disk_change (bdev=0xf62c4e00) at block_dev.c:522
#5 0xc020b377 in floppy_open (inode=0xf54e5ec0, filp=0xf4baa1a0) at floppy.c:3808
#6 0xc0144bc6 in do_open (bdev=0xf62c4e00, inode=0xf54e5ec0, file=0xf4baa1a0) at block_dev.c:623
#7 0xc0144f63 in blkdev_open (inode=0xf54e5ec0, filp=0xf4baa1a0) at block_dev.c:740
#8 0xc013d83e in dentry_open (dentry=0xf62dc5e0, mnt=0xc3ff5ee0, flags=32768) at open.c:655
#9 0xc013d770 in filp_open (filename=0xf6362000 "/dev/fd0", flags=32768, mode=0) at open.c:624
#10 0xc013db4f in sys_open (filename=0xbffffb9c "/dev/fd0", flags=32768, mode=0) at open.c:800
#11 0xc0107123 in syscall_call () at stats.c:204
So if you use something with known geometry, like /dev/fd0h1440, it works!
Andrew Morton wrote:
>
> ...
> So if you use something with known geometry, like /dev/fd0h1440, it works!
No it doesn't. You can run mke2fs, but the result is a wreck.
Simon Kirby wrote:
>
> ...
> What's happening with my MP3 streaming is:
>
> 1. read(4k) gets data after a delay. xmms starts playing.
> 2. read(4k) gets some more data, right way, because readahead worked.
> xmms continues.
> ...
> 3. read(4k) blocks for a long time while readahead starts up again and
> reads a huge block of data. read() then returns the 4k. meanwhile,
> xmms has underrun. xmms starts again.
> 4. goto 2.
>
> It's really easy to see this behavior with the xmms-crossfade plugin and
> a large buffer with "buffer debugging" display on.
I happen to have a little test app for this stuff:
http://www.zip.com.au/~akpm/linux/stream.tar.gz
You can use it to slowly read or write a file.
./stream -i /dev/fd0h1440 23 1000
will read 1000k from floppy at 23k per second. It's a bit
useless at those rates on 2.4 because of the coarse timer
resolution. But in 1000Hz 2.5 it works a treat.
./stream -i /dev/fd0h1440 20 1000 0.00s user 0.01s system 0% cpu 51.896 total
./stream -i /dev/fd0h1440 21 1000 0.00s user 0.02s system 0% cpu 49.825 total
./stream -i /dev/fd0h1440 22 1000 0.00s user 0.02s system 0% cpu 47.843 total
./stream -i /dev/fd0h1440 23 1000 0.00s user 0.01s system 0% cpu 45.853 total
./stream -i /dev/fd0h1440 24 1000 0.01s user 0.02s system 0% cpu 44.077 total
./stream -i /dev/fd0h1440 25 1000 0.00s user 0.02s system 0% cpu 42.307 total
./stream -i /dev/fd0h1440 26 1000 0.00s user 0.01s system 0% cpu 41.305 total
./stream -i /dev/fd0h1440 27 1000 0.00s user 0.02s system 0% cpu 40.493 total
./stream -i /dev/fd0h1440 28 1000 0.01s user 0.02s system 0% cpu 39.122 total
./stream -i /dev/fd0h1440 29 1000 0.00s user 0.01s system 0% cpu 39.118 total
What we see here is perfect readahead behaviour. The kernel is keeping the
read streaming ahead of the application's read cursor all the way out to the
point where the device is saturated. (The numbers are all off by three
seconds because of the initial spinup delay).
If you strace it, the reads are smooth on 2.4 and 2.5.
So it may be an NFS peculiarity. That's a bit hard for me to test over
100bT.
On Sat, 10 Aug 2002, Simon Kirby wrote:
>
> This is only somewhat related, but I'm wondering if the cache effects
> also apply to readahead block sizes. Sequential page-sized read()s from
> a file causes readahead to kick in and grow in size. Over time, it ends
> up using very large blocks. Would it be beneficial to keep the readahead
> size smaller so that it still stays in cache?
Any sane IO subsystem will do the actual IO using DMA, and not pollute the
cache for read[aheads] until the point where it is actually _used_.
> Also, this use of large blocks shouldn't really matter, but I'm seeing a
> problem where the process ends up sleeping for most of the time,
> switching between CPU and I/O rather than simply having the I/O for the
> next read() occur in advance of the current read().
>
> The problem appears to be that readahead isn't awakening the process to
> present partial results.
You're not the only one complaining about this.
I _think_ that the problem is not th read-ahead code, but some of the
block layer stuff. It appears that the read-ahead code is so successful
that we generate one large request for all of it, and we won't be waking
things up as they come in, but only after the whole request is done.
This is due to Andrews bio work. It decreases CPU load, but it sure as
hell does seem to decrease parallelism too, which is bad.
Basically, it _used_ to be that each page got woken up one at a time as
they became ready after IO. With the new scheme, they all get woken up
together in "mpage_end_io_read()" (or write, but since people usually
don't wait for writes..).
At least that is how I read the code. Andrew?
On the other hand, for most high-end controllers, you aren't even likely
to get notified in the middle anyway, since the controller will just do
the whole dang IO request in one go, and only notify us when it is totally
done.
> This problem is showing up with NFS over a slow link, causing streaming
> audio to be unusable. On the other end of the speed scale, it probably
> also affects "grep" and other applications reading from hard disks, etc.
Hmm.. NFS should be a totally different kettle of fish. Although the
read-ahead code is shared, I think the NFS client should be returning
successes one page at a time.
Jens, Trond and Andrew Cc'd for comments and your "strace" showing a
6/7-second latency is appended.
Linus
---
> To demonstrate the problem reliably, I've used "strace -r cat" on a
> floppy, which is a sufficiently slow medium. :) This is on a 2.4.19
> kernel, but 2.5 behaves similarly. Note how the readahead starts small
> and gets very large. Also, note how the start of the first larger
> readahead occurs shortly after a previous read, and that it blocks early
> even though the data should already be there (4.9 seconds). It also
> appears to stumble a bit later on. read() times show up as the relative
> time for the following write() (which is going /dev/null):
>
> 0.000294 open("a/bigzero", O_RDONLY|O_LARGEFILE) = 3
> 0.000258 fstat64(3, {st_mode=S_IFREG|0775, st_size=914432, ...}) = 0
> 0.000275 brk(0x804e000) = 0x804e000
> 0.000223 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.593615 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000807 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000730 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000878 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000209 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000642 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000304 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000482 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.647682 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000537 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000687 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000469 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000185 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000433 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000183 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000430 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000186 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000432 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.649228 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000541 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000194 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000500 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 4.897722 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000535 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000190 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000431 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000181 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000505 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000199 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000486 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000191 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000429 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000181 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000485 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000193 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000431 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000182 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000434 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000182 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000858 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000221 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.001148 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000243 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000877 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000247 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000649 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000220 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000497 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 6.615653 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.002430 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000283 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000857 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000217 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000449 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000429 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000179 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000430 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000180 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000179 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000432 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000537 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000198 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000436 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000179 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000455 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000180 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000530 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000283 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000475 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000185 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000430 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000181 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000434 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.001341 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000470 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.001626 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.001282 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000278 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000481 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000186 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000430 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000467 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000179 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000179 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000182 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000429 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000581 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000203 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000662 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000199 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000430 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000180 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000492 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000201 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000430 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000179 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000181 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000484 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000189 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000433 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000181 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000440 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000182 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000448 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000429 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000179 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000429 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000429 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000180 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000188 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.001342 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000328 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000839 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000194 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000449 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 1.031732 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000531 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 6.154301 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000544 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000198 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000740 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000250 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000723 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000186 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000444 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000448 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000180 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000179 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000435 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000179 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.001227 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000196 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000454 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000182 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000597 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000207 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000432 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000180 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000497 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000196 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000429 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000179 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000199 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000452 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000179 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000727 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000221 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000486 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000187 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000650 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.001078 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 7.004463 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000538 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000191 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000430 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000180 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000440 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000180 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000448 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000506 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.001446 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000283 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000469 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000185 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000431 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000181 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000494 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000194 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000429 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000175 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000487 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000193 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000431 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000484 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000191 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000429 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000182 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000684 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000220 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000500 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000201 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000429 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000180 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000484 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000189 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000431 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000543 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000190 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000430 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 7.407175 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000530 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000185 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000435 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000429 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000179 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000429 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000179 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000447 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000180 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000446 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000183 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000954 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000410 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000478 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000182 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000429 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000182 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000429 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000239 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000435 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000181 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000430 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000179 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000429 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000429 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000180 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000182 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000627 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000205 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.001126 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000191 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000468 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000432 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000179 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000537 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000221 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000489 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 3.391947 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000529 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000185 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000573 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000193 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000486 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000191 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000436 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000531 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000588 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000444 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000181 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000447 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000182 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000180 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000179 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000425 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000185 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.001018 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000811 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000536 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000258 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000531 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000207 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000487 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000194 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000525 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000231 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000439 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000181 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000430 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000175 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000425 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000179 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000426 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000431 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000485 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000547 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000194 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000448 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000182 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000479 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000177 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000176 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000179 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000428 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000178 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000181 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000633 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000241 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000434 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000182 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000432 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000180 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000427 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 1.144692 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000533 read(3, "\0\0\0\0\0"..., 4096) = 4096
> 0.000185 write(1, "\0\0\0\0\0"..., 4096) = 4096
> 0.000434 read(3, "\0\0\0\0\0"..., 4096) = 1024
> 0.000318 write(1, "\0\0\0\0\0"..., 1024) = 1024
> 0.000276 read(3, "", 4096) = 0
> 0.000184 close(3) = 0
> 0.000259 _exit(0) = ?
>
> We probably want huge readahead to occur in the case where programs are
> competing for I/O from the same device, but the latency here from slow
> devices is horrible.
>
> Simon-
>
> [ Stormix Technologies Inc. ][ NetNation Communications Inc. ]
> [ [email protected] ][ [email protected] ]
> [ Opinions expressed are not necessarily those of my employers. ]
>
>
>
On Sun, 11 Aug 2002, Daniel Phillips wrote:
> On Sunday 11 August 2002 00:42, Linus Torvalds wrote:
> > For example, what do you do when somebody has a COW-page mapped into it's
> > VM space and you want to start paging stuff out?
>
> Clearly it requires a CoW break and swapping out that page won't free any
> memory directly, but it will in turn allow the cache page to be dropped.
Well, that's the point. Is it really "clearly"?
One alternative is to just instead remove it from the page cache, and add
it to the swap cache directly (and unmapping it). In fact, I _think_ that
is the right thing to do (yes, it only works if the page count is 2 (one
for page cache, one for the VM mapping), but that's very different from
breaking the COW and generating two separate pages.
The "move directly to swap cache" is nice in that it doesn't add any new
pages. But it's nasty in that it steals pages from the file cache, so that
it basically turns a potentially sharable cache into a private cache that
nobody else will see.
See? You actually _do_ have choices on what to do.
Linus
On Sunday 11 August 2002 21:00, Linus Torvalds wrote:
> On Sun, 11 Aug 2002, Daniel Phillips wrote:
>
> > On Sunday 11 August 2002 00:42, Linus Torvalds wrote:
> > > For example, what do you do when somebody has a COW-page mapped into it's
> > > VM space and you want to start paging stuff out?
> >
> > Clearly it requires a CoW break and swapping out that page won't free any
> > memory directly, but it will in turn allow the cache page to be dropped.
>
> Well, that's the point. Is it really "clearly"?
>
> One alternative is to just instead remove it from the page cache, and add
> it to the swap cache directly (and unmapping it). In fact, I _think_ that
> is the right thing to do (yes, it only works if the page count is 2 (one
> for page cache, one for the VM mapping), but that's very different from
> breaking the COW and generating two separate pages.
Far clearer ;-)
With reverse mapping it works for any page count.
> The "move directly to swap cache" is nice in that it doesn't add any new
> pages. But it's nasty in that it steals pages from the file cache, so that
> it basically turns a potentially sharable cache into a private cache that
> nobody else will see.
But you got it right the first time: we're evicting the page because it's
inactive and we want the memory for something else. We don't need to give
that page more second chances, it already had its share of chances before
it got this far in the eviction process. If the file page gets reloaded
before the swap-out completes it just means we chose the victim poorly
in the first place, or we're unlucky. The latter is supposed to be the
exception, not the rule.
> See? You actually _do_ have choices on what to do.
Yes, in this case, the correct thing and the dumb thing.
--
Daniel
Jeff Garzik writes:
> Linus Torvalds wrote:
> The overhead of the extra stat and mmap/munmap syscalls seemed to be the
> thing that slowed things down. sendfile was pretty fast, but still an
> extra syscall, with an annoyingly large error handling case [only
> certain files can be sendfile'd]
That error handling case sure does discourage sendfile use.
> I sure would like an O_STREAMING flag, though... let a user app hint to
> the system that the pages it is reading or writing are perhaps less
> likely to be reused, or access randomly.... A copy-file syscall would
> be nice, too, but that's just laziness talking....
You have a laptop computer with a USB-connected Ethernet.
You mount a NetApp or similar box via the SMB/CIFS protocol.
You see a multi-gigabyte file. You make a copy... ouch!!!
For each gigabyte, you hog the network for an hour.
Now let's say this file is for a MacOS app. You have to
preserve the creator, file type, resource fork, etc.
Linus Torvalds wrote:
>
> ...
> Basically, it _used_ to be that each page got woken up one at a time as
> they became ready after IO. With the new scheme, they all get woken up
> together in "mpage_end_io_read()" (or write, but since people usually
> don't wait for writes..).
>
> At least that is how I read the code. Andrew?
Yes. The basic unit of IO in there is a 64k BIO. So once readahead
is cruising, pages come unlocked in 16-page batches. In 2.4 they'll
come unlocked one at a time against a device such as a floppy drive.
But with default settings the readahead code lays one to two of these
BIOs out ahead of the read point, so the application never stumbles across
a locked page unless it's outpacing the device.
At least that's the theory, and the testing I did yesterday
was succesful.
So I'd appreciate it if Simon could invetigate a little further
with the test app I posted. Something is up, and it may not
be just an NFS thing. But note that nfs_readpage will go
synchronous if rsize is less than PAGE_CACHE_SIZE, so it has
to be set up right.
On Sun, 11 Aug 2002, Andrew Morton wrote:
>
> At least that's the theory, and the testing I did yesterday
> was succesful.
Did you try Simons test-case which seemed to be just a "cat" on a floppy
"To demonstrate the problem reliably, I've used "strace -r cat" on a
floppy, which is a sufficiently slow medium. :) This is on a 2.4.19
kernel, but 2.5 behaves similarly.")
although that may be different from the NFS issue, it is kind of
interesting: the perfect behaviour would be a steady stream of data, not
too many hickups.
Linus
Linus Torvalds wrote:
>
> On Sun, 11 Aug 2002, Andrew Morton wrote:
> >
> > At least that's the theory, and the testing I did yesterday
> > was succesful.
>
> Did you try Simons test-case which seemed to be just a "cat" on a floppy
>
> "To demonstrate the problem reliably, I've used "strace -r cat" on a
> floppy, which is a sufficiently slow medium. :) This is on a 2.4.19
> kernel, but 2.5 behaves similarly.")
>
> although that may be different from the NFS issue, it is kind of
> interesting: the perfect behaviour would be a steady stream of data, not
> too many hickups.
I did, but I cut you from the Cc...
> I happen to have a little test app for this stuff:
> http://www.zip.com.au/~akpm/linux/stream.tar.gz
>
> You can use it to slowly read or write a file.
>
> ./stream -i /dev/fd0h1440 23 1000
>
> will read 1000k from floppy at 23k per second. It's a bit
> useless at those rates on 2.4 because of the coarse timer
> resolution. But in 1000Hz 2.5 it works a treat.
>
> ./stream -i /dev/fd0h1440 20 1000 0.00s user 0.01s system 0% cpu 51.896 total
> ./stream -i /dev/fd0h1440 21 1000 0.00s user 0.02s system 0% cpu 49.825 total
> ./stream -i /dev/fd0h1440 22 1000 0.00s user 0.02s system 0% cpu 47.843 total
> ./stream -i /dev/fd0h1440 23 1000 0.00s user 0.01s system 0% cpu 45.853 total
> ./stream -i /dev/fd0h1440 24 1000 0.01s user 0.02s system 0% cpu 44.077 total
> ./stream -i /dev/fd0h1440 25 1000 0.00s user 0.02s system 0% cpu 42.307 total
> ./stream -i /dev/fd0h1440 26 1000 0.00s user 0.01s system 0% cpu 41.305 total
> ./stream -i /dev/fd0h1440 27 1000 0.00s user 0.02s system 0% cpu 40.493 total
> ./stream -i /dev/fd0h1440 28 1000 0.01s user 0.02s system 0% cpu 39.122 total
> ./stream -i /dev/fd0h1440 29 1000 0.00s user 0.01s system 0% cpu 39.118 total
>
> What we see here is perfect readahead behaviour. The kernel is keeping the
> read streaming ahead of the application's read cursor all the way out to the
> point where the device is saturated. (The numbers are all off by three
> seconds because of the initial spinup delay).
>
> If you strace it, the reads are smooth on 2.4 and 2.5.
>
> So it may be an NFS peculiarity. That's a bit hard for me to test over
> 100bT.
The strace of that app is smooth, all the way out to the peak disk
bandwidth.
So something is different either in the test or in Simon's setup. It
needs further investigation.
On Sun, Aug 11, 2002 at 08:28:12PM -0700, Andrew Morton wrote:
> So I'd appreciate it if Simon could invetigate a little further
> with the test app I posted. Something is up, and it may not
> be just an NFS thing. But note that nfs_readpage will go
> synchronous if rsize is less than PAGE_CACHE_SIZE, so it has
> to be set up right.
You're right -- my NFS page size is set to 2048. I can't remember if I
did this because I was trying to work around huge read-ahead or because I
was trying to work around the bursts of high latency from my Terayon
cable modem (which idles at a slow line speed and "falls forward" to
higher speeds once it detects traffic, but with a delay, causing awful
latency at the expense of "better noise immunity"). Anyway, I will test
this tomorrow. I recall that 1024 byte-sized blocks were too small
because the latency of the cable modem would cause it to not have high
enough throughput, so I settled with 2048.
I haven't been able to test your application over NFS yet, but I did get
a chance to test it with a floppy. I was able to (on 2.4.19) reproduce a
case where even with just 5 KB/second reads, the read() would block every
so often (long strace attached).
I don't really trust my floppy device to read every sector successfully
on the first try, but at one point during this strace, I saw a point
where read() blocked and the floppy LED lit immediately, as if it had
done no preparation at all (it was not as if it was close and the motor
didn't spin up in time).
Some strace snippets:
[sroot@oof:/]# umount /a ; mount -t ext2 -o noatime,nodiratime /dev/fd0 /a
&& strace -o /tmp/strace.txt -r a/stream -i a/bigfile 5 1024
0.209706 read(3, "\0\0\0\0\0"..., 1024) = 1024
0.000242 nanosleep({0, 200000000}, {3221223736, 1073818412}) = 0
0.209861 read(3, "\0\0\0\0\0"..., 1024) = 1024
0.000318 nanosleep({0, 200000000}, {3221223736, 1073818412}) = 0
0.209665 read(3, "\0\0\0\0\0"..., 1024) = 1024
0.538707 nanosleep({0, 200000000}, {3221223736, 1073818412}) = 0
0.201309 read(3, "\0\0\0\0\0"..., 1024) = 1024
0.000298 nanosleep({0, 200000000}, {3221223736, 1073818412}) = 0
0.209675 read(3, "\0\0\0\0\0"..., 1024) = 1024
0.000226 nanosleep({0, 200000000}, {3221223736, 1073818412}) = 0
0.209711 read(3, "\0\0\0\0\0"..., 1024) = 1024
0.000290 nanosleep({0, 200000000}, {3221223736, 1073818412}) = 0
0.209757 read(3, "\0\0\0\0\0"..., 1024) = 1024
0.000308 nanosleep({0, 200000000}, {3221223736, 1073818412}) = 0
0.209863 read(3, "\0\0\0\0\0"..., 1024) = 1024
2.680966 nanosleep({0, 200000000}, {3221223736, 1073818412}) = 0
0.209039 read(3, "\0\0\0\0\0"..., 1024) = 1024
0.000359 nanosleep({0, 200000000}, {3221223736, 1073818412}) = 0
0.209648 read(3, "\0\0\0\0\0"..., 1024) = 1024
0.000308 nanosleep({0, 200000000}, {3221223736, 1073818412}) = 0
0.209711 read(3, "\0\0\0\0\0"..., 1024) = 1024
0.209645 read(3, "\0\0\0\0\0"..., 1024) = 1024
0.000307 nanosleep({0, 200000000}, {3221223736, 1073818412}) = 0
0.209734 read(3, "\0\0\0\0\0"..., 1024) = 1024
0.000301 nanosleep({0, 200000000}, {3221223736, 1073818412}) = 0
0.209672 read(3, "\0\0\0\0\0"..., 1024) = 1024
2.964750 nanosleep({0, 200000000}, {3221223736, 1073818412}) = 0
0.205464 read(3, "\0\0\0\0\0"..., 1024) = 1024
0.000316 nanosleep({0, 200000000}, {3221223736, 1073818412}) = 0
0.209628 read(3, "\0\0\0\0\0"..., 1024) = 1024
0.000302 nanosleep({0, 200000000}, {3221223736, 1073818412}) = 0
0.209792 read(3, "\0\0\0\0\0"..., 1024) = 1024
So, something does appear to be wrong. If I can actually mount a
filesystem on a floppy in 2.5, I'll see if the same thing happens.
Simon-
[ Simon Kirby ][ Network Operations ]
[ [email protected] ][ NetNation Communications ]
[ Opinions expressed are not necessarily those of my employer. ]
Simon Kirby wrote:
>
> On Sun, Aug 11, 2002 at 08:28:12PM -0700, Andrew Morton wrote:
>
> > So I'd appreciate it if Simon could invetigate a little further
> > with the test app I posted. Something is up, and it may not
> > be just an NFS thing. But note that nfs_readpage will go
> > synchronous if rsize is less than PAGE_CACHE_SIZE, so it has
> > to be set up right.
>
> You're right -- my NFS page size is set to 2048. I can't remember if I
> did this because I was trying to work around huge read-ahead or because I
> was trying to work around the bursts of high latency from my Terayon
> cable modem (which idles at a slow line speed and "falls forward" to
> higher speeds once it detects traffic, but with a delay, causing awful
> latency at the expense of "better noise immunity"). Anyway, I will test
> this tomorrow. I recall that 1024 byte-sized blocks were too small
> because the latency of the cable modem would cause it to not have high
> enough throughput, so I settled with 2048.
OK, thanks.
> I haven't been able to test your application over NFS yet, but I did get
> a chance to test it with a floppy. I was able to (on 2.4.19) reproduce a
> case where even with just 5 KB/second reads, the read() would block every
> so often (long strace attached).
Well with a 64k readahead chunk the kernel will only talk to the
floppy drive once per 13 seconds. Surely it's spinning down?
Try setting the readahead to 16 kbytes (three seconds) with
blockdev --setra 32 /dev/floppy
>
> So, something does appear to be wrong. If I can actually mount a
> filesystem on a floppy in 2.5, I'll see if the same thing happens.
Nope, floppy is bust. But you can read directly from /dev/fd0h1440 OK.
On Fri, 9 Aug 2002 18:33:09 -0700 (PDT)
Linus Torvalds <[email protected]> wrote:
> repeat:
> kmap_atomic(..); // this increments preempt count
> nr = copy_from_user(..);
Please please please use a different name for "I know I'm not preemptible but
I can handle it" or a flag or something.
That leaves us with the possibility of a BUG() in the "normal" copy_to/from_user
for all those "I'm holding a spinlock while copying to userspace wheeee!" bugs.
Very common mistake for new kernel authors.
With the preempt count we have an easy way of detecting this at runtime: I'd
like to keep that.
Rusty.
--
there are those who do and those who hang on and you don't see too
many doers quoting their contemporaries. -- Larry McVoy
On Monday 12 August 2002 09:45, Rusty Russell wrote:
> On Fri, 9 Aug 2002 18:33:09 -0700 (PDT)
> Linus Torvalds <[email protected]> wrote:
> > repeat:
> > kmap_atomic(..); // this increments preempt count
> > nr = copy_from_user(..);
>
> Please please please use a different name for "I know I'm not preemptible but
> I can handle it" or a flag or something.
>
> That leaves us with the possibility of a BUG() in the "normal" copy_to/from_user
> for all those "I'm holding a spinlock while copying to userspace wheeee!" bugs.
> Very common mistake for new kernel authors.
That's the whole point of this: it's not a bug anymore. (It's a feature.)
But agreed, a different name than preempt count would be nice, because it's
evolving away from its original function. Is this a 'monitor'? (I don't
think so.) Perhaps 'atomic_count' is more accurate.
--
Daniel
Albert D. Cahalan wrote:
> Jeff Garzik writes:
>>I sure would like an O_STREAMING flag, though... let a user app hint to
>>the system that the pages it is reading or writing are perhaps less
>>likely to be reused, or access randomly.... A copy-file syscall would
>>be nice, too, but that's just laziness talking....
>
>
> You have a laptop computer with a USB-connected Ethernet.
> You mount a NetApp or similar box via the SMB/CIFS protocol.
> You see a multi-gigabyte file. You make a copy... ouch!!!
> For each gigabyte, you hog the network for an hour.
> Now let's say this file is for a MacOS app. You have to
> preserve the creator, file type, resource fork, etc.
/bin/cp has these problems regardless of whether or not it uses a
copy-file syscall.
Jeff
[email protected] said:
> > > A copy-file syscall would be nice, too, but that's just laziness
> > > talking....
> > You have a laptop computer with a USB-connected Ethernet.
> > You mount a NetApp or similar box via the SMB/CIFS protocol.
> > You see a multi-gigabyte file. You make a copy... ouch!!!
> > For each gigabyte, you hog the network for an hour.
> /bin/cp has these problems regardless of whether or not it uses a
> copy-file syscall.
Nope. There was a reason he specified SMB/CIFS.
--
dwmw2
On Mon, 12 Aug 2002, Rusty Russell wrote:
> On Fri, 9 Aug 2002 18:33:09 -0700 (PDT)
> Linus Torvalds <[email protected]> wrote:
>
> > repeat:
> > kmap_atomic(..); // this increments preempt count
> > nr = copy_from_user(..);
>
> Please please please use a different name for "I know I'm not preemptible but
> I can handle it" or a flag or something.
>
> That leaves us with the possibility of a BUG() in the "normal" copy_to/from_user
> for all those "I'm holding a spinlock while copying to userspace wheeee!" bugs.
> Very common mistake for new kernel authors.
Agreed.
Maybe the right thing to do is to just have a
atomic_copy_from_user()
which can then be used to explicitly not check if we have a kernel
debugging option.
Linus
>>>>> " " == Andrew Morton <[email protected]> writes:
> Simon Kirby wrote:
>>
>> On Sun, Aug 11, 2002 at 08:28:12PM -0700, Andrew Morton wrote:
>>
>> > So I'd appreciate it if Simon could invetigate a little
>> > further with the test app I posted. Something is up, and it
>> > may not be just an NFS thing. But note that nfs_readpage
>> > will go synchronous if rsize is less than PAGE_CACHE_SIZE, so
>> > it has to be set up right.
>>
>> You're right -- my NFS page size is set to 2048. I can't
>> remember if I did this because I was trying to work around huge
>> read-ahead or because I was trying to work around the bursts of
>> high latency from my Terayon cable modem (which idles at a slow
>> line speed and "falls forward" to higher speeds once it detects
>> traffic, but with a delay, causing awful latency at the expense
>> of "better noise immunity"). Anyway, I will test this
>> tomorrow. I recall that 1024 byte-sized blocks were too small
>> because the latency of the cable modem would cause it to not
>> have high enough throughput, so I settled with 2048.
> OK, thanks.
Sorry if somebody already covered this (I'm still a bit jetlagged so I
may have missed part of the argument) but if the read is synchronous,
why should we care about doing readahead at all?
Wasn't the 2.4.x code designed so that you first scheduled the read
for the page you are interested in, and only if the page was not
immediately made available would you then schedule some readahead?
Cheers,
Trond
On Mon, 12 Aug 2002, Daniel Phillips wrote:
>
> That's the whole point of this: it's not a bug anymore. (It's a feature.)
Well, it's a feature only if _intentional_, so I think Rusty's argument
was that we should call it something else than "copy_to/from_user()" if
we're ready to accept the fact that it fails for random reasons..
Linus
Trond Myklebust wrote:
>
> >>>>> " " == Andrew Morton <[email protected]> writes:
>
> > Simon Kirby wrote:
> >>
> >> On Sun, Aug 11, 2002 at 08:28:12PM -0700, Andrew Morton wrote:
> >>
> >> > So I'd appreciate it if Simon could invetigate a little
> >> > further with the test app I posted. Something is up, and it
> >> > may not be just an NFS thing. But note that nfs_readpage
> >> > will go synchronous if rsize is less than PAGE_CACHE_SIZE, so
> >> > it has to be set up right.
> >>
> >> You're right -- my NFS page size is set to 2048. I can't
> >> remember if I did this because I was trying to work around huge
> >> read-ahead or because I was trying to work around the bursts of
> >> high latency from my Terayon cable modem (which idles at a slow
> >> line speed and "falls forward" to higher speeds once it detects
> >> traffic, but with a delay, causing awful latency at the expense
> >> of "better noise immunity"). Anyway, I will test this
> >> tomorrow. I recall that 1024 byte-sized blocks were too small
> >> because the latency of the cable modem would cause it to not
> >> have high enough throughput, so I settled with 2048.
>
> > OK, thanks.
>
> Sorry if somebody already covered this (I'm still a bit jetlagged so I
> may have missed part of the argument) but if the read is synchronous,
> why should we care about doing readahead at all?
Well, all reads are synchronous, in a way....
In this case, where the application's data-processing bandwidth is
vastly higher than the media bandwidth, readahead isn't doing anything
useful, apart from allowing the submission of nice big chunks to the IO
layers. Batching.
If the application is processing data more slowly then readahead
will allow the IO to be overlapped with that processing. But with
rsize < PAGE_CACHE_SIZE, all NFS reads are synchronous and everything
has gone bad. It may be sensible for NFS to disable readahead
in this case.
> Wasn't the 2.4.x code designed so that you first scheduled the read
> for the page you are interested in, and only if the page was not
> immediately made available would you then schedule some readahead?
2.4 will schedule readahead whether or not the requested page is
uptodate. Same in 2.5.
2.4 readahead has an explicit "don't do more readahead if the
current page is still under IO", whereas 2.5 has "don't readahead
pages in a previously-submitted window". They'll have the same
effect.
On Monday 12 August 2002 22:29, Linus Torvalds wrote:
> On Mon, 12 Aug 2002, Daniel Phillips wrote:
> >
> > That's the whole point of this: it's not a bug anymore. (It's a feature.)
>
> Well, it's a feature only if _intentional_, so I think Rusty's argument
> was that we should call it something else than "copy_to/from_user()" if
> we're ready to accept the fact that it fails for random reasons..
Right, I meant to follow up and correct that - the caller has the
responsibility of detecting the short transfer and taking corrective
action, but on the other hand, maybe the caller always had that
responsibility.
But for the cases where the caller 'knows' it holds no locks, it's
better to oops if that's untrue as Rusty said, plus the inc/dec is
saved in that case.
--
Daniel
On Sat, Aug 10, 2002 at 08:01:17PM +0100, Christoph Hellwig wrote:
> Solaris 9 (and Solaris 8 with a certain patch) support Linux-style
> sendfile(). Linux 2.5 on the other hand doesn't support sendfile to
> files anymore..
Why got it broken? It was useful for copying and showing the
progress at the same time without touching the own VM.
Regards
Ingo Oeser
--
Science is what we can tell a computer. Art is everything else. --- D.E.Knuth