Introduce a new fault_in_iov_iter helper for manually faulting in an iterator.
Other than fault_in_pages_writeable(), this function is non-destructive.
We'll use fault_in_iov_iter in gfs2 once we've determined that the iterator
passed to .read_iter or .write_iter isn't in memory.
Signed-off-by: Andreas Gruenbacher <[email protected]>
---
include/linux/mm.h | 3 ++
include/linux/uio.h | 1 +
lib/iov_iter.c | 42 ++++++++++++++++++++++++++++
mm/gup.c | 68 +++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 114 insertions(+)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7ca22e6e694a..14b1353995e2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1840,6 +1840,9 @@ int get_user_pages_fast(unsigned long start, int nr_pages,
int pin_user_pages_fast(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages);
+unsigned long fault_in_user_pages(unsigned long start, unsigned long len,
+ bool write);
+
int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc);
int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
struct task_struct *task, bool bypass_rlim);
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 82c3c3e819e0..152b3605e86c 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -120,6 +120,7 @@ size_t copy_page_from_iter_atomic(struct page *page, unsigned offset,
void iov_iter_advance(struct iov_iter *i, size_t bytes);
void iov_iter_revert(struct iov_iter *i, size_t bytes);
int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes);
+size_t fault_in_iov_iter(const struct iov_iter *i);
size_t iov_iter_single_seg_count(const struct iov_iter *i);
size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
struct iov_iter *i);
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 20dc3d800573..7221665f7ac4 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -460,6 +460,48 @@ int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes)
}
EXPORT_SYMBOL(iov_iter_fault_in_readable);
+/**
+ * fault_in_iov_iter - fault in iov iterator for reading / writing
+ * @i: iterator
+ *
+ * Faults in the iterator using get_user_pages, i.e., without triggering
+ * hardware page faults.
+ *
+ * This is primarily useful when we know that some or all of the pages in @i
+ * aren't in memory. For iterators that are likely to be in memory,
+ * fault_in_pages_readable() may be more appropriate.
+ *
+ * Other than fault_in_pages_writeable(), this function is non-destructive even
+ * when faulting in pages for writing.
+ *
+ * Returns the number of bytes faulted in, or the size of @i if @i doesn't need
+ * faulting in.
+ */
+size_t fault_in_iov_iter(const struct iov_iter *i)
+{
+ size_t count = i->count;
+ const struct iovec *p;
+ size_t ret = 0, skip;
+
+ if (iter_is_iovec(i)) {
+ for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) {
+ unsigned long len = min(count, p->iov_len - skip);
+ unsigned long start, l;
+
+ if (unlikely(!len))
+ continue;
+ start = (unsigned long)p->iov_base + skip;
+ l = fault_in_user_pages(start, len, iov_iter_rw(i) != WRITE);
+ ret += l;
+ if (unlikely(l != len))
+ break;
+ count -= l;
+ }
+ }
+ return ret;
+}
+EXPORT_SYMBOL(fault_in_iov_iter);
+
void iov_iter_init(struct iov_iter *i, unsigned int direction,
const struct iovec *iov, unsigned long nr_segs,
size_t count)
diff --git a/mm/gup.c b/mm/gup.c
index 42b8b1fa6521..033d66586c62 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1669,6 +1669,74 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
}
#endif /* !CONFIG_MMU */
+/**
+ * fault_in_user_pages - fault in an address range for reading / writing
+ * @start: start of address range
+ * @len: length of address range
+ * @write: fault in for writing
+ *
+ * Note that we don't pin or otherwise hold the pages referenced that we fault
+ * in. There's no guarantee that they'll stay in memory for any duration of
+ * time.
+ *
+ * Returns the number of bytes faulted in from @start.
+ */
+unsigned long fault_in_user_pages(unsigned long start, unsigned long len,
+ bool write)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma = NULL;
+ unsigned long end, nstart, nend;
+ int locked = 0;
+ int gup_flags;
+
+ /*
+ * FIXME: Make sure this function doesn't succeed for pages that cannot
+ * be accessed; otherwise we could end up in a loop trying to fault in
+ * and then access the pages. (It's okay if a page gets evicted and we
+ * need more than one retry.)
+ */
+
+ /*
+ * FIXME: Are these the right FOLL_* flags?
+ */
+
+ gup_flags = FOLL_TOUCH | FOLL_POPULATE;
+ if (write)
+ gup_flags |= FOLL_WRITE;
+
+ end = PAGE_ALIGN(start + len);
+ for (nstart = start & PAGE_MASK; nstart < end; nstart = nend) {
+ unsigned long nr_pages;
+ long ret;
+
+ if (!locked) {
+ locked = 1;
+ mmap_read_lock(mm);
+ vma = find_vma(mm, nstart);
+ } else if (nstart >= vma->vm_end)
+ vma = vma->vm_next;
+ if (!vma || vma->vm_start >= end)
+ break;
+ nend = min(end, vma->vm_end);
+ if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+ continue;
+ if (nstart < vma->vm_start)
+ nstart = vma->vm_start;
+ nr_pages = (nend - nstart) / PAGE_SIZE;
+ ret = __get_user_pages_locked(mm, nstart, nr_pages,
+ NULL, NULL, &locked, gup_flags);
+ if (ret <= 0)
+ break;
+ nend = nstart + ret * PAGE_SIZE;
+ }
+ if (locked)
+ mmap_read_unlock(mm);
+ if (nstart > start)
+ return min(nstart - start, len);
+ return 0;
+}
+
/**
* get_dump_page() - pin user page in memory while writing it to core dump
* @addr: user address
--
2.26.3
On Fri, Jul 23, 2021 at 1:58 PM Andreas Gruenbacher <[email protected]> wrote:
>
> Introduce a new fault_in_iov_iter helper for manually faulting in an iterator.
> Other than fault_in_pages_writeable(), this function is non-destructive.
Again, as I pointed out in the previous version, "Other than" is not
sensible language.
You mean "Unlike".
Same issue in the comment:
> + * Other than fault_in_pages_writeable(), this function is non-destructive even
> + * when faulting in pages for writing.
It really should be
"Unlike fault_in_pages_writeable(), this function .."
to parse correctly.
I understand what you mean, but only because I know what
fault_in_pages_writeable() does and what the issue was.
And in a year or two, I might have forgotten, and wonder what you meant.
Linus
On Fri, Jul 23, 2021 at 10:58:34PM +0200, Andreas Gruenbacher wrote:
> Introduce a new fault_in_iov_iter helper for manually faulting in an iterator.
> Other than fault_in_pages_writeable(), this function is non-destructive.
>
> We'll use fault_in_iov_iter in gfs2 once we've determined that the iterator
> passed to .read_iter or .write_iter isn't in memory.
Hmm... I suspect that this is going to be much heavier for read access
than the existing variant. Do we ever want it for anything other than
writes?
Am Sa., 24. Juli 2021 um 01:41 Uhr schrieb Linus Torvalds
<[email protected]>:
> On Fri, Jul 23, 2021 at 1:58 PM Andreas Gruenbacher <[email protected]> wrote:
> >
> > Introduce a new fault_in_iov_iter helper for manually faulting in an iterator.
> > Other than fault_in_pages_writeable(), this function is non-destructive.
>
> Again, as I pointed out in the previous version, "Other than" is not
> sensible language.
Thanks for pointing this out a second time. I have no idea how I
screwed up fixing that.
Andreas
Am Sa., 24. Juli 2021 um 03:53 Uhr schrieb Al Viro <[email protected]>:
> On Fri, Jul 23, 2021 at 10:58:34PM +0200, Andreas Gruenbacher wrote:
> > Introduce a new fault_in_iov_iter helper for manually faulting in an iterator.
> > Other than fault_in_pages_writeable(), this function is non-destructive.
> >
> > We'll use fault_in_iov_iter in gfs2 once we've determined that the iterator
> > passed to .read_iter or .write_iter isn't in memory.
>
> Hmm... I suspect that this is going to be much heavier for read access
> than the existing variant. Do we ever want it for anything other than
> writes?
I don't know if it actually is slower when pages need to be faulted
in, but I'm fine turning it into a write-only function.
Thanks,
Andreas
On Fri 23-07-21 22:58:34, Andreas Gruenbacher wrote:
> Introduce a new fault_in_iov_iter helper for manually faulting in an iterator.
> Other than fault_in_pages_writeable(), this function is non-destructive.
>
> We'll use fault_in_iov_iter in gfs2 once we've determined that the iterator
> passed to .read_iter or .write_iter isn't in memory.
>
> Signed-off-by: Andreas Gruenbacher <[email protected]>
...
> +unsigned long fault_in_user_pages(unsigned long start, unsigned long len,
> + bool write)
> +{
> + struct mm_struct *mm = current->mm;
> + struct vm_area_struct *vma = NULL;
> + unsigned long end, nstart, nend;
> + int locked = 0;
> + int gup_flags;
> +
> + /*
> + * FIXME: Make sure this function doesn't succeed for pages that cannot
> + * be accessed; otherwise we could end up in a loop trying to fault in
> + * and then access the pages. (It's okay if a page gets evicted and we
> + * need more than one retry.)
> + */
> +
> + /*
> + * FIXME: Are these the right FOLL_* flags?
> + */
How about the FIXMEs here? I guess we should answer these questions before
merging and remove the comments. Regarding the first FIXME I tend to agree
that if we cannot fault in the first page, we should return the error
rather than returning 0 as you do now. OTOH the caller can check for 0 and
understand there's something wrong going on as well. But the error would be
probably a bit clearer.
> +
> + gup_flags = FOLL_TOUCH | FOLL_POPULATE;
I don't think FOLL_POPULATE makes sense here. It makes sense only with
FOLL_MLOCK and determines whether mlock(2) should fault in missing pages or
not.
Honza
> + if (write)
> + gup_flags |= FOLL_WRITE;
> +
> + end = PAGE_ALIGN(start + len);
> + for (nstart = start & PAGE_MASK; nstart < end; nstart = nend) {
> + unsigned long nr_pages;
> + long ret;
> +
> + if (!locked) {
> + locked = 1;
> + mmap_read_lock(mm);
> + vma = find_vma(mm, nstart);
> + } else if (nstart >= vma->vm_end)
> + vma = vma->vm_next;
> + if (!vma || vma->vm_start >= end)
> + break;
> + nend = min(end, vma->vm_end);
> + if (vma->vm_flags & (VM_IO | VM_PFNMAP))
> + continue;
> + if (nstart < vma->vm_start)
> + nstart = vma->vm_start;
> + nr_pages = (nend - nstart) / PAGE_SIZE;
> + ret = __get_user_pages_locked(mm, nstart, nr_pages,
> + NULL, NULL, &locked, gup_flags);
> + if (ret <= 0)
> + break;
> + nend = nstart + ret * PAGE_SIZE;
> + }
> + if (locked)
> + mmap_read_unlock(mm);
> + if (nstart > start)
> + return min(nstart - start, len);
> + return 0;
> +}
> +
> /**
> * get_dump_page() - pin user page in memory while writing it to core dump
> * @addr: user address
> --
> 2.26.3
>
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Mon, Jul 26, 2021 at 9:33 AM Jan Kara <[email protected]> wrote:
>
> On Fri 23-07-21 22:58:34, Andreas Gruenbacher wrote:
> > + gup_flags = FOLL_TOUCH | FOLL_POPULATE;
>
> I don't think FOLL_POPULATE makes sense here. It makes sense only with
> FOLL_MLOCK and determines whether mlock(2) should fault in missing pages or
> not.
Yeah, it won't hurt, but FOLL_POPULATE doesn't actually do anything
unless FOLL_MLOCK is set. It is, as you say, a magic flag just for
mlock.
The only ones that should matter are FOLL_WRITE (for obvious reasons)
and FOLL_TOUCH (to set the accessed and dirty bits, rather than just
th protection bits)
Linus