While reviewing Baoquan's recent changes to permit vread() access to
vm_map_ram regions of vmalloc allocations, Willy pointed out [1] that it
would be nice to refactor vread() as a whole, since its only user is
read_kcore() and the existing form of vread() necessitates the use of a
bounce buffer.
This patch series does exactly that, as well as adjusting how we read the
kernel text section to avoid the use of a bounce buffer in this case as
well.
This has been tested against the test case which motivated Baoquan's
changes in the first place [2] which continues to function correctly, as do
the vmalloc self tests.
[1] https://lore.kernel.org/all/Y8WfDSRkc%[email protected]/
[2] https://lore.kernel.org/all/[email protected]/T/#u
v4:
- Fixup mistake in email client which orphaned patch emails from the
cover letter.
v3:
- Revert introduction of mutex/rwsem in vmalloc
- Introduce copy_page_to_iter_atomic() iovec function
- Update vread_iter() and descendent functions to use only this
- Fault in user pages before calling vread_iter()
- Use const char* in vread_iter() and descendent functions
- Updated commit messages based on feedback
- Extend vread functions to always check how many bytes we could copy. If
at any stage we are unable to copy/zero, abort and return the number of
bytes we did copy.
https://lore.kernel.org/all/[email protected]/
v2:
- Fix ordering of vread_iter() parameters
- Fix nommu vread() -> vread_iter()
https://lore.kernel.org/all/[email protected]/
v1:
https://lore.kernel.org/all/[email protected]/
Lorenzo Stoakes (4):
fs/proc/kcore: avoid bounce buffer for ktext data
fs/proc/kcore: convert read_kcore() to read_kcore_iter()
iov_iter: add copy_page_to_iter_atomic()
mm: vmalloc: convert vread() to vread_iter()
fs/proc/kcore.c | 89 ++++++---------
include/linux/uio.h | 2 +
include/linux/vmalloc.h | 3 +-
lib/iov_iter.c | 28 +++++
mm/nommu.c | 10 +-
mm/vmalloc.c | 234 +++++++++++++++++++++++++---------------
6 files changed, 218 insertions(+), 148 deletions(-)
--
2.39.2
Commit df04abfd181a ("fs/proc/kcore.c: Add bounce buffer for ktext data")
introduced the use of a bounce buffer to retrieve kernel text data for
/proc/kcore in order to avoid failures arising from hardened user copies
enabled by CONFIG_HARDENED_USERCOPY in check_kernel_text_object().
We can avoid doing this if instead of copy_to_user() we use _copy_to_user()
which bypasses the hardening check. This is more efficient than using a
bounce buffer and simplifies the code.
We do so as part an overall effort to eliminate bounce buffer usage in the
function with an eye to converting it an iterator read.
Signed-off-by: Lorenzo Stoakes <[email protected]>
Reviewed-by: David Hildenbrand <[email protected]>
---
fs/proc/kcore.c | 17 +++++------------
1 file changed, 5 insertions(+), 12 deletions(-)
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 71157ee35c1a..556f310d6aa4 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -541,19 +541,12 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
case KCORE_VMEMMAP:
case KCORE_TEXT:
/*
- * Using bounce buffer to bypass the
- * hardened user copy kernel text checks.
+ * We use _copy_to_user() to bypass usermode hardening
+ * which would otherwise prevent this operation.
*/
- if (copy_from_kernel_nofault(buf, (void *)start, tsz)) {
- if (clear_user(buffer, tsz)) {
- ret = -EFAULT;
- goto out;
- }
- } else {
- if (copy_to_user(buffer, buf, tsz)) {
- ret = -EFAULT;
- goto out;
- }
+ if (_copy_to_user(buffer, (char *)start, tsz)) {
+ ret = -EFAULT;
+ goto out;
}
break;
default:
--
2.39.2
Now we have eliminated spinlocks from the vread() case, convert
read_kcore() to read_kcore_iter().
For the time being we still use a bounce buffer for vread(), however in the
next patch we will convert this to interact directly with the iterator and
eliminate the bounce buffer altogether.
Signed-off-by: Lorenzo Stoakes <[email protected]>
---
fs/proc/kcore.c | 58 ++++++++++++++++++++++++-------------------------
1 file changed, 29 insertions(+), 29 deletions(-)
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 556f310d6aa4..25e0eeb8d498 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -24,7 +24,7 @@
#include <linux/memblock.h>
#include <linux/init.h>
#include <linux/slab.h>
-#include <linux/uaccess.h>
+#include <linux/uio.h>
#include <asm/io.h>
#include <linux/list.h>
#include <linux/ioport.h>
@@ -308,9 +308,12 @@ static void append_kcore_note(char *notes, size_t *i, const char *name,
}
static ssize_t
-read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
+read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
{
+ struct file *file = iocb->ki_filp;
char *buf = file->private_data;
+ loff_t *ppos = &iocb->ki_pos;
+
size_t phdrs_offset, notes_offset, data_offset;
size_t page_offline_frozen = 1;
size_t phdrs_len, notes_len;
@@ -318,6 +321,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
size_t tsz;
int nphdr;
unsigned long start;
+ size_t buflen = iov_iter_count(iter);
size_t orig_buflen = buflen;
int ret = 0;
@@ -333,7 +337,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
notes_offset = phdrs_offset + phdrs_len;
/* ELF file header. */
- if (buflen && *fpos < sizeof(struct elfhdr)) {
+ if (buflen && *ppos < sizeof(struct elfhdr)) {
struct elfhdr ehdr = {
.e_ident = {
[EI_MAG0] = ELFMAG0,
@@ -355,19 +359,18 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
.e_phnum = nphdr,
};
- tsz = min_t(size_t, buflen, sizeof(struct elfhdr) - *fpos);
- if (copy_to_user(buffer, (char *)&ehdr + *fpos, tsz)) {
+ tsz = min_t(size_t, buflen, sizeof(struct elfhdr) - *ppos);
+ if (copy_to_iter((char *)&ehdr + *ppos, tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
}
- buffer += tsz;
buflen -= tsz;
- *fpos += tsz;
+ *ppos += tsz;
}
/* ELF program headers. */
- if (buflen && *fpos < phdrs_offset + phdrs_len) {
+ if (buflen && *ppos < phdrs_offset + phdrs_len) {
struct elf_phdr *phdrs, *phdr;
phdrs = kzalloc(phdrs_len, GFP_KERNEL);
@@ -397,22 +400,21 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
phdr++;
}
- tsz = min_t(size_t, buflen, phdrs_offset + phdrs_len - *fpos);
- if (copy_to_user(buffer, (char *)phdrs + *fpos - phdrs_offset,
- tsz)) {
+ tsz = min_t(size_t, buflen, phdrs_offset + phdrs_len - *ppos);
+ if (copy_to_iter((char *)phdrs + *ppos - phdrs_offset, tsz,
+ iter) != tsz) {
kfree(phdrs);
ret = -EFAULT;
goto out;
}
kfree(phdrs);
- buffer += tsz;
buflen -= tsz;
- *fpos += tsz;
+ *ppos += tsz;
}
/* ELF note segment. */
- if (buflen && *fpos < notes_offset + notes_len) {
+ if (buflen && *ppos < notes_offset + notes_len) {
struct elf_prstatus prstatus = {};
struct elf_prpsinfo prpsinfo = {
.pr_sname = 'R',
@@ -447,24 +449,23 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
vmcoreinfo_data,
min(vmcoreinfo_size, notes_len - i));
- tsz = min_t(size_t, buflen, notes_offset + notes_len - *fpos);
- if (copy_to_user(buffer, notes + *fpos - notes_offset, tsz)) {
+ tsz = min_t(size_t, buflen, notes_offset + notes_len - *ppos);
+ if (copy_to_iter(notes + *ppos - notes_offset, tsz, iter) != tsz) {
kfree(notes);
ret = -EFAULT;
goto out;
}
kfree(notes);
- buffer += tsz;
buflen -= tsz;
- *fpos += tsz;
+ *ppos += tsz;
}
/*
* Check to see if our file offset matches with any of
* the addresses in the elf_phdr on our list.
*/
- start = kc_offset_to_vaddr(*fpos - data_offset);
+ start = kc_offset_to_vaddr(*ppos - data_offset);
if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
tsz = buflen;
@@ -497,7 +498,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
}
if (!m) {
- if (clear_user(buffer, tsz)) {
+ if (iov_iter_zero(tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
}
@@ -508,14 +509,14 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
case KCORE_VMALLOC:
vread(buf, (char *)start, tsz);
/* we have to zero-fill user buffer even if no read */
- if (copy_to_user(buffer, buf, tsz)) {
+ if (copy_to_iter(buf, tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
}
break;
case KCORE_USER:
/* User page is handled prior to normal kernel page: */
- if (copy_to_user(buffer, (char *)start, tsz)) {
+ if (copy_to_iter((char *)start, tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
}
@@ -531,7 +532,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
*/
if (!page || PageOffline(page) ||
is_page_hwpoison(page) || !pfn_is_ram(pfn)) {
- if (clear_user(buffer, tsz)) {
+ if (iov_iter_zero(tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
}
@@ -541,25 +542,24 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
case KCORE_VMEMMAP:
case KCORE_TEXT:
/*
- * We use _copy_to_user() to bypass usermode hardening
+ * We use _copy_to_iter() to bypass usermode hardening
* which would otherwise prevent this operation.
*/
- if (_copy_to_user(buffer, (char *)start, tsz)) {
+ if (_copy_to_iter((char *)start, tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
}
break;
default:
pr_warn_once("Unhandled KCORE type: %d\n", m->type);
- if (clear_user(buffer, tsz)) {
+ if (iov_iter_zero(tsz, iter) != tsz) {
ret = -EFAULT;
goto out;
}
}
skip:
buflen -= tsz;
- *fpos += tsz;
- buffer += tsz;
+ *ppos += tsz;
start += tsz;
tsz = (buflen > PAGE_SIZE ? PAGE_SIZE : buflen);
}
@@ -603,7 +603,7 @@ static int release_kcore(struct inode *inode, struct file *file)
}
static const struct proc_ops kcore_proc_ops = {
- .proc_read = read_kcore,
+ .proc_read_iter = read_kcore_iter,
.proc_open = open_kcore,
.proc_release = release_kcore,
.proc_lseek = default_llseek,
--
2.39.2
Provide an atomic context equivalent for copy_page_to_iter(). This eschews
the might_fault() check copies memory in the same way that
copy_page_from_iter_atomic() does.
This functions assumes a non-compound page, however this mimics the
existing behaviour of copy_page_from_iter_atomic(). I am keeping the
behaviour consistent between the two, deferring any such change to an
explicit folio-fication effort.
This is being added in order that an iteratable form of vread() can be
implemented with known prefaulted pages to avoid the need for mutex
locking.
Signed-off-by: Lorenzo Stoakes <[email protected]>
---
include/linux/uio.h | 2 ++
lib/iov_iter.c | 28 ++++++++++++++++++++++++++++
2 files changed, 30 insertions(+)
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 27e3fd942960..fab07103090f 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -154,6 +154,8 @@ static inline struct iovec iov_iter_iovec(const struct iov_iter *iter)
size_t copy_page_from_iter_atomic(struct page *page, unsigned offset,
size_t bytes, struct iov_iter *i);
+size_t copy_page_to_iter_atomic(struct page *page, unsigned offset,
+ size_t bytes, struct iov_iter *i);
void iov_iter_advance(struct iov_iter *i, size_t bytes);
void iov_iter_revert(struct iov_iter *i, size_t bytes);
size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes);
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 274014e4eafe..48ca1c5dfc04 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -821,6 +821,34 @@ size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t byt
}
EXPORT_SYMBOL(copy_page_from_iter_atomic);
+size_t copy_page_to_iter_atomic(struct page *page, unsigned offset, size_t bytes,
+ struct iov_iter *i)
+{
+ char *kaddr = kmap_local_page(page);
+ char *p = kaddr + offset;
+ size_t copied = 0;
+
+ if (!page_copy_sane(page, offset, bytes) ||
+ WARN_ON_ONCE(i->data_source))
+ goto out;
+
+ if (unlikely(iov_iter_is_pipe(i))) {
+ copied = copy_page_to_iter_pipe(page, offset, bytes, i);
+ goto out;
+ }
+
+ iterate_and_advance(i, bytes, base, len, off,
+ copyout(base, p + off, len),
+ memcpy(base, p + off, len)
+ )
+ copied = bytes;
+
+out:
+ kunmap_local(kaddr);
+ return copied;
+}
+EXPORT_SYMBOL(copy_page_to_iter_atomic);
+
static void pipe_advance(struct iov_iter *i, size_t size)
{
struct pipe_inode_info *pipe = i->pipe;
--
2.39.2
Having previously laid the foundation for converting vread() to an iterator
function, pull the trigger and do so.
This patch attempts to provide minimal refactoring and to reflect the
existing logic as best we can, for example we continue to zero portions of
memory not read, as before.
Overall, there should be no functional difference other than a performance
improvement in /proc/kcore access to vmalloc regions.
Now we have eliminated the need for a bounce buffer in read_kcore_iter(),
we dispense with it. We need to ensure userland pages are faulted in before
proceeding, as we take spin locks.
Additionally, we must account for the fact that at any point a copy may
fail if this happens, we exit indicating fewer bytes retrieved than
expected.
Signed-off-by: Lorenzo Stoakes <[email protected]>
---
fs/proc/kcore.c | 26 ++---
include/linux/vmalloc.h | 3 +-
mm/nommu.c | 10 +-
mm/vmalloc.c | 234 +++++++++++++++++++++++++---------------
4 files changed, 160 insertions(+), 113 deletions(-)
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 25e0eeb8d498..221e16f75ba5 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -307,13 +307,9 @@ static void append_kcore_note(char *notes, size_t *i, const char *name,
*i = ALIGN(*i + descsz, 4);
}
-static ssize_t
-read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
+static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
{
- struct file *file = iocb->ki_filp;
- char *buf = file->private_data;
loff_t *ppos = &iocb->ki_pos;
-
size_t phdrs_offset, notes_offset, data_offset;
size_t page_offline_frozen = 1;
size_t phdrs_len, notes_len;
@@ -507,9 +503,12 @@ read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
switch (m->type) {
case KCORE_VMALLOC:
- vread(buf, (char *)start, tsz);
- /* we have to zero-fill user buffer even if no read */
- if (copy_to_iter(buf, tsz, iter) != tsz) {
+ /*
+ * Make sure user pages are faulted in as we acquire
+ * spinlocks in vread_iter().
+ */
+ if (fault_in_iov_iter_writeable(iter, tsz) ||
+ vread_iter(iter, (char *)start, tsz) != tsz) {
ret = -EFAULT;
goto out;
}
@@ -582,10 +581,6 @@ static int open_kcore(struct inode *inode, struct file *filp)
if (ret)
return ret;
- filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL);
- if (!filp->private_data)
- return -ENOMEM;
-
if (kcore_need_update)
kcore_update_ram();
if (i_size_read(inode) != proc_root_kcore->size) {
@@ -596,16 +591,9 @@ static int open_kcore(struct inode *inode, struct file *filp)
return 0;
}
-static int release_kcore(struct inode *inode, struct file *file)
-{
- kfree(file->private_data);
- return 0;
-}
-
static const struct proc_ops kcore_proc_ops = {
.proc_read_iter = read_kcore_iter,
.proc_open = open_kcore,
- .proc_release = release_kcore,
.proc_lseek = default_llseek,
};
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 69250efa03d1..461aa5637f65 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -9,6 +9,7 @@
#include <asm/page.h> /* pgprot_t */
#include <linux/rbtree.h>
#include <linux/overflow.h>
+#include <linux/uio.h>
#include <asm/vmalloc.h>
@@ -251,7 +252,7 @@ static inline void set_vm_flush_reset_perms(void *addr)
#endif
/* for /proc/kcore */
-extern long vread(char *buf, char *addr, unsigned long count);
+extern long vread_iter(struct iov_iter *iter, const char *addr, size_t count);
/*
* Internals. Don't use..
diff --git a/mm/nommu.c b/mm/nommu.c
index 57ba243c6a37..e0fcd948096e 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -36,6 +36,7 @@
#include <linux/printk.h>
#include <linux/uaccess.h>
+#include <linux/uio.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
@@ -198,14 +199,13 @@ unsigned long vmalloc_to_pfn(const void *addr)
}
EXPORT_SYMBOL(vmalloc_to_pfn);
-long vread(char *buf, char *addr, unsigned long count)
+long vread_iter(struct iov_iter *iter, char *addr, size_t count)
{
/* Don't allow overflow */
- if ((unsigned long) buf + count < count)
- count = -(unsigned long) buf;
+ if ((unsigned long) addr + count < count)
+ count = -(unsigned long) addr;
- memcpy(buf, addr, count);
- return count;
+ return copy_to_iter(addr, count, iter);
}
/*
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 978194dc2bb8..ebfa1e9fe6f9 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -37,7 +37,6 @@
#include <linux/rbtree_augmented.h>
#include <linux/overflow.h>
#include <linux/pgtable.h>
-#include <linux/uaccess.h>
#include <linux/hugetlb.h>
#include <linux/sched/mm.h>
#include <asm/tlbflush.h>
@@ -3442,62 +3441,95 @@ void *vmalloc_32_user(unsigned long size)
EXPORT_SYMBOL(vmalloc_32_user);
/*
- * small helper routine , copy contents to buf from addr.
- * If the page is not present, fill zero.
+ * Atomically zero bytes in the iterator.
+ *
+ * Returns the number of zeroed bytes.
*/
+size_t zero_iter(struct iov_iter *iter, size_t count)
+{
+ size_t remains = count;
+
+ while (remains > 0) {
+ size_t num, copied;
+
+ num = remains < PAGE_SIZE ? remains : PAGE_SIZE;
+ copied = copy_page_to_iter_atomic(ZERO_PAGE(0), 0, num, iter);
+ remains -= copied;
+
+ if (copied < num)
+ break;
+ }
+
+ return count - remains;
+}
-static int aligned_vread(char *buf, char *addr, unsigned long count)
+/*
+ * small helper routine, copy contents to iter from addr.
+ * If the page is not present, fill zero.
+ *
+ * Returns the number of copied bytes.
+ */
+static size_t aligned_vread_iter(struct iov_iter *iter,
+ const char *addr, size_t count)
{
- struct page *p;
- int copied = 0;
+ size_t remains = count;
+ struct page *page;
- while (count) {
+ while (remains > 0) {
unsigned long offset, length;
+ size_t copied = 0;
offset = offset_in_page(addr);
length = PAGE_SIZE - offset;
- if (length > count)
- length = count;
- p = vmalloc_to_page(addr);
+ if (length > remains)
+ length = remains;
+ page = vmalloc_to_page(addr);
/*
- * To do safe access to this _mapped_ area, we need
- * lock. But adding lock here means that we need to add
- * overhead of vmalloc()/vfree() calls for this _debug_
- * interface, rarely used. Instead of that, we'll use
- * kmap() and get small overhead in this access function.
+ * To do safe access to this _mapped_ area, we need lock. But
+ * adding lock here means that we need to add overhead of
+ * vmalloc()/vfree() calls for this _debug_ interface, rarely
+ * used. Instead of that, we'll use an local mapping via
+ * copy_page_to_iter_atomic() and accept a small overhead in
+ * this access function.
*/
- if (p) {
- /* We can expect USER0 is not used -- see vread() */
- void *map = kmap_atomic(p);
- memcpy(buf, map + offset, length);
- kunmap_atomic(map);
- } else
- memset(buf, 0, length);
+ if (page)
+ copied = copy_page_to_iter_atomic(page, offset, length,
+ iter);
+
+ /* Zero anything we were unable to copy. */
+ copied += zero_iter(iter, length - copied);
+
+ addr += copied;
+ remains -= copied;
- addr += length;
- buf += length;
- copied += length;
- count -= length;
+ if (copied != length)
+ break;
}
- return copied;
+
+ return count - remains;
}
-static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags)
+/*
+ * Read from a vm_map_ram region of memory.
+ *
+ * Returns the number of copied bytes.
+ */
+static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr,
+ size_t count, unsigned long flags)
{
char *start;
struct vmap_block *vb;
unsigned long offset;
- unsigned int rs, re, n;
+ unsigned int rs, re;
+ size_t remains, n;
/*
* If it's area created by vm_map_ram() interface directly, but
* not further subdividing and delegating management to vmap_block,
* handle it here.
*/
- if (!(flags & VMAP_BLOCK)) {
- aligned_vread(buf, addr, count);
- return;
- }
+ if (!(flags & VMAP_BLOCK))
+ return aligned_vread_iter(iter, addr, count);
/*
* Area is split into regions and tracked with vmap_block, read out
@@ -3505,50 +3537,65 @@ static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags
*/
vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr));
if (!vb)
- goto finished;
+ goto finished_zero;
spin_lock(&vb->lock);
if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
spin_unlock(&vb->lock);
- goto finished;
+ goto finished_zero;
}
+
+ remains = count;
for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) {
- if (!count)
- break;
+ size_t copied;
+
+ if (remains == 0)
+ goto finished;
+
start = vmap_block_vaddr(vb->va->va_start, rs);
- while (addr < start) {
- if (count == 0)
- goto unlock;
- *buf = '\0';
- buf++;
- addr++;
- count--;
+
+ if (addr < start) {
+ size_t to_zero = min_t(size_t, start - addr, remains);
+ size_t zeroed = zero_iter(iter, to_zero);
+
+ addr += zeroed;
+ remains -= zeroed;
+
+ if (remains == 0 || zeroed != to_zero)
+ goto finished;
}
+
/*it could start reading from the middle of used region*/
offset = offset_in_page(addr);
n = ((re - rs + 1) << PAGE_SHIFT) - offset;
- if (n > count)
- n = count;
- aligned_vread(buf, start+offset, n);
+ if (n > remains)
+ n = remains;
+
+ copied = aligned_vread_iter(iter, start + offset, n);
- buf += n;
- addr += n;
- count -= n;
+ addr += copied;
+ remains -= copied;
+
+ if (copied != n)
+ goto finished;
}
-unlock:
+
spin_unlock(&vb->lock);
-finished:
+finished_zero:
/* zero-fill the left dirty or free regions */
- if (count)
- memset(buf, 0, count);
+ return count - remains + zero_iter(iter, remains);
+finished:
+ /* We couldn't copy/zero everything */
+ spin_unlock(&vb->lock);
+ return count - remains;
}
/**
- * vread() - read vmalloc area in a safe way.
- * @buf: buffer for reading data
- * @addr: vm address.
- * @count: number of bytes to be read.
+ * vread_iter() - read vmalloc area in a safe way to an iterator.
+ * @iter: the iterator to which data should be written.
+ * @addr: vm address.
+ * @count: number of bytes to be read.
*
* This function checks that addr is a valid vmalloc'ed area, and
* copy data from that area to a given buffer. If the given memory range
@@ -3568,13 +3615,12 @@ static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags
* (same number as @count) or %0 if [addr...addr+count) doesn't
* include any intersection with valid vmalloc area
*/
-long vread(char *buf, char *addr, unsigned long count)
+long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
{
struct vmap_area *va;
struct vm_struct *vm;
- char *vaddr, *buf_start = buf;
- unsigned long buflen = count;
- unsigned long n, size, flags;
+ char *vaddr;
+ size_t n, size, flags, remains;
addr = kasan_reset_tag(addr);
@@ -3582,18 +3628,22 @@ long vread(char *buf, char *addr, unsigned long count)
if ((unsigned long) addr + count < count)
count = -(unsigned long) addr;
+ remains = count;
+
spin_lock(&vmap_area_lock);
va = find_vmap_area_exceed_addr((unsigned long)addr);
if (!va)
- goto finished;
+ goto finished_zero;
/* no intersects with alive vmap_area */
- if ((unsigned long)addr + count <= va->va_start)
- goto finished;
+ if ((unsigned long)addr + remains <= va->va_start)
+ goto finished_zero;
list_for_each_entry_from(va, &vmap_area_list, list) {
- if (!count)
- break;
+ size_t copied;
+
+ if (remains == 0)
+ goto finished;
vm = va->vm;
flags = va->flags & VMAP_FLAGS_MASK;
@@ -3608,6 +3658,7 @@ long vread(char *buf, char *addr, unsigned long count)
if (vm && (vm->flags & VM_UNINITIALIZED))
continue;
+
/* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
smp_rmb();
@@ -3616,38 +3667,45 @@ long vread(char *buf, char *addr, unsigned long count)
if (addr >= vaddr + size)
continue;
- while (addr < vaddr) {
- if (count == 0)
+
+ if (addr < vaddr) {
+ size_t to_zero = min_t(size_t, vaddr - addr, remains);
+ size_t zeroed = zero_iter(iter, to_zero);
+
+ addr += zeroed;
+ remains -= zeroed;
+
+ if (remains == 0 || zeroed != to_zero)
goto finished;
- *buf = '\0';
- buf++;
- addr++;
- count--;
}
+
n = vaddr + size - addr;
- if (n > count)
- n = count;
+ if (n > remains)
+ n = remains;
if (flags & VMAP_RAM)
- vmap_ram_vread(buf, addr, n, flags);
+ copied = vmap_ram_vread_iter(iter, addr, n, flags);
else if (!(vm->flags & VM_IOREMAP))
- aligned_vread(buf, addr, n);
+ copied = aligned_vread_iter(iter, addr, n);
else /* IOREMAP area is treated as memory hole */
- memset(buf, 0, n);
- buf += n;
- addr += n;
- count -= n;
+ copied = zero_iter(iter, n);
+
+ addr += copied;
+ remains -= copied;
+
+ if (copied != n)
+ goto finished;
}
-finished:
- spin_unlock(&vmap_area_lock);
- if (buf == buf_start)
- return 0;
+finished_zero:
+ spin_unlock(&vmap_area_lock);
/* zero-fill memory holes */
- if (buf != buf_start + buflen)
- memset(buf, 0, buflen - (buf - buf_start));
+ return count - remains + zero_iter(iter, remains);
+finished:
+ /* Nothing remains, or We couldn't copy/zero everything. */
+ spin_unlock(&vmap_area_lock);
- return buflen;
+ return count - remains;
}
/**
--
2.39.2
Hi Lorenzo,
On 03/21/23 at 08:54pm, Lorenzo Stoakes wrote:
> Now we have eliminated spinlocks from the vread() case, convert
> read_kcore() to read_kcore_iter().
Sorry for late comment.
Here I could miss something important, I don't get where we have
eliminated spinlocks from the vread() case. Do I misunderstand this
sentence?
>
> For the time being we still use a bounce buffer for vread(), however in the
> next patch we will convert this to interact directly with the iterator and
> eliminate the bounce buffer altogether.
>
> Signed-off-by: Lorenzo Stoakes <[email protected]>
> ---
> fs/proc/kcore.c | 58 ++++++++++++++++++++++++-------------------------
> 1 file changed, 29 insertions(+), 29 deletions(-)
>
> diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
> index 556f310d6aa4..25e0eeb8d498 100644
> --- a/fs/proc/kcore.c
> +++ b/fs/proc/kcore.c
> @@ -24,7 +24,7 @@
> #include <linux/memblock.h>
> #include <linux/init.h>
> #include <linux/slab.h>
> -#include <linux/uaccess.h>
> +#include <linux/uio.h>
> #include <asm/io.h>
> #include <linux/list.h>
> #include <linux/ioport.h>
> @@ -308,9 +308,12 @@ static void append_kcore_note(char *notes, size_t *i, const char *name,
> }
>
> static ssize_t
> -read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
> +read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
> {
> + struct file *file = iocb->ki_filp;
> char *buf = file->private_data;
> + loff_t *ppos = &iocb->ki_pos;
> +
> size_t phdrs_offset, notes_offset, data_offset;
> size_t page_offline_frozen = 1;
> size_t phdrs_len, notes_len;
> @@ -318,6 +321,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
> size_t tsz;
> int nphdr;
> unsigned long start;
> + size_t buflen = iov_iter_count(iter);
> size_t orig_buflen = buflen;
> int ret = 0;
>
> @@ -333,7 +337,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
> notes_offset = phdrs_offset + phdrs_len;
>
> /* ELF file header. */
> - if (buflen && *fpos < sizeof(struct elfhdr)) {
> + if (buflen && *ppos < sizeof(struct elfhdr)) {
> struct elfhdr ehdr = {
> .e_ident = {
> [EI_MAG0] = ELFMAG0,
> @@ -355,19 +359,18 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
> .e_phnum = nphdr,
> };
>
> - tsz = min_t(size_t, buflen, sizeof(struct elfhdr) - *fpos);
> - if (copy_to_user(buffer, (char *)&ehdr + *fpos, tsz)) {
> + tsz = min_t(size_t, buflen, sizeof(struct elfhdr) - *ppos);
> + if (copy_to_iter((char *)&ehdr + *ppos, tsz, iter) != tsz) {
> ret = -EFAULT;
> goto out;
> }
>
> - buffer += tsz;
> buflen -= tsz;
> - *fpos += tsz;
> + *ppos += tsz;
> }
>
> /* ELF program headers. */
> - if (buflen && *fpos < phdrs_offset + phdrs_len) {
> + if (buflen && *ppos < phdrs_offset + phdrs_len) {
> struct elf_phdr *phdrs, *phdr;
>
> phdrs = kzalloc(phdrs_len, GFP_KERNEL);
> @@ -397,22 +400,21 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
> phdr++;
> }
>
> - tsz = min_t(size_t, buflen, phdrs_offset + phdrs_len - *fpos);
> - if (copy_to_user(buffer, (char *)phdrs + *fpos - phdrs_offset,
> - tsz)) {
> + tsz = min_t(size_t, buflen, phdrs_offset + phdrs_len - *ppos);
> + if (copy_to_iter((char *)phdrs + *ppos - phdrs_offset, tsz,
> + iter) != tsz) {
> kfree(phdrs);
> ret = -EFAULT;
> goto out;
> }
> kfree(phdrs);
>
> - buffer += tsz;
> buflen -= tsz;
> - *fpos += tsz;
> + *ppos += tsz;
> }
>
> /* ELF note segment. */
> - if (buflen && *fpos < notes_offset + notes_len) {
> + if (buflen && *ppos < notes_offset + notes_len) {
> struct elf_prstatus prstatus = {};
> struct elf_prpsinfo prpsinfo = {
> .pr_sname = 'R',
> @@ -447,24 +449,23 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
> vmcoreinfo_data,
> min(vmcoreinfo_size, notes_len - i));
>
> - tsz = min_t(size_t, buflen, notes_offset + notes_len - *fpos);
> - if (copy_to_user(buffer, notes + *fpos - notes_offset, tsz)) {
> + tsz = min_t(size_t, buflen, notes_offset + notes_len - *ppos);
> + if (copy_to_iter(notes + *ppos - notes_offset, tsz, iter) != tsz) {
> kfree(notes);
> ret = -EFAULT;
> goto out;
> }
> kfree(notes);
>
> - buffer += tsz;
> buflen -= tsz;
> - *fpos += tsz;
> + *ppos += tsz;
> }
>
> /*
> * Check to see if our file offset matches with any of
> * the addresses in the elf_phdr on our list.
> */
> - start = kc_offset_to_vaddr(*fpos - data_offset);
> + start = kc_offset_to_vaddr(*ppos - data_offset);
> if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
> tsz = buflen;
>
> @@ -497,7 +498,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
> }
>
> if (!m) {
> - if (clear_user(buffer, tsz)) {
> + if (iov_iter_zero(tsz, iter) != tsz) {
> ret = -EFAULT;
> goto out;
> }
> @@ -508,14 +509,14 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
> case KCORE_VMALLOC:
> vread(buf, (char *)start, tsz);
> /* we have to zero-fill user buffer even if no read */
> - if (copy_to_user(buffer, buf, tsz)) {
> + if (copy_to_iter(buf, tsz, iter) != tsz) {
> ret = -EFAULT;
> goto out;
> }
> break;
> case KCORE_USER:
> /* User page is handled prior to normal kernel page: */
> - if (copy_to_user(buffer, (char *)start, tsz)) {
> + if (copy_to_iter((char *)start, tsz, iter) != tsz) {
> ret = -EFAULT;
> goto out;
> }
> @@ -531,7 +532,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
> */
> if (!page || PageOffline(page) ||
> is_page_hwpoison(page) || !pfn_is_ram(pfn)) {
> - if (clear_user(buffer, tsz)) {
> + if (iov_iter_zero(tsz, iter) != tsz) {
> ret = -EFAULT;
> goto out;
> }
> @@ -541,25 +542,24 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
> case KCORE_VMEMMAP:
> case KCORE_TEXT:
> /*
> - * We use _copy_to_user() to bypass usermode hardening
> + * We use _copy_to_iter() to bypass usermode hardening
> * which would otherwise prevent this operation.
> */
> - if (_copy_to_user(buffer, (char *)start, tsz)) {
> + if (_copy_to_iter((char *)start, tsz, iter) != tsz) {
> ret = -EFAULT;
> goto out;
> }
> break;
> default:
> pr_warn_once("Unhandled KCORE type: %d\n", m->type);
> - if (clear_user(buffer, tsz)) {
> + if (iov_iter_zero(tsz, iter) != tsz) {
> ret = -EFAULT;
> goto out;
> }
> }
> skip:
> buflen -= tsz;
> - *fpos += tsz;
> - buffer += tsz;
> + *ppos += tsz;
> start += tsz;
> tsz = (buflen > PAGE_SIZE ? PAGE_SIZE : buflen);
> }
> @@ -603,7 +603,7 @@ static int release_kcore(struct inode *inode, struct file *file)
> }
>
> static const struct proc_ops kcore_proc_ops = {
> - .proc_read = read_kcore,
> + .proc_read_iter = read_kcore_iter,
> .proc_open = open_kcore,
> .proc_release = release_kcore,
> .proc_lseek = default_llseek,
> --
> 2.39.2
>
On Wed, Mar 22, 2023 at 09:15:48AM +0800, Baoquan He wrote:
> Hi Lorenzo,
>
> On 03/21/23 at 08:54pm, Lorenzo Stoakes wrote:
> > Now we have eliminated spinlocks from the vread() case, convert
> > read_kcore() to read_kcore_iter().
>
> Sorry for late comment.
>
> Here I could miss something important, I don't get where we have
> eliminated spinlocks from the vread() case. Do I misunderstand this
> sentence?
>
Apologies, I didn't update the commit message after the latest revision! We
can just drop this sentence altogether.
Andrew - could you change the commit message to simply read:-
For the time being we still use a bounce buffer for vread(), however in the
next patch we will convert this to interact directly with the iterator and
eliminate the bounce buffer altogether.
Thanks!
> >
> > For the time being we still use a bounce buffer for vread(), however in the
> > next patch we will convert this to interact directly with the iterator and
> > eliminate the bounce buffer altogether.
> >
> > Signed-off-by: Lorenzo Stoakes <[email protected]>
> > ---
> > fs/proc/kcore.c | 58 ++++++++++++++++++++++++-------------------------
> > 1 file changed, 29 insertions(+), 29 deletions(-)
> >
> > diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
> > index 556f310d6aa4..25e0eeb8d498 100644
> > --- a/fs/proc/kcore.c
> > +++ b/fs/proc/kcore.c
> > @@ -24,7 +24,7 @@
> > #include <linux/memblock.h>
> > #include <linux/init.h>
> > #include <linux/slab.h>
> > -#include <linux/uaccess.h>
> > +#include <linux/uio.h>
> > #include <asm/io.h>
> > #include <linux/list.h>
> > #include <linux/ioport.h>
> > @@ -308,9 +308,12 @@ static void append_kcore_note(char *notes, size_t *i, const char *name,
> > }
> >
> > static ssize_t
> > -read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
> > +read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
> > {
> > + struct file *file = iocb->ki_filp;
> > char *buf = file->private_data;
> > + loff_t *ppos = &iocb->ki_pos;
> > +
> > size_t phdrs_offset, notes_offset, data_offset;
> > size_t page_offline_frozen = 1;
> > size_t phdrs_len, notes_len;
> > @@ -318,6 +321,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
> > size_t tsz;
> > int nphdr;
> > unsigned long start;
> > + size_t buflen = iov_iter_count(iter);
> > size_t orig_buflen = buflen;
> > int ret = 0;
> >
> > @@ -333,7 +337,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
> > notes_offset = phdrs_offset + phdrs_len;
> >
> > /* ELF file header. */
> > - if (buflen && *fpos < sizeof(struct elfhdr)) {
> > + if (buflen && *ppos < sizeof(struct elfhdr)) {
> > struct elfhdr ehdr = {
> > .e_ident = {
> > [EI_MAG0] = ELFMAG0,
> > @@ -355,19 +359,18 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
> > .e_phnum = nphdr,
> > };
> >
> > - tsz = min_t(size_t, buflen, sizeof(struct elfhdr) - *fpos);
> > - if (copy_to_user(buffer, (char *)&ehdr + *fpos, tsz)) {
> > + tsz = min_t(size_t, buflen, sizeof(struct elfhdr) - *ppos);
> > + if (copy_to_iter((char *)&ehdr + *ppos, tsz, iter) != tsz) {
> > ret = -EFAULT;
> > goto out;
> > }
> >
> > - buffer += tsz;
> > buflen -= tsz;
> > - *fpos += tsz;
> > + *ppos += tsz;
> > }
> >
> > /* ELF program headers. */
> > - if (buflen && *fpos < phdrs_offset + phdrs_len) {
> > + if (buflen && *ppos < phdrs_offset + phdrs_len) {
> > struct elf_phdr *phdrs, *phdr;
> >
> > phdrs = kzalloc(phdrs_len, GFP_KERNEL);
> > @@ -397,22 +400,21 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
> > phdr++;
> > }
> >
> > - tsz = min_t(size_t, buflen, phdrs_offset + phdrs_len - *fpos);
> > - if (copy_to_user(buffer, (char *)phdrs + *fpos - phdrs_offset,
> > - tsz)) {
> > + tsz = min_t(size_t, buflen, phdrs_offset + phdrs_len - *ppos);
> > + if (copy_to_iter((char *)phdrs + *ppos - phdrs_offset, tsz,
> > + iter) != tsz) {
> > kfree(phdrs);
> > ret = -EFAULT;
> > goto out;
> > }
> > kfree(phdrs);
> >
> > - buffer += tsz;
> > buflen -= tsz;
> > - *fpos += tsz;
> > + *ppos += tsz;
> > }
> >
> > /* ELF note segment. */
> > - if (buflen && *fpos < notes_offset + notes_len) {
> > + if (buflen && *ppos < notes_offset + notes_len) {
> > struct elf_prstatus prstatus = {};
> > struct elf_prpsinfo prpsinfo = {
> > .pr_sname = 'R',
> > @@ -447,24 +449,23 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
> > vmcoreinfo_data,
> > min(vmcoreinfo_size, notes_len - i));
> >
> > - tsz = min_t(size_t, buflen, notes_offset + notes_len - *fpos);
> > - if (copy_to_user(buffer, notes + *fpos - notes_offset, tsz)) {
> > + tsz = min_t(size_t, buflen, notes_offset + notes_len - *ppos);
> > + if (copy_to_iter(notes + *ppos - notes_offset, tsz, iter) != tsz) {
> > kfree(notes);
> > ret = -EFAULT;
> > goto out;
> > }
> > kfree(notes);
> >
> > - buffer += tsz;
> > buflen -= tsz;
> > - *fpos += tsz;
> > + *ppos += tsz;
> > }
> >
> > /*
> > * Check to see if our file offset matches with any of
> > * the addresses in the elf_phdr on our list.
> > */
> > - start = kc_offset_to_vaddr(*fpos - data_offset);
> > + start = kc_offset_to_vaddr(*ppos - data_offset);
> > if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
> > tsz = buflen;
> >
> > @@ -497,7 +498,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
> > }
> >
> > if (!m) {
> > - if (clear_user(buffer, tsz)) {
> > + if (iov_iter_zero(tsz, iter) != tsz) {
> > ret = -EFAULT;
> > goto out;
> > }
> > @@ -508,14 +509,14 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
> > case KCORE_VMALLOC:
> > vread(buf, (char *)start, tsz);
> > /* we have to zero-fill user buffer even if no read */
> > - if (copy_to_user(buffer, buf, tsz)) {
> > + if (copy_to_iter(buf, tsz, iter) != tsz) {
> > ret = -EFAULT;
> > goto out;
> > }
> > break;
> > case KCORE_USER:
> > /* User page is handled prior to normal kernel page: */
> > - if (copy_to_user(buffer, (char *)start, tsz)) {
> > + if (copy_to_iter((char *)start, tsz, iter) != tsz) {
> > ret = -EFAULT;
> > goto out;
> > }
> > @@ -531,7 +532,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
> > */
> > if (!page || PageOffline(page) ||
> > is_page_hwpoison(page) || !pfn_is_ram(pfn)) {
> > - if (clear_user(buffer, tsz)) {
> > + if (iov_iter_zero(tsz, iter) != tsz) {
> > ret = -EFAULT;
> > goto out;
> > }
> > @@ -541,25 +542,24 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
> > case KCORE_VMEMMAP:
> > case KCORE_TEXT:
> > /*
> > - * We use _copy_to_user() to bypass usermode hardening
> > + * We use _copy_to_iter() to bypass usermode hardening
> > * which would otherwise prevent this operation.
> > */
> > - if (_copy_to_user(buffer, (char *)start, tsz)) {
> > + if (_copy_to_iter((char *)start, tsz, iter) != tsz) {
> > ret = -EFAULT;
> > goto out;
> > }
> > break;
> > default:
> > pr_warn_once("Unhandled KCORE type: %d\n", m->type);
> > - if (clear_user(buffer, tsz)) {
> > + if (iov_iter_zero(tsz, iter) != tsz) {
> > ret = -EFAULT;
> > goto out;
> > }
> > }
> > skip:
> > buflen -= tsz;
> > - *fpos += tsz;
> > - buffer += tsz;
> > + *ppos += tsz;
> > start += tsz;
> > tsz = (buflen > PAGE_SIZE ? PAGE_SIZE : buflen);
> > }
> > @@ -603,7 +603,7 @@ static int release_kcore(struct inode *inode, struct file *file)
> > }
> >
> > static const struct proc_ops kcore_proc_ops = {
> > - .proc_read = read_kcore,
> > + .proc_read_iter = read_kcore_iter,
> > .proc_open = open_kcore,
> > .proc_release = release_kcore,
> > .proc_lseek = default_llseek,
> > --
> > 2.39.2
> >
>
On 03/21/23 at 08:54pm, Lorenzo Stoakes wrote:
> Provide an atomic context equivalent for copy_page_to_iter(). This eschews
> the might_fault() check copies memory in the same way that
> copy_page_from_iter_atomic() does.
>
> This functions assumes a non-compound page, however this mimics the
> existing behaviour of copy_page_from_iter_atomic(). I am keeping the
> behaviour consistent between the two, deferring any such change to an
> explicit folio-fication effort.
>
> This is being added in order that an iteratable form of vread() can be
> implemented with known prefaulted pages to avoid the need for mutex
> locking.
>
> Signed-off-by: Lorenzo Stoakes <[email protected]>
> ---
> include/linux/uio.h | 2 ++
> lib/iov_iter.c | 28 ++++++++++++++++++++++++++++
> 2 files changed, 30 insertions(+)
>
> diff --git a/include/linux/uio.h b/include/linux/uio.h
> index 27e3fd942960..fab07103090f 100644
> --- a/include/linux/uio.h
> +++ b/include/linux/uio.h
> @@ -154,6 +154,8 @@ static inline struct iovec iov_iter_iovec(const struct iov_iter *iter)
>
> size_t copy_page_from_iter_atomic(struct page *page, unsigned offset,
> size_t bytes, struct iov_iter *i);
> +size_t copy_page_to_iter_atomic(struct page *page, unsigned offset,
> + size_t bytes, struct iov_iter *i);
> void iov_iter_advance(struct iov_iter *i, size_t bytes);
> void iov_iter_revert(struct iov_iter *i, size_t bytes);
> size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes);
> diff --git a/lib/iov_iter.c b/lib/iov_iter.c
> index 274014e4eafe..48ca1c5dfc04 100644
> --- a/lib/iov_iter.c
> +++ b/lib/iov_iter.c
> @@ -821,6 +821,34 @@ size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t byt
> }
> EXPORT_SYMBOL(copy_page_from_iter_atomic);
>
> +size_t copy_page_to_iter_atomic(struct page *page, unsigned offset, size_t bytes,
> + struct iov_iter *i)
> +{
> + char *kaddr = kmap_local_page(page);
I am a little confused about the name of this new function. In its
conterpart, copy_page_from_iter_atomic(), kmap_atomic()/kunmpa_atomic()
are used. With them, if CONFIG_HIGHMEM=n, it's like below:
static inline void *kmap_atomic(struct page *page)
{
if (IS_ENABLED(CONFIG_PREEMPT_RT))
migrate_disable();
else
preempt_disable();
pagefault_disable();
return page_address(page);
}
But kmap_local_page() is only having page_address(), the code block
between kmap_local_page() and kunmap_local() is also atomic, it's a
little messy in my mind.
static inline void *kmap_local_page(struct page *page)
{
return page_address(page);
}
> + char *p = kaddr + offset;
> + size_t copied = 0;
> +
> + if (!page_copy_sane(page, offset, bytes) ||
> + WARN_ON_ONCE(i->data_source))
> + goto out;
> +
> + if (unlikely(iov_iter_is_pipe(i))) {
> + copied = copy_page_to_iter_pipe(page, offset, bytes, i);
> + goto out;
> + }
> +
> + iterate_and_advance(i, bytes, base, len, off,
> + copyout(base, p + off, len),
> + memcpy(base, p + off, len)
> + )
> + copied = bytes;
> +
> +out:
> + kunmap_local(kaddr);
> + return copied;
> +}
> +EXPORT_SYMBOL(copy_page_to_iter_atomic);
> +
> static void pipe_advance(struct iov_iter *i, size_t size)
> {
> struct pipe_inode_info *pipe = i->pipe;
> --
> 2.39.2
>
On Wed, Mar 22, 2023 at 06:17:25PM +0800, Baoquan He wrote:
> On 03/21/23 at 08:54pm, Lorenzo Stoakes wrote:
> > Provide an atomic context equivalent for copy_page_to_iter(). This eschews
> > the might_fault() check copies memory in the same way that
> > copy_page_from_iter_atomic() does.
> >
> > This functions assumes a non-compound page, however this mimics the
> > existing behaviour of copy_page_from_iter_atomic(). I am keeping the
> > behaviour consistent between the two, deferring any such change to an
> > explicit folio-fication effort.
> >
> > This is being added in order that an iteratable form of vread() can be
> > implemented with known prefaulted pages to avoid the need for mutex
> > locking.
> >
> > Signed-off-by: Lorenzo Stoakes <[email protected]>
> > ---
> > include/linux/uio.h | 2 ++
> > lib/iov_iter.c | 28 ++++++++++++++++++++++++++++
> > 2 files changed, 30 insertions(+)
> >
> > diff --git a/include/linux/uio.h b/include/linux/uio.h
> > index 27e3fd942960..fab07103090f 100644
> > --- a/include/linux/uio.h
> > +++ b/include/linux/uio.h
> > @@ -154,6 +154,8 @@ static inline struct iovec iov_iter_iovec(const struct iov_iter *iter)
> >
> > size_t copy_page_from_iter_atomic(struct page *page, unsigned offset,
> > size_t bytes, struct iov_iter *i);
> > +size_t copy_page_to_iter_atomic(struct page *page, unsigned offset,
> > + size_t bytes, struct iov_iter *i);
> > void iov_iter_advance(struct iov_iter *i, size_t bytes);
> > void iov_iter_revert(struct iov_iter *i, size_t bytes);
> > size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes);
> > diff --git a/lib/iov_iter.c b/lib/iov_iter.c
> > index 274014e4eafe..48ca1c5dfc04 100644
> > --- a/lib/iov_iter.c
> > +++ b/lib/iov_iter.c
> > @@ -821,6 +821,34 @@ size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t byt
> > }
> > EXPORT_SYMBOL(copy_page_from_iter_atomic);
> >
> > +size_t copy_page_to_iter_atomic(struct page *page, unsigned offset, size_t bytes,
> > + struct iov_iter *i)
> > +{
> > + char *kaddr = kmap_local_page(page);
>
> I am a little confused about the name of this new function. In its
> conterpart, copy_page_from_iter_atomic(), kmap_atomic()/kunmpa_atomic()
> are used. With them, if CONFIG_HIGHMEM=n, it's like below:
The reason for this is that:-
1. kmap_atomic() explicitly states that it is now deprecated and must no longer
be used, and kmap_local_page() should be used instead:-
* kmap_atomic - Atomically map a page for temporary usage - Deprecated!
* Do not use in new code. Use kmap_local_page() instead.
2. kmap_local_page() explicitly states that it can be used in any context:-
* Can be invoked from any context, including interrupts.
I wanted follow this advice as strictly as I could, hence the change. However,
we do need preemption/pagefaults explicitly disabled in this context (we are
happy to fail if the faulted in pages are unmapped in meantime), and I didn't
check the internals to make sure.
So I think for safety it is better to use k[un]map_atomic() here, I'll respin
and put that back in, good catch!
>
> static inline void *kmap_atomic(struct page *page)
> {
> if (IS_ENABLED(CONFIG_PREEMPT_RT))
> migrate_disable();
> else
> preempt_disable();
> pagefault_disable();
> return page_address(page);
> }
>
> But kmap_local_page() is only having page_address(), the code block
> between kmap_local_page() and kunmap_local() is also atomic, it's a
> little messy in my mind.
>
> static inline void *kmap_local_page(struct page *page)
> {
> return page_address(page);
> }
>
> > + char *p = kaddr + offset;
> > + size_t copied = 0;
> > +
> > + if (!page_copy_sane(page, offset, bytes) ||
> > + WARN_ON_ONCE(i->data_source))
> > + goto out;
> > +
> > + if (unlikely(iov_iter_is_pipe(i))) {
> > + copied = copy_page_to_iter_pipe(page, offset, bytes, i);
> > + goto out;
> > + }
> > +
> > + iterate_and_advance(i, bytes, base, len, off,
> > + copyout(base, p + off, len),
> > + memcpy(base, p + off, len)
> > + )
> > + copied = bytes;
> > +
> > +out:
> > + kunmap_local(kaddr);
> > + return copied;
> > +}
> > +EXPORT_SYMBOL(copy_page_to_iter_atomic);
> > +
> > static void pipe_advance(struct iov_iter *i, size_t size)
> > {
> > struct pipe_inode_info *pipe = i->pipe;
> > --
> > 2.39.2
> >
>
On Wed, Mar 22, 2023 at 10:32:47AM +0000, Lorenzo Stoakes wrote:
> > I am a little confused about the name of this new function. In its
> > conterpart, copy_page_from_iter_atomic(), kmap_atomic()/kunmpa_atomic()
> > are used. With them, if CONFIG_HIGHMEM=n, it's like below:
>
> The reason for this is that:-
>
> 1. kmap_atomic() explicitly states that it is now deprecated and must no longer
> be used, and kmap_local_page() should be used instead:-
>
> * kmap_atomic - Atomically map a page for temporary usage - Deprecated!
>
> * Do not use in new code. Use kmap_local_page() instead.
>
> 2. kmap_local_page() explicitly states that it can be used in any context:-
>
> * Can be invoked from any context, including interrupts.
>
> I wanted follow this advice as strictly as I could, hence the change. However,
> we do need preemption/pagefaults explicitly disabled in this context (we are
> happy to fail if the faulted in pages are unmapped in meantime), and I didn't
> check the internals to make sure.
>
> So I think for safety it is better to use k[un]map_atomic() here, I'll respin
> and put that back in, good catch!
>
Actually, given we have preemption disabled due to the held spinlock, I think
it'd be better to add a copy_page_to_iter_nofault() that uses
copy_to_user_nofault() which will disable pagefaults thus have exactly the
equivalent behaviour, more explicitly and without the use of a deprecated
function.
Thanks for raising this!!
> >
> > static inline void *kmap_atomic(struct page *page)
> > {
> > if (IS_ENABLED(CONFIG_PREEMPT_RT))
> > migrate_disable();
> > else
> > preempt_disable();
> > pagefault_disable();
> > return page_address(page);
> > }
> >
> > But kmap_local_page() is only having page_address(), the code block
> > between kmap_local_page() and kunmap_local() is also atomic, it's a
> > little messy in my mind.
> >
> > static inline void *kmap_local_page(struct page *page)
> > {
> > return page_address(page);
> > }
> >
> > > + char *p = kaddr + offset;
> > > + size_t copied = 0;
> > > +
> > > + if (!page_copy_sane(page, offset, bytes) ||
> > > + WARN_ON_ONCE(i->data_source))
> > > + goto out;
> > > +
> > > + if (unlikely(iov_iter_is_pipe(i))) {
> > > + copied = copy_page_to_iter_pipe(page, offset, bytes, i);
> > > + goto out;
> > > + }
> > > +
> > > + iterate_and_advance(i, bytes, base, len, off,
> > > + copyout(base, p + off, len),
> > > + memcpy(base, p + off, len)
> > > + )
> > > + copied = bytes;
> > > +
> > > +out:
> > > + kunmap_local(kaddr);
> > > + return copied;
> > > +}
> > > +EXPORT_SYMBOL(copy_page_to_iter_atomic);
> > > +
> > > static void pipe_advance(struct iov_iter *i, size_t size)
> > > {
> > > struct pipe_inode_info *pipe = i->pipe;
> > > --
> > > 2.39.2
> > >
> >
On 21.03.23 21:54, Lorenzo Stoakes wrote:
> Now we have eliminated spinlocks from the vread() case, convert
> read_kcore() to read_kcore_iter().
>
> For the time being we still use a bounce buffer for vread(), however in the
> next patch we will convert this to interact directly with the iterator and
> eliminate the bounce buffer altogether.
>
> Signed-off-by: Lorenzo Stoakes <[email protected]>
> ---
> fs/proc/kcore.c | 58 ++++++++++++++++++++++++-------------------------
> 1 file changed, 29 insertions(+), 29 deletions(-)
>
> diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
> index 556f310d6aa4..25e0eeb8d498 100644
> --- a/fs/proc/kcore.c
> +++ b/fs/proc/kcore.c
> @@ -24,7 +24,7 @@
> #include <linux/memblock.h>
> #include <linux/init.h>
> #include <linux/slab.h>
> -#include <linux/uaccess.h>
> +#include <linux/uio.h>
> #include <asm/io.h>
> #include <linux/list.h>
> #include <linux/ioport.h>
> @@ -308,9 +308,12 @@ static void append_kcore_note(char *notes, size_t *i, const char *name,
> }
>
> static ssize_t
> -read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
> +read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
> {
> + struct file *file = iocb->ki_filp;
> char *buf = file->private_data;
> + loff_t *ppos = &iocb->ki_pos;
Not renaming fpos -> ppos in this patch would result in less noise in
this patch. Just like you didn't rename buflen.
In general, LGTM
Reviewed-by: David Hildenbrand <[email protected]>
--
Thanks,
David / dhildenb
On Wed, Mar 22, 2023 at 12:18:08PM +0100, David Hildenbrand wrote:
> On 21.03.23 21:54, Lorenzo Stoakes wrote:
> > Having previously laid the foundation for converting vread() to an iterator
> > function, pull the trigger and do so.
> >
> > This patch attempts to provide minimal refactoring and to reflect the
> > existing logic as best we can, for example we continue to zero portions of
> > memory not read, as before.
> >
> > Overall, there should be no functional difference other than a performance
> > improvement in /proc/kcore access to vmalloc regions.
> >
> > Now we have eliminated the need for a bounce buffer in read_kcore_iter(),
> > we dispense with it. We need to ensure userland pages are faulted in before
> > proceeding, as we take spin locks.
> >
> > Additionally, we must account for the fact that at any point a copy may
> > fail if this happens, we exit indicating fewer bytes retrieved than
> > expected.
> >
> > Signed-off-by: Lorenzo Stoakes <[email protected]>
> > ---
> > fs/proc/kcore.c | 26 ++---
> > include/linux/vmalloc.h | 3 +-
> > mm/nommu.c | 10 +-
> > mm/vmalloc.c | 234 +++++++++++++++++++++++++---------------
> > 4 files changed, 160 insertions(+), 113 deletions(-)
> >
> > diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
> > index 25e0eeb8d498..221e16f75ba5 100644
> > --- a/fs/proc/kcore.c
> > +++ b/fs/proc/kcore.c
> > @@ -307,13 +307,9 @@ static void append_kcore_note(char *notes, size_t *i, const char *name,
> > *i = ALIGN(*i + descsz, 4);
> > }
> > -static ssize_t
> > -read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
> > +static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
> > {
> > - struct file *file = iocb->ki_filp;
> > - char *buf = file->private_data;
> > loff_t *ppos = &iocb->ki_pos;
> > -
> > size_t phdrs_offset, notes_offset, data_offset;
> > size_t page_offline_frozen = 1;
> > size_t phdrs_len, notes_len;
> > @@ -507,9 +503,12 @@ read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
> > switch (m->type) {
> > case KCORE_VMALLOC:
> > - vread(buf, (char *)start, tsz);
> > - /* we have to zero-fill user buffer even if no read */
> > - if (copy_to_iter(buf, tsz, iter) != tsz) {
> > + /*
> > + * Make sure user pages are faulted in as we acquire
> > + * spinlocks in vread_iter().
> > + */
> > + if (fault_in_iov_iter_writeable(iter, tsz) ||
> > + vread_iter(iter, (char *)start, tsz) != tsz) {
> > ret = -EFAULT;
> > goto out;
> > }
>
> What if we race with swapout after faulting the pages in? Or some other
> mechanism to write-protect the user space pages?
>
> Also, "This is primarily useful when we already know that some or all of the
> pages in @i aren't in memory". This order of events might slow down things
> quite a bit if I am not wrong.
>
>
> Wouldn't you want to have something like:
>
> while (vread_iter(iter, (char *)start, tsz) != tsz) {
> if (fault_in_iov_iter_writeable(iter, tsz)) {
> ret = -EFAULT;
> goto out;
> }
> }
>
> Or am I missing something?
>
Indeed, I was thinking of this as:-
- prefault
- try (possibly fail if race) copy operation
However it does make more sense, and makes it explicit that it's an attempt that
might fail requiring a fault-in in the while form.
I think the upcoming change to explicitly make the iter function
copy_folio_to_iter_nofault() makes it clear from end-to-end what is being done -
we mustn't fault (spinlocks held) and if a fault would occur we error out, if we
error out try faulting in, if this fails then abort.
I will fixup in respin.
> --
> Thanks,
>
> David / dhildenb
>
On 21.03.23 21:54, Lorenzo Stoakes wrote:
> Having previously laid the foundation for converting vread() to an iterator
> function, pull the trigger and do so.
>
> This patch attempts to provide minimal refactoring and to reflect the
> existing logic as best we can, for example we continue to zero portions of
> memory not read, as before.
>
> Overall, there should be no functional difference other than a performance
> improvement in /proc/kcore access to vmalloc regions.
>
> Now we have eliminated the need for a bounce buffer in read_kcore_iter(),
> we dispense with it. We need to ensure userland pages are faulted in before
> proceeding, as we take spin locks.
>
> Additionally, we must account for the fact that at any point a copy may
> fail if this happens, we exit indicating fewer bytes retrieved than
> expected.
>
> Signed-off-by: Lorenzo Stoakes <[email protected]>
> ---
> fs/proc/kcore.c | 26 ++---
> include/linux/vmalloc.h | 3 +-
> mm/nommu.c | 10 +-
> mm/vmalloc.c | 234 +++++++++++++++++++++++++---------------
> 4 files changed, 160 insertions(+), 113 deletions(-)
>
> diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
> index 25e0eeb8d498..221e16f75ba5 100644
> --- a/fs/proc/kcore.c
> +++ b/fs/proc/kcore.c
> @@ -307,13 +307,9 @@ static void append_kcore_note(char *notes, size_t *i, const char *name,
> *i = ALIGN(*i + descsz, 4);
> }
>
> -static ssize_t
> -read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
> +static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
> {
> - struct file *file = iocb->ki_filp;
> - char *buf = file->private_data;
> loff_t *ppos = &iocb->ki_pos;
> -
> size_t phdrs_offset, notes_offset, data_offset;
> size_t page_offline_frozen = 1;
> size_t phdrs_len, notes_len;
> @@ -507,9 +503,12 @@ read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
>
> switch (m->type) {
> case KCORE_VMALLOC:
> - vread(buf, (char *)start, tsz);
> - /* we have to zero-fill user buffer even if no read */
> - if (copy_to_iter(buf, tsz, iter) != tsz) {
> + /*
> + * Make sure user pages are faulted in as we acquire
> + * spinlocks in vread_iter().
> + */
> + if (fault_in_iov_iter_writeable(iter, tsz) ||
> + vread_iter(iter, (char *)start, tsz) != tsz) {
> ret = -EFAULT;
> goto out;
> }
What if we race with swapout after faulting the pages in? Or some other
mechanism to write-protect the user space pages?
Also, "This is primarily useful when we already know that some or all of
the pages in @i aren't in memory". This order of events might slow down
things quite a bit if I am not wrong.
Wouldn't you want to have something like:
while (vread_iter(iter, (char *)start, tsz) != tsz) {
if (fault_in_iov_iter_writeable(iter, tsz)) {
ret = -EFAULT;
goto out;
}
}
Or am I missing something?
--
Thanks,
David / dhildenb
On 03/21/23 at 08:54pm, Lorenzo Stoakes wrote:
......
> Additionally, we must account for the fact that at any point a copy may
> fail if this happens, we exit indicating fewer bytes retrieved than
> expected.
......
> -static int aligned_vread(char *buf, char *addr, unsigned long count)
> +/*
> + * small helper routine, copy contents to iter from addr.
> + * If the page is not present, fill zero.
> + *
> + * Returns the number of copied bytes.
> + */
> +static size_t aligned_vread_iter(struct iov_iter *iter,
> + const char *addr, size_t count)
> {
> - struct page *p;
> - int copied = 0;
> + size_t remains = count;
> + struct page *page;
>
> - while (count) {
> + while (remains > 0) {
> unsigned long offset, length;
> + size_t copied = 0;
>
> offset = offset_in_page(addr);
> length = PAGE_SIZE - offset;
> - if (length > count)
> - length = count;
> - p = vmalloc_to_page(addr);
> + if (length > remains)
> + length = remains;
> + page = vmalloc_to_page(addr);
> /*
> - * To do safe access to this _mapped_ area, we need
> - * lock. But adding lock here means that we need to add
> - * overhead of vmalloc()/vfree() calls for this _debug_
> - * interface, rarely used. Instead of that, we'll use
> - * kmap() and get small overhead in this access function.
> + * To do safe access to this _mapped_ area, we need lock. But
> + * adding lock here means that we need to add overhead of
> + * vmalloc()/vfree() calls for this _debug_ interface, rarely
> + * used. Instead of that, we'll use an local mapping via
> + * copy_page_to_iter_atomic() and accept a small overhead in
> + * this access function.
> */
> - if (p) {
> - /* We can expect USER0 is not used -- see vread() */
> - void *map = kmap_atomic(p);
> - memcpy(buf, map + offset, length);
> - kunmap_atomic(map);
> - } else
> - memset(buf, 0, length);
> + if (page)
> + copied = copy_page_to_iter_atomic(page, offset, length,
> + iter);
> +
If we decided to quit at any failing copy point, indicating fewer bytes
retrieved than expected, wondering why we don't quit here if
copy_page_to_iter_atomic() failed?
The original zero filling is for unmapped vmalloc area, but not for
reading area if failed. Not sure if I got your point.
> + /* Zero anything we were unable to copy. */
> + copied += zero_iter(iter, length - copied);
> +
> + addr += copied;
> + remains -= copied;
>
> - addr += length;
> - buf += length;
> - copied += length;
> - count -= length;
> + if (copied != length)
> + break;
> }
> - return copied;
> +
> + return count - remains;
> }
>
> -static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags)
> +/*
> + * Read from a vm_map_ram region of memory.
> + *
> + * Returns the number of copied bytes.
> + */
> +static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr,
> + size_t count, unsigned long flags)
> {
> char *start;
> struct vmap_block *vb;
> unsigned long offset;
> - unsigned int rs, re, n;
> + unsigned int rs, re;
> + size_t remains, n;
>
> /*
> * If it's area created by vm_map_ram() interface directly, but
> * not further subdividing and delegating management to vmap_block,
> * handle it here.
> */
> - if (!(flags & VMAP_BLOCK)) {
> - aligned_vread(buf, addr, count);
> - return;
> - }
> + if (!(flags & VMAP_BLOCK))
> + return aligned_vread_iter(iter, addr, count);
>
> /*
> * Area is split into regions and tracked with vmap_block, read out
> @@ -3505,50 +3537,65 @@ static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags
> */
> vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr));
> if (!vb)
> - goto finished;
> + goto finished_zero;
>
> spin_lock(&vb->lock);
> if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
> spin_unlock(&vb->lock);
> - goto finished;
> + goto finished_zero;
> }
> +
> + remains = count;
> for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) {
> - if (!count)
> - break;
> + size_t copied;
> +
> + if (remains == 0)
> + goto finished;
> +
> start = vmap_block_vaddr(vb->va->va_start, rs);
> - while (addr < start) {
> - if (count == 0)
> - goto unlock;
> - *buf = '\0';
> - buf++;
> - addr++;
> - count--;
> +
> + if (addr < start) {
> + size_t to_zero = min_t(size_t, start - addr, remains);
> + size_t zeroed = zero_iter(iter, to_zero);
> +
> + addr += zeroed;
> + remains -= zeroed;
> +
> + if (remains == 0 || zeroed != to_zero)
> + goto finished;
> }
> +
> /*it could start reading from the middle of used region*/
> offset = offset_in_page(addr);
> n = ((re - rs + 1) << PAGE_SHIFT) - offset;
> - if (n > count)
> - n = count;
> - aligned_vread(buf, start+offset, n);
> + if (n > remains)
> + n = remains;
> +
> + copied = aligned_vread_iter(iter, start + offset, n);
>
> - buf += n;
> - addr += n;
> - count -= n;
> + addr += copied;
> + remains -= copied;
> +
> + if (copied != n)
> + goto finished;
> }
> -unlock:
> +
> spin_unlock(&vb->lock);
>
> -finished:
> +finished_zero:
> /* zero-fill the left dirty or free regions */
> - if (count)
> - memset(buf, 0, count);
> + return count - remains + zero_iter(iter, remains);
When it jumps to finished_zero, local varialble 'remains' is still 0, we
do nothing here but return count. It doesn't look so right. You may need
to move up the 'remains' assignment as 'count' line.
> +finished:
> + /* We couldn't copy/zero everything */
> + spin_unlock(&vb->lock);
> + return count - remains;
> }
>
> /**
> - * vread() - read vmalloc area in a safe way.
> - * @buf: buffer for reading data
> - * @addr: vm address.
> - * @count: number of bytes to be read.
> + * vread_iter() - read vmalloc area in a safe way to an iterator.
> + * @iter: the iterator to which data should be written.
> + * @addr: vm address.
> + * @count: number of bytes to be read.
> *
> * This function checks that addr is a valid vmalloc'ed area, and
> * copy data from that area to a given buffer. If the given memory range
> @@ -3568,13 +3615,12 @@ static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags
> * (same number as @count) or %0 if [addr...addr+count) doesn't
> * include any intersection with valid vmalloc area
> */
> -long vread(char *buf, char *addr, unsigned long count)
> +long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
> {
> struct vmap_area *va;
> struct vm_struct *vm;
> - char *vaddr, *buf_start = buf;
> - unsigned long buflen = count;
> - unsigned long n, size, flags;
> + char *vaddr;
> + size_t n, size, flags, remains;
>
> addr = kasan_reset_tag(addr);
>
> @@ -3582,18 +3628,22 @@ long vread(char *buf, char *addr, unsigned long count)
> if ((unsigned long) addr + count < count)
> count = -(unsigned long) addr;
>
> + remains = count;
> +
> spin_lock(&vmap_area_lock);
> va = find_vmap_area_exceed_addr((unsigned long)addr);
> if (!va)
> - goto finished;
> + goto finished_zero;
>
> /* no intersects with alive vmap_area */
> - if ((unsigned long)addr + count <= va->va_start)
> - goto finished;
> + if ((unsigned long)addr + remains <= va->va_start)
> + goto finished_zero;
>
> list_for_each_entry_from(va, &vmap_area_list, list) {
> - if (!count)
> - break;
> + size_t copied;
> +
> + if (remains == 0)
> + goto finished;
>
> vm = va->vm;
> flags = va->flags & VMAP_FLAGS_MASK;
> @@ -3608,6 +3658,7 @@ long vread(char *buf, char *addr, unsigned long count)
>
> if (vm && (vm->flags & VM_UNINITIALIZED))
> continue;
> +
> /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
> smp_rmb();
>
> @@ -3616,38 +3667,45 @@ long vread(char *buf, char *addr, unsigned long count)
>
> if (addr >= vaddr + size)
> continue;
> - while (addr < vaddr) {
> - if (count == 0)
> +
> + if (addr < vaddr) {
> + size_t to_zero = min_t(size_t, vaddr - addr, remains);
> + size_t zeroed = zero_iter(iter, to_zero);
> +
> + addr += zeroed;
> + remains -= zeroed;
> +
> + if (remains == 0 || zeroed != to_zero)
> goto finished;
> - *buf = '\0';
> - buf++;
> - addr++;
> - count--;
> }
> +
> n = vaddr + size - addr;
> - if (n > count)
> - n = count;
> + if (n > remains)
> + n = remains;
>
> if (flags & VMAP_RAM)
> - vmap_ram_vread(buf, addr, n, flags);
> + copied = vmap_ram_vread_iter(iter, addr, n, flags);
> else if (!(vm->flags & VM_IOREMAP))
> - aligned_vread(buf, addr, n);
> + copied = aligned_vread_iter(iter, addr, n);
> else /* IOREMAP area is treated as memory hole */
> - memset(buf, 0, n);
> - buf += n;
> - addr += n;
> - count -= n;
> + copied = zero_iter(iter, n);
> +
> + addr += copied;
> + remains -= copied;
> +
> + if (copied != n)
> + goto finished;
> }
> -finished:
> - spin_unlock(&vmap_area_lock);
>
> - if (buf == buf_start)
> - return 0;
> +finished_zero:
> + spin_unlock(&vmap_area_lock);
> /* zero-fill memory holes */
> - if (buf != buf_start + buflen)
> - memset(buf, 0, buflen - (buf - buf_start));
> + return count - remains + zero_iter(iter, remains);
> +finished:
> + /* Nothing remains, or We couldn't copy/zero everything. */
~~~ typo, s/We/we/
> + spin_unlock(&vmap_area_lock);
>
> - return buflen;
> + return count - remains;
> }
>
> /**
> --
> 2.39.2
>
On 03/22/23 at 10:32am, Lorenzo Stoakes wrote:
> On Wed, Mar 22, 2023 at 06:17:25PM +0800, Baoquan He wrote:
> > On 03/21/23 at 08:54pm, Lorenzo Stoakes wrote:
> > > Provide an atomic context equivalent for copy_page_to_iter(). This eschews
> > > the might_fault() check copies memory in the same way that
> > > copy_page_from_iter_atomic() does.
> > >
> > > This functions assumes a non-compound page, however this mimics the
> > > existing behaviour of copy_page_from_iter_atomic(). I am keeping the
> > > behaviour consistent between the two, deferring any such change to an
> > > explicit folio-fication effort.
> > >
> > > This is being added in order that an iteratable form of vread() can be
> > > implemented with known prefaulted pages to avoid the need for mutex
> > > locking.
> > >
> > > Signed-off-by: Lorenzo Stoakes <[email protected]>
> > > ---
> > > include/linux/uio.h | 2 ++
> > > lib/iov_iter.c | 28 ++++++++++++++++++++++++++++
> > > 2 files changed, 30 insertions(+)
> > >
> > > diff --git a/include/linux/uio.h b/include/linux/uio.h
> > > index 27e3fd942960..fab07103090f 100644
> > > --- a/include/linux/uio.h
> > > +++ b/include/linux/uio.h
> > > @@ -154,6 +154,8 @@ static inline struct iovec iov_iter_iovec(const struct iov_iter *iter)
> > >
> > > size_t copy_page_from_iter_atomic(struct page *page, unsigned offset,
> > > size_t bytes, struct iov_iter *i);
> > > +size_t copy_page_to_iter_atomic(struct page *page, unsigned offset,
> > > + size_t bytes, struct iov_iter *i);
> > > void iov_iter_advance(struct iov_iter *i, size_t bytes);
> > > void iov_iter_revert(struct iov_iter *i, size_t bytes);
> > > size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes);
> > > diff --git a/lib/iov_iter.c b/lib/iov_iter.c
> > > index 274014e4eafe..48ca1c5dfc04 100644
> > > --- a/lib/iov_iter.c
> > > +++ b/lib/iov_iter.c
> > > @@ -821,6 +821,34 @@ size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t byt
> > > }
> > > EXPORT_SYMBOL(copy_page_from_iter_atomic);
> > >
> > > +size_t copy_page_to_iter_atomic(struct page *page, unsigned offset, size_t bytes,
> > > + struct iov_iter *i)
> > > +{
> > > + char *kaddr = kmap_local_page(page);
> >
> > I am a little confused about the name of this new function. In its
> > conterpart, copy_page_from_iter_atomic(), kmap_atomic()/kunmpa_atomic()
> > are used. With them, if CONFIG_HIGHMEM=n, it's like below:
>
> The reason for this is that:-
>
> 1. kmap_atomic() explicitly states that it is now deprecated and must no longer
> be used, and kmap_local_page() should be used instead:-
>
> * kmap_atomic - Atomically map a page for temporary usage - Deprecated!
>
> * Do not use in new code. Use kmap_local_page() instead.
>
> 2. kmap_local_page() explicitly states that it can be used in any context:-
>
> * Can be invoked from any context, including interrupts.
Yeah, I saw that stated in document too. With my understanding, it's the
page mapping itself will be guaranteed and can be used in any context
when kmap_local_page() is taken. However, here kmap_local_page() is used
to make the code block atomic, it could be not achieved.
>
> I wanted follow this advice as strictly as I could, hence the change. However,
> we do need preemption/pagefaults explicitly disabled in this context (we are
> happy to fail if the faulted in pages are unmapped in meantime), and I didn't
> check the internals to make sure.
>
> So I think for safety it is better to use k[un]map_atomic() here, I'll respin
> and put that back in, good catch!
>
> >
> > static inline void *kmap_atomic(struct page *page)
> > {
> > if (IS_ENABLED(CONFIG_PREEMPT_RT))
> > migrate_disable();
> > else
> > preempt_disable();
> > pagefault_disable();
> > return page_address(page);
> > }
> >
> > But kmap_local_page() is only having page_address(), the code block
> > between kmap_local_page() and kunmap_local() is also atomic, it's a
> > little messy in my mind.
> >
> > static inline void *kmap_local_page(struct page *page)
> > {
> > return page_address(page);
> > }
> >
> > > + char *p = kaddr + offset;
> > > + size_t copied = 0;
> > > +
> > > + if (!page_copy_sane(page, offset, bytes) ||
> > > + WARN_ON_ONCE(i->data_source))
> > > + goto out;
> > > +
> > > + if (unlikely(iov_iter_is_pipe(i))) {
> > > + copied = copy_page_to_iter_pipe(page, offset, bytes, i);
> > > + goto out;
> > > + }
> > > +
> > > + iterate_and_advance(i, bytes, base, len, off,
> > > + copyout(base, p + off, len),
> > > + memcpy(base, p + off, len)
> > > + )
> > > + copied = bytes;
> > > +
> > > +out:
> > > + kunmap_local(kaddr);
> > > + return copied;
> > > +}
> > > +EXPORT_SYMBOL(copy_page_to_iter_atomic);
> > > +
> > > static void pipe_advance(struct iov_iter *i, size_t size)
> > > {
> > > struct pipe_inode_info *pipe = i->pipe;
> > > --
> > > 2.39.2
> > >
> >
>
On 03/22/23 at 11:06am, Lorenzo Stoakes wrote:
> On Wed, Mar 22, 2023 at 10:32:47AM +0000, Lorenzo Stoakes wrote:
> > > I am a little confused about the name of this new function. In its
> > > conterpart, copy_page_from_iter_atomic(), kmap_atomic()/kunmpa_atomic()
> > > are used. With them, if CONFIG_HIGHMEM=n, it's like below:
> >
> > The reason for this is that:-
> >
> > 1. kmap_atomic() explicitly states that it is now deprecated and must no longer
> > be used, and kmap_local_page() should be used instead:-
> >
> > * kmap_atomic - Atomically map a page for temporary usage - Deprecated!
> >
> > * Do not use in new code. Use kmap_local_page() instead.
> >
> > 2. kmap_local_page() explicitly states that it can be used in any context:-
> >
> > * Can be invoked from any context, including interrupts.
> >
> > I wanted follow this advice as strictly as I could, hence the change. However,
> > we do need preemption/pagefaults explicitly disabled in this context (we are
> > happy to fail if the faulted in pages are unmapped in meantime), and I didn't
> > check the internals to make sure.
> >
> > So I think for safety it is better to use k[un]map_atomic() here, I'll respin
> > and put that back in, good catch!
> >
>
> Actually, given we have preemption disabled due to the held spinlock, I think
> it'd be better to add a copy_page_to_iter_nofault() that uses
> copy_to_user_nofault() which will disable pagefaults thus have exactly the
> equivalent behaviour, more explicitly and without the use of a deprecated
> function.
Sounds a great idea, that let us be able to avoid using kmap_atomic.
>
> > >
> > > static inline void *kmap_atomic(struct page *page)
> > > {
> > > if (IS_ENABLED(CONFIG_PREEMPT_RT))
> > > migrate_disable();
> > > else
> > > preempt_disable();
> > > pagefault_disable();
> > > return page_address(page);
> > > }
> > >
> > > But kmap_local_page() is only having page_address(), the code block
> > > between kmap_local_page() and kunmap_local() is also atomic, it's a
> > > little messy in my mind.
> > >
> > > static inline void *kmap_local_page(struct page *page)
> > > {
> > > return page_address(page);
> > > }
> > >
> > > > + char *p = kaddr + offset;
> > > > + size_t copied = 0;
> > > > +
> > > > + if (!page_copy_sane(page, offset, bytes) ||
> > > > + WARN_ON_ONCE(i->data_source))
> > > > + goto out;
> > > > +
> > > > + if (unlikely(iov_iter_is_pipe(i))) {
> > > > + copied = copy_page_to_iter_pipe(page, offset, bytes, i);
> > > > + goto out;
> > > > + }
> > > > +
> > > > + iterate_and_advance(i, bytes, base, len, off,
> > > > + copyout(base, p + off, len),
> > > > + memcpy(base, p + off, len)
> > > > + )
> > > > + copied = bytes;
> > > > +
> > > > +out:
> > > > + kunmap_local(kaddr);
> > > > + return copied;
> > > > +}
> > > > +EXPORT_SYMBOL(copy_page_to_iter_atomic);
> > > > +
> > > > static void pipe_advance(struct iov_iter *i, size_t size)
> > > > {
> > > > struct pipe_inode_info *pipe = i->pipe;
> > > > --
> > > > 2.39.2
> > > >
> > >
>
On Wed, Mar 22, 2023 at 08:55:23PM +0800, Baoquan He wrote:
> On 03/21/23 at 08:54pm, Lorenzo Stoakes wrote:
> ......
> > Additionally, we must account for the fact that at any point a copy may
> > fail if this happens, we exit indicating fewer bytes retrieved than
> > expected.
> ......
> > -static int aligned_vread(char *buf, char *addr, unsigned long count)
> > +/*
> > + * small helper routine, copy contents to iter from addr.
> > + * If the page is not present, fill zero.
> > + *
> > + * Returns the number of copied bytes.
> > + */
> > +static size_t aligned_vread_iter(struct iov_iter *iter,
> > + const char *addr, size_t count)
> > {
> > - struct page *p;
> > - int copied = 0;
> > + size_t remains = count;
> > + struct page *page;
> >
> > - while (count) {
> > + while (remains > 0) {
> > unsigned long offset, length;
> > + size_t copied = 0;
> >
> > offset = offset_in_page(addr);
> > length = PAGE_SIZE - offset;
> > - if (length > count)
> > - length = count;
> > - p = vmalloc_to_page(addr);
> > + if (length > remains)
> > + length = remains;
> > + page = vmalloc_to_page(addr);
> > /*
> > - * To do safe access to this _mapped_ area, we need
> > - * lock. But adding lock here means that we need to add
> > - * overhead of vmalloc()/vfree() calls for this _debug_
> > - * interface, rarely used. Instead of that, we'll use
> > - * kmap() and get small overhead in this access function.
> > + * To do safe access to this _mapped_ area, we need lock. But
> > + * adding lock here means that we need to add overhead of
> > + * vmalloc()/vfree() calls for this _debug_ interface, rarely
> > + * used. Instead of that, we'll use an local mapping via
> > + * copy_page_to_iter_atomic() and accept a small overhead in
> > + * this access function.
> > */
> > - if (p) {
> > - /* We can expect USER0 is not used -- see vread() */
> > - void *map = kmap_atomic(p);
> > - memcpy(buf, map + offset, length);
> > - kunmap_atomic(map);
> > - } else
> > - memset(buf, 0, length);
> > + if (page)
> > + copied = copy_page_to_iter_atomic(page, offset, length,
> > + iter);
> > +
>
> If we decided to quit at any failing copy point, indicating fewer bytes
> retrieved than expected, wondering why we don't quit here if
> copy_page_to_iter_atomic() failed?
>
> The original zero filling is for unmapped vmalloc area, but not for
> reading area if failed. Not sure if I got your point.
>
Great spot, I will move this zeroing to an else branch.
> > + /* Zero anything we were unable to copy. */
> > + copied += zero_iter(iter, length - copied);
> > +
> > + addr += copied;
> > + remains -= copied;
> >
> > - addr += length;
> > - buf += length;
> > - copied += length;
> > - count -= length;
> > + if (copied != length)
> > + break;
> > }
> > - return copied;
> > +
> > + return count - remains;
> > }
> >
> > -static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags)
> > +/*
> > + * Read from a vm_map_ram region of memory.
> > + *
> > + * Returns the number of copied bytes.
> > + */
> > +static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr,
> > + size_t count, unsigned long flags)
> > {
> > char *start;
> > struct vmap_block *vb;
> > unsigned long offset;
> > - unsigned int rs, re, n;
> > + unsigned int rs, re;
> > + size_t remains, n;
> >
> > /*
> > * If it's area created by vm_map_ram() interface directly, but
> > * not further subdividing and delegating management to vmap_block,
> > * handle it here.
> > */
> > - if (!(flags & VMAP_BLOCK)) {
> > - aligned_vread(buf, addr, count);
> > - return;
> > - }
> > + if (!(flags & VMAP_BLOCK))
> > + return aligned_vread_iter(iter, addr, count);
> >
> > /*
> > * Area is split into regions and tracked with vmap_block, read out
> > @@ -3505,50 +3537,65 @@ static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags
> > */
> > vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr));
> > if (!vb)
> > - goto finished;
> > + goto finished_zero;
> >
> > spin_lock(&vb->lock);
> > if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
> > spin_unlock(&vb->lock);
> > - goto finished;
> > + goto finished_zero;
> > }
> > +
> > + remains = count;
> > for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) {
> > - if (!count)
> > - break;
> > + size_t copied;
> > +
> > + if (remains == 0)
> > + goto finished;
> > +
> > start = vmap_block_vaddr(vb->va->va_start, rs);
> > - while (addr < start) {
> > - if (count == 0)
> > - goto unlock;
> > - *buf = '\0';
> > - buf++;
> > - addr++;
> > - count--;
> > +
> > + if (addr < start) {
> > + size_t to_zero = min_t(size_t, start - addr, remains);
> > + size_t zeroed = zero_iter(iter, to_zero);
> > +
> > + addr += zeroed;
> > + remains -= zeroed;
> > +
> > + if (remains == 0 || zeroed != to_zero)
> > + goto finished;
> > }
> > +
> > /*it could start reading from the middle of used region*/
> > offset = offset_in_page(addr);
> > n = ((re - rs + 1) << PAGE_SHIFT) - offset;
> > - if (n > count)
> > - n = count;
> > - aligned_vread(buf, start+offset, n);
> > + if (n > remains)
> > + n = remains;
> > +
> > + copied = aligned_vread_iter(iter, start + offset, n);
> >
> > - buf += n;
> > - addr += n;
> > - count -= n;
> > + addr += copied;
> > + remains -= copied;
> > +
> > + if (copied != n)
> > + goto finished;
> > }
> > -unlock:
> > +
> > spin_unlock(&vb->lock);
> >
> > -finished:
> > +finished_zero:
> > /* zero-fill the left dirty or free regions */
> > - if (count)
> > - memset(buf, 0, count);
> > + return count - remains + zero_iter(iter, remains);
>
> When it jumps to finished_zero, local varialble 'remains' is still 0, we
> do nothing here but return count. It doesn't look so right. You may need
> to move up the 'remains' assignment as 'count' line.
>
Another great spot, will fix.
> > +finished:
> > + /* We couldn't copy/zero everything */
> > + spin_unlock(&vb->lock);
> > + return count - remains;
> > }
> >
> > /**
> > - * vread() - read vmalloc area in a safe way.
> > - * @buf: buffer for reading data
> > - * @addr: vm address.
> > - * @count: number of bytes to be read.
> > + * vread_iter() - read vmalloc area in a safe way to an iterator.
> > + * @iter: the iterator to which data should be written.
> > + * @addr: vm address.
> > + * @count: number of bytes to be read.
> > *
> > * This function checks that addr is a valid vmalloc'ed area, and
> > * copy data from that area to a given buffer. If the given memory range
> > @@ -3568,13 +3615,12 @@ static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags
> > * (same number as @count) or %0 if [addr...addr+count) doesn't
> > * include any intersection with valid vmalloc area
> > */
> > -long vread(char *buf, char *addr, unsigned long count)
> > +long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
> > {
> > struct vmap_area *va;
> > struct vm_struct *vm;
> > - char *vaddr, *buf_start = buf;
> > - unsigned long buflen = count;
> > - unsigned long n, size, flags;
> > + char *vaddr;
> > + size_t n, size, flags, remains;
> >
> > addr = kasan_reset_tag(addr);
> >
> > @@ -3582,18 +3628,22 @@ long vread(char *buf, char *addr, unsigned long count)
> > if ((unsigned long) addr + count < count)
> > count = -(unsigned long) addr;
> >
> > + remains = count;
> > +
> > spin_lock(&vmap_area_lock);
> > va = find_vmap_area_exceed_addr((unsigned long)addr);
> > if (!va)
> > - goto finished;
> > + goto finished_zero;
> >
> > /* no intersects with alive vmap_area */
> > - if ((unsigned long)addr + count <= va->va_start)
> > - goto finished;
> > + if ((unsigned long)addr + remains <= va->va_start)
> > + goto finished_zero;
> >
> > list_for_each_entry_from(va, &vmap_area_list, list) {
> > - if (!count)
> > - break;
> > + size_t copied;
> > +
> > + if (remains == 0)
> > + goto finished;
> >
> > vm = va->vm;
> > flags = va->flags & VMAP_FLAGS_MASK;
> > @@ -3608,6 +3658,7 @@ long vread(char *buf, char *addr, unsigned long count)
> >
> > if (vm && (vm->flags & VM_UNINITIALIZED))
> > continue;
> > +
> > /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
> > smp_rmb();
> >
> > @@ -3616,38 +3667,45 @@ long vread(char *buf, char *addr, unsigned long count)
> >
> > if (addr >= vaddr + size)
> > continue;
> > - while (addr < vaddr) {
> > - if (count == 0)
> > +
> > + if (addr < vaddr) {
> > + size_t to_zero = min_t(size_t, vaddr - addr, remains);
> > + size_t zeroed = zero_iter(iter, to_zero);
> > +
> > + addr += zeroed;
> > + remains -= zeroed;
> > +
> > + if (remains == 0 || zeroed != to_zero)
> > goto finished;
> > - *buf = '\0';
> > - buf++;
> > - addr++;
> > - count--;
> > }
> > +
> > n = vaddr + size - addr;
> > - if (n > count)
> > - n = count;
> > + if (n > remains)
> > + n = remains;
> >
> > if (flags & VMAP_RAM)
> > - vmap_ram_vread(buf, addr, n, flags);
> > + copied = vmap_ram_vread_iter(iter, addr, n, flags);
> > else if (!(vm->flags & VM_IOREMAP))
> > - aligned_vread(buf, addr, n);
> > + copied = aligned_vread_iter(iter, addr, n);
> > else /* IOREMAP area is treated as memory hole */
> > - memset(buf, 0, n);
> > - buf += n;
> > - addr += n;
> > - count -= n;
> > + copied = zero_iter(iter, n);
> > +
> > + addr += copied;
> > + remains -= copied;
> > +
> > + if (copied != n)
> > + goto finished;
> > }
> > -finished:
> > - spin_unlock(&vmap_area_lock);
> >
> > - if (buf == buf_start)
> > - return 0;
> > +finished_zero:
> > + spin_unlock(&vmap_area_lock);
> > /* zero-fill memory holes */
> > - if (buf != buf_start + buflen)
> > - memset(buf, 0, buflen - (buf - buf_start));
> > + return count - remains + zero_iter(iter, remains);
> > +finished:
> > + /* Nothing remains, or We couldn't copy/zero everything. */
> ~~~ typo, s/We/we/
> > + spin_unlock(&vmap_area_lock);
> >
> > - return buflen;
> > + return count - remains;
> > }
> >
> > /**
> > --
> > 2.39.2
> >
>