Currently, vread can read out vmalloc areas which is associated with
a vm_struct. While this doesn't work for areas created by vm_map_ram()
interface because it doesn't have an associated vm_struct. Then in vread(),
these areas are all skipped.
Here, add a new function vmap_ram_vread() to read out vm_map_ram areas.
The area created with vmap_ram_vread() interface directly can be handled
like the other normal vmap areas with aligned_vread(). While areas
which will be further subdivided and managed with vmap_block need
carefully read out page-aligned small regions and zero fill holes.
Signed-off-by: Baoquan He <[email protected]>
---
mm/vmalloc.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++-----
1 file changed, 73 insertions(+), 7 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ab4825050b5c..13875bc41e27 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3544,6 +3544,65 @@ static int aligned_vread(char *buf, char *addr, unsigned long count)
return copied;
}
+static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags)
+{
+ char *start;
+ struct vmap_block *vb;
+ unsigned long offset;
+ unsigned int rs, re, n;
+
+ /*
+ * If it's area created by vm_map_ram() interface directly, but
+ * not further subdividing and delegating management to vmap_block,
+ * handle it here.
+ */
+ if (!(flags & VMAP_BLOCK)) {
+ aligned_vread(buf, addr, count);
+ return;
+ }
+
+ /*
+ * Area is split into regions and tracked with vmap_block, read out
+ * each region and zero fill the hole between regions.
+ */
+ vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr));
+
+ spin_lock(&vb->lock);
+ if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
+ spin_unlock(&vb->lock);
+ memset(buf, 0, count);
+ return;
+ }
+ for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) {
+ if (!count)
+ break;
+ start = vmap_block_vaddr(vb->va->va_start, rs);
+ while (addr < start) {
+ if (count == 0)
+ break;
+ *buf = '\0';
+ buf++;
+ addr++;
+ count--;
+ }
+ /*it could start reading from the middle of used region*/
+ offset = offset_in_page(addr);
+ n = ((re - rs + 1) << PAGE_SHIFT) - offset;
+ if (n > count)
+ n = count;
+ aligned_vread(buf, start+offset, n);
+
+ buf += n;
+ addr += n;
+ count -= n;
+ }
+ spin_unlock(&vb->lock);
+
+ /* zero-fill the left dirty or free regions */
+ if (count)
+ memset(buf, 0, count);
+}
+
/**
* vread() - read vmalloc area in a safe way.
* @buf: buffer for reading data
@@ -3574,7 +3633,7 @@ long vread(char *buf, char *addr, unsigned long count)
struct vm_struct *vm;
char *vaddr, *buf_start = buf;
unsigned long buflen = count;
- unsigned long n;
+ unsigned long n, size, flags;
addr = kasan_reset_tag(addr);
@@ -3595,12 +3654,16 @@ long vread(char *buf, char *addr, unsigned long count)
if (!count)
break;
- if (!va->vm)
+ vm = va->vm;
+ flags = va->flags & VMAP_FLAGS_MASK;
+
+ if (!vm && !flags)
continue;
- vm = va->vm;
- vaddr = (char *) vm->addr;
- if (addr >= vaddr + get_vm_area_size(vm))
+ vaddr = (char *) va->va_start;
+ size = vm ? get_vm_area_size(vm) : va_size(va);
+
+ if (addr >= vaddr + size)
continue;
while (addr < vaddr) {
if (count == 0)
@@ -3610,10 +3673,13 @@ long vread(char *buf, char *addr, unsigned long count)
addr++;
count--;
}
- n = vaddr + get_vm_area_size(vm) - addr;
+ n = vaddr + size - addr;
if (n > count)
n = count;
- if (!(vm->flags & VM_IOREMAP))
+
+ if (flags & VMAP_RAM)
+ vmap_ram_vread(buf, addr, n, flags);
+ else if (!(vm->flags & VM_IOREMAP))
aligned_vread(buf, addr, n);
else /* IOREMAP area is treated as memory hole */
memset(buf, 0, n);
--
2.34.1
Hi Baoquan,
url: https://github.com/intel-lab-lkp/linux/commits/Baoquan-He/mm-vmalloc-c-add-used_map-into-vmap_block-to-track-space-of-vmap_block/20230113-112149
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/r/20230113031921.64716-4-bhe%40redhat.com
patch subject: [PATCH v3 3/7] mm/vmalloc.c: allow vread() to read out vm_map_ram areas
config: i386-randconfig-m021
compiler: gcc-11 (Debian 11.3.0-8) 11.3.0
If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <[email protected]>
| Reported-by: Dan Carpenter <[email protected]>
smatch warnings:
mm/vmalloc.c:3682 vread() error: we previously assumed 'vm' could be null (see line 3664)
vim +/vm +3682 mm/vmalloc.c
^1da177e4c3f415 Linus Torvalds 2005-04-16 3630 long vread(char *buf, char *addr, unsigned long count)
^1da177e4c3f415 Linus Torvalds 2005-04-16 3631 {
e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3632 struct vmap_area *va;
e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3633 struct vm_struct *vm;
^1da177e4c3f415 Linus Torvalds 2005-04-16 3634 char *vaddr, *buf_start = buf;
d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3635 unsigned long buflen = count;
129dbdf298d7383 Baoquan He 2023-01-13 3636 unsigned long n, size, flags;
^1da177e4c3f415 Linus Torvalds 2005-04-16 3637
4aff1dc4fb3a5a3 Andrey Konovalov 2022-03-24 3638 addr = kasan_reset_tag(addr);
4aff1dc4fb3a5a3 Andrey Konovalov 2022-03-24 3639
^1da177e4c3f415 Linus Torvalds 2005-04-16 3640 /* Don't allow overflow */
^1da177e4c3f415 Linus Torvalds 2005-04-16 3641 if ((unsigned long) addr + count < count)
^1da177e4c3f415 Linus Torvalds 2005-04-16 3642 count = -(unsigned long) addr;
^1da177e4c3f415 Linus Torvalds 2005-04-16 3643
e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3644 spin_lock(&vmap_area_lock);
f181234a5a21fd0 Chen Wandun 2021-09-02 3645 va = find_vmap_area_exceed_addr((unsigned long)addr);
f608788cd2d6cae Serapheim Dimitropoulos 2021-04-29 3646 if (!va)
f608788cd2d6cae Serapheim Dimitropoulos 2021-04-29 3647 goto finished;
f181234a5a21fd0 Chen Wandun 2021-09-02 3648
f181234a5a21fd0 Chen Wandun 2021-09-02 3649 /* no intersects with alive vmap_area */
f181234a5a21fd0 Chen Wandun 2021-09-02 3650 if ((unsigned long)addr + count <= va->va_start)
f181234a5a21fd0 Chen Wandun 2021-09-02 3651 goto finished;
f181234a5a21fd0 Chen Wandun 2021-09-02 3652
f608788cd2d6cae Serapheim Dimitropoulos 2021-04-29 3653 list_for_each_entry_from(va, &vmap_area_list, list) {
e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3654 if (!count)
e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3655 break;
e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3656
129dbdf298d7383 Baoquan He 2023-01-13 3657 vm = va->vm;
129dbdf298d7383 Baoquan He 2023-01-13 3658 flags = va->flags & VMAP_FLAGS_MASK;
129dbdf298d7383 Baoquan He 2023-01-13 3659
129dbdf298d7383 Baoquan He 2023-01-13 3660 if (!vm && !flags)
^^^
vm can be NULL if a flag in VMAP_FLAGS_MASK is set.
e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3661 continue;
e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3662
129dbdf298d7383 Baoquan He 2023-01-13 3663 vaddr = (char *) va->va_start;
129dbdf298d7383 Baoquan He 2023-01-13 @3664 size = vm ? get_vm_area_size(vm) : va_size(va);
^^
129dbdf298d7383 Baoquan He 2023-01-13 3665
129dbdf298d7383 Baoquan He 2023-01-13 3666 if (addr >= vaddr + size)
^1da177e4c3f415 Linus Torvalds 2005-04-16 3667 continue;
^1da177e4c3f415 Linus Torvalds 2005-04-16 3668 while (addr < vaddr) {
^1da177e4c3f415 Linus Torvalds 2005-04-16 3669 if (count == 0)
^1da177e4c3f415 Linus Torvalds 2005-04-16 3670 goto finished;
^1da177e4c3f415 Linus Torvalds 2005-04-16 3671 *buf = '\0';
^1da177e4c3f415 Linus Torvalds 2005-04-16 3672 buf++;
^1da177e4c3f415 Linus Torvalds 2005-04-16 3673 addr++;
^1da177e4c3f415 Linus Torvalds 2005-04-16 3674 count--;
^1da177e4c3f415 Linus Torvalds 2005-04-16 3675 }
129dbdf298d7383 Baoquan He 2023-01-13 3676 n = vaddr + size - addr;
d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3677 if (n > count)
d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3678 n = count;
129dbdf298d7383 Baoquan He 2023-01-13 3679
129dbdf298d7383 Baoquan He 2023-01-13 3680 if (flags & VMAP_RAM)
assume VMAP_RAM is not set
129dbdf298d7383 Baoquan He 2023-01-13 3681 vmap_ram_vread(buf, addr, n, flags);
129dbdf298d7383 Baoquan He 2023-01-13 @3682 else if (!(vm->flags & VM_IOREMAP))
^^^^^^^^^
Unchecked dereference. Should this be "flags" instead of "vm->flags"?
d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3683 aligned_vread(buf, addr, n);
d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3684 else /* IOREMAP area is treated as memory hole */
d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3685 memset(buf, 0, n);
d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3686 buf += n;
d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3687 addr += n;
d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3688 count -= n;
^1da177e4c3f415 Linus Torvalds 2005-04-16 3689 }
^1da177e4c3f415 Linus Torvalds 2005-04-16 3690 finished:
e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3691 spin_unlock(&vmap_area_lock);
d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3692
d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3693 if (buf == buf_start)
d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3694 return 0;
d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3695 /* zero-fill memory holes */
d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3696 if (buf != buf_start + buflen)
d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3697 memset(buf, 0, buflen - (buf - buf_start));
d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3698
d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3699 return buflen;
^1da177e4c3f415 Linus Torvalds 2005-04-16 3700 }
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests
Hi Dan,
On 01/14/23 at 10:57am, Dan Carpenter wrote:
> Hi Baoquan,
>
> url: https://github.com/intel-lab-lkp/linux/commits/Baoquan-He/mm-vmalloc-c-add-used_map-into-vmap_block-to-track-space-of-vmap_block/20230113-112149
> base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
> patch link: https://lore.kernel.org/r/20230113031921.64716-4-bhe%40redhat.com
> patch subject: [PATCH v3 3/7] mm/vmalloc.c: allow vread() to read out vm_map_ram areas
> config: i386-randconfig-m021
> compiler: gcc-11 (Debian 11.3.0-8) 11.3.0
>
> If you fix the issue, kindly add following tag where applicable
> | Reported-by: kernel test robot <[email protected]>
> | Reported-by: Dan Carpenter <[email protected]>
>
> smatch warnings:
> mm/vmalloc.c:3682 vread() error: we previously assumed 'vm' could be null (see line 3664)
Thanks for checking this. I went through the code flow again, personally
think that the issue and risk pointed out could not exist. Please see
the comment at below.
>
> vim +/vm +3682 mm/vmalloc.c
>
> ^1da177e4c3f415 Linus Torvalds 2005-04-16 3630 long vread(char *buf, char *addr, unsigned long count)
> ^1da177e4c3f415 Linus Torvalds 2005-04-16 3631 {
> e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3632 struct vmap_area *va;
> e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3633 struct vm_struct *vm;
> ^1da177e4c3f415 Linus Torvalds 2005-04-16 3634 char *vaddr, *buf_start = buf;
> d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3635 unsigned long buflen = count;
> 129dbdf298d7383 Baoquan He 2023-01-13 3636 unsigned long n, size, flags;
> ^1da177e4c3f415 Linus Torvalds 2005-04-16 3637
> 4aff1dc4fb3a5a3 Andrey Konovalov 2022-03-24 3638 addr = kasan_reset_tag(addr);
> 4aff1dc4fb3a5a3 Andrey Konovalov 2022-03-24 3639
> ^1da177e4c3f415 Linus Torvalds 2005-04-16 3640 /* Don't allow overflow */
> ^1da177e4c3f415 Linus Torvalds 2005-04-16 3641 if ((unsigned long) addr + count < count)
> ^1da177e4c3f415 Linus Torvalds 2005-04-16 3642 count = -(unsigned long) addr;
> ^1da177e4c3f415 Linus Torvalds 2005-04-16 3643
> e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3644 spin_lock(&vmap_area_lock);
> f181234a5a21fd0 Chen Wandun 2021-09-02 3645 va = find_vmap_area_exceed_addr((unsigned long)addr);
> f608788cd2d6cae Serapheim Dimitropoulos 2021-04-29 3646 if (!va)
> f608788cd2d6cae Serapheim Dimitropoulos 2021-04-29 3647 goto finished;
> f181234a5a21fd0 Chen Wandun 2021-09-02 3648
> f181234a5a21fd0 Chen Wandun 2021-09-02 3649 /* no intersects with alive vmap_area */
> f181234a5a21fd0 Chen Wandun 2021-09-02 3650 if ((unsigned long)addr + count <= va->va_start)
> f181234a5a21fd0 Chen Wandun 2021-09-02 3651 goto finished;
> f181234a5a21fd0 Chen Wandun 2021-09-02 3652
> f608788cd2d6cae Serapheim Dimitropoulos 2021-04-29 3653 list_for_each_entry_from(va, &vmap_area_list, list) {
> e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3654 if (!count)
> e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3655 break;
> e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3656
> 129dbdf298d7383 Baoquan He 2023-01-13 3657 vm = va->vm;
> 129dbdf298d7383 Baoquan He 2023-01-13 3658 flags = va->flags & VMAP_FLAGS_MASK;
> 129dbdf298d7383 Baoquan He 2023-01-13 3659
> 129dbdf298d7383 Baoquan He 2023-01-13 3660 if (!vm && !flags)
> ^^^
> vm can be NULL if a flag in VMAP_FLAGS_MASK is set.
>
> e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3661 continue;
Right, after the 'continue;' line, only two cases could happen when it
comes here. (vm != null) or (vm->flags & VMAP_RAM) is true.
> e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3662
> 129dbdf298d7383 Baoquan He 2023-01-13 3663 vaddr = (char *) va->va_start;
> 129dbdf298d7383 Baoquan He 2023-01-13 @3664 size = vm ? get_vm_area_size(vm) : va_size(va);
> ^^
>
> 129dbdf298d7383 Baoquan He 2023-01-13 3665
> 129dbdf298d7383 Baoquan He 2023-01-13 3666 if (addr >= vaddr + size)
> ^1da177e4c3f415 Linus Torvalds 2005-04-16 3667 continue;
> ^1da177e4c3f415 Linus Torvalds 2005-04-16 3668 while (addr < vaddr) {
> ^1da177e4c3f415 Linus Torvalds 2005-04-16 3669 if (count == 0)
> ^1da177e4c3f415 Linus Torvalds 2005-04-16 3670 goto finished;
> ^1da177e4c3f415 Linus Torvalds 2005-04-16 3671 *buf = '\0';
> ^1da177e4c3f415 Linus Torvalds 2005-04-16 3672 buf++;
> ^1da177e4c3f415 Linus Torvalds 2005-04-16 3673 addr++;
> ^1da177e4c3f415 Linus Torvalds 2005-04-16 3674 count--;
> ^1da177e4c3f415 Linus Torvalds 2005-04-16 3675 }
> 129dbdf298d7383 Baoquan He 2023-01-13 3676 n = vaddr + size - addr;
> d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3677 if (n > count)
> d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3678 n = count;
> 129dbdf298d7383 Baoquan He 2023-01-13 3679
> 129dbdf298d7383 Baoquan He 2023-01-13 3680 if (flags & VMAP_RAM)
>
> assume VMAP_RAM is not set
OK, then vm is not null.
>
> 129dbdf298d7383 Baoquan He 2023-01-13 3681 vmap_ram_vread(buf, addr, n, flags);
> 129dbdf298d7383 Baoquan He 2023-01-13 @3682 else if (!(vm->flags & VM_IOREMAP))
> ^^^^^^^^^
> Unchecked dereference. Should this be "flags" instead of "vm->flags"?
Thus, here, in 'else if', vm is not null. And in this 'else if', we are
intending to check vm->flags. I don't see issue or risk here. Please
help check if I miss anything.
Thanks
Baoquan
>
> d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3683 aligned_vread(buf, addr, n);
> d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3684 else /* IOREMAP area is treated as memory hole */
> d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3685 memset(buf, 0, n);
> d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3686 buf += n;
> d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3687 addr += n;
> d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3688 count -= n;
> ^1da177e4c3f415 Linus Torvalds 2005-04-16 3689 }
> ^1da177e4c3f415 Linus Torvalds 2005-04-16 3690 finished:
> e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3691 spin_unlock(&vmap_area_lock);
> d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3692
> d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3693 if (buf == buf_start)
> d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3694 return 0;
> d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3695 /* zero-fill memory holes */
> d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3696 if (buf != buf_start + buflen)
> d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3697 memset(buf, 0, buflen - (buf - buf_start));
> d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3698
> d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3699 return buflen;
> ^1da177e4c3f415 Linus Torvalds 2005-04-16 3700 }
>
> --
> 0-DAY CI Kernel Test Service
> https://github.com/intel/lkp-tests
>
On Fri, Jan 13, 2023 at 11:19:17AM +0800, Baoquan He wrote:
> Currently, vread can read out vmalloc areas which is associated with
> a vm_struct. While this doesn't work for areas created by vm_map_ram()
> interface because it doesn't have an associated vm_struct. Then in vread(),
> these areas are all skipped.
>
> Here, add a new function vmap_ram_vread() to read out vm_map_ram areas.
> The area created with vmap_ram_vread() interface directly can be handled
> like the other normal vmap areas with aligned_vread(). While areas
> which will be further subdivided and managed with vmap_block need
> carefully read out page-aligned small regions and zero fill holes.
>
> Signed-off-by: Baoquan He <[email protected]>
> ---
> mm/vmalloc.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++-----
> 1 file changed, 73 insertions(+), 7 deletions(-)
>
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index ab4825050b5c..13875bc41e27 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -3544,6 +3544,65 @@ static int aligned_vread(char *buf, char *addr, unsigned long count)
> return copied;
> }
>
> +static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags)
> +{
> + char *start;
> + struct vmap_block *vb;
> + unsigned long offset;
> + unsigned int rs, re, n;
> +
> + /*
> + * If it's area created by vm_map_ram() interface directly, but
> + * not further subdividing and delegating management to vmap_block,
> + * handle it here.
> + */
> + if (!(flags & VMAP_BLOCK)) {
> + aligned_vread(buf, addr, count);
> + return;
> + }
> +
> + /*
> + * Area is split into regions and tracked with vmap_block, read out
> + * each region and zero fill the hole between regions.
> + */
> + vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr));
> +
> + spin_lock(&vb->lock);
> + if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
>
CPU-X invokes free_vmap_block() whereas we take the vb->lock and do
some manipulations with vb that might be already freed over RCU-core.
Should we protect it by the rcu_read_lock() also here?
--
Uladzislau Rezki
On Sun, Jan 15, 2023 at 10:08:55PM +0800, Baoquan He wrote:
> > f181234a5a21fd0 Chen Wandun 2021-09-02 3650 if ((unsigned long)addr + count <= va->va_start)
> > f181234a5a21fd0 Chen Wandun 2021-09-02 3651 goto finished;
> > f181234a5a21fd0 Chen Wandun 2021-09-02 3652
> > f608788cd2d6cae Serapheim Dimitropoulos 2021-04-29 3653 list_for_each_entry_from(va, &vmap_area_list, list) {
> > e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3654 if (!count)
> > e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3655 break;
> > e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3656
> > 129dbdf298d7383 Baoquan He 2023-01-13 3657 vm = va->vm;
> > 129dbdf298d7383 Baoquan He 2023-01-13 3658 flags = va->flags & VMAP_FLAGS_MASK;
> > 129dbdf298d7383 Baoquan He 2023-01-13 3659
> > 129dbdf298d7383 Baoquan He 2023-01-13 3660 if (!vm && !flags)
> > ^^^
> > vm can be NULL if a flag in VMAP_FLAGS_MASK is set.
> >
> > e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3661 continue;
>
> Right, after the 'continue;' line, only two cases could happen when it
> comes here. (vm != null) or (vm->flags & VMAP_RAM) is true.
>
You're saying VMAP_RAM, but strictly speaking the code is checking
VMAP_FLAGS_MASK and not VMAP_RAM.
+#define VMAP_RAM 0x1 /* indicates vm_map_ram area*/
+#define VMAP_BLOCK 0x2 /* mark out the vmap_block sub-type*/
+#define VMAP_FLAGS_MASK 0x3
If we assume that vm is NULL, VMAP_BLOCK is set and VMAP_RAM is clear
then it would lead to a NULL dereference. There might be reasons why
that combination is impossible outside the function but we can't tell
from the information we have here.
Which is fine, outside information is a common reason for false
positives with this check. But I was just concerned about the mix of
VMAP_FLAGS_MASK and VMAP_RAM.
regards,
dan carpenter
> > e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3662
> > 129dbdf298d7383 Baoquan He 2023-01-13 3663 vaddr = (char *) va->va_start;
> > 129dbdf298d7383 Baoquan He 2023-01-13 @3664 size = vm ? get_vm_area_size(vm) : va_size(va);
> > ^^
> >
> > 129dbdf298d7383 Baoquan He 2023-01-13 3665
> > 129dbdf298d7383 Baoquan He 2023-01-13 3666 if (addr >= vaddr + size)
> > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3667 continue;
> > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3668 while (addr < vaddr) {
> > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3669 if (count == 0)
> > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3670 goto finished;
> > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3671 *buf = '\0';
> > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3672 buf++;
> > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3673 addr++;
> > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3674 count--;
> > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3675 }
> > 129dbdf298d7383 Baoquan He 2023-01-13 3676 n = vaddr + size - addr;
> > d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3677 if (n > count)
> > d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3678 n = count;
> > 129dbdf298d7383 Baoquan He 2023-01-13 3679
> > 129dbdf298d7383 Baoquan He 2023-01-13 3680 if (flags & VMAP_RAM)
> >
> > assume VMAP_RAM is not set
>
> OK, then vm is not null.
> >
> > 129dbdf298d7383 Baoquan He 2023-01-13 3681 vmap_ram_vread(buf, addr, n, flags);
> > 129dbdf298d7383 Baoquan He 2023-01-13 @3682 else if (!(vm->flags & VM_IOREMAP))
> > ^^^^^^^^^
> > Unchecked dereference. Should this be "flags" instead of "vm->flags"?
>
> Thus, here, in 'else if', vm is not null. And in this 'else if', we are
> intending to check vm->flags. I don't see issue or risk here. Please
> help check if I miss anything.
>
> Thanks
> Baoquan
>
On Fri, Jan 13, 2023 at 11:19:17AM +0800, Baoquan He wrote:
> + spin_lock(&vb->lock);
> + if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
> + spin_unlock(&vb->lock);
> + memset(buf, 0, count);
> + return;
> + }
> + for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) {
> + if (!count)
> + break;
> + start = vmap_block_vaddr(vb->va->va_start, rs);
> + while (addr < start) {
> + if (count == 0)
> + break;
> + *buf = '\0';
> + buf++;
> + addr++;
> + count--;
> + }
> + /*it could start reading from the middle of used region*/
> + offset = offset_in_page(addr);
> + n = ((re - rs + 1) << PAGE_SHIFT) - offset;
> + if (n > count)
> + n = count;
> + aligned_vread(buf, start+offset, n);
The whole vread() interface is rather suboptimal. The only user is proc,
which is trying to copy to userspace. But the vread() interface copies
to a kernel address, so kcore has to copy to a bounce buffer. That makes
this spinlock work, but the price is that we can't copy to a user address
in the future. Ideally, read_kcore() would be kcore_read_iter() and
we'd pass an iov_iter into vread(). vread() would then need to use a
mutex rather than a spinlock.
I don't think this needs to be done now, but if someone's looking for
a project ...
On Mon, Jan 16, 2023 at 07:01:33PM +0000, Matthew Wilcox wrote:
> On Fri, Jan 13, 2023 at 11:19:17AM +0800, Baoquan He wrote:
> > + spin_lock(&vb->lock);
> > + if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
> > + spin_unlock(&vb->lock);
> > + memset(buf, 0, count);
> > + return;
> > + }
> > + for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) {
> > + if (!count)
> > + break;
> > + start = vmap_block_vaddr(vb->va->va_start, rs);
> > + while (addr < start) {
> > + if (count == 0)
> > + break;
> > + *buf = '\0';
> > + buf++;
> > + addr++;
> > + count--;
> > + }
> > + /*it could start reading from the middle of used region*/
> > + offset = offset_in_page(addr);
> > + n = ((re - rs + 1) << PAGE_SHIFT) - offset;
> > + if (n > count)
> > + n = count;
> > + aligned_vread(buf, start+offset, n);
>
> The whole vread() interface is rather suboptimal. The only user is proc,
> which is trying to copy to userspace. But the vread() interface copies
> to a kernel address, so kcore has to copy to a bounce buffer. That makes
> this spinlock work, but the price is that we can't copy to a user address
> in the future. Ideally, read_kcore() would be kcore_read_iter() and
> we'd pass an iov_iter into vread(). vread() would then need to use a
> mutex rather than a spinlock.
>
> I don't think this needs to be done now, but if someone's looking for
> a project ...
Interesting! I may take a look at this if I get the time :)
On 01/16/23 at 04:08pm, Dan Carpenter wrote:
> On Sun, Jan 15, 2023 at 10:08:55PM +0800, Baoquan He wrote:
> > > f181234a5a21fd0 Chen Wandun 2021-09-02 3650 if ((unsigned long)addr + count <= va->va_start)
> > > f181234a5a21fd0 Chen Wandun 2021-09-02 3651 goto finished;
> > > f181234a5a21fd0 Chen Wandun 2021-09-02 3652
> > > f608788cd2d6cae Serapheim Dimitropoulos 2021-04-29 3653 list_for_each_entry_from(va, &vmap_area_list, list) {
> > > e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3654 if (!count)
> > > e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3655 break;
> > > e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3656
> > > 129dbdf298d7383 Baoquan He 2023-01-13 3657 vm = va->vm;
> > > 129dbdf298d7383 Baoquan He 2023-01-13 3658 flags = va->flags & VMAP_FLAGS_MASK;
> > > 129dbdf298d7383 Baoquan He 2023-01-13 3659
> > > 129dbdf298d7383 Baoquan He 2023-01-13 3660 if (!vm && !flags)
> > > ^^^
> > > vm can be NULL if a flag in VMAP_FLAGS_MASK is set.
> > >
> > > e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3661 continue;
> >
> > Right, after the 'continue;' line, only two cases could happen when it
> > comes here. (vm != null) or (vm->flags & VMAP_RAM) is true.
> >
>
> You're saying VMAP_RAM, but strictly speaking the code is checking
> VMAP_FLAGS_MASK and not VMAP_RAM.
>
> +#define VMAP_RAM 0x1 /* indicates vm_map_ram area*/
> +#define VMAP_BLOCK 0x2 /* mark out the vmap_block sub-type*/
> +#define VMAP_FLAGS_MASK 0x3
>
> If we assume that vm is NULL, VMAP_BLOCK is set and VMAP_RAM is clear
> then it would lead to a NULL dereference. There might be reasons why
> that combination is impossible outside the function but we can't tell
> from the information we have here.
VMAP_BLOCK has no chance to be set alone. It has to be set together with
VMAP_RAM if needed.
>
> Which is fine, outside information is a common reason for false
> positives with this check. But I was just concerned about the mix of
> VMAP_FLAGS_MASK and VMAP_RAM.
Thanks, I see your point now, will consider how to improve it.
On 01/16/23 at 12:50pm, Uladzislau Rezki wrote:
> On Fri, Jan 13, 2023 at 11:19:17AM +0800, Baoquan He wrote:
> > Currently, vread can read out vmalloc areas which is associated with
> > a vm_struct. While this doesn't work for areas created by vm_map_ram()
> > interface because it doesn't have an associated vm_struct. Then in vread(),
> > these areas are all skipped.
> >
> > Here, add a new function vmap_ram_vread() to read out vm_map_ram areas.
> > The area created with vmap_ram_vread() interface directly can be handled
> > like the other normal vmap areas with aligned_vread(). While areas
> > which will be further subdivided and managed with vmap_block need
> > carefully read out page-aligned small regions and zero fill holes.
> >
> > Signed-off-by: Baoquan He <[email protected]>
> > ---
> > mm/vmalloc.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++-----
> > 1 file changed, 73 insertions(+), 7 deletions(-)
> >
> > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > index ab4825050b5c..13875bc41e27 100644
> > --- a/mm/vmalloc.c
> > +++ b/mm/vmalloc.c
> > @@ -3544,6 +3544,65 @@ static int aligned_vread(char *buf, char *addr, unsigned long count)
> > return copied;
> > }
> >
> > +static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags)
> > +{
> > + char *start;
> > + struct vmap_block *vb;
> > + unsigned long offset;
> > + unsigned int rs, re, n;
> > +
> > + /*
> > + * If it's area created by vm_map_ram() interface directly, but
> > + * not further subdividing and delegating management to vmap_block,
> > + * handle it here.
> > + */
> > + if (!(flags & VMAP_BLOCK)) {
> > + aligned_vread(buf, addr, count);
> > + return;
> > + }
> > +
> > + /*
> > + * Area is split into regions and tracked with vmap_block, read out
> > + * each region and zero fill the hole between regions.
> > + */
> > + vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr));
> > +
> > + spin_lock(&vb->lock);
> > + if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
> >
> CPU-X invokes free_vmap_block() whereas we take the vb->lock and do
> some manipulations with vb that might be already freed over RCU-core.
>
> Should we protect it by the rcu_read_lock() also here?
Just go over the vb and vbq code again, seems we don't need the
rcu_read_lock() here. The rcu lock is needed when operating on the
vmap_block_queue->free list. I don't see race between the vb accessing
here and those list adding or removing on vmap_block_queue->free with
rcu. If I miss some race windows between them, please help point out.
However, when I check free_vmap_block(), I do find a risk. As you said,
CPU-x invokes free_vmap_block() and executed xa_erase() to remove the vb
from vmap_blocks tree. Then vread() comes into vmap_ram_vread() and call
xa_load(), it would be null. I should check the returned vb in
free_vmap_block().
static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags)
{
......
if (!(flags & VMAP_BLOCK)) {
aligned_vread(buf, addr, count);
return;
}
/*
* Area is split into regions and tracked with vmap_block, read out
* each region and zero fill the hole between regions.
*/
vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr));
if (!vb) <-- vb need be checked here to avoid accessing erased vb from vmap_blocks tree
memset(buf, 0, count);
......
}
On 01/19/23 at 05:52pm, Baoquan He wrote:
> On 01/16/23 at 12:50pm, Uladzislau Rezki wrote:
> > On Fri, Jan 13, 2023 at 11:19:17AM +0800, Baoquan He wrote:
> > > Currently, vread can read out vmalloc areas which is associated with
> > > a vm_struct. While this doesn't work for areas created by vm_map_ram()
> > > interface because it doesn't have an associated vm_struct. Then in vread(),
> > > these areas are all skipped.
> > >
> > > Here, add a new function vmap_ram_vread() to read out vm_map_ram areas.
> > > The area created with vmap_ram_vread() interface directly can be handled
> > > like the other normal vmap areas with aligned_vread(). While areas
> > > which will be further subdivided and managed with vmap_block need
> > > carefully read out page-aligned small regions and zero fill holes.
> > >
> > > Signed-off-by: Baoquan He <[email protected]>
> > > ---
> > > mm/vmalloc.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++-----
> > > 1 file changed, 73 insertions(+), 7 deletions(-)
> > >
> > > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > > index ab4825050b5c..13875bc41e27 100644
> > > --- a/mm/vmalloc.c
> > > +++ b/mm/vmalloc.c
> > > @@ -3544,6 +3544,65 @@ static int aligned_vread(char *buf, char *addr, unsigned long count)
> > > return copied;
> > > }
> > >
> > > +static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags)
> > > +{
> > > + char *start;
> > > + struct vmap_block *vb;
> > > + unsigned long offset;
> > > + unsigned int rs, re, n;
> > > +
> > > + /*
> > > + * If it's area created by vm_map_ram() interface directly, but
> > > + * not further subdividing and delegating management to vmap_block,
> > > + * handle it here.
> > > + */
> > > + if (!(flags & VMAP_BLOCK)) {
> > > + aligned_vread(buf, addr, count);
> > > + return;
> > > + }
> > > +
> > > + /*
> > > + * Area is split into regions and tracked with vmap_block, read out
> > > + * each region and zero fill the hole between regions.
> > > + */
> > > + vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr));
> > > +
> > > + spin_lock(&vb->lock);
> > > + if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
> > >
> > CPU-X invokes free_vmap_block() whereas we take the vb->lock and do
> > some manipulations with vb that might be already freed over RCU-core.
> >
> > Should we protect it by the rcu_read_lock() also here?
>
> Just go over the vb and vbq code again, seems we don't need the
> rcu_read_lock() here. The rcu lock is needed when operating on the
> vmap_block_queue->free list. I don't see race between the vb accessing
> here and those list adding or removing on vmap_block_queue->free with
> rcu. If I miss some race windows between them, please help point out.
>
> However, when I check free_vmap_block(), I do find a risk. As you said,
Forgot to add details about why there's no race between free_vmap_block()
and vmap_ram_vread() because we have taken vmap_area_lock at the beginning
of vread(). So, except of the missing checking on returned value from
xa_load(), free_vmap_block() either is blocked to wait for vmap_area_lock
before calling unlink_va(), or finishes calling unlink_va() to remove
the vmap from vmap_area_root tree. In both cases, no race happened.
> CPU-x invokes free_vmap_block() and executed xa_erase() to remove the vb
> from vmap_blocks tree. Then vread() comes into vmap_ram_vread() and call
> xa_load(), it would be null. I should check the returned vb in
> free_vmap_block().
>
>
> static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags)
> {
> ......
> if (!(flags & VMAP_BLOCK)) {
> aligned_vread(buf, addr, count);
> return;
> }
>
> /*
> * Area is split into regions and tracked with vmap_block, read out
> * each region and zero fill the hole between regions.
> */
> vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr));
> if (!vb) <-- vb need be checked here to avoid accessing erased vb from vmap_blocks tree
> memset(buf, 0, count);
> ......
> }
>
>
> On 01/19/23 at 05:52pm, Baoquan He wrote:
> > On 01/16/23 at 12:50pm, Uladzislau Rezki wrote:
> > > On Fri, Jan 13, 2023 at 11:19:17AM +0800, Baoquan He wrote:
> > > > Currently, vread can read out vmalloc areas which is associated with
> > > > a vm_struct. While this doesn't work for areas created by vm_map_ram()
> > > > interface because it doesn't have an associated vm_struct. Then in vread(),
> > > > these areas are all skipped.
> > > >
> > > > Here, add a new function vmap_ram_vread() to read out vm_map_ram areas.
> > > > The area created with vmap_ram_vread() interface directly can be handled
> > > > like the other normal vmap areas with aligned_vread(). While areas
> > > > which will be further subdivided and managed with vmap_block need
> > > > carefully read out page-aligned small regions and zero fill holes.
> > > >
> > > > Signed-off-by: Baoquan He <[email protected]>
> > > > ---
> > > > mm/vmalloc.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++-----
> > > > 1 file changed, 73 insertions(+), 7 deletions(-)
> > > >
> > > > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > > > index ab4825050b5c..13875bc41e27 100644
> > > > --- a/mm/vmalloc.c
> > > > +++ b/mm/vmalloc.c
> > > > @@ -3544,6 +3544,65 @@ static int aligned_vread(char *buf, char *addr, unsigned long count)
> > > > return copied;
> > > > }
> > > >
> > > > +static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags)
> > > > +{
> > > > + char *start;
> > > > + struct vmap_block *vb;
> > > > + unsigned long offset;
> > > > + unsigned int rs, re, n;
> > > > +
> > > > + /*
> > > > + * If it's area created by vm_map_ram() interface directly, but
> > > > + * not further subdividing and delegating management to vmap_block,
> > > > + * handle it here.
> > > > + */
> > > > + if (!(flags & VMAP_BLOCK)) {
> > > > + aligned_vread(buf, addr, count);
> > > > + return;
> > > > + }
> > > > +
> > > > + /*
> > > > + * Area is split into regions and tracked with vmap_block, read out
> > > > + * each region and zero fill the hole between regions.
> > > > + */
> > > > + vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr));
> > > > +
> > > > + spin_lock(&vb->lock);
> > > > + if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
> > > >
> > > CPU-X invokes free_vmap_block() whereas we take the vb->lock and do
> > > some manipulations with vb that might be already freed over RCU-core.
> > >
> > > Should we protect it by the rcu_read_lock() also here?
> >
> > Just go over the vb and vbq code again, seems we don't need the
> > rcu_read_lock() here. The rcu lock is needed when operating on the
> > vmap_block_queue->free list. I don't see race between the vb accessing
> > here and those list adding or removing on vmap_block_queue->free with
> > rcu. If I miss some race windows between them, please help point out.
> >
> > However, when I check free_vmap_block(), I do find a risk. As you said,
>
> Forgot to add details about why there's no race between free_vmap_block()
> and vmap_ram_vread() because we have taken vmap_area_lock at the beginning
> of vread(). So, except of the missing checking on returned value from
> xa_load(), free_vmap_block() either is blocked to wait for vmap_area_lock
> before calling unlink_va(), or finishes calling unlink_va() to remove
> the vmap from vmap_area_root tree. In both cases, no race happened.
>
Agree. xa_load()s return value should be checked. Because it can be that
there is no vmap_block associated with an address if xa_erase() was done
earlier.
--
Uladzislau Rezki