2022-06-01 20:19:19

by Cindy Lu

[permalink] [raw]
Subject: [RFC PATCH v2] vdpa: Do not count the pages that were already pinned in the vhost-vDPA

We count pinned_vm as follow in vhost-vDPA

lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
ret = -ENOMEM;
goto unlock;
}
This means if we have two vDPA devices for the same VM the pages would be counted twice
So we add a tree to save the page that counted and we will not count it
again.
Signed-off-by: Cindy Lu <[email protected]>
---
drivers/vhost/vdpa.c | 542 +++++++++++++++++++++++++++++++++++++++++-
drivers/vhost/vhost.h | 1 +
2 files changed, 539 insertions(+), 4 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 05f5fd2af58f..1b0da0735efd 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -24,6 +24,10 @@
#include <linux/vhost.h>

#include "vhost.h"
+#include <linux/rbtree.h>
+#include <linux/interval_tree.h>
+#include <linux/interval_tree_generic.h>
+#include <linux/hashtable.h>

enum {
VHOST_VDPA_BACKEND_FEATURES =
@@ -506,12 +510,478 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep,
return r;
}

+struct vdpa_tree_node {
+ struct interval_tree_node tree_node;
+ int ref;
+};
+struct vdpa_link_node {
+ struct vdpa_tree_node *vdpa_node;
+ struct vdpa_link_node *next;
+ u64 node_start;
+ u64 node_last;
+};
+
+int vhost_vdpa_add_range_ctx(struct rb_root_cached *root, u64 start, u64 last,
+ int ref)
+{
+ struct interval_tree_node *new_node;
+ struct vdpa_tree_node *vdpa_node;
+
+ if (last < start)
+ return -EFAULT;
+
+ /* If the range being mapped is [0, ULONG_MAX], split it into two entries
+ * otherwise its size would overflow u64.
+ */
+ if (start == 0 && last == ULONG_MAX) {
+ u64 mid = last / 2;
+
+ vhost_vdpa_add_range_ctx(root, start, mid, ref);
+ start = mid + 1;
+ }
+ vdpa_node = kmalloc(sizeof(struct vdpa_tree_node), GFP_ATOMIC);
+
+ new_node = &vdpa_node->tree_node;
+ if (!new_node)
+ return -ENOMEM;
+
+ new_node->start = start;
+ new_node->last = last;
+ vdpa_node->ref = ref;
+
+ interval_tree_insert(new_node, root);
+
+ return 0;
+}
+
+u64 vhost_vdpa_range_ref_add(struct rb_root_cached *root,
+ struct vdpa_link_node *link_head, int node_number,
+ u64 start, u64 last)
+{
+ int i = 0;
+ u64 size = 0;
+ int new_ref;
+ u64 node_start;
+ u64 node_last;
+ u64 range_start;
+ u64 range_last;
+ int range_size;
+ struct vdpa_link_node *link_node;
+ struct vdpa_tree_node *vdpa_node = NULL;
+ struct interval_tree_node *node = NULL;
+
+ if (node_number == 0) {
+ vhost_vdpa_add_range_ctx(root, start, last, 1);
+
+ size = last - start + 1;
+ return size;
+ }
+
+ link_node = link_head;
+ range_start = start;
+ range_last = last;
+ range_size = range_start - range_last;
+ for (i = 0; i < node_number; i++) {
+ vdpa_node = link_node->vdpa_node;
+ link_node = link_node->next;
+ node = &vdpa_node->tree_node;
+ new_ref = vdpa_node->ref;
+ node_start = node->start;
+ node_last = node->last;
+
+ if (range_start == node_start) {
+ if (node_last < range_last) {
+ /* range_start= node->start--- node->last--range_last*/
+ vhost_vdpa_add_range_ctx(root, node_start,
+ node_last,
+ new_ref + 1);
+ /*count the next range */
+ } else if (node_last > range_last) {
+ /* range_start= node->start --- last -- node->last*/
+ vhost_vdpa_add_range_ctx(root, node_start,
+ range_last,
+ new_ref + 1);
+ vhost_vdpa_add_range_ctx(root, range_last + 1,
+ node_last, new_ref);
+ } else {
+ vhost_vdpa_add_range_ctx(root, node_start,
+ node_last,
+ new_ref + 1);
+ }
+ } else if (node_start < range_start) {
+ if (range_last < node_last) {
+ /* node->start--- start--- last--- node->last*/
+ /* should the end rang*/
+
+ vhost_vdpa_add_range_ctx(root, node_start,
+ range_start - 1,
+ new_ref);
+ vhost_vdpa_add_range_ctx(root, range_start,
+ range_last,
+ new_ref + 1);
+ vhost_vdpa_add_range_ctx(root, range_last + 1,
+ node_last, new_ref);
+
+ } else if (range_last > node_last) {
+ /* node->start--- start--- node->last-- last*/
+
+ vhost_vdpa_add_range_ctx(root, node_start,
+ range_start - 1,
+ new_ref);
+ vhost_vdpa_add_range_ctx(root, range_start,
+ node_last,
+ new_ref + 1);
+ } else {
+ /* node->start--- start--- node->last= last*/
+ vhost_vdpa_add_range_ctx(root, node_start,
+ range_start - 1,
+ new_ref);
+ vhost_vdpa_add_range_ctx(root, range_start,
+ node_last,
+ new_ref + 1);
+ /* should the end rang*/
+ }
+ } else {
+ if (node_last < range_last) {
+ /* range_start --- node->start --- node->last ----last */
+
+ vhost_vdpa_add_range_ctx(root, range_start,
+ node_start - 1, 1);
+ vhost_vdpa_add_range_ctx(root, node_start,
+ node_last,
+ new_ref + 1);
+ size += ((node_start - 1) - range_start) + 1;
+ } else if (node_last > range_last) {
+ /* range_start--- node->start --- last -- node->last */
+ vhost_vdpa_add_range_ctx(root, range_start,
+ node_start - 1, 1);
+ vhost_vdpa_add_range_ctx(root, node_start,
+ range_last,
+ new_ref + 1);
+ vhost_vdpa_add_range_ctx(root, range_last + 1,
+ node_last, new_ref);
+ size += ((node_start - 1) - range_start) + 1;
+
+ /* should the end rang*/
+ } else {
+ /* range_start--- node->start --- last = node->last */
+ vhost_vdpa_add_range_ctx(root, range_start,
+ node_start - 1, 1);
+ vhost_vdpa_add_range_ctx(root, node_start,
+ node_last,
+ new_ref + 1);
+ size += ((node_start - 1) - range_start) + 1;
+
+ /* should the end rang*/
+ }
+ }
+ /* work in next node*/
+ range_start = node_last + 1;
+ if (range_start > range_last)
+ break;
+ }
+
+ range_size = range_last - range_start;
+
+ /* last round and still some range*/
+
+ if ((range_size >= 0) && (range_start >= node_last) &&
+ (node_number == i + 1)) {
+ vhost_vdpa_add_range_ctx(root, range_start, range_last, 1);
+ size = size + (range_last - range_start) + 1;
+ } else if ((range_size == -1) && (node_number == i + 1)) {
+ return size;
+ } else {
+ printk(KERN_WARNING,
+ "%s %d FAIL start %lld last %lld node->start %lld node->last %lld i %d",
+ __func__, __LINE__, range_start, range_last, node_start,
+ node_last, i);
+ }
+
+ return size;
+}
+
+u64 vhost_vdpa_range_ref_del(struct rb_root_cached *root,
+ struct vdpa_link_node *link_head, int node_number,
+ u64 start, u64 last)
+{
+ int i = 0;
+ u64 size = 0;
+ int new_ref;
+ u64 node_start;
+ u64 node_last;
+ u64 range_start;
+ u64 range_last;
+ int range_size;
+ struct vdpa_link_node *link_node;
+ struct vdpa_tree_node *vdpa_node = NULL;
+ struct interval_tree_node *node = NULL;
+
+ if (node_number == 0)
+ return 0;
+
+ link_node = link_head;
+ range_start = start;
+ range_last = last;
+
+ for (i = 0; i < node_number; i++) {
+ vdpa_node = link_node->vdpa_node;
+ link_node = link_node->next;
+ node = &vdpa_node->tree_node;
+ new_ref = vdpa_node->ref;
+ node_start = node->start;
+ node_last = node->last;
+
+ if (range_start == node_start) {
+ if (node_last < range_last) {
+ /* range_start =node->start --- node->last ----last*/
+ if (new_ref > 1) {
+ vhost_vdpa_add_range_ctx(root,
+ node_start,
+ node_last,
+ new_ref - 1);
+ /*count the next range */
+ } else {
+ /* if the ref =0, do not need add it back, count size*/
+ size += (node_last - node_start) + 1;
+ }
+
+ } else if (node_last > range_last) {
+ /* range_start= node->start --- last -- node->last*/
+
+ if (new_ref > 1) {
+ vhost_vdpa_add_range_ctx(root,
+ node_start,
+ range_last,
+ new_ref - 1);
+ } else {
+ size += (range_last - node_start) + 1;
+ }
+ vhost_vdpa_add_range_ctx(root, range_last + 1,
+ node_last, new_ref);
+ } else {
+ /* range_start= node->start --- last = node->last*/
+
+ if (new_ref > 1) {
+ vhost_vdpa_add_range_ctx(root,
+ node_start,
+ range_last,
+ new_ref - 1);
+ } else {
+ size += (range_last - node_start) + 1;
+ }
+ /* should be the end */
+ }
+ } else if (node_start < range_start) {
+ if (range_last < node_last) {
+ /* node->start--- start--- last--- node->last*/
+ /* should the end rang*/
+ vhost_vdpa_add_range_ctx(root, node_start,
+ range_start - 1,
+ new_ref);
+ if (new_ref > 1) {
+ vhost_vdpa_add_range_ctx(root,
+ range_start,
+ range_last,
+ new_ref - 1);
+ } else {
+ size += (range_last - range_start) + 1;
+ }
+ vhost_vdpa_add_range_ctx(root, range_last + 1,
+ node_last, new_ref);
+
+ } else if (range_last > node_last) {
+ /* node->start--- start--- node->last--- last*/
+
+ vhost_vdpa_add_range_ctx(root, node_start,
+ range_start - 1,
+ new_ref);
+ if (new_ref > 1) {
+ vhost_vdpa_add_range_ctx(root,
+ range_start,
+ node_last,
+ new_ref - 1);
+ } else {
+ size += (node_last - range_start) + 1;
+ }
+ } else {
+ /* node->start--- start--- node->last= last*/
+ vhost_vdpa_add_range_ctx(root, node_start,
+ range_start - 1,
+ new_ref);
+ if (new_ref > 1) {
+ vhost_vdpa_add_range_ctx(root,
+ range_start,
+ range_last,
+ new_ref - 1);
+ } else {
+ size += (range_last - range_start) + 1;
+ }
+ /* should be the end */
+ }
+ } else {
+ /* some range not in the node, error*/
+ printk(KERN_WARNING,
+ "%s %d FAIL start %lld last %lld node->start %lld node->last %lld new_ref %d",
+ __func__, __LINE__, range_start, range_last,
+ node_start, node_last, new_ref);
+ }
+
+ range_start = node_last + 1;
+ if (range_start > range_last)
+ break;
+ }
+
+ range_size = range_last - range_start;
+
+ /* last round and still some range*/
+
+ if ((range_size > 0) && (node_number == i + 1)) {
+ printk(KERN_WARNING,
+ "%s %d FAIL start %lld last %lld node->start %lld node->last %lld range_size %d",
+ __func__, __LINE__, range_start, range_last, node_start,
+ node_last, range_size);
+ }
+ return size;
+}
+
+struct vdpa_link_node *vhost_vdpa_merge_list(struct vdpa_link_node *list1,
+ struct vdpa_link_node *list2)
+{
+ struct vdpa_link_node dummy_head;
+ struct vdpa_link_node *ptr = &dummy_head;
+
+ while (list1 && list2) {
+ if (list1->node_start < list2->node_start) {
+ ptr->next = list1;
+ list1 = list1->next;
+ } else {
+ ptr->next = list2;
+ list2 = list2->next;
+ }
+ ptr = ptr->next;
+ }
+ if (list1)
+ ptr->next = list1;
+ else
+ ptr->next = list2;
+
+ return dummy_head.next;
+}
+
+struct vdpa_link_node *vhost_vdpa_get_mid(struct vdpa_link_node *head)
+{
+ struct vdpa_link_node *mid_prev = NULL;
+ struct vdpa_link_node *mid;
+
+ while (head && head->next) {
+ mid_prev = (mid_prev == NULL) ? head : mid_prev->next;
+ head = head->next->next;
+ }
+ mid = mid_prev->next;
+ mid_prev->next = NULL;
+ return mid;
+}
+struct vdpa_link_node *vhost_vdpa_sort_list(struct vdpa_link_node *head)
+{
+ struct vdpa_link_node *mid;
+ struct vdpa_link_node *left;
+ struct vdpa_link_node *right;
+
+ if (!head || !head->next)
+ return head;
+
+ mid = vhost_vdpa_get_mid(head);
+ left = vhost_vdpa_sort_list(head);
+ right = vhost_vdpa_sort_list(mid);
+ return vhost_vdpa_merge_list(left, right);
+}
+
+u64 vhost_vdpa_range_ops(struct rb_root_cached *root, u64 start, u64 last,
+ bool ops)
+{
+ struct interval_tree_node *node = NULL;
+ struct vdpa_tree_node *vdpa_node;
+ int node_number = 0;
+ int i = 0;
+ u64 size = 0;
+ struct vdpa_link_node dummy_head = { 0 };
+ struct vdpa_link_node *link_node;
+ struct vdpa_link_node *link_head_tmp;
+ struct vdpa_link_node *pre_link_node;
+
+ pre_link_node = &dummy_head;
+ /*search the rang overlaped, and del from the tree*/
+ for (node = interval_tree_iter_first(root, start, last); node;
+ node = interval_tree_iter_next(node, start, last)) {
+ link_node = kmalloc(sizeof(struct vdpa_link_node), GFP_ATOMIC);
+ if (link_node == NULL) {
+ goto out;
+ }
+ vdpa_node =
+ container_of(node, struct vdpa_tree_node, tree_node);
+ link_node->vdpa_node = vdpa_node;
+ link_node->node_start = node->start;
+ link_node->node_last = node->last;
+
+ pre_link_node->next = link_node;
+ pre_link_node = link_node;
+ pre_link_node->next = NULL;
+
+ node_number++;
+
+ interval_tree_remove(node, root);
+ }
+ /* sorting the node */
+ link_head_tmp = vhost_vdpa_sort_list(dummy_head.next);
+
+ /* these link node are have overlap with range, check the ref and add back to the tree*/
+ if (ops == true) {
+ size = vhost_vdpa_range_ref_add(root, link_head_tmp,
+ node_number, start, last);
+ } else {
+ size = vhost_vdpa_range_ref_del(root, link_head_tmp,
+ node_number, start, last);
+ }
+out:
+ pre_link_node = link_head_tmp;
+
+ for (i = 0; i < node_number; i++) {
+ vdpa_node = pre_link_node->vdpa_node;
+ link_node = pre_link_node->next;
+ kfree(vdpa_node);
+ kfree(pre_link_node);
+ pre_link_node = link_node;
+ }
+ return size;
+}
+u64 vhost_vdpa_search_range_add(struct rb_root_cached *root, u64 start,
+ u64 last)
+{
+ u64 size;
+
+ size = vhost_vdpa_range_ops(root, start, last, true);
+
+ return size;
+}
+
+u64 vhost_vdpa_search_range_del(struct rb_root_cached *root, u64 start,
+ u64 last)
+{
+ u64 size;
+
+ size = vhost_vdpa_range_ops(root, start, last, false);
+
+ return size;
+}
+
static void vhost_vdpa_pa_unmap(struct vhost_vdpa *v, u64 start, u64 last)
{
struct vhost_dev *dev = &v->vdev;
struct vhost_iotlb *iotlb = dev->iotlb;
struct vhost_iotlb_map *map;
struct page *page;
+ u64 size;
unsigned long pfn, pinned;

while ((map = vhost_iotlb_itree_first(iotlb, start, last)) != NULL) {
@@ -523,7 +993,11 @@ static void vhost_vdpa_pa_unmap(struct vhost_vdpa *v, u64 start, u64 last)
set_page_dirty_lock(page);
unpin_user_page(page);
}
- atomic64_sub(PFN_DOWN(map->size), &dev->mm->pinned_vm);
+
+ size = vhost_vdpa_search_range_del(dev->vdpa_mem_tree,
+ map->start,
+ map->start + map->size - 1);
+ atomic64_sub(PFN_DOWN(size), &dev->mm->pinned_vm);
vhost_iotlb_map_free(iotlb, map);
}
}
@@ -591,6 +1065,7 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, u64 iova,
struct vdpa_device *vdpa = v->vdpa;
const struct vdpa_config_ops *ops = vdpa->config;
int r = 0;
+ u64 size_count;

r = vhost_iotlb_add_range_ctx(dev->iotlb, iova, iova + size - 1,
pa, perm, opaque);
@@ -610,9 +1085,11 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, u64 iova,
vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1);
return r;
}
-
- if (!vdpa->use_va)
- atomic64_add(PFN_DOWN(size), &dev->mm->pinned_vm);
+ if (!vdpa->use_va) {
+ size_count = vhost_vdpa_search_range_add(dev->vdpa_mem_tree,
+ iova, iova + size - 1);
+ atomic64_add(PFN_DOWN(size_count), &dev->mm->pinned_vm);
+ }

return 0;
}
@@ -946,6 +1423,58 @@ static void vhost_vdpa_set_iova_range(struct vhost_vdpa *v)
}
}

+struct root_for_vdpa_node {
+ struct hlist_node hlist;
+ struct rb_root_cached vdpa_mem_tree;
+ pid_t pid_using;
+};
+static DECLARE_HASHTABLE(root_for_vdpa_node_list, 8);
+int status_for_vdpa_tree = 0;
+
+struct root_for_vdpa_node *vhost_vdpa_get_mem_tree(struct task_struct *task)
+{
+ struct root_for_vdpa_node *root_get_tmp = NULL;
+ pid_t pid_using = task_pid_nr(task);
+
+ /* No hased table, init one */
+ if (status_for_vdpa_tree == 0) {
+ hash_init(root_for_vdpa_node_list);
+ status_for_vdpa_tree = 1;
+ }
+
+ hash_for_each_possible (root_for_vdpa_node_list, root_get_tmp, hlist,
+ pid_using) {
+ if (root_get_tmp->pid_using == pid_using)
+ return root_get_tmp;
+ }
+
+ root_get_tmp = kmalloc(sizeof(*root_get_tmp), GFP_KERNEL);
+ root_get_tmp->pid_using = pid_using;
+
+ root_get_tmp->vdpa_mem_tree = RB_ROOT_CACHED;
+
+ hash_add(root_for_vdpa_node_list, &root_get_tmp->hlist,
+ root_get_tmp->pid_using);
+
+ return root_get_tmp;
+}
+
+void vhost_vdpa_relase_mem_tree(struct task_struct *task)
+{
+ struct root_for_vdpa_node *root_get_tmp = NULL;
+ pid_t pid_using = task_pid_nr(task);
+
+ /* No hased table, init one */
+ hash_for_each_possible (root_for_vdpa_node_list, root_get_tmp, hlist,
+ pid_using) {
+ if (root_get_tmp->pid_using == pid_using)
+ kfree(root_get_tmp);
+ return;
+ }
+
+ return;
+}
+
static int vhost_vdpa_open(struct inode *inode, struct file *filep)
{
struct vhost_vdpa *v;
@@ -991,10 +1520,13 @@ static int vhost_vdpa_open(struct inode *inode, struct file *filep)
vhost_vdpa_set_iova_range(v);

filep->private_data = v;
+ struct root_for_vdpa_node *tmp = vhost_vdpa_get_mem_tree(current);
+ dev->vdpa_mem_tree = &tmp->vdpa_mem_tree;

return 0;

err_init_iotlb:
+ vhost_vdpa_relase_mem_tree(current);
vhost_dev_cleanup(&v->vdev);
kfree(vqs);
err:
@@ -1016,6 +1548,8 @@ static int vhost_vdpa_release(struct inode *inode, struct file *filep)
struct vhost_dev *d = &v->vdev;

mutex_lock(&d->mutex);
+ vhost_vdpa_relase_mem_tree(current);
+
filep->private_data = NULL;
vhost_vdpa_clean_irq(v);
vhost_vdpa_reset(v);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 638bb640d6b4..d1c662eb4f26 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -161,6 +161,7 @@ struct vhost_dev {
int byte_weight;
u64 kcov_handle;
bool use_worker;
+ struct rb_root_cached *vdpa_mem_tree;
int (*msg_handler)(struct vhost_dev *dev,
struct vhost_iotlb_msg *msg);
};
--
2.34.3



2022-06-01 21:24:10

by Jason Wang

[permalink] [raw]
Subject: Re: [RFC PATCH v2] vdpa: Do not count the pages that were already pinned in the vhost-vDPA

On Wed, Jun 1, 2022 at 9:20 AM Cindy Lu <[email protected]> wrote:
>
> We count pinned_vm as follow in vhost-vDPA
>
> lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
> ret = -ENOMEM;
> goto unlock;
> }
> This means if we have two vDPA devices for the same VM the pages would be counted twice
> So we add a tree to save the page that counted and we will not count it
> again.

The code is not easy to be reviewed, some suggestions:

- It's better to explain in general the algorithm you used here
- Add more comment in the codes to explain the rationale

And I still see the above check against the RLIMIT in the code, is it
intentional?

> Signed-off-by: Cindy Lu <[email protected]>
> ---
> drivers/vhost/vdpa.c | 542 +++++++++++++++++++++++++++++++++++++++++-
> drivers/vhost/vhost.h | 1 +
> 2 files changed, 539 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
> index 05f5fd2af58f..1b0da0735efd 100644
> --- a/drivers/vhost/vdpa.c
> +++ b/drivers/vhost/vdpa.c
> @@ -24,6 +24,10 @@
> #include <linux/vhost.h>
>
> #include "vhost.h"
> +#include <linux/rbtree.h>
> +#include <linux/interval_tree.h>
> +#include <linux/interval_tree_generic.h>
> +#include <linux/hashtable.h>
>
> enum {
> VHOST_VDPA_BACKEND_FEATURES =
> @@ -506,12 +510,478 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep,
> return r;
> }
>
> +struct vdpa_tree_node {
> + struct interval_tree_node tree_node;

Can we simply reuse the vhost_iotlb tree? Note that vhost_iotlb_map
can be associated with a opaque as token which could be used as
reference count.

> + int ref;

If it's a refcount, let's use unsigned here.

> +};
> +struct vdpa_link_node {
> + struct vdpa_tree_node *vdpa_node;
> + struct vdpa_link_node *next;

Need to explain why we need a linked list here considering we've
already structured it as an interval tree.

Btw, unless it's performance critical, let's try to re-use kernel list.h.

> + u64 node_start;
> + u64 node_last;

Let's add a comment to explain each member here.

> +};
> +
> +int vhost_vdpa_add_range_ctx(struct rb_root_cached *root, u64 start, u64 last,
> + int ref)

We don't want to export this symbol, so let's make the function
static, so did other functions.

> +{
> + struct interval_tree_node *new_node;
> + struct vdpa_tree_node *vdpa_node;
> +
> + if (last < start)
> + return -EFAULT;
> +
> + /* If the range being mapped is [0, ULONG_MAX], split it into two entries
> + * otherwise its size would overflow u64.
> + */
> + if (start == 0 && last == ULONG_MAX) {
> + u64 mid = last / 2;
> +
> + vhost_vdpa_add_range_ctx(root, start, mid, ref);
> + start = mid + 1;
> + }
> + vdpa_node = kmalloc(sizeof(struct vdpa_tree_node), GFP_ATOMIC);
> +

Let's check if kmalloc succeeds here.


> + new_node = &vdpa_node->tree_node;
> + if (!new_node)
> + return -ENOMEM;
> +
> + new_node->start = start;
> + new_node->last = last;
> + vdpa_node->ref = ref;
> +
> + interval_tree_insert(new_node, root);
> +
> + return 0;
> +}
> +
> +u64 vhost_vdpa_range_ref_add(struct rb_root_cached *root,
> + struct vdpa_link_node *link_head, int node_number,
> + u64 start, u64 last)
> +{
> + int i = 0;
> + u64 size = 0;
> + int new_ref;
> + u64 node_start;
> + u64 node_last;
> + u64 range_start;
> + u64 range_last;
> + int range_size;
> + struct vdpa_link_node *link_node;
> + struct vdpa_tree_node *vdpa_node = NULL;
> + struct interval_tree_node *node = NULL;
> +
> + if (node_number == 0) {
> + vhost_vdpa_add_range_ctx(root, start, last, 1);
> +
> + size = last - start + 1;
> + return size;
> + }
> +
> + link_node = link_head;
> + range_start = start;
> + range_last = last;
> + range_size = range_start - range_last;
> + for (i = 0; i < node_number; i++) {
> + vdpa_node = link_node->vdpa_node;
> + link_node = link_node->next;
> + node = &vdpa_node->tree_node;
> + new_ref = vdpa_node->ref;
> + node_start = node->start;
> + node_last = node->last;
> +
> + if (range_start == node_start) {
> + if (node_last < range_last) {
> + /* range_start= node->start--- node->last--range_last*/
> + vhost_vdpa_add_range_ctx(root, node_start,
> + node_last,
> + new_ref + 1);
> + /*count the next range */
> + } else if (node_last > range_last) {
> + /* range_start= node->start --- last -- node->last*/
> + vhost_vdpa_add_range_ctx(root, node_start,
> + range_last,
> + new_ref + 1);
> + vhost_vdpa_add_range_ctx(root, range_last + 1,
> + node_last, new_ref);
> + } else {
> + vhost_vdpa_add_range_ctx(root, node_start,
> + node_last,
> + new_ref + 1);
> + }
> + } else if (node_start < range_start) {
> + if (range_last < node_last) {
> + /* node->start--- start--- last--- node->last*/
> + /* should the end rang*/
> +
> + vhost_vdpa_add_range_ctx(root, node_start,
> + range_start - 1,
> + new_ref);
> + vhost_vdpa_add_range_ctx(root, range_start,
> + range_last,
> + new_ref + 1);
> + vhost_vdpa_add_range_ctx(root, range_last + 1,
> + node_last, new_ref);
> +
> + } else if (range_last > node_last) {
> + /* node->start--- start--- node->last-- last*/
> +
> + vhost_vdpa_add_range_ctx(root, node_start,
> + range_start - 1,
> + new_ref);
> + vhost_vdpa_add_range_ctx(root, range_start,
> + node_last,
> + new_ref + 1);
> + } else {
> + /* node->start--- start--- node->last= last*/
> + vhost_vdpa_add_range_ctx(root, node_start,
> + range_start - 1,
> + new_ref);
> + vhost_vdpa_add_range_ctx(root, range_start,
> + node_last,
> + new_ref + 1);
> + /* should the end rang*/
> + }
> + } else {
> + if (node_last < range_last) {
> + /* range_start --- node->start --- node->last ----last */
> +
> + vhost_vdpa_add_range_ctx(root, range_start,
> + node_start - 1, 1);
> + vhost_vdpa_add_range_ctx(root, node_start,
> + node_last,
> + new_ref + 1);
> + size += ((node_start - 1) - range_start) + 1;
> + } else if (node_last > range_last) {
> + /* range_start--- node->start --- last -- node->last */
> + vhost_vdpa_add_range_ctx(root, range_start,
> + node_start - 1, 1);
> + vhost_vdpa_add_range_ctx(root, node_start,
> + range_last,
> + new_ref + 1);
> + vhost_vdpa_add_range_ctx(root, range_last + 1,
> + node_last, new_ref);
> + size += ((node_start - 1) - range_start) + 1;
> +
> + /* should the end rang*/
> + } else {
> + /* range_start--- node->start --- last = node->last */
> + vhost_vdpa_add_range_ctx(root, range_start,
> + node_start - 1, 1);
> + vhost_vdpa_add_range_ctx(root, node_start,
> + node_last,
> + new_ref + 1);
> + size += ((node_start - 1) - range_start) + 1;
> +
> + /* should the end rang*/
> + }
> + }
> + /* work in next node*/
> + range_start = node_last + 1;
> + if (range_start > range_last)
> + break;
> + }
> +
> + range_size = range_last - range_start;
> +
> + /* last round and still some range*/
> +
> + if ((range_size >= 0) && (range_start >= node_last) &&
> + (node_number == i + 1)) {
> + vhost_vdpa_add_range_ctx(root, range_start, range_last, 1);
> + size = size + (range_last - range_start) + 1;
> + } else if ((range_size == -1) && (node_number == i + 1)) {
> + return size;
> + } else {
> + printk(KERN_WARNING,
> + "%s %d FAIL start %lld last %lld node->start %lld node->last %lld i %d",
> + __func__, __LINE__, range_start, range_last, node_start,
> + node_last, i);
> + }
> +
> + return size;
> +}
> +
> +u64 vhost_vdpa_range_ref_del(struct rb_root_cached *root,
> + struct vdpa_link_node *link_head, int node_number,
> + u64 start, u64 last)
> +{
> + int i = 0;
> + u64 size = 0;
> + int new_ref;
> + u64 node_start;
> + u64 node_last;
> + u64 range_start;
> + u64 range_last;
> + int range_size;
> + struct vdpa_link_node *link_node;
> + struct vdpa_tree_node *vdpa_node = NULL;
> + struct interval_tree_node *node = NULL;
> +
> + if (node_number == 0)
> + return 0;
> +
> + link_node = link_head;
> + range_start = start;
> + range_last = last;
> +
> + for (i = 0; i < node_number; i++) {
> + vdpa_node = link_node->vdpa_node;
> + link_node = link_node->next;
> + node = &vdpa_node->tree_node;
> + new_ref = vdpa_node->ref;
> + node_start = node->start;
> + node_last = node->last;
> +
> + if (range_start == node_start) {
> + if (node_last < range_last) {
> + /* range_start =node->start --- node->last ----last*/

The comment needs some tweaking to be understood by the reviewers easily.


> + if (new_ref > 1) {
> + vhost_vdpa_add_range_ctx(root,
> + node_start,
> + node_last,
> + new_ref - 1);
> + /*count the next range */
> + } else {
> + /* if the ref =0, do not need add it back, count size*/
> + size += (node_last - node_start) + 1;
> + }
> +
> + } else if (node_last > range_last) {
> + /* range_start= node->start --- last -- node->last*/
> +
> + if (new_ref > 1) {
> + vhost_vdpa_add_range_ctx(root,
> + node_start,
> + range_last,
> + new_ref - 1);
> + } else {
> + size += (range_last - node_start) + 1;
> + }
> + vhost_vdpa_add_range_ctx(root, range_last + 1,
> + node_last, new_ref);
> + } else {
> + /* range_start= node->start --- last = node->last*/
> +
> + if (new_ref > 1) {
> + vhost_vdpa_add_range_ctx(root,
> + node_start,
> + range_last,
> + new_ref - 1);
> + } else {
> + size += (range_last - node_start) + 1;
> + }
> + /* should be the end */
> + }
> + } else if (node_start < range_start) {
> + if (range_last < node_last) {
> + /* node->start--- start--- last--- node->last*/
> + /* should the end rang*/
> + vhost_vdpa_add_range_ctx(root, node_start,
> + range_start - 1,
> + new_ref);
> + if (new_ref > 1) {
> + vhost_vdpa_add_range_ctx(root,
> + range_start,
> + range_last,
> + new_ref - 1);
> + } else {
> + size += (range_last - range_start) + 1;
> + }
> + vhost_vdpa_add_range_ctx(root, range_last + 1,
> + node_last, new_ref);
> +
> + } else if (range_last > node_last) {
> + /* node->start--- start--- node->last--- last*/
> +
> + vhost_vdpa_add_range_ctx(root, node_start,
> + range_start - 1,
> + new_ref);
> + if (new_ref > 1) {
> + vhost_vdpa_add_range_ctx(root,
> + range_start,
> + node_last,
> + new_ref - 1);
> + } else {
> + size += (node_last - range_start) + 1;
> + }
> + } else {
> + /* node->start--- start--- node->last= last*/
> + vhost_vdpa_add_range_ctx(root, node_start,
> + range_start - 1,
> + new_ref);
> + if (new_ref > 1) {
> + vhost_vdpa_add_range_ctx(root,
> + range_start,
> + range_last,
> + new_ref - 1);
> + } else {
> + size += (range_last - range_start) + 1;
> + }
> + /* should be the end */
> + }
> + } else {
> + /* some range not in the node, error*/
> + printk(KERN_WARNING,
> + "%s %d FAIL start %lld last %lld node->start %lld node->last %lld new_ref %d",
> + __func__, __LINE__, range_start, range_last,
> + node_start, node_last, new_ref);
> + }
> +
> + range_start = node_last + 1;
> + if (range_start > range_last)
> + break;
> + }
> +
> + range_size = range_last - range_start;
> +
> + /* last round and still some range*/
> +
> + if ((range_size > 0) && (node_number == i + 1)) {
> + printk(KERN_WARNING,
> + "%s %d FAIL start %lld last %lld node->start %lld node->last %lld range_size %d",
> + __func__, __LINE__, range_start, range_last, node_start,
> + node_last, range_size);
> + }
> + return size;
> +}
> +
> +struct vdpa_link_node *vhost_vdpa_merge_list(struct vdpa_link_node *list1,
> + struct vdpa_link_node *list2)
> +{
> + struct vdpa_link_node dummy_head;
> + struct vdpa_link_node *ptr = &dummy_head;
> +
> + while (list1 && list2) {
> + if (list1->node_start < list2->node_start) {
> + ptr->next = list1;
> + list1 = list1->next;
> + } else {
> + ptr->next = list2;
> + list2 = list2->next;
> + }
> + ptr = ptr->next;
> + }
> + if (list1)
> + ptr->next = list1;
> + else
> + ptr->next = list2;
> +
> + return dummy_head.next;
> +}
> +
> +struct vdpa_link_node *vhost_vdpa_get_mid(struct vdpa_link_node *head)
> +{
> + struct vdpa_link_node *mid_prev = NULL;
> + struct vdpa_link_node *mid;
> +
> + while (head && head->next) {
> + mid_prev = (mid_prev == NULL) ? head : mid_prev->next;
> + head = head->next->next;
> + }
> + mid = mid_prev->next;
> + mid_prev->next = NULL;
> + return mid;
> +}
> +struct vdpa_link_node *vhost_vdpa_sort_list(struct vdpa_link_node *head)
> +{
> + struct vdpa_link_node *mid;
> + struct vdpa_link_node *left;
> + struct vdpa_link_node *right;
> +
> + if (!head || !head->next)
> + return head;
> +
> + mid = vhost_vdpa_get_mid(head);
> + left = vhost_vdpa_sort_list(head);
> + right = vhost_vdpa_sort_list(mid);
> + return vhost_vdpa_merge_list(left, right);
> +}
> +
> +u64 vhost_vdpa_range_ops(struct rb_root_cached *root, u64 start, u64 last,
> + bool ops)
> +{
> + struct interval_tree_node *node = NULL;
> + struct vdpa_tree_node *vdpa_node;
> + int node_number = 0;
> + int i = 0;
> + u64 size = 0;
> + struct vdpa_link_node dummy_head = { 0 };
> + struct vdpa_link_node *link_node;
> + struct vdpa_link_node *link_head_tmp;
> + struct vdpa_link_node *pre_link_node;
> +
> + pre_link_node = &dummy_head;
> + /*search the rang overlaped, and del from the tree*/
> + for (node = interval_tree_iter_first(root, start, last); node;
> + node = interval_tree_iter_next(node, start, last)) {
> + link_node = kmalloc(sizeof(struct vdpa_link_node), GFP_ATOMIC);
> + if (link_node == NULL) {
> + goto out;
> + }
> + vdpa_node =
> + container_of(node, struct vdpa_tree_node, tree_node);
> + link_node->vdpa_node = vdpa_node;
> + link_node->node_start = node->start;
> + link_node->node_last = node->last;
> +
> + pre_link_node->next = link_node;
> + pre_link_node = link_node;
> + pre_link_node->next = NULL;
> +
> + node_number++;
> +
> + interval_tree_remove(node, root);
> + }
> + /* sorting the node */

The code explains itself, let's explain why we need the sort.

> + link_head_tmp = vhost_vdpa_sort_list(dummy_head.next);
> +
> + /* these link node are have overlap with range, check the ref and add back to the tree*/
> + if (ops == true) {
> + size = vhost_vdpa_range_ref_add(root, link_head_tmp,
> + node_number, start, last);
> + } else {
> + size = vhost_vdpa_range_ref_del(root, link_head_tmp,
> + node_number, start, last);
> + }
> +out:
> + pre_link_node = link_head_tmp;
> +
> + for (i = 0; i < node_number; i++) {
> + vdpa_node = pre_link_node->vdpa_node;
> + link_node = pre_link_node->next;
> + kfree(vdpa_node);
> + kfree(pre_link_node);
> + pre_link_node = link_node;
> + }
> + return size;
> +}
> +u64 vhost_vdpa_search_range_add(struct rb_root_cached *root, u64 start,
> + u64 last)
> +{
> + u64 size;
> +
> + size = vhost_vdpa_range_ops(root, start, last, true);
> +
> + return size;
> +}
> +
> +u64 vhost_vdpa_search_range_del(struct rb_root_cached *root, u64 start,
> + u64 last)
> +{
> + u64 size;
> +
> + size = vhost_vdpa_range_ops(root, start, last, false);
> +
> + return size;
> +}
> +
> static void vhost_vdpa_pa_unmap(struct vhost_vdpa *v, u64 start, u64 last)
> {
> struct vhost_dev *dev = &v->vdev;
> struct vhost_iotlb *iotlb = dev->iotlb;
> struct vhost_iotlb_map *map;
> struct page *page;
> + u64 size;
> unsigned long pfn, pinned;
>
> while ((map = vhost_iotlb_itree_first(iotlb, start, last)) != NULL) {
> @@ -523,7 +993,11 @@ static void vhost_vdpa_pa_unmap(struct vhost_vdpa *v, u64 start, u64 last)
> set_page_dirty_lock(page);
> unpin_user_page(page);
> }
> - atomic64_sub(PFN_DOWN(map->size), &dev->mm->pinned_vm);
> +
> + size = vhost_vdpa_search_range_del(dev->vdpa_mem_tree,
> + map->start,
> + map->start + map->size - 1);
> + atomic64_sub(PFN_DOWN(size), &dev->mm->pinned_vm);
> vhost_iotlb_map_free(iotlb, map);
> }
> }
> @@ -591,6 +1065,7 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, u64 iova,
> struct vdpa_device *vdpa = v->vdpa;
> const struct vdpa_config_ops *ops = vdpa->config;
> int r = 0;
> + u64 size_count;
>
> r = vhost_iotlb_add_range_ctx(dev->iotlb, iova, iova + size - 1,
> pa, perm, opaque);
> @@ -610,9 +1085,11 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, u64 iova,
> vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1);
> return r;
> }
> -
> - if (!vdpa->use_va)
> - atomic64_add(PFN_DOWN(size), &dev->mm->pinned_vm);
> + if (!vdpa->use_va) {
> + size_count = vhost_vdpa_search_range_add(dev->vdpa_mem_tree,
> + iova, iova + size - 1);
> + atomic64_add(PFN_DOWN(size_count), &dev->mm->pinned_vm);
> + }
>
> return 0;
> }
> @@ -946,6 +1423,58 @@ static void vhost_vdpa_set_iova_range(struct vhost_vdpa *v)
> }
> }
>
> +struct root_for_vdpa_node {
> + struct hlist_node hlist;
> + struct rb_root_cached vdpa_mem_tree;
> + pid_t pid_using;
> +};
> +static DECLARE_HASHTABLE(root_for_vdpa_node_list, 8);
> +int status_for_vdpa_tree = 0;
> +
> +struct root_for_vdpa_node *vhost_vdpa_get_mem_tree(struct task_struct *task)
> +{

Any reason we get the tree via task_struct instead of mm_struct?

> + struct root_for_vdpa_node *root_get_tmp = NULL;
> + pid_t pid_using = task_pid_nr(task);
> +
> + /* No hased table, init one */
> + if (status_for_vdpa_tree == 0) {
> + hash_init(root_for_vdpa_node_list);
> + status_for_vdpa_tree = 1;
> + }
> +
> + hash_for_each_possible (root_for_vdpa_node_list, root_get_tmp, hlist,
> + pid_using) {
> + if (root_get_tmp->pid_using == pid_using)
> + return root_get_tmp;
> + }
> +
> + root_get_tmp = kmalloc(sizeof(*root_get_tmp), GFP_KERNEL);
> + root_get_tmp->pid_using = pid_using;
> +
> + root_get_tmp->vdpa_mem_tree = RB_ROOT_CACHED;
> +
> + hash_add(root_for_vdpa_node_list, &root_get_tmp->hlist,
> + root_get_tmp->pid_using);
> +
> + return root_get_tmp;
> +}
> +
> +void vhost_vdpa_relase_mem_tree(struct task_struct *task)
> +{
> + struct root_for_vdpa_node *root_get_tmp = NULL;
> + pid_t pid_using = task_pid_nr(task);
> +
> + /* No hased table, init one */
> + hash_for_each_possible (root_for_vdpa_node_list, root_get_tmp, hlist,
> + pid_using) {
> + if (root_get_tmp->pid_using == pid_using)
> + kfree(root_get_tmp);
> + return;
> + }
> +
> + return;
> +}
> +
> static int vhost_vdpa_open(struct inode *inode, struct file *filep)
> {
> struct vhost_vdpa *v;
> @@ -991,10 +1520,13 @@ static int vhost_vdpa_open(struct inode *inode, struct file *filep)
> vhost_vdpa_set_iova_range(v);
>
> filep->private_data = v;
> + struct root_for_vdpa_node *tmp = vhost_vdpa_get_mem_tree(current);

This looks wrong, the mapping contains VA so it is bound to the owner.
This means

1) Need to get and put the accounting tree via mm_struct in set_owner
2) Need to release all mappings during reset owner

Thanks

> + dev->vdpa_mem_tree = &tmp->vdpa_mem_tree;
>
> return 0;
>
> err_init_iotlb:
> + vhost_vdpa_relase_mem_tree(current);
> vhost_dev_cleanup(&v->vdev);
> kfree(vqs);
> err:
> @@ -1016,6 +1548,8 @@ static int vhost_vdpa_release(struct inode *inode, struct file *filep)
> struct vhost_dev *d = &v->vdev;
>
> mutex_lock(&d->mutex);
> + vhost_vdpa_relase_mem_tree(current);
> +
> filep->private_data = NULL;
> vhost_vdpa_clean_irq(v);
> vhost_vdpa_reset(v);
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 638bb640d6b4..d1c662eb4f26 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -161,6 +161,7 @@ struct vhost_dev {
> int byte_weight;
> u64 kcov_handle;
> bool use_worker;
> + struct rb_root_cached *vdpa_mem_tree;
> int (*msg_handler)(struct vhost_dev *dev,
> struct vhost_iotlb_msg *msg);
> };
> --
> 2.34.3
>