When a grant entry is still in use by the remote domain, Linux must put
it on a deferred list. Normally, this list is very short, because
the PV network and block protocols expect the backend to unmap the grant
first. However, Qubes OS's GUI protocol is subject to the constraints
of the X Window System, and as such winds up with the frontend unmapping
the window first. As a result, the list can grow very large, resulting
in a massive memory leak and eventual VM freeze.
To partially solve this problem, make the number of entries that the VM
will attempt to free at each iteration tunable. The default is still
10, but it can be overridden at compile-time (via Kconfig), boot-time
(via a kernel command-line option), or runtime (via sysfs).
This is Cc: stable because (when combined with appropriate userspace
changes) it fixes a severe performance and stability problem for Qubes
OS users.
Cc: [email protected]
Signed-off-by: Demi Marie Obenour <[email protected]>
---
drivers/xen/grant-table.c | 40 ++++++++++++++++++++++++++++-----------
1 file changed, 29 insertions(+), 11 deletions(-)
Changes since v2:
- use atomic_inc_return(x) and atomic_dec_return(x) instead of
atomic_add_return(1, x) and atomic_sub_return(1, x) respectively.
- move module_param macro closer to the definition of
free_per_iteration.
- add blank line between declarations and statements.
Changes since v1:
- drop setting default via Kconfig
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index e1ec725c2819d4d5dede063eb00d86a6d52944c0..f13c3b76ad1eb7110e2a2981e9fa4e504174e431 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -498,14 +498,21 @@ static LIST_HEAD(deferred_list);
static void gnttab_handle_deferred(struct timer_list *);
static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred);
+static atomic64_t deferred_count;
+static atomic64_t leaked_count;
+static unsigned int free_per_iteration = 10;
+module_param(free_per_iteration, uint, 0600);
+
static void gnttab_handle_deferred(struct timer_list *unused)
{
- unsigned int nr = 10;
+ unsigned int nr = READ_ONCE(free_per_iteration);
+ const bool ignore_limit = nr == 0;
struct deferred_entry *first = NULL;
unsigned long flags;
+ size_t freed = 0;
spin_lock_irqsave(&gnttab_list_lock, flags);
- while (nr--) {
+ while ((ignore_limit || nr--) && !list_empty(&deferred_list)) {
struct deferred_entry *entry
= list_first_entry(&deferred_list,
struct deferred_entry, list);
@@ -515,10 +522,14 @@ static void gnttab_handle_deferred(struct timer_list *unused)
list_del(&entry->list);
spin_unlock_irqrestore(&gnttab_list_lock, flags);
if (_gnttab_end_foreign_access_ref(entry->ref)) {
+ uint64_t ret = atomic64_dec_return(&deferred_count);
+
put_free_entry(entry->ref);
- pr_debug("freeing g.e. %#x (pfn %#lx)\n",
- entry->ref, page_to_pfn(entry->page));
+ pr_debug("freeing g.e. %#x (pfn %#lx), %llu remaining\n",
+ entry->ref, page_to_pfn(entry->page),
+ (unsigned long long)ret);
put_page(entry->page);
+ freed++;
kfree(entry);
entry = NULL;
} else {
@@ -530,21 +541,22 @@ static void gnttab_handle_deferred(struct timer_list *unused)
spin_lock_irqsave(&gnttab_list_lock, flags);
if (entry)
list_add_tail(&entry->list, &deferred_list);
- else if (list_empty(&deferred_list))
- break;
}
- if (!list_empty(&deferred_list) && !timer_pending(&deferred_timer)) {
+ if (list_empty(&deferred_list))
+ WARN_ON(atomic64_read(&deferred_count));
+ else if (!timer_pending(&deferred_timer)) {
deferred_timer.expires = jiffies + HZ;
add_timer(&deferred_timer);
}
spin_unlock_irqrestore(&gnttab_list_lock, flags);
+ pr_debug("Freed %zu references", freed);
}
static void gnttab_add_deferred(grant_ref_t ref, struct page *page)
{
struct deferred_entry *entry;
gfp_t gfp = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL;
- const char *what = KERN_WARNING "leaking";
+ uint64_t leaked, deferred;
entry = kmalloc(sizeof(*entry), gfp);
if (!page) {
@@ -567,10 +579,16 @@ static void gnttab_add_deferred(grant_ref_t ref, struct page *page)
add_timer(&deferred_timer);
}
spin_unlock_irqrestore(&gnttab_list_lock, flags);
- what = KERN_DEBUG "deferring";
+ deferred = atomic64_inc_return(&deferred_count);
+ leaked = atomic64_read(&leaked_count);
+ pr_debug("deferring g.e. %#x (pfn %#lx) (total deferred %llu, total leaked %llu)\n",
+ ref, page ? page_to_pfn(page) : -1, deferred, leaked);
+ } else {
+ deferred = atomic64_read(&deferred_count);
+ leaked = atomic64_inc_return(&leaked_count);
+ pr_warn("leaking g.e. %#x (pfn %#lx) (total deferred %llu, total leaked %llu)\n",
+ ref, page ? page_to_pfn(page) : -1, deferred, leaked);
}
- printk("%s g.e. %#x (pfn %#lx)\n",
- what, ref, page ? page_to_pfn(page) : -1);
}
int gnttab_try_end_foreign_access(grant_ref_t ref)
--
Sincerely,
Demi Marie Obenour (she/her/hers)
Invisible Things Lab
On 27.06.23 19:22, Demi Marie Obenour wrote:
> When a grant entry is still in use by the remote domain, Linux must put
> it on a deferred list. Normally, this list is very short, because
> the PV network and block protocols expect the backend to unmap the grant
> first. However, Qubes OS's GUI protocol is subject to the constraints
> of the X Window System, and as such winds up with the frontend unmapping
> the window first. As a result, the list can grow very large, resulting
> in a massive memory leak and eventual VM freeze.
>
> To partially solve this problem, make the number of entries that the VM
> will attempt to free at each iteration tunable. The default is still
> 10, but it can be overridden at compile-time (via Kconfig), boot-time
> (via a kernel command-line option), or runtime (via sysfs).
You are still mentioning Kconfig.
For the sysfs entry you should add something under Documentation/ABI. Sorry
I didn't spot this omission previously.
Juergen
On 27.06.2023 19:22, Demi Marie Obenour wrote:
> When a grant entry is still in use by the remote domain, Linux must put
> it on a deferred list. Normally, this list is very short, because
> the PV network and block protocols expect the backend to unmap the grant
> first. However, Qubes OS's GUI protocol is subject to the constraints
> of the X Window System, and as such winds up with the frontend unmapping
> the window first. As a result, the list can grow very large, resulting
> in a massive memory leak and eventual VM freeze.
>
> To partially solve this problem, make the number of entries that the VM
> will attempt to free at each iteration tunable. The default is still
> 10, but it can be overridden at compile-time (via Kconfig), boot-time
> (via a kernel command-line option), or runtime (via sysfs).
>
> This is Cc: stable because (when combined with appropriate userspace
> changes) it fixes a severe performance and stability problem for Qubes
> OS users.
>
> Cc: [email protected]
> Signed-off-by: Demi Marie Obenour <[email protected]>
Why am I _still_ - after two earlier private questions to the same
effect - on the To: list of this submission? Please can you respect
other people's time and interests and properly follow patch
submission rules, applying common sense when (like has been the
case in the past for Linux) those rules result in overly broad sets
of people.
Jan
On Tue, Jul 04, 2023 at 02:07:47PM +0200, Jan Beulich wrote:
> On 27.06.2023 19:22, Demi Marie Obenour wrote:
> > When a grant entry is still in use by the remote domain, Linux must put
> > it on a deferred list. Normally, this list is very short, because
> > the PV network and block protocols expect the backend to unmap the grant
> > first. However, Qubes OS's GUI protocol is subject to the constraints
> > of the X Window System, and as such winds up with the frontend unmapping
> > the window first. As a result, the list can grow very large, resulting
> > in a massive memory leak and eventual VM freeze.
> >
> > To partially solve this problem, make the number of entries that the VM
> > will attempt to free at each iteration tunable. The default is still
> > 10, but it can be overridden at compile-time (via Kconfig), boot-time
> > (via a kernel command-line option), or runtime (via sysfs).
> >
> > This is Cc: stable because (when combined with appropriate userspace
> > changes) it fixes a severe performance and stability problem for Qubes
> > OS users.
> >
> > Cc: [email protected]
> > Signed-off-by: Demi Marie Obenour <[email protected]>
>
> Why am I _still_ - after two earlier private questions to the same
> effect - on the To: list of this submission? Please can you respect
> other people's time and interests and properly follow patch
> submission rules, applying common sense when (like has been the
> case in the past for Linux) those rules result in overly broad sets
> of people.
>
> Jan
Sorry, I somehow had that in an old version of the patch, and was
editing the patch by hand rather than generating it with git
format-patch.
--
Sincerely,
Demi Marie Obenour (she/her/hers)
Invisible Things Lab
When a grant entry is still in use by the remote domain, Linux must put
it on a deferred list. Normally, this list is very short, because
the PV network and block protocols expect the backend to unmap the grant
first. However, Qubes OS's GUI protocol is subject to the constraints
of the X Window System, and as such winds up with the frontend unmapping
the window first. As a result, the list can grow very large, resulting
in a massive memory leak and eventual VM freeze.
To partially solve this problem, make the number of entries that the VM
will attempt to free at each iteration tunable. The default is still
10, but it can be overridden via a module parameter.
This is Cc: stable because (when combined with appropriate userspace
changes) it fixes a severe performance and stability problem for Qubes
OS users.
Cc: [email protected]
Signed-off-by: Demi Marie Obenour <[email protected]>
---
Documentation/ABI/testing/sysfs-module | 11 +++++++
drivers/xen/grant-table.c | 40 +++++++++++++++++++-------
2 files changed, 40 insertions(+), 11 deletions(-)
Changes since v3:
- do not mention Kconfig in the commit message.
- add entry to Documentation/ABI for the new sysfs entry.
Changes since v2:
- use atomic_inc_return(x) and atomic_dec_return(x) instead of
atomic_add_return(1, x) and atomic_sub_return(1, x) respectively.
- move module_param macro closer to the definition of
free_per_iteration.
- add blank line between declarations and statements.
Changes since v1:
- drop setting default via Kconfig
diff --git a/Documentation/ABI/testing/sysfs-module b/Documentation/ABI/testing/sysfs-module
index 08886367d0470e8d8922703a7d5174077801c2a8..62addab47d0c5908d26ec2f5d07db5ce21833566 100644
--- a/Documentation/ABI/testing/sysfs-module
+++ b/Documentation/ABI/testing/sysfs-module
@@ -60,3 +60,14 @@ Description: Module taint flags:
C staging driver module
E unsigned module
== =====================
+
+What: /sys/module/grant_table/parameters/free_per_iteration
+Date: July 2023
+KernelVersion: 6.5 but backported to all supported stable branches
+Contact: Xen developer discussion <[email protected]>
+Description: Read and write number of grant entries to attempt to free per iteration.
+
+ Note: Future versions of Xen and Linux may provide a better
+ interface for controlling the rate of deferred grant reclaim
+ or may not need it at all.
+Users: Qubes OS (https://www.qubes-os.org)
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index e1ec725c2819d4d5dede063eb00d86a6d52944c0..f13c3b76ad1eb7110e2a2981e9fa4e504174e431 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -498,14 +498,21 @@ static LIST_HEAD(deferred_list);
static void gnttab_handle_deferred(struct timer_list *);
static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred);
+static atomic64_t deferred_count;
+static atomic64_t leaked_count;
+static unsigned int free_per_iteration = 10;
+module_param(free_per_iteration, uint, 0600);
+
static void gnttab_handle_deferred(struct timer_list *unused)
{
- unsigned int nr = 10;
+ unsigned int nr = READ_ONCE(free_per_iteration);
+ const bool ignore_limit = nr == 0;
struct deferred_entry *first = NULL;
unsigned long flags;
+ size_t freed = 0;
spin_lock_irqsave(&gnttab_list_lock, flags);
- while (nr--) {
+ while ((ignore_limit || nr--) && !list_empty(&deferred_list)) {
struct deferred_entry *entry
= list_first_entry(&deferred_list,
struct deferred_entry, list);
@@ -515,10 +522,14 @@ static void gnttab_handle_deferred(struct timer_list *unused)
list_del(&entry->list);
spin_unlock_irqrestore(&gnttab_list_lock, flags);
if (_gnttab_end_foreign_access_ref(entry->ref)) {
+ uint64_t ret = atomic64_dec_return(&deferred_count);
+
put_free_entry(entry->ref);
- pr_debug("freeing g.e. %#x (pfn %#lx)\n",
- entry->ref, page_to_pfn(entry->page));
+ pr_debug("freeing g.e. %#x (pfn %#lx), %llu remaining\n",
+ entry->ref, page_to_pfn(entry->page),
+ (unsigned long long)ret);
put_page(entry->page);
+ freed++;
kfree(entry);
entry = NULL;
} else {
@@ -530,21 +541,22 @@ static void gnttab_handle_deferred(struct timer_list *unused)
spin_lock_irqsave(&gnttab_list_lock, flags);
if (entry)
list_add_tail(&entry->list, &deferred_list);
- else if (list_empty(&deferred_list))
- break;
}
- if (!list_empty(&deferred_list) && !timer_pending(&deferred_timer)) {
+ if (list_empty(&deferred_list))
+ WARN_ON(atomic64_read(&deferred_count));
+ else if (!timer_pending(&deferred_timer)) {
deferred_timer.expires = jiffies + HZ;
add_timer(&deferred_timer);
}
spin_unlock_irqrestore(&gnttab_list_lock, flags);
+ pr_debug("Freed %zu references", freed);
}
static void gnttab_add_deferred(grant_ref_t ref, struct page *page)
{
struct deferred_entry *entry;
gfp_t gfp = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL;
- const char *what = KERN_WARNING "leaking";
+ uint64_t leaked, deferred;
entry = kmalloc(sizeof(*entry), gfp);
if (!page) {
@@ -567,10 +579,16 @@ static void gnttab_add_deferred(grant_ref_t ref, struct page *page)
add_timer(&deferred_timer);
}
spin_unlock_irqrestore(&gnttab_list_lock, flags);
- what = KERN_DEBUG "deferring";
+ deferred = atomic64_inc_return(&deferred_count);
+ leaked = atomic64_read(&leaked_count);
+ pr_debug("deferring g.e. %#x (pfn %#lx) (total deferred %llu, total leaked %llu)\n",
+ ref, page ? page_to_pfn(page) : -1, deferred, leaked);
+ } else {
+ deferred = atomic64_read(&deferred_count);
+ leaked = atomic64_inc_return(&leaked_count);
+ pr_warn("leaking g.e. %#x (pfn %#lx) (total deferred %llu, total leaked %llu)\n",
+ ref, page ? page_to_pfn(page) : -1, deferred, leaked);
}
- printk("%s g.e. %#x (pfn %#lx)\n",
- what, ref, page ? page_to_pfn(page) : -1);
}
int gnttab_try_end_foreign_access(grant_ref_t ref)
--
Sincerely,
Demi Marie Obenour (she/her/hers)
Invisible Things Lab
On 26.07.23 18:52, Demi Marie Obenour wrote:
> When a grant entry is still in use by the remote domain, Linux must put
> it on a deferred list. Normally, this list is very short, because
> the PV network and block protocols expect the backend to unmap the grant
> first. However, Qubes OS's GUI protocol is subject to the constraints
> of the X Window System, and as such winds up with the frontend unmapping
> the window first. As a result, the list can grow very large, resulting
> in a massive memory leak and eventual VM freeze.
>
> To partially solve this problem, make the number of entries that the VM
> will attempt to free at each iteration tunable. The default is still
> 10, but it can be overridden via a module parameter.
>
> This is Cc: stable because (when combined with appropriate userspace
> changes) it fixes a severe performance and stability problem for Qubes
> OS users.
>
> Cc: [email protected]
> Signed-off-by: Demi Marie Obenour <[email protected]>
Reviewed-by: Juergen Gross <[email protected]>
Juergen