Add NR_SECONDARY_PAGETABLE memory stat and use it to account KVM mmu
usage as the first type of accounted secondary page tables. This stat
can be later extended to account for other types of secondary pages
tables (e.g. iommu page tables).
Rationale behind why this is useful and link to extended discussion in
the first patch.
---
Changes in V6:
- Rebased on top of kvm/queue and fixed conflicts.
- Fixed docs spaces and tabs (Sean).
- More narrative commit logs (Sean and Oliver).
- Updated kvm_account_pgtable_pages() documentation to describe the
rules of using it more clearly (Sean).
- Collected Acks and Reviewed-by's by Shakeel and Oliver (Thanks!)
Changes in V5:
- Updated cover letter to explain more the rationale behind the change
(Thanks to contributions by Sean Christopherson).
- Removed extraneous + in arm64 patch (Oliver Upton, Marc Zyngier).
- Shortened secondary_pagetables to sec_pagetables (Shakeel Butt).
- Removed dependency on other patchsets (applies to queue branch).
Changes in V4:
- Changed accounting hooks in arm64 to only account s2 page tables and
refactored them to a much cleaner form, based on recommendations from
Oliver Upton and Marc Zyngier.
- Dropped patches for mips and riscv. I am not interested in those archs
anyway and don't have the resources to test them. I posted them for
completeness but it doesn't seem like anyone was interested.
Changes in V3:
- Added NR_SECONDARY_PAGETABLE instead of piggybacking on NR_PAGETABLE
stats.
Changes in V2:
- Added accounting stats for other archs than x86.
- Changed locations in the code where x86 KVM page table stats were
accounted based on suggestions from Sean Christopherson.
---
Yosry Ahmed (4):
mm: add NR_SECONDARY_PAGETABLE to count secondary page table uses.
KVM: mmu: add a helper to account memory used by KVM MMU.
KVM: x86/mmu: count KVM mmu usage in secondary pagetable stats.
KVM: arm64/mmu: count KVM s2 mmu usage in secondary pagetable stats
Documentation/admin-guide/cgroup-v2.rst | 5 ++++
Documentation/filesystems/proc.rst | 4 +++
arch/arm64/kvm/mmu.c | 36 ++++++++++++++++++++++---
arch/x86/kvm/mmu/mmu.c | 16 +++++++++--
arch/x86/kvm/mmu/tdp_mmu.c | 12 +++++++++
drivers/base/node.c | 2 ++
fs/proc/meminfo.c | 2 ++
include/linux/kvm_host.h | 10 +++++++
include/linux/mmzone.h | 1 +
mm/memcontrol.c | 1 +
mm/page_alloc.c | 6 ++++-
mm/vmstat.c | 1 +
12 files changed, 89 insertions(+), 7 deletions(-)
--
2.37.0.rc0.161.g10f37bed90-goog
Add a helper to account pages used by KVM for page tables in memory
secondary pagetable stats. This function will be used by subsequent
patches in different archs.
Signed-off-by: Yosry Ahmed <[email protected]>
---
include/linux/kvm_host.h | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3b40f8d68fbb1..032821d77e920 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2241,6 +2241,16 @@ static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu)
}
#endif /* CONFIG_KVM_XFER_TO_GUEST_WORK */
+/*
+ * If more than one page is being (un)accounted, @virt must be the address of
+ * the first page of a block of pages what were allocated together (i.e
+ * accounted together).
+ */
+static inline void kvm_account_pgtable_pages(void *virt, int nr)
+{
+ mod_lruvec_page_state(virt_to_page(virt), NR_SECONDARY_PAGETABLE, nr);
+}
+
/*
* This defines how many reserved entries we want to keep before we
* kick the vcpu to the userspace to avoid dirty ring full. This
--
2.37.0.rc0.161.g10f37bed90-goog
Count the pages used by KVM mmu on x86 in memory stats under secondary
pagetable stats (e.g. "SecPageTables" in /proc/meminfo) to give better
visibility into the memory consumption of KVM mmu in a similar way to
how normal user page tables are accounted.
Signed-off-by: Yosry Ahmed <[email protected]>
---
arch/x86/kvm/mmu/mmu.c | 16 ++++++++++++++--
arch/x86/kvm/mmu/tdp_mmu.c | 12 ++++++++++++
2 files changed, 26 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index f7fa4c31b7c52..b1645202658ab 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1664,6 +1664,18 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
percpu_counter_add(&kvm_total_used_mmu_pages, nr);
}
+static void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ kvm_mod_used_mmu_pages(kvm, +1);
+ kvm_account_pgtable_pages((void *)sp->spt, +1);
+}
+
+static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ kvm_mod_used_mmu_pages(kvm, -1);
+ kvm_account_pgtable_pages((void *)sp->spt, -1);
+}
+
static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp)
{
MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
@@ -2123,7 +2135,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
*/
sp->mmu_valid_gen = kvm->arch.mmu_valid_gen;
list_add(&sp->link, &kvm->arch.active_mmu_pages);
- kvm_mod_used_mmu_pages(kvm, +1);
+ kvm_account_mmu_page(kvm, sp);
sp->gfn = gfn;
sp->role = role;
@@ -2450,7 +2462,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
list_add(&sp->link, invalid_list);
else
list_move(&sp->link, invalid_list);
- kvm_mod_used_mmu_pages(kvm, -1);
+ kvm_unaccount_mmu_page(kvm, sp);
} else {
/*
* Remove the active root from the active page list, the root
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index f3a430d64975c..3c5cb6054819a 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -372,6 +372,16 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
}
}
+static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ kvm_account_pgtable_pages((void *)sp->spt, +1);
+}
+
+static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ kvm_account_pgtable_pages((void *)sp->spt, -1);
+}
+
/**
* tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
*
@@ -384,6 +394,7 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
bool shared)
{
+ tdp_unaccount_mmu_page(kvm, sp);
if (shared)
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
else
@@ -1136,6 +1147,7 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
if (account_nx)
account_huge_nx_page(kvm, sp);
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+ tdp_account_mmu_page(kvm, sp);
return 0;
}
--
2.37.0.rc0.161.g10f37bed90-goog
We keep track of several kernel memory stats (total kernel memory, page
tables, stack, vmalloc, etc) on multiple levels (global, per-node,
per-memcg, etc). These stats give insights to users to how much memory
is used by the kernel and for what purposes.
Currently, memory used by kvm mmu is not accounted in any of those
kernel memory stats. This patch series accounts the memory pages
used by KVM for page tables in those stats in a new
NR_SECONDARY_PAGETABLE stat. This stat can be later extended to account
for other types of secondary pages tables (e.g. iommu page tables).
KVM has a decent number of large allocations that aren't for page
tables, but for most of them, the number/size of those allocations
scales linearly with either the number of vCPUs or the amount of memory
assigned to the VM. KVM's secondary page table allocations do not scale
linearly, especially when nested virtualization is in use.
From a KVM perspective, NR_SECONDARY_PAGETABLE will scale with KVM's
per-VM pages_{4k,2m,1g} stats unless the guest is doing something
bizarre (e.g. accessing only 4kb chunks of 2mb pages so that KVM is
forced to allocate a large number of page tables even though the guest
isn't accessing that much memory). However, someone would need to either
understand how KVM works to make that connection, or know (or be told) to
go look at KVM's stats if they're running VMs to better decipher the stats.
Furthermore, having NR_PAGETABLE side-by-side with NR_SECONDARY_PAGETABLE
is informative. For example, when backing a VM with THP vs. HugeTLB,
NR_SECONDARY_PAGETABLE is roughly the same, but NR_PAGETABLE is an order
of magnitude higher with THP. So having this stat will at the very least
prove to be useful for understanding tradeoffs between VM backing types,
and likely even steer folks towards potential optimizations.
The original discussion with more details about the rationale:
https://lore.kernel.org/all/[email protected]
This stat will be used by subsequent patches to count KVM mmu
memory usage.
Signed-off-by: Yosry Ahmed <[email protected]>
Acked-by: Shakeel Butt <[email protected]>
---
Documentation/admin-guide/cgroup-v2.rst | 5 +++++
Documentation/filesystems/proc.rst | 4 ++++
drivers/base/node.c | 2 ++
fs/proc/meminfo.c | 2 ++
include/linux/mmzone.h | 1 +
mm/memcontrol.c | 1 +
mm/page_alloc.c | 6 +++++-
mm/vmstat.c | 1 +
8 files changed, 21 insertions(+), 1 deletion(-)
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 176298f2f4def..e06db032bdbf3 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1340,6 +1340,11 @@ PAGE_SIZE multiple when read back.
pagetables
Amount of memory allocated for page tables.
+ sec_pagetables
+ Amount of memory allocated for secondary page tables,
+ this currently includes KVM mmu allocations on x86
+ and arm64.
+
percpu (npn)
Amount of memory used for storing per-cpu kernel
data structures.
diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index 1bc91fb8c321a..aa2a05b585772 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -977,6 +977,7 @@ Example output. You may not have all of these fields.
SUnreclaim: 142336 kB
KernelStack: 11168 kB
PageTables: 20540 kB
+ SecPageTables: 0 kB
NFS_Unstable: 0 kB
Bounce: 0 kB
WritebackTmp: 0 kB
@@ -1085,6 +1086,9 @@ KernelStack
Memory consumed by the kernel stacks of all tasks
PageTables
Memory consumed by userspace page tables
+SecPageTables
+ Memory consumed by secondary page tables, this currently
+ currently includes KVM mmu allocations on x86 and arm64.
NFS_Unstable
Always zero. Previous counted pages which had been written to
the server, but has not been committed to stable storage.
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 0ac6376ef7a10..5ad56a0cd5937 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -433,6 +433,7 @@ static ssize_t node_read_meminfo(struct device *dev,
"Node %d ShadowCallStack:%8lu kB\n"
#endif
"Node %d PageTables: %8lu kB\n"
+ "Node %d SecPageTables: %8lu kB\n"
"Node %d NFS_Unstable: %8lu kB\n"
"Node %d Bounce: %8lu kB\n"
"Node %d WritebackTmp: %8lu kB\n"
@@ -459,6 +460,7 @@ static ssize_t node_read_meminfo(struct device *dev,
nid, node_page_state(pgdat, NR_KERNEL_SCS_KB),
#endif
nid, K(node_page_state(pgdat, NR_PAGETABLE)),
+ nid, K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
nid, 0UL,
nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)),
nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 6e89f0e2fd20f..208efd4fa52c7 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -115,6 +115,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
#endif
show_val_kb(m, "PageTables: ",
global_node_page_state(NR_PAGETABLE));
+ show_val_kb(m, "SecPageTables: ",
+ global_node_page_state(NR_SECONDARY_PAGETABLE));
show_val_kb(m, "NFS_Unstable: ", 0);
show_val_kb(m, "Bounce: ",
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index aab70355d64f3..13190d298c986 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -216,6 +216,7 @@ enum node_stat_item {
NR_KERNEL_SCS_KB, /* measured in KiB */
#endif
NR_PAGETABLE, /* used for pagetables */
+ NR_SECONDARY_PAGETABLE, /* secondary pagetables, e.g. kvm shadow pagetables */
#ifdef CONFIG_SWAP
NR_SWAPCACHE,
#endif
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index abec50f31fe64..d8178395215d4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1394,6 +1394,7 @@ static const struct memory_stat memory_stats[] = {
{ "kernel", MEMCG_KMEM },
{ "kernel_stack", NR_KERNEL_STACK_KB },
{ "pagetables", NR_PAGETABLE },
+ { "sec_pagetables", NR_SECONDARY_PAGETABLE },
{ "percpu", MEMCG_PERCPU_B },
{ "sock", MEMCG_SOCK },
{ "vmalloc", MEMCG_VMALLOC },
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e008a3df0485c..41ba8942ccee6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5950,7 +5950,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
" unevictable:%lu dirty:%lu writeback:%lu\n"
" slab_reclaimable:%lu slab_unreclaimable:%lu\n"
- " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
+ " mapped:%lu shmem:%lu pagetables:%lu\n"
+ " sec_pagetables:%lu bounce:%lu\n"
" kernel_misc_reclaimable:%lu\n"
" free:%lu free_pcp:%lu free_cma:%lu\n",
global_node_page_state(NR_ACTIVE_ANON),
@@ -5967,6 +5968,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
global_node_page_state(NR_FILE_MAPPED),
global_node_page_state(NR_SHMEM),
global_node_page_state(NR_PAGETABLE),
+ global_node_page_state(NR_SECONDARY_PAGETABLE),
global_zone_page_state(NR_BOUNCE),
global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE),
global_zone_page_state(NR_FREE_PAGES),
@@ -6000,6 +6002,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
" shadow_call_stack:%lukB"
#endif
" pagetables:%lukB"
+ " sec_pagetables:%lukB"
" all_unreclaimable? %s"
"\n",
pgdat->node_id,
@@ -6025,6 +6028,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
node_page_state(pgdat, NR_KERNEL_SCS_KB),
#endif
K(node_page_state(pgdat, NR_PAGETABLE)),
+ K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
"yes" : "no");
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 373d2730fcf21..b937eba681d15 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1240,6 +1240,7 @@ const char * const vmstat_text[] = {
"nr_shadow_call_stack",
#endif
"nr_page_table_pages",
+ "nr_sec_page_table_pages",
#ifdef CONFIG_SWAP
"nr_swapcached",
#endif
--
2.37.0.rc0.161.g10f37bed90-goog
On Tue, 28 Jun 2022 23:09:35 +0100,
Yosry Ahmed <[email protected]> wrote:
>
> We keep track of several kernel memory stats (total kernel memory, page
> tables, stack, vmalloc, etc) on multiple levels (global, per-node,
> per-memcg, etc). These stats give insights to users to how much memory
> is used by the kernel and for what purposes.
>
> Currently, memory used by kvm mmu is not accounted in any of those
> kernel memory stats. This patch series accounts the memory pages
> used by KVM for page tables in those stats in a new
> NR_SECONDARY_PAGETABLE stat. This stat can be later extended to account
> for other types of secondary pages tables (e.g. iommu page tables).
>
> KVM has a decent number of large allocations that aren't for page
> tables, but for most of them, the number/size of those allocations
> scales linearly with either the number of vCPUs or the amount of memory
> assigned to the VM. KVM's secondary page table allocations do not scale
> linearly, especially when nested virtualization is in use.
>
> From a KVM perspective, NR_SECONDARY_PAGETABLE will scale with KVM's
> per-VM pages_{4k,2m,1g} stats unless the guest is doing something
> bizarre (e.g. accessing only 4kb chunks of 2mb pages so that KVM is
> forced to allocate a large number of page tables even though the guest
> isn't accessing that much memory). However, someone would need to either
> understand how KVM works to make that connection, or know (or be told) to
> go look at KVM's stats if they're running VMs to better decipher the stats.
>
> Furthermore, having NR_PAGETABLE side-by-side with NR_SECONDARY_PAGETABLE
> is informative. For example, when backing a VM with THP vs. HugeTLB,
> NR_SECONDARY_PAGETABLE is roughly the same, but NR_PAGETABLE is an order
> of magnitude higher with THP. So having this stat will at the very least
> prove to be useful for understanding tradeoffs between VM backing types,
> and likely even steer folks towards potential optimizations.
>
> The original discussion with more details about the rationale:
> https://lore.kernel.org/all/[email protected]
>
> This stat will be used by subsequent patches to count KVM mmu
> memory usage.
>
> Signed-off-by: Yosry Ahmed <[email protected]>
> Acked-by: Shakeel Butt <[email protected]>
Acked-by: Marc Zyngier <[email protected]>
M.
--
Without deviation from the norm, progress is not possible.
On Tue, 28 Jun 2022 23:09:36 +0100,
Yosry Ahmed <[email protected]> wrote:
>
> Add a helper to account pages used by KVM for page tables in memory
> secondary pagetable stats. This function will be used by subsequent
> patches in different archs.
>
> Signed-off-by: Yosry Ahmed <[email protected]>
Acked-by: Marc Zyngier <[email protected]>
M.
--
Without deviation from the norm, progress is not possible.
If/when this patchset gets merged, would it be through the mm tree or
kvm tree? It is based on the kvm-queue branch so I am guessing it
could be easier to go through kvm but I am not sure what the policy is
here. Andrew or Paolo, do you mind clarifying the policy on such
patchsets? Thanks!
On Tue, Jun 28, 2022 at 3:09 PM Yosry Ahmed <[email protected]> wrote:
>
> Add NR_SECONDARY_PAGETABLE memory stat and use it to account KVM mmu
> usage as the first type of accounted secondary page tables. This stat
> can be later extended to account for other types of secondary pages
> tables (e.g. iommu page tables).
>
> Rationale behind why this is useful and link to extended discussion in
> the first patch.
>
> ---
>
> Changes in V6:
> - Rebased on top of kvm/queue and fixed conflicts.
> - Fixed docs spaces and tabs (Sean).
> - More narrative commit logs (Sean and Oliver).
> - Updated kvm_account_pgtable_pages() documentation to describe the
> rules of using it more clearly (Sean).
> - Collected Acks and Reviewed-by's by Shakeel and Oliver (Thanks!)
>
> Changes in V5:
> - Updated cover letter to explain more the rationale behind the change
> (Thanks to contributions by Sean Christopherson).
> - Removed extraneous + in arm64 patch (Oliver Upton, Marc Zyngier).
> - Shortened secondary_pagetables to sec_pagetables (Shakeel Butt).
> - Removed dependency on other patchsets (applies to queue branch).
>
> Changes in V4:
> - Changed accounting hooks in arm64 to only account s2 page tables and
> refactored them to a much cleaner form, based on recommendations from
> Oliver Upton and Marc Zyngier.
> - Dropped patches for mips and riscv. I am not interested in those archs
> anyway and don't have the resources to test them. I posted them for
> completeness but it doesn't seem like anyone was interested.
>
> Changes in V3:
> - Added NR_SECONDARY_PAGETABLE instead of piggybacking on NR_PAGETABLE
> stats.
>
> Changes in V2:
> - Added accounting stats for other archs than x86.
> - Changed locations in the code where x86 KVM page table stats were
> accounted based on suggestions from Sean Christopherson.
>
> ---
>
> Yosry Ahmed (4):
> mm: add NR_SECONDARY_PAGETABLE to count secondary page table uses.
> KVM: mmu: add a helper to account memory used by KVM MMU.
> KVM: x86/mmu: count KVM mmu usage in secondary pagetable stats.
> KVM: arm64/mmu: count KVM s2 mmu usage in secondary pagetable stats
>
> Documentation/admin-guide/cgroup-v2.rst | 5 ++++
> Documentation/filesystems/proc.rst | 4 +++
> arch/arm64/kvm/mmu.c | 36 ++++++++++++++++++++++---
> arch/x86/kvm/mmu/mmu.c | 16 +++++++++--
> arch/x86/kvm/mmu/tdp_mmu.c | 12 +++++++++
> drivers/base/node.c | 2 ++
> fs/proc/meminfo.c | 2 ++
> include/linux/kvm_host.h | 10 +++++++
> include/linux/mmzone.h | 1 +
> mm/memcontrol.c | 1 +
> mm/page_alloc.c | 6 ++++-
> mm/vmstat.c | 1 +
> 12 files changed, 89 insertions(+), 7 deletions(-)
>
> --
> 2.37.0.rc0.161.g10f37bed90-goog
>
On Tue, Jun 28, 2022, Yosry Ahmed wrote:
> We keep track of several kernel memory stats (total kernel memory, page
> tables, stack, vmalloc, etc) on multiple levels (global, per-node,
> per-memcg, etc). These stats give insights to users to how much memory
> is used by the kernel and for what purposes.
>
> Currently, memory used by kvm mmu is not accounted in any of those
Nit, capitalize KVM (mainly to be consistent).
> @@ -1085,6 +1086,9 @@ KernelStack
> Memory consumed by the kernel stacks of all tasks
> PageTables
> Memory consumed by userspace page tables
> +SecPageTables
> + Memory consumed by secondary page tables, this currently
> + currently includes KVM mmu allocations on x86 and arm64.
Nit, this line has a tab instead of eight spaces. Not sure if it actually matters,
there are plenty of tabs elsewhere in the file, but all the entries in this block
use only spaces.
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index aab70355d64f3..13190d298c986 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -216,6 +216,7 @@ enum node_stat_item {
> NR_KERNEL_SCS_KB, /* measured in KiB */
> #endif
> NR_PAGETABLE, /* used for pagetables */
> + NR_SECONDARY_PAGETABLE, /* secondary pagetables, e.g. kvm shadow pagetables */
Nit, s/kvm/KVM, and drop the "shadow", which might be misinterpreted as saying KVM
pagetables are only accounted when KVM is using shadow paging. KVM's usage of "shadow"
is messy, so I totally understand why you included it, but in this case it's unnecessary
and potentially confusing.
And finally, something that's not a nit. Should this be wrapped with CONFIG_KVM
(using IS_ENABLED() because KVM can be built as a module)? That could be removed
if another non-KVM secondary MMU user comes along, but until then, #ifdeffery for
stats the depend on a single feature seems to be the status quo for this code.
> #ifdef CONFIG_SWAP
> NR_SWAPCACHE,
> #endif
On Tue, Jun 28, 2022, Yosry Ahmed wrote:
> Add a helper to account pages used by KVM for page tables in memory
> secondary pagetable stats. This function will be used by subsequent
> patches in different archs.
>
> Signed-off-by: Yosry Ahmed <[email protected]>
> ---
> include/linux/kvm_host.h | 10 ++++++++++
> 1 file changed, 10 insertions(+)
>
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 3b40f8d68fbb1..032821d77e920 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -2241,6 +2241,16 @@ static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu)
> }
> #endif /* CONFIG_KVM_XFER_TO_GUEST_WORK */
>
> +/*
> + * If more than one page is being (un)accounted, @virt must be the address of
> + * the first page of a block of pages what were allocated together (i.e
> + * accounted together).
Sorry for the belated thoughts...
If you spin a v7, can you add a note to call out that mod_lruvec_page_state() is
itself thread-safe? Caught my eye because the TDP MMU usage happens while holding
mmu_lock for read.
> + */
> +static inline void kvm_account_pgtable_pages(void *virt, int nr)
> +{
> + mod_lruvec_page_state(virt_to_page(virt), NR_SECONDARY_PAGETABLE, nr);
> +}
> +
> /*
> * This defines how many reserved entries we want to keep before we
> * kick the vcpu to the userspace to avoid dirty ring full. This
> --
> 2.37.0.rc0.161.g10f37bed90-goog
>
On Tue, Jun 28, 2022, Yosry Ahmed wrote:
> Count the pages used by KVM mmu on x86 in memory stats under secondary
> pagetable stats (e.g. "SecPageTables" in /proc/meminfo) to give better
> visibility into the memory consumption of KVM mmu in a similar way to
> how normal user page tables are accounted.
>
> Signed-off-by: Yosry Ahmed <[email protected]>
> ---
Reviewed-by: Sean Christopherson <[email protected]>
Thanks for taking another look at this!
On Thu, Jul 7, 2022 at 1:59 PM Sean Christopherson <[email protected]> wrote:
>
> On Tue, Jun 28, 2022, Yosry Ahmed wrote:
> > We keep track of several kernel memory stats (total kernel memory, page
> > tables, stack, vmalloc, etc) on multiple levels (global, per-node,
> > per-memcg, etc). These stats give insights to users to how much memory
> > is used by the kernel and for what purposes.
> >
> > Currently, memory used by kvm mmu is not accounted in any of those
>
> Nit, capitalize KVM (mainly to be consistent).
>
> > @@ -1085,6 +1086,9 @@ KernelStack
> > Memory consumed by the kernel stacks of all tasks
> > PageTables
> > Memory consumed by userspace page tables
> > +SecPageTables
> > + Memory consumed by secondary page tables, this currently
> > + currently includes KVM mmu allocations on x86 and arm64.
>
> Nit, this line has a tab instead of eight spaces. Not sure if it actually matters,
> there are plenty of tabs elsewhere in the file, but all the entries in this block
> use only spaces.
>
Will fix it.
> > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> > index aab70355d64f3..13190d298c986 100644
> > --- a/include/linux/mmzone.h
> > +++ b/include/linux/mmzone.h
> > @@ -216,6 +216,7 @@ enum node_stat_item {
> > NR_KERNEL_SCS_KB, /* measured in KiB */
> > #endif
> > NR_PAGETABLE, /* used for pagetables */
> > + NR_SECONDARY_PAGETABLE, /* secondary pagetables, e.g. kvm shadow pagetables */
>
> Nit, s/kvm/KVM, and drop the "shadow", which might be misinterpreted as saying KVM
> pagetables are only accounted when KVM is using shadow paging. KVM's usage of "shadow"
> is messy, so I totally understand why you included it, but in this case it's unnecessary
> and potentially confusing.
>
> And finally, something that's not a nit. Should this be wrapped with CONFIG_KVM
> (using IS_ENABLED() because KVM can be built as a module)? That could be removed
> if another non-KVM secondary MMU user comes along, but until then, #ifdeffery for
> stats the depend on a single feature seems to be the status quo for this code.
>
I will #ifdef the stat, but I will emphasize in the docs that is
currently *only* used for KVM so that it makes sense if users without
KVM don't see the stat at all. I will also remove the stat from
show_free_areas() in mm/page_alloc.c as it seems like none of the
#ifdefed stats show up there.
> > #ifdef CONFIG_SWAP
> > NR_SWAPCACHE,
> > #endif
On Thu, Jul 7, 2022 at 2:08 PM Sean Christopherson <[email protected]> wrote:
>
> On Tue, Jun 28, 2022, Yosry Ahmed wrote:
> > Add a helper to account pages used by KVM for page tables in memory
> > secondary pagetable stats. This function will be used by subsequent
> > patches in different archs.
> >
> > Signed-off-by: Yosry Ahmed <[email protected]>
> > ---
> > include/linux/kvm_host.h | 10 ++++++++++
> > 1 file changed, 10 insertions(+)
> >
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 3b40f8d68fbb1..032821d77e920 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -2241,6 +2241,16 @@ static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu)
> > }
> > #endif /* CONFIG_KVM_XFER_TO_GUEST_WORK */
> >
> > +/*
> > + * If more than one page is being (un)accounted, @virt must be the address of
> > + * the first page of a block of pages what were allocated together (i.e
> > + * accounted together).
>
> Sorry for the belated thoughts...
>
> If you spin a v7, can you add a note to call out that mod_lruvec_page_state() is
> itself thread-safe? Caught my eye because the TDP MMU usage happens while holding
> mmu_lock for read.
>
Sure! I will send a v7 anyway to address the comments on patch 1. Thanks!
> > + */
> > +static inline void kvm_account_pgtable_pages(void *virt, int nr)
> > +{
> > + mod_lruvec_page_state(virt_to_page(virt), NR_SECONDARY_PAGETABLE, nr);
> > +}
> > +
> > /*
> > * This defines how many reserved entries we want to keep before we
> > * kick the vcpu to the userspace to avoid dirty ring full. This
> > --
> > 2.37.0.rc0.161.g10f37bed90-goog
> >
On Tue, Jul 12, 2022, Yosry Ahmed wrote:
> Thanks for taking another look at this!
>
> On Thu, Jul 7, 2022 at 1:59 PM Sean Christopherson <[email protected]> wrote:
> >
> > On Tue, Jun 28, 2022, Yosry Ahmed wrote:
> > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> > > index aab70355d64f3..13190d298c986 100644
> > > --- a/include/linux/mmzone.h
> > > +++ b/include/linux/mmzone.h
> > > @@ -216,6 +216,7 @@ enum node_stat_item {
> > > NR_KERNEL_SCS_KB, /* measured in KiB */
> > > #endif
> > > NR_PAGETABLE, /* used for pagetables */
> > > + NR_SECONDARY_PAGETABLE, /* secondary pagetables, e.g. kvm shadow pagetables */
> >
> > Nit, s/kvm/KVM, and drop the "shadow", which might be misinterpreted as saying KVM
> > pagetables are only accounted when KVM is using shadow paging. KVM's usage of "shadow"
> > is messy, so I totally understand why you included it, but in this case it's unnecessary
> > and potentially confusing.
> >
> > And finally, something that's not a nit. Should this be wrapped with CONFIG_KVM
> > (using IS_ENABLED() because KVM can be built as a module)? That could be removed
> > if another non-KVM secondary MMU user comes along, but until then, #ifdeffery for
> > stats the depend on a single feature seems to be the status quo for this code.
> >
>
> I will #ifdef the stat, but I will emphasize in the docs that is
> currently *only* used for KVM so that it makes sense if users without
> KVM don't see the stat at all. I will also remove the stat from
> show_free_areas() in mm/page_alloc.c as it seems like none of the
> #ifdefed stats show up there.
It's might be worth getting someone from mm/ to weigh in before going through the
trouble, my suggestion/question is based purely on the existing code.
On Tue, Jul 12, 2022 at 4:06 PM Sean Christopherson <[email protected]> wrote:
>
> On Tue, Jul 12, 2022, Yosry Ahmed wrote:
> > Thanks for taking another look at this!
> >
> > On Thu, Jul 7, 2022 at 1:59 PM Sean Christopherson <[email protected]> wrote:
> > >
> > > On Tue, Jun 28, 2022, Yosry Ahmed wrote:
> > > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> > > > index aab70355d64f3..13190d298c986 100644
> > > > --- a/include/linux/mmzone.h
> > > > +++ b/include/linux/mmzone.h
> > > > @@ -216,6 +216,7 @@ enum node_stat_item {
> > > > NR_KERNEL_SCS_KB, /* measured in KiB */
> > > > #endif
> > > > NR_PAGETABLE, /* used for pagetables */
> > > > + NR_SECONDARY_PAGETABLE, /* secondary pagetables, e.g. kvm shadow pagetables */
> > >
> > > Nit, s/kvm/KVM, and drop the "shadow", which might be misinterpreted as saying KVM
> > > pagetables are only accounted when KVM is using shadow paging. KVM's usage of "shadow"
> > > is messy, so I totally understand why you included it, but in this case it's unnecessary
> > > and potentially confusing.
> > >
> > > And finally, something that's not a nit. Should this be wrapped with CONFIG_KVM
> > > (using IS_ENABLED() because KVM can be built as a module)? That could be removed
> > > if another non-KVM secondary MMU user comes along, but until then, #ifdeffery for
> > > stats the depend on a single feature seems to be the status quo for this code.
> > >
> >
> > I will #ifdef the stat, but I will emphasize in the docs that is
> > currently *only* used for KVM so that it makes sense if users without
> > KVM don't see the stat at all. I will also remove the stat from
> > show_free_areas() in mm/page_alloc.c as it seems like none of the
> > #ifdefed stats show up there.
>
> It's might be worth getting someone from mm/ to weigh in before going through the
> trouble, my suggestion/question is based purely on the existing code.
Any mm folks with an opinion about this?
Any preference on whether we should wrap NR_SECONDARY_PAGETABLE stats
with #ifdef CONFIG_KVM for now as it is currently the only source for
this stat?
On Mon, Jul 18, 2022 at 11:26 AM Yosry Ahmed <[email protected]> wrote:
>
> On Tue, Jul 12, 2022 at 4:06 PM Sean Christopherson <[email protected]> wrote:
> >
> > On Tue, Jul 12, 2022, Yosry Ahmed wrote:
> > > Thanks for taking another look at this!
> > >
> > > On Thu, Jul 7, 2022 at 1:59 PM Sean Christopherson <[email protected]> wrote:
> > > >
> > > > On Tue, Jun 28, 2022, Yosry Ahmed wrote:
> > > > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> > > > > index aab70355d64f3..13190d298c986 100644
> > > > > --- a/include/linux/mmzone.h
> > > > > +++ b/include/linux/mmzone.h
> > > > > @@ -216,6 +216,7 @@ enum node_stat_item {
> > > > > NR_KERNEL_SCS_KB, /* measured in KiB */
> > > > > #endif
> > > > > NR_PAGETABLE, /* used for pagetables */
> > > > > + NR_SECONDARY_PAGETABLE, /* secondary pagetables, e.g. kvm shadow pagetables */
> > > >
> > > > Nit, s/kvm/KVM, and drop the "shadow", which might be misinterpreted as saying KVM
> > > > pagetables are only accounted when KVM is using shadow paging. KVM's usage of "shadow"
> > > > is messy, so I totally understand why you included it, but in this case it's unnecessary
> > > > and potentially confusing.
> > > >
> > > > And finally, something that's not a nit. Should this be wrapped with CONFIG_KVM
> > > > (using IS_ENABLED() because KVM can be built as a module)? That could be removed
> > > > if another non-KVM secondary MMU user comes along, but until then, #ifdeffery for
> > > > stats the depend on a single feature seems to be the status quo for this code.
> > > >
> > >
> > > I will #ifdef the stat, but I will emphasize in the docs that is
> > > currently *only* used for KVM so that it makes sense if users without
> > > KVM don't see the stat at all. I will also remove the stat from
> > > show_free_areas() in mm/page_alloc.c as it seems like none of the
> > > #ifdefed stats show up there.
> >
> > It's might be worth getting someone from mm/ to weigh in before going through the
> > trouble, my suggestion/question is based purely on the existing code.
>
> Any mm folks with an opinion about this?
>
> Any preference on whether we should wrap NR_SECONDARY_PAGETABLE stats
> with #ifdef CONFIG_KVM for now as it is currently the only source for
> this stat?
Any input here?
Johannes, you have been involved in discussions in earlier versions of
this series, any thoughts here?
On Mon, Aug 8, 2022 at 1:06 PM Yosry Ahmed <[email protected]> wrote:
>
> On Mon, Jul 18, 2022 at 11:26 AM Yosry Ahmed <[email protected]> wrote:
> >
> > On Tue, Jul 12, 2022 at 4:06 PM Sean Christopherson <[email protected]> wrote:
> > >
> > > On Tue, Jul 12, 2022, Yosry Ahmed wrote:
> > > > Thanks for taking another look at this!
> > > >
> > > > On Thu, Jul 7, 2022 at 1:59 PM Sean Christopherson <[email protected]> wrote:
> > > > >
> > > > > On Tue, Jun 28, 2022, Yosry Ahmed wrote:
> > > > > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> > > > > > index aab70355d64f3..13190d298c986 100644
> > > > > > --- a/include/linux/mmzone.h
> > > > > > +++ b/include/linux/mmzone.h
> > > > > > @@ -216,6 +216,7 @@ enum node_stat_item {
> > > > > > NR_KERNEL_SCS_KB, /* measured in KiB */
> > > > > > #endif
> > > > > > NR_PAGETABLE, /* used for pagetables */
> > > > > > + NR_SECONDARY_PAGETABLE, /* secondary pagetables, e.g. kvm shadow pagetables */
> > > > >
> > > > > Nit, s/kvm/KVM, and drop the "shadow", which might be misinterpreted as saying KVM
> > > > > pagetables are only accounted when KVM is using shadow paging. KVM's usage of "shadow"
> > > > > is messy, so I totally understand why you included it, but in this case it's unnecessary
> > > > > and potentially confusing.
> > > > >
> > > > > And finally, something that's not a nit. Should this be wrapped with CONFIG_KVM
> > > > > (using IS_ENABLED() because KVM can be built as a module)? That could be removed
> > > > > if another non-KVM secondary MMU user comes along, but until then, #ifdeffery for
> > > > > stats the depend on a single feature seems to be the status quo for this code.
> > > > >
> > > >
> > > > I will #ifdef the stat, but I will emphasize in the docs that is
> > > > currently *only* used for KVM so that it makes sense if users without
> > > > KVM don't see the stat at all. I will also remove the stat from
> > > > show_free_areas() in mm/page_alloc.c as it seems like none of the
> > > > #ifdefed stats show up there.
> > >
> > > It's might be worth getting someone from mm/ to weigh in before going through the
> > > trouble, my suggestion/question is based purely on the existing code.
> >
> > Any mm folks with an opinion about this?
> >
> > Any preference on whether we should wrap NR_SECONDARY_PAGETABLE stats
> > with #ifdef CONFIG_KVM for now as it is currently the only source for
> > this stat?
>
> Any input here?
>
> Johannes, you have been involved in discussions in earlier versions of
> this series, any thoughts here?
Andrew, do you have an opinion on this? If not, I will send a v7 with
the nits discussed with Sean. I think otherwise this series has
sufficient ACKs.
Would this be merged through the mm tree or kvm tree? This was based
on the kvm/queue branch but I think I can rebase it on top of
mm-unstable, I think all dependencies that this would have added in
kvm/queue would have been fanned to mm by now.
On Mon, Aug 08, 2022 at 01:06:15PM -0700, Yosry Ahmed wrote:
> On Mon, Jul 18, 2022 at 11:26 AM Yosry Ahmed <[email protected]> wrote:
> >
> > On Tue, Jul 12, 2022 at 4:06 PM Sean Christopherson <[email protected]> wrote:
> > >
> > > On Tue, Jul 12, 2022, Yosry Ahmed wrote:
> > > > Thanks for taking another look at this!
> > > >
> > > > On Thu, Jul 7, 2022 at 1:59 PM Sean Christopherson <[email protected]> wrote:
> > > > >
> > > > > On Tue, Jun 28, 2022, Yosry Ahmed wrote:
> > > > > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> > > > > > index aab70355d64f3..13190d298c986 100644
> > > > > > --- a/include/linux/mmzone.h
> > > > > > +++ b/include/linux/mmzone.h
> > > > > > @@ -216,6 +216,7 @@ enum node_stat_item {
> > > > > > NR_KERNEL_SCS_KB, /* measured in KiB */
> > > > > > #endif
> > > > > > NR_PAGETABLE, /* used for pagetables */
> > > > > > + NR_SECONDARY_PAGETABLE, /* secondary pagetables, e.g. kvm shadow pagetables */
> > > > >
> > > > > Nit, s/kvm/KVM, and drop the "shadow", which might be misinterpreted as saying KVM
> > > > > pagetables are only accounted when KVM is using shadow paging. KVM's usage of "shadow"
> > > > > is messy, so I totally understand why you included it, but in this case it's unnecessary
> > > > > and potentially confusing.
> > > > >
> > > > > And finally, something that's not a nit. Should this be wrapped with CONFIG_KVM
> > > > > (using IS_ENABLED() because KVM can be built as a module)? That could be removed
> > > > > if another non-KVM secondary MMU user comes along, but until then, #ifdeffery for
> > > > > stats the depend on a single feature seems to be the status quo for this code.
> > > > >
> > > >
> > > > I will #ifdef the stat, but I will emphasize in the docs that is
> > > > currently *only* used for KVM so that it makes sense if users without
> > > > KVM don't see the stat at all. I will also remove the stat from
> > > > show_free_areas() in mm/page_alloc.c as it seems like none of the
> > > > #ifdefed stats show up there.
> > >
> > > It's might be worth getting someone from mm/ to weigh in before going through the
> > > trouble, my suggestion/question is based purely on the existing code.
> >
> > Any mm folks with an opinion about this?
> >
> > Any preference on whether we should wrap NR_SECONDARY_PAGETABLE stats
> > with #ifdef CONFIG_KVM for now as it is currently the only source for
> > this stat?
>
> Any input here?
>
> Johannes, you have been involved in discussions in earlier versions of
> this series, any thoughts here?
No super strong feelings here. Most major distros have CONFIG_KVM=y/n,
so it'll be a common fixture anyway, and the ifdef is proooobably not
worth it for hiding it from people. OTOH, the ifdef is useful for
documenting the code.
If you've already ifdeffed it now, I'd say go ahead with
it. Otherwise, don't :) My 2c.
On Mon, Aug 15, 2022 at 8:13 AM Johannes Weiner <[email protected]> wrote:
>
> On Mon, Aug 08, 2022 at 01:06:15PM -0700, Yosry Ahmed wrote:
> > On Mon, Jul 18, 2022 at 11:26 AM Yosry Ahmed <[email protected]> wrote:
> > >
> > > On Tue, Jul 12, 2022 at 4:06 PM Sean Christopherson <[email protected]> wrote:
> > > >
> > > > On Tue, Jul 12, 2022, Yosry Ahmed wrote:
> > > > > Thanks for taking another look at this!
> > > > >
> > > > > On Thu, Jul 7, 2022 at 1:59 PM Sean Christopherson <[email protected]> wrote:
> > > > > >
> > > > > > On Tue, Jun 28, 2022, Yosry Ahmed wrote:
> > > > > > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> > > > > > > index aab70355d64f3..13190d298c986 100644
> > > > > > > --- a/include/linux/mmzone.h
> > > > > > > +++ b/include/linux/mmzone.h
> > > > > > > @@ -216,6 +216,7 @@ enum node_stat_item {
> > > > > > > NR_KERNEL_SCS_KB, /* measured in KiB */
> > > > > > > #endif
> > > > > > > NR_PAGETABLE, /* used for pagetables */
> > > > > > > + NR_SECONDARY_PAGETABLE, /* secondary pagetables, e.g. kvm shadow pagetables */
> > > > > >
> > > > > > Nit, s/kvm/KVM, and drop the "shadow", which might be misinterpreted as saying KVM
> > > > > > pagetables are only accounted when KVM is using shadow paging. KVM's usage of "shadow"
> > > > > > is messy, so I totally understand why you included it, but in this case it's unnecessary
> > > > > > and potentially confusing.
> > > > > >
> > > > > > And finally, something that's not a nit. Should this be wrapped with CONFIG_KVM
> > > > > > (using IS_ENABLED() because KVM can be built as a module)? That could be removed
> > > > > > if another non-KVM secondary MMU user comes along, but until then, #ifdeffery for
> > > > > > stats the depend on a single feature seems to be the status quo for this code.
> > > > > >
> > > > >
> > > > > I will #ifdef the stat, but I will emphasize in the docs that is
> > > > > currently *only* used for KVM so that it makes sense if users without
> > > > > KVM don't see the stat at all. I will also remove the stat from
> > > > > show_free_areas() in mm/page_alloc.c as it seems like none of the
> > > > > #ifdefed stats show up there.
> > > >
> > > > It's might be worth getting someone from mm/ to weigh in before going through the
> > > > trouble, my suggestion/question is based purely on the existing code.
> > >
> > > Any mm folks with an opinion about this?
> > >
> > > Any preference on whether we should wrap NR_SECONDARY_PAGETABLE stats
> > > with #ifdef CONFIG_KVM for now as it is currently the only source for
> > > this stat?
> >
> > Any input here?
> >
> > Johannes, you have been involved in discussions in earlier versions of
> > this series, any thoughts here?
>
> No super strong feelings here. Most major distros have CONFIG_KVM=y/n,
> so it'll be a common fixture anyway, and the ifdef is proooobably not
> worth it for hiding it from people. OTOH, the ifdef is useful for
> documenting the code.
>
> If you've already ifdeffed it now, I'd say go ahead with
> it. Otherwise, don't :) My 2c.
Thanks a lot, Johannes! I haven't ifdeffed it yet so I'll send a v7
with a few nits and collect ACKs. Andrew, would you prefer me to
rebase on top of mm-unstable? Or will this go in through the kvm tree?
(currently it's based on an old-ish kvm/queue).
On Mon, 15 Aug 2022 08:39:23 -0700 Yosry Ahmed <[email protected]> wrote:
> Thanks a lot, Johannes! I haven't ifdeffed it yet so I'll send a v7
> with a few nits and collect ACKs. Andrew, would you prefer me to
> rebase on top of mm-unstable? Or will this go in through the kvm tree?
> (currently it's based on an old-ish kvm/queue).
Through KVM is OK by me, assuming there'll be ongoing work which is
dependent on this.
On Tue, 28 Jun 2022 22:09:35 +0000 Yosry Ahmed <[email protected]> wrote:
> We keep track of several kernel memory stats (total kernel memory, page
> tables, stack, vmalloc, etc) on multiple levels (global, per-node,
> per-memcg, etc). These stats give insights to users to how much memory
> is used by the kernel and for what purposes.
>
> Currently, memory used by kvm mmu is not accounted in any of those
> kernel memory stats. This patch series accounts the memory pages
> used by KVM for page tables in those stats in a new
> NR_SECONDARY_PAGETABLE stat. This stat can be later extended to account
> for other types of secondary pages tables (e.g. iommu page tables).
>
> KVM has a decent number of large allocations that aren't for page
> tables, but for most of them, the number/size of those allocations
> scales linearly with either the number of vCPUs or the amount of memory
> assigned to the VM. KVM's secondary page table allocations do not scale
> linearly, especially when nested virtualization is in use.
>
> >From a KVM perspective, NR_SECONDARY_PAGETABLE will scale with KVM's
> per-VM pages_{4k,2m,1g} stats unless the guest is doing something
> bizarre (e.g. accessing only 4kb chunks of 2mb pages so that KVM is
> forced to allocate a large number of page tables even though the guest
> isn't accessing that much memory). However, someone would need to either
> understand how KVM works to make that connection, or know (or be told) to
> go look at KVM's stats if they're running VMs to better decipher the stats.
>
> Furthermore, having NR_PAGETABLE side-by-side with NR_SECONDARY_PAGETABLE
> is informative. For example, when backing a VM with THP vs. HugeTLB,
> NR_SECONDARY_PAGETABLE is roughly the same, but NR_PAGETABLE is an order
> of magnitude higher with THP. So having this stat will at the very least
> prove to be useful for understanding tradeoffs between VM backing types,
> and likely even steer folks towards potential optimizations.
>
> The original discussion with more details about the rationale:
> https://lore.kernel.org/all/[email protected]
>
> This stat will be used by subsequent patches to count KVM mmu
> memory usage.
Nits and triviata:
> --- a/Documentation/filesystems/proc.rst
> +++ b/Documentation/filesystems/proc.rst
> @@ -977,6 +977,7 @@ Example output. You may not have all of these fields.
> SUnreclaim: 142336 kB
> KernelStack: 11168 kB
> PageTables: 20540 kB
> + SecPageTables: 0 kB
> NFS_Unstable: 0 kB
> Bounce: 0 kB
> WritebackTmp: 0 kB
> @@ -1085,6 +1086,9 @@ KernelStack
> Memory consumed by the kernel stacks of all tasks
> PageTables
> Memory consumed by userspace page tables
> +SecPageTables
> + Memory consumed by secondary page tables, this currently
> + currently includes KVM mmu allocations on x86 and arm64.
Something happened to the whitespace there.
> + "Node %d SecPageTables: %8lu kB\n"
> ...
> + nid, K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
The use of "sec" in the user-facing changes and "secondary" in the
programmer-facing changes is irksome. Can we be consistent? I'd
prefer "secondary" throughout.
On Wed, Aug 17, 2022 at 10:24 AM Andrew Morton
<[email protected]> wrote:
>
> On Tue, 28 Jun 2022 22:09:35 +0000 Yosry Ahmed <[email protected]> wrote:
>
> > We keep track of several kernel memory stats (total kernel memory, page
> > tables, stack, vmalloc, etc) on multiple levels (global, per-node,
> > per-memcg, etc). These stats give insights to users to how much memory
> > is used by the kernel and for what purposes.
> >
> > Currently, memory used by kvm mmu is not accounted in any of those
> > kernel memory stats. This patch series accounts the memory pages
> > used by KVM for page tables in those stats in a new
> > NR_SECONDARY_PAGETABLE stat. This stat can be later extended to account
> > for other types of secondary pages tables (e.g. iommu page tables).
> >
> > KVM has a decent number of large allocations that aren't for page
> > tables, but for most of them, the number/size of those allocations
> > scales linearly with either the number of vCPUs or the amount of memory
> > assigned to the VM. KVM's secondary page table allocations do not scale
> > linearly, especially when nested virtualization is in use.
> >
> > >From a KVM perspective, NR_SECONDARY_PAGETABLE will scale with KVM's
> > per-VM pages_{4k,2m,1g} stats unless the guest is doing something
> > bizarre (e.g. accessing only 4kb chunks of 2mb pages so that KVM is
> > forced to allocate a large number of page tables even though the guest
> > isn't accessing that much memory). However, someone would need to either
> > understand how KVM works to make that connection, or know (or be told) to
> > go look at KVM's stats if they're running VMs to better decipher the stats.
> >
> > Furthermore, having NR_PAGETABLE side-by-side with NR_SECONDARY_PAGETABLE
> > is informative. For example, when backing a VM with THP vs. HugeTLB,
> > NR_SECONDARY_PAGETABLE is roughly the same, but NR_PAGETABLE is an order
> > of magnitude higher with THP. So having this stat will at the very least
> > prove to be useful for understanding tradeoffs between VM backing types,
> > and likely even steer folks towards potential optimizations.
> >
> > The original discussion with more details about the rationale:
> > https://lore.kernel.org/all/[email protected]
> >
> > This stat will be used by subsequent patches to count KVM mmu
> > memory usage.
>
> Nits and triviata:
>
> > --- a/Documentation/filesystems/proc.rst
> > +++ b/Documentation/filesystems/proc.rst
> > @@ -977,6 +977,7 @@ Example output. You may not have all of these fields.
> > SUnreclaim: 142336 kB
> > KernelStack: 11168 kB
> > PageTables: 20540 kB
> > + SecPageTables: 0 kB
> > NFS_Unstable: 0 kB
> > Bounce: 0 kB
> > WritebackTmp: 0 kB
> > @@ -1085,6 +1086,9 @@ KernelStack
> > Memory consumed by the kernel stacks of all tasks
> > PageTables
> > Memory consumed by userspace page tables
> > +SecPageTables
> > + Memory consumed by secondary page tables, this currently
> > + currently includes KVM mmu allocations on x86 and arm64.
>
> Something happened to the whitespace there.
Yeah I have the fix for this queued for v7. Thanks!
>
> > + "Node %d SecPageTables: %8lu kB\n"
> > ...
> > + nid, K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
>
> The use of "sec" in the user-facing changes and "secondary" in the
> programmer-facing changes is irksome. Can we be consistent? I'd
> prefer "secondary" throughout.
>
SecondaryPageTables is too long (unfortunately), it messes up the
formatting in node_read_meminfo() and meminfo_proc_show(). I would
prefer "secondary" as well, but I don't know if breaking the format in
this way is okay.
This is what I mean by breaking the format btw (the numbers become misaligned):
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 5ad56a0cd593..4f85750a0f8e 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -433,7 +433,7 @@ static ssize_t node_read_meminfo(struct device *dev,
"Node %d ShadowCallStack:%8lu kB\n"
#endif
"Node %d PageTables: %8lu kB\n"
- "Node %d SecPageTables: %8lu kB\n"
+ "Node %d SecondaryPageTables: %8lu kB\n"
"Node %d NFS_Unstable: %8lu kB\n"
"Node %d Bounce: %8lu kB\n"
"Node %d WritebackTmp: %8lu kB\n"
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 208efd4fa52c..b7166d09a38f 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -115,7 +115,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
#endif
show_val_kb(m, "PageTables: ",
global_node_page_state(NR_PAGETABLE));
- show_val_kb(m, "SecPageTables: ",
+ show_val_kb(m, "SecondaryPageTables: ",
global_node_page_state(NR_SECONDARY_PAGETABLE));
show_val_kb(m, "NFS_Unstable: ", 0);
On Mon, 22 Aug 2022 17:04:57 -0700 Yosry Ahmed <[email protected]> wrote:
> > SecondaryPageTables is too long (unfortunately), it messes up the
> > formatting in node_read_meminfo() and meminfo_proc_show(). I would
> > prefer "secondary" as well, but I don't know if breaking the format in
> > this way is okay.
>
> Any thoughts here Andrew? Change to SecondaryPageTables anyway? Change
> all to use "sec" instead of "secondary"? Leave as-is?
Leave as-is, I guess.
On Wed, Aug 17, 2022 at 3:27 PM Yosry Ahmed <[email protected]> wrote:
>
> On Wed, Aug 17, 2022 at 10:24 AM Andrew Morton
> <[email protected]> wrote:
> >
> > On Tue, 28 Jun 2022 22:09:35 +0000 Yosry Ahmed <[email protected]> wrote:
> >
> > > We keep track of several kernel memory stats (total kernel memory, page
> > > tables, stack, vmalloc, etc) on multiple levels (global, per-node,
> > > per-memcg, etc). These stats give insights to users to how much memory
> > > is used by the kernel and for what purposes.
> > >
> > > Currently, memory used by kvm mmu is not accounted in any of those
> > > kernel memory stats. This patch series accounts the memory pages
> > > used by KVM for page tables in those stats in a new
> > > NR_SECONDARY_PAGETABLE stat. This stat can be later extended to account
> > > for other types of secondary pages tables (e.g. iommu page tables).
> > >
> > > KVM has a decent number of large allocations that aren't for page
> > > tables, but for most of them, the number/size of those allocations
> > > scales linearly with either the number of vCPUs or the amount of memory
> > > assigned to the VM. KVM's secondary page table allocations do not scale
> > > linearly, especially when nested virtualization is in use.
> > >
> > > >From a KVM perspective, NR_SECONDARY_PAGETABLE will scale with KVM's
> > > per-VM pages_{4k,2m,1g} stats unless the guest is doing something
> > > bizarre (e.g. accessing only 4kb chunks of 2mb pages so that KVM is
> > > forced to allocate a large number of page tables even though the guest
> > > isn't accessing that much memory). However, someone would need to either
> > > understand how KVM works to make that connection, or know (or be told) to
> > > go look at KVM's stats if they're running VMs to better decipher the stats.
> > >
> > > Furthermore, having NR_PAGETABLE side-by-side with NR_SECONDARY_PAGETABLE
> > > is informative. For example, when backing a VM with THP vs. HugeTLB,
> > > NR_SECONDARY_PAGETABLE is roughly the same, but NR_PAGETABLE is an order
> > > of magnitude higher with THP. So having this stat will at the very least
> > > prove to be useful for understanding tradeoffs between VM backing types,
> > > and likely even steer folks towards potential optimizations.
> > >
> > > The original discussion with more details about the rationale:
> > > https://lore.kernel.org/all/[email protected]
> > >
> > > This stat will be used by subsequent patches to count KVM mmu
> > > memory usage.
> >
> > Nits and triviata:
> >
> > > --- a/Documentation/filesystems/proc.rst
> > > +++ b/Documentation/filesystems/proc.rst
> > > @@ -977,6 +977,7 @@ Example output. You may not have all of these fields.
> > > SUnreclaim: 142336 kB
> > > KernelStack: 11168 kB
> > > PageTables: 20540 kB
> > > + SecPageTables: 0 kB
> > > NFS_Unstable: 0 kB
> > > Bounce: 0 kB
> > > WritebackTmp: 0 kB
> > > @@ -1085,6 +1086,9 @@ KernelStack
> > > Memory consumed by the kernel stacks of all tasks
> > > PageTables
> > > Memory consumed by userspace page tables
> > > +SecPageTables
> > > + Memory consumed by secondary page tables, this currently
> > > + currently includes KVM mmu allocations on x86 and arm64.
> >
> > Something happened to the whitespace there.
>
> Yeah I have the fix for this queued for v7. Thanks!
>
> >
> > > + "Node %d SecPageTables: %8lu kB\n"
> > > ...
> > > + nid, K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
> >
> > The use of "sec" in the user-facing changes and "secondary" in the
> > programmer-facing changes is irksome. Can we be consistent? I'd
> > prefer "secondary" throughout.
> >
>
> SecondaryPageTables is too long (unfortunately), it messes up the
> formatting in node_read_meminfo() and meminfo_proc_show(). I would
> prefer "secondary" as well, but I don't know if breaking the format in
> this way is okay.
Any thoughts here Andrew? Change to SecondaryPageTables anyway? Change
all to use "sec" instead of "secondary"? Leave as-is?
>
> This is what I mean by breaking the format btw (the numbers become misaligned):
>
> diff --git a/drivers/base/node.c b/drivers/base/node.c
> index 5ad56a0cd593..4f85750a0f8e 100644
> --- a/drivers/base/node.c
> +++ b/drivers/base/node.c
> @@ -433,7 +433,7 @@ static ssize_t node_read_meminfo(struct device *dev,
> "Node %d ShadowCallStack:%8lu kB\n"
> #endif
> "Node %d PageTables: %8lu kB\n"
> - "Node %d SecPageTables: %8lu kB\n"
> + "Node %d SecondaryPageTables: %8lu kB\n"
> "Node %d NFS_Unstable: %8lu kB\n"
> "Node %d Bounce: %8lu kB\n"
> "Node %d WritebackTmp: %8lu kB\n"
> diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
> index 208efd4fa52c..b7166d09a38f 100644
> --- a/fs/proc/meminfo.c
> +++ b/fs/proc/meminfo.c
> @@ -115,7 +115,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
> #endif
> show_val_kb(m, "PageTables: ",
> global_node_page_state(NR_PAGETABLE));
> - show_val_kb(m, "SecPageTables: ",
> + show_val_kb(m, "SecondaryPageTables: ",
> global_node_page_state(NR_SECONDARY_PAGETABLE));
>
> show_val_kb(m, "NFS_Unstable: ", 0);