From this discussion:
https://www.spinics.net/lists/linux-mm/msg145604.html
I investigated whether it is feasible to re-enable deferred page
initialization on xen's para-vitalized domains. After studying the
code, I found non-intrusive way to do just that.
All we need to do is to assume that page-table's pages are pinned early in
boot, which is always true, and add a new PV OP call to notify guests that
boot allocator is finished, so we can set all the necessary fields in
already initialized struct pages.
I have tested this on my laptop with 64-bit kernel, but I would appreciate
if someone could provide more xen testing.
Apply against: linux-next. Enable the following configs:
CONFIG_XEN_PV=y
CONFIG_DEFERRED_STRUCT_PAGE_INIT=y
The above two are needed to test deferred page initialization on PV Xen
domains. If fix is applied correctly, dmesg should output line(s) like this
during boot:
[ 0.266180] node 0 initialised, 717570 pages in 36ms
CONFIG_DEBUG_VM=y
This is needed to poison struct page's memory, otherwise it would be all
zero.
CONFIG_DEBUG_VM_PGFLAGS=y
Verifies that we do not access struct pages flags while memory is still
poisoned (struct pages are not initialized yet).
Pavel Tatashin (1):
xen, mm: Allow deferred page initalization for xen pv domains
arch/x86/include/asm/paravirt.h | 9 +++++++++
arch/x86/include/asm/paravirt_types.h | 3 +++
arch/x86/kernel/paravirt.c | 1 +
arch/x86/mm/init_32.c | 1 +
arch/x86/mm/init_64.c | 1 +
arch/x86/xen/mmu_pv.c | 38 ++++++++++++++++++++++++-----------
mm/page_alloc.c | 4 ----
7 files changed, 41 insertions(+), 16 deletions(-)
--
2.16.2
Juergen Gross noticed that commit
f7f99100d8d ("mm: stop zeroing memory during allocation in vmemmap")
broke XEN PV domains when deferred struct page initialization is enabled.
This is because the xen's PagePinned() flag is getting erased from struct
pages when they are initialized later in boot.
Juergen fixed this problem by disabling deferred pages on xen pv domains.
However, it is desirable to have this feature available, as it reduces boot
time. This fix re-enables the feature for pv-dmains, and fixes the problem
the following way:
The fix is to delay setting PagePinned flag until struct pages for all
allocated memory are initialized (until free_all_bootmem()).
A new hypervisor op pv_init_ops.after_bootmem() is called to let xen know
that boot allocator is done, and hence struct pages for all the allocated
memory are now initialized. If deferred page initialization is enabled, the
rest of struct pages are going to be initialized later in boot once
page_alloc_init_late() is called.
xen_after_bootmem() is xen's implementation of pv_init_ops.after_bootmem(),
we walk page table and mark every page as pinned.
Signed-off-by: Pavel Tatashin <[email protected]>
---
arch/x86/include/asm/paravirt.h | 9 +++++++++
arch/x86/include/asm/paravirt_types.h | 3 +++
arch/x86/kernel/paravirt.c | 1 +
arch/x86/mm/init_32.c | 1 +
arch/x86/mm/init_64.c | 1 +
arch/x86/xen/mmu_pv.c | 38 ++++++++++++++++++++++++-----------
mm/page_alloc.c | 4 ----
7 files changed, 41 insertions(+), 16 deletions(-)
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 9be2bf13825b..737e596a9836 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -820,6 +820,11 @@ static inline notrace unsigned long arch_local_irq_save(void)
extern void default_banner(void);
+static inline void paravirt_after_bootmem(void)
+{
+ pv_init_ops.after_bootmem();
+}
+
#else /* __ASSEMBLY__ */
#define _PVSITE(ptype, clobbers, ops, word, algn) \
@@ -964,6 +969,10 @@ static inline void paravirt_arch_dup_mmap(struct mm_struct *oldmm,
static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
{
}
+
+static inline void paravirt_after_bootmem(void)
+{
+}
#endif /* __ASSEMBLY__ */
#endif /* !CONFIG_PARAVIRT */
#endif /* _ASM_X86_PARAVIRT_H */
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 180bc0bff0fb..da78a3610168 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -86,6 +86,9 @@ struct pv_init_ops {
*/
unsigned (*patch)(u8 type, u16 clobber, void *insnbuf,
unsigned long addr, unsigned len);
+
+ /* called right after we finish boot allocator */
+ void (*after_bootmem)(void);
} __no_randomize_layout;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 99dc79e76bdc..7b5f931e2e3a 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -315,6 +315,7 @@ struct pv_info pv_info = {
struct pv_init_ops pv_init_ops = {
.patch = native_patch,
+ .after_bootmem = paravirt_nop,
};
struct pv_time_ops pv_time_ops = {
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 79cb066f40c0..6096d0d9ecbc 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -763,6 +763,7 @@ void __init mem_init(void)
free_all_bootmem();
after_bootmem = 1;
+ paravirt_after_bootmem();
mem_init_print_info(NULL);
printk(KERN_INFO "virtual kernel memory layout:\n"
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 332f6e25977a..70b7b5093d07 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1189,6 +1189,7 @@ void __init mem_init(void)
/* this will put all memory onto the freelists */
free_all_bootmem();
after_bootmem = 1;
+ paravirt_after_bootmem();
/*
* Must be done after boot memory is put on freelist, because here we
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index d20763472920..603589809334 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -116,6 +116,8 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
static phys_addr_t xen_pt_base, xen_pt_size __initdata;
+static DEFINE_STATIC_KEY_FALSE(xen_struct_pages_ready);
+
/*
* Just beyond the highest usermode address. STACK_TOP_MAX has a
* redzone above it, so round it up to a PGD boundary.
@@ -155,11 +157,18 @@ void make_lowmem_page_readwrite(void *vaddr)
}
+/*
+ * During early boot all pages are pinned, but we do not have struct pages,
+ * so return true until struct pages are ready.
+ */
static bool xen_page_pinned(void *ptr)
{
- struct page *page = virt_to_page(ptr);
+ if (static_branch_likely(&xen_struct_pages_ready)) {
+ struct page *page = virt_to_page(ptr);
- return PagePinned(page);
+ return PagePinned(page);
+ }
+ return true;
}
static void xen_extend_mmu_update(const struct mmu_update *update)
@@ -836,11 +845,6 @@ void xen_mm_pin_all(void)
spin_unlock(&pgd_lock);
}
-/*
- * The init_mm pagetable is really pinned as soon as its created, but
- * that's before we have page structures to store the bits. So do all
- * the book-keeping now.
- */
static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
enum pt_level level)
{
@@ -848,8 +852,18 @@ static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
return 0;
}
-static void __init xen_mark_init_mm_pinned(void)
+/*
+ * The init_mm pagetable is really pinned as soon as its created, but
+ * that's before we have page structures to store the bits. So do all
+ * the book-keeping now once struct pages for allocated pages are
+ * initialized. This happens only after free_all_bootmem() is called.
+ */
+static void __init xen_after_bootmem(void)
{
+ static_branch_enable(&xen_struct_pages_ready);
+#ifdef CONFIG_X86_64
+ SetPagePinned(virt_to_page(level3_user_vsyscall));
+#endif
xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
}
@@ -1623,14 +1637,15 @@ static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot)
static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,
unsigned level)
{
- bool pinned = PagePinned(virt_to_page(mm->pgd));
+ bool pinned = xen_page_pinned(mm->pgd);
trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned);
if (pinned) {
struct page *page = pfn_to_page(pfn);
- SetPagePinned(page);
+ if (static_branch_likely(&xen_struct_pages_ready))
+ SetPagePinned(page);
if (!PageHighMem(page)) {
xen_mc_batch();
@@ -2364,9 +2379,8 @@ static void __init xen_post_allocator_init(void)
#ifdef CONFIG_X86_64
pv_mmu_ops.write_cr3 = &xen_write_cr3;
- SetPagePinned(virt_to_page(level3_user_vsyscall));
#endif
- xen_mark_init_mm_pinned();
+ pv_init_ops.after_bootmem = &xen_after_bootmem;
}
static void xen_leave_lazy_mmu(void)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2836bc9e0999..6f9d34bdd071 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -46,7 +46,6 @@
#include <linux/stop_machine.h>
#include <linux/sort.h>
#include <linux/pfn.h>
-#include <xen/xen.h>
#include <linux/backing-dev.h>
#include <linux/fault-inject.h>
#include <linux/page-isolation.h>
@@ -317,9 +316,6 @@ static inline bool update_defer_init(pg_data_t *pgdat,
/* Always populate low zones for address-constrained allocations */
if (zone_end < pgdat_end_pfn(pgdat))
return true;
- /* Xen PV domains need page structures early */
- if (xen_pv_domain())
- return true;
(*nr_initialised)++;
if ((*nr_initialised > pgdat->static_init_pgcnt) &&
(pfn & (PAGES_PER_SECTION - 1)) == 0) {
--
2.16.2
On 24/02/18 00:25, Pavel Tatashin wrote:
> Juergen Gross noticed that commit
> f7f99100d8d ("mm: stop zeroing memory during allocation in vmemmap")
> broke XEN PV domains when deferred struct page initialization is enabled.
>
> This is because the xen's PagePinned() flag is getting erased from struct
> pages when they are initialized later in boot.
>
> Juergen fixed this problem by disabling deferred pages on xen pv domains.
> However, it is desirable to have this feature available, as it reduces boot
> time. This fix re-enables the feature for pv-dmains, and fixes the problem
> the following way:
>
> The fix is to delay setting PagePinned flag until struct pages for all
> allocated memory are initialized (until free_all_bootmem()).
>
> A new hypervisor op pv_init_ops.after_bootmem() is called to let xen know
> that boot allocator is done, and hence struct pages for all the allocated
> memory are now initialized. If deferred page initialization is enabled, the
> rest of struct pages are going to be initialized later in boot once
> page_alloc_init_late() is called.
>
> xen_after_bootmem() is xen's implementation of pv_init_ops.after_bootmem(),
> we walk page table and mark every page as pinned.
>
> Signed-off-by: Pavel Tatashin <[email protected]>
> ---
> arch/x86/include/asm/paravirt.h | 9 +++++++++
> arch/x86/include/asm/paravirt_types.h | 3 +++
> arch/x86/kernel/paravirt.c | 1 +
> arch/x86/mm/init_32.c | 1 +
> arch/x86/mm/init_64.c | 1 +
> arch/x86/xen/mmu_pv.c | 38 ++++++++++++++++++++++++-----------
> mm/page_alloc.c | 4 ----
> 7 files changed, 41 insertions(+), 16 deletions(-)
>
> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
> index 9be2bf13825b..737e596a9836 100644
> --- a/arch/x86/include/asm/paravirt.h
> +++ b/arch/x86/include/asm/paravirt.h
> @@ -820,6 +820,11 @@ static inline notrace unsigned long arch_local_irq_save(void)
>
> extern void default_banner(void);
>
> +static inline void paravirt_after_bootmem(void)
> +{
> + pv_init_ops.after_bootmem();
> +}
> +
Putting this in the paravirt framework is overkill IMO. There is no need
to patch the callsites for optimal performance.
I'd put it into struct x86_hyper_init and pre-init it with x86_init_noop
> #else /* __ASSEMBLY__ */
>
> #define _PVSITE(ptype, clobbers, ops, word, algn) \
> @@ -964,6 +969,10 @@ static inline void paravirt_arch_dup_mmap(struct mm_struct *oldmm,
> static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
> {
> }
> +
> +static inline void paravirt_after_bootmem(void)
> +{
> +}
> #endif /* __ASSEMBLY__ */
> #endif /* !CONFIG_PARAVIRT */
> #endif /* _ASM_X86_PARAVIRT_H */
> diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
> index 180bc0bff0fb..da78a3610168 100644
> --- a/arch/x86/include/asm/paravirt_types.h
> +++ b/arch/x86/include/asm/paravirt_types.h
> @@ -86,6 +86,9 @@ struct pv_init_ops {
> */
> unsigned (*patch)(u8 type, u16 clobber, void *insnbuf,
> unsigned long addr, unsigned len);
> +
> + /* called right after we finish boot allocator */
> + void (*after_bootmem)(void);
> } __no_randomize_layout;
>
>
> diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
> index 99dc79e76bdc..7b5f931e2e3a 100644
> --- a/arch/x86/kernel/paravirt.c
> +++ b/arch/x86/kernel/paravirt.c
> @@ -315,6 +315,7 @@ struct pv_info pv_info = {
>
> struct pv_init_ops pv_init_ops = {
> .patch = native_patch,
> + .after_bootmem = paravirt_nop,
> };
>
> struct pv_time_ops pv_time_ops = {
> diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
> index 79cb066f40c0..6096d0d9ecbc 100644
> --- a/arch/x86/mm/init_32.c
> +++ b/arch/x86/mm/init_32.c
> @@ -763,6 +763,7 @@ void __init mem_init(void)
> free_all_bootmem();
>
> after_bootmem = 1;
> + paravirt_after_bootmem();
>
> mem_init_print_info(NULL);
> printk(KERN_INFO "virtual kernel memory layout:\n"
> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> index 332f6e25977a..70b7b5093d07 100644
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -1189,6 +1189,7 @@ void __init mem_init(void)
> /* this will put all memory onto the freelists */
> free_all_bootmem();
> after_bootmem = 1;
> + paravirt_after_bootmem();
>
> /*
> * Must be done after boot memory is put on freelist, because here we
> diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
> index d20763472920..603589809334 100644
> --- a/arch/x86/xen/mmu_pv.c
> +++ b/arch/x86/xen/mmu_pv.c
> @@ -116,6 +116,8 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
>
> static phys_addr_t xen_pt_base, xen_pt_size __initdata;
>
> +static DEFINE_STATIC_KEY_FALSE(xen_struct_pages_ready);
> +
> /*
> * Just beyond the highest usermode address. STACK_TOP_MAX has a
> * redzone above it, so round it up to a PGD boundary.
> @@ -155,11 +157,18 @@ void make_lowmem_page_readwrite(void *vaddr)
> }
>
>
> +/*
> + * During early boot all pages are pinned, but we do not have struct pages,
> + * so return true until struct pages are ready.
> + */
Uuh, this comment is just not true.
The "pinned" state for Xen means it is a pv pagetable known to Xen. Such
pages are read-only for the guest and can be modified via hypercalls
only.
So either the "pinned" state will be tested for page tables only, in
which case the comment needs adjustment, or the code is wrong.
Juergen
Hi Juergen,
Thank you for taking a look at this patch, I will address your
comments, and send out an updated patch.
>> extern void default_banner(void);
>>
>> +static inline void paravirt_after_bootmem(void)
>> +{
>> + pv_init_ops.after_bootmem();
>> +}
>> +
>
> Putting this in the paravirt framework is overkill IMO. There is no need
> to patch the callsites for optimal performance.
>
> I'd put it into struct x86_hyper_init and pre-init it with x86_init_noop
Sure, I will move it into x86_hyper_init.
>>
>> +/*
>> + * During early boot all pages are pinned, but we do not have struct pages,
>> + * so return true until struct pages are ready.
>> + */
>
> Uuh, this comment is just not true.
>
> The "pinned" state for Xen means it is a pv pagetable known to Xen. Such
> pages are read-only for the guest and can be modified via hypercalls
> only.
>
> So either the "pinned" state will be tested for page tables only, in
> which case the comment needs adjustment, or the code is wrong.
The comment should state: During early boot all _page table_ pages are pinned
Thank you,
Pavel