During boot, all non-reserved memblock memory is exposed to the buddy
allocator. Poisoning all that memory with KASAN lengthens boot time,
especially on systems with large amount of RAM. This patch makes
page_alloc to not call kasan_free_pages() on all new memory.
__free_pages_core() is used when exposing fresh memory during system
boot and when onlining memory during hotplug. This patch adds a new
FPI_SKIP_KASAN_POISON flag and passes it to __free_pages_ok() through
free_pages_prepare() from __free_pages_core().
This has little impact on KASAN memory tracking.
Assuming that there are no references to newly exposed pages before they
are ever allocated, there won't be any intended (but buggy) accesses to
that memory that KASAN would normally detect.
However, with this patch, KASAN stops detecting wild and large
out-of-bounds accesses that happen to land on a fresh memory page that
was never allocated. This is taken as an acceptable trade-off.
All memory allocated normally when the boot is over keeps getting
poisoned as usual.
Signed-off-by: Andrey Konovalov <[email protected]>
---
Resending with Change-Id dropped.
---
mm/page_alloc.c | 43 ++++++++++++++++++++++++++++++++-----------
1 file changed, 32 insertions(+), 11 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0b55c9c95364..f10966e3b4a5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -108,6 +108,17 @@ typedef int __bitwise fpi_t;
*/
#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
+/*
+ * Don't poison memory with KASAN.
+ * During boot, all non-reserved memblock memory is exposed to the buddy
+ * allocator. Poisoning all that memory lengthens boot time, especially on
+ * systems with large amount of RAM. This flag is used to skip that poisoning.
+ * Assuming that there are no references to those newly exposed pages before
+ * they are ever allocated, this has little effect on KASAN memory tracking.
+ * All memory allocated normally after boot gets poisoned as usual.
+ */
+#define FPI_SKIP_KASAN_POISON ((__force fpi_t)BIT(2))
+
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
#define MIN_PERCPU_PAGELIST_FRACTION (8)
@@ -384,10 +395,14 @@ static DEFINE_STATIC_KEY_TRUE(deferred_pages);
* on-demand allocation and then freed again before the deferred pages
* initialization is done, but this is not likely to happen.
*/
-static inline void kasan_free_nondeferred_pages(struct page *page, int order)
+static inline void kasan_free_nondeferred_pages(struct page *page, int order,
+ fpi_t fpi_flags)
{
- if (!static_branch_unlikely(&deferred_pages))
- kasan_free_pages(page, order);
+ if (static_branch_unlikely(&deferred_pages))
+ return;
+ if (fpi_flags & FPI_SKIP_KASAN_POISON)
+ return;
+ kasan_free_pages(page, order);
}
/* Returns true if the struct page for the pfn is uninitialised */
@@ -438,7 +453,13 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
return false;
}
#else
-#define kasan_free_nondeferred_pages(p, o) kasan_free_pages(p, o)
+static inline void kasan_free_nondeferred_pages(struct page *page, int order,
+ fpi_t fpi_flags)
+{
+ if (fpi_flags & FPI_SKIP_KASAN_POISON)
+ return;
+ kasan_free_pages(page, order);
+}
static inline bool early_page_uninitialised(unsigned long pfn)
{
@@ -1216,7 +1237,7 @@ static void kernel_init_free_pages(struct page *page, int numpages)
}
static __always_inline bool free_pages_prepare(struct page *page,
- unsigned int order, bool check_free)
+ unsigned int order, bool check_free, fpi_t fpi_flags)
{
int bad = 0;
@@ -1290,7 +1311,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
debug_pagealloc_unmap_pages(page, 1 << order);
- kasan_free_nondeferred_pages(page, order);
+ kasan_free_nondeferred_pages(page, order, fpi_flags);
return true;
}
@@ -1303,7 +1324,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
*/
static bool free_pcp_prepare(struct page *page)
{
- return free_pages_prepare(page, 0, true);
+ return free_pages_prepare(page, 0, true, FPI_NONE);
}
static bool bulkfree_pcp_prepare(struct page *page)
@@ -1323,9 +1344,9 @@ static bool bulkfree_pcp_prepare(struct page *page)
static bool free_pcp_prepare(struct page *page)
{
if (debug_pagealloc_enabled_static())
- return free_pages_prepare(page, 0, true);
+ return free_pages_prepare(page, 0, true, FPI_NONE);
else
- return free_pages_prepare(page, 0, false);
+ return free_pages_prepare(page, 0, false, FPI_NONE);
}
static bool bulkfree_pcp_prepare(struct page *page)
@@ -1533,7 +1554,7 @@ static void __free_pages_ok(struct page *page, unsigned int order,
int migratetype;
unsigned long pfn = page_to_pfn(page);
- if (!free_pages_prepare(page, order, true))
+ if (!free_pages_prepare(page, order, true, fpi_flags))
return;
migratetype = get_pfnblock_migratetype(page, pfn);
@@ -1570,7 +1591,7 @@ void __free_pages_core(struct page *page, unsigned int order)
* Bypass PCP and place fresh pages right to the tail, primarily
* relevant for memory onlining.
*/
- __free_pages_ok(page, order, FPI_TO_TAIL);
+ __free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON);
}
#ifdef CONFIG_NEED_MULTIPLE_NODES
--
2.30.0.478.g8a0d178c01-goog
On Wed, Feb 17, 2021 at 09:59:24PM +0100, Andrey Konovalov wrote:
> During boot, all non-reserved memblock memory is exposed to the buddy
> allocator. Poisoning all that memory with KASAN lengthens boot time,
> especially on systems with large amount of RAM. This patch makes
> page_alloc to not call kasan_free_pages() on all new memory.
>
> __free_pages_core() is used when exposing fresh memory during system
> boot and when onlining memory during hotplug. This patch adds a new
> FPI_SKIP_KASAN_POISON flag and passes it to __free_pages_ok() through
> free_pages_prepare() from __free_pages_core().
>
> This has little impact on KASAN memory tracking.
>
> Assuming that there are no references to newly exposed pages before they
> are ever allocated, there won't be any intended (but buggy) accesses to
> that memory that KASAN would normally detect.
>
> However, with this patch, KASAN stops detecting wild and large
> out-of-bounds accesses that happen to land on a fresh memory page that
> was never allocated. This is taken as an acceptable trade-off.
>
> All memory allocated normally when the boot is over keeps getting
> poisoned as usual.
>
> Signed-off-by: Andrey Konovalov <[email protected]>
The approach looks fine to me. If you don't like the trade-off, I think
you could still leave the kasan poisoning in if CONFIG_DEBUG_KERNEL.
Reviewed-by: Catalin Marinas <[email protected]>
Just curious, have you noticed any issue booting a KASAN_SW_TAGS-enabled
kernel on a system with sufficiently large RAM? Is the boot slow-down
significant?
For MTE, we could look at optimising the poisoning code for page size to
use STGM or DC GZVA but I don't think we can make it unnoticeable for
large systems (especially with DC GZVA, that's like zeroing the whole
RAM at boot).
--
Catalin
On Thu, Feb 18, 2021 at 11:46 AM Catalin Marinas
<[email protected]> wrote:
>
> The approach looks fine to me. If you don't like the trade-off, I think
> you could still leave the kasan poisoning in if CONFIG_DEBUG_KERNEL.
This won't work, Android enables CONFIG_DEBUG_KERNEL in GKI as it turns out :)
> Reviewed-by: Catalin Marinas <[email protected]>
>
> Just curious, have you noticed any issue booting a KASAN_SW_TAGS-enabled
> kernel on a system with sufficiently large RAM? Is the boot slow-down
> significant?
When booting KASAN_SW_TAGS in QEMU with 40G there's a noticeable
start-up delay compared to 2G, but it doesn't seem to be caused by
this memblock->page_alloc poisoning, as removing it makes no
noticeable difference.
I also don't see a noticeable "hang" when booting KASAN_SW_TAGS in
FVP, compared to the one I see with KASAN_HW_TAGS. But I do see a
"hang" in QEMU when going from 2G to 40G with KASAN_HW_TAGS.
It seems that doing STG is much more expensive than writing to the
shadow memory.
> For MTE, we could look at optimising the poisoning code for page size to
> use STGM or DC GZVA but I don't think we can make it unnoticeable for
> large systems (especially with DC GZVA, that's like zeroing the whole
> RAM at boot).
https://bugzilla.kernel.org/show_bug.cgi?id=211817
On Thu, Feb 18, 2021 at 09:24:49PM +0100, Andrey Konovalov wrote:
> On Thu, Feb 18, 2021 at 11:46 AM Catalin Marinas
> <[email protected]> wrote:
> >
> > The approach looks fine to me. If you don't like the trade-off, I think
> > you could still leave the kasan poisoning in if CONFIG_DEBUG_KERNEL.
>
> This won't work, Android enables CONFIG_DEBUG_KERNEL in GKI as it
> turns out :)
And does this option go into production kernels?
> > For MTE, we could look at optimising the poisoning code for page size to
> > use STGM or DC GZVA but I don't think we can make it unnoticeable for
> > large systems (especially with DC GZVA, that's like zeroing the whole
> > RAM at boot).
>
> https://bugzilla.kernel.org/show_bug.cgi?id=211817
A quick hack here if you can give it a try. It can be made more optimal,
maybe calling the set_mem_tag_page directly from kasan:
diff --git a/arch/arm64/include/asm/mte-kasan.h b/arch/arm64/include/asm/mte-kasan.h
index 7ab500e2ad17..b9b9ca1976eb 100644
--- a/arch/arm64/include/asm/mte-kasan.h
+++ b/arch/arm64/include/asm/mte-kasan.h
@@ -48,6 +48,20 @@ static inline u8 mte_get_random_tag(void)
return mte_get_ptr_tag(addr);
}
+static inline void __mte_set_mem_tag_page(u64 curr, u64 end)
+{
+ u64 bs = 4 << (read_cpuid(DCZID_EL0) & 0xf);
+
+ do {
+ asm volatile(__MTE_PREAMBLE "dc gva, %0"
+ :
+ : "r" (curr)
+ : "memory");
+
+ curr += bs;
+ } while (curr != end);
+}
+
/*
* Assign allocation tags for a region of memory based on the pointer tag.
* Note: The address must be non-NULL and MTE_GRANULE_SIZE aligned and
@@ -63,6 +77,11 @@ static inline void mte_set_mem_tag_range(void *addr, size_t size, u8 tag)
curr = (u64)__tag_set(addr, tag);
end = curr + size;
+ if (IS_ALIGNED((unsigned long)addr, PAGE_SIZE) && size == PAGE_SIZE) {
+ __mte_set_mem_tag_page(curr, end);
+ return;
+ }
+
do {
/*
* 'asm volatile' is required to prevent the compiler to move