Received-SPF: pass (google.com: domain of linux-kernel-owner@vger.kernel.org designates 2620:137:e000::1:20 as permitted sender) client-ip=2620:137:e000::1:20;
Date:   Thu, 29 Jun 2023 12:21:32 +0300
From:   Mike Rapoport <rppt@kernel.org>
To:     Liam Ni <zhiguangni01@gmail.com>
Cc:     dave.hansen@linux.intel.com, luto@kernel.org, peterz@infradead.org,
        tglx@linutronix.de, mingo@redhat.com, bp@alien8.de, x86@kernel.org,
        hpa@zytor.com, akpm@linux-foundation.org,
        linux-kernel@vger.kernel.org, linux-mm@kvack.org
Subject: Re: [PATCH V2] x86,NUMA:improve the execution efficiency of
 numa_meminfo_cover_memory()
Message-ID: <20230629092132.GP52412@kernel.org>
References: <20230619075315.49114-1-zhiguangni01@gmail.com>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <20230619075315.49114-1-zhiguangni01@gmail.com>
Precedence: bulk

On Mon, Jun 19, 2023 at 03:53:15PM +0800, Liam Ni wrote:
> The number of pages in memblock that doesn't have the node
> assigned,which also means that these pages are not in numa_info.
> So these pages can represent the number of lose pages.
> 
> Signed-off-by: Liam Ni <zhiguangni01@gmail.com>
> ---
>  arch/x86/mm/numa.c | 25 ++++++-------------------
>  include/linux/mm.h |  2 ++
>  mm/mm_init.c       | 26 ++++++++++++++++++++++++++
>  3 files changed, 34 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> index 2aadb2019b4f..ffe3b771eac7 100644
> --- a/arch/x86/mm/numa.c
> +++ b/arch/x86/mm/numa.c
> @@ -451,28 +451,15 @@ EXPORT_SYMBOL(__node_distance);
>   * Sanity check to catch more bad NUMA configurations (they are amazingly
>   * common).  Make sure the nodes cover all memory.
>   */
> -static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
> +static bool __init numa_meminfo_cover_memory(void)
>  {
> -	u64 numaram, e820ram;
> -	int i;
> -
> -	numaram = 0;
> -	for (i = 0; i < mi->nr_blks; i++) {
> -		u64 s = mi->blk[i].start >> PAGE_SHIFT;
> -		u64 e = mi->blk[i].end >> PAGE_SHIFT;
> -		numaram += e - s;
> -		numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
> -		if ((s64)numaram < 0)
> -			numaram = 0;
> -	}
> +	u64 lo_pages;
>  
> -	e820ram = max_pfn - absent_pages_in_range(0, max_pfn);
> +	lo_pages = without_node_pages_in_range(0, max_pfn);

The entire validation of NUMA coverage can be done in memblock, it's enough
to pass the threshold to that function and it can verify that amount or
memory without the node id set is less than that threshold.
Then the call to numa_meminfo_cover_memory() can be replaced with, say,
memblock_validate_numa_coverage(SZ_1M).

This applies not only to x86, but to loongarch as well.

And once architecture specific code does not use *absent_pages_in_range(),
that can be made static in mm/mm_init.c

>  
>  	/* We seem to lose 3 pages somewhere. Allow 1M of slack. */
> -	if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
> -		printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n",
> -		       (numaram << PAGE_SHIFT) >> 20,
> -		       (e820ram << PAGE_SHIFT) >> 20);
> +	if (lo_pages >= (1 << (20 - PAGE_SHIFT))) {
> +		printk(KERN_ERR "NUMA: we lose more than 1M pages\n");
>  		return false;
>  	}
>  	return true;
> @@ -583,7 +570,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
>  			return -EINVAL;
>  		}
>  	}
> -	if (!numa_meminfo_cover_memory(mi))
> +	if (!numa_meminfo_cover_memory())
>  		return -EINVAL;
>  
>  	/* Finally register nodes. */
> diff --git a/mm/mm_init.c b/mm/mm_init.c
> index 3ddd18a89b66..1d5085a8f93b 100644
> --- a/mm/mm_init.c
> +++ b/mm/mm_init.c
> @@ -1132,6 +1132,32 @@ static void __init adjust_zone_range_for_zone_movable(int nid,
>  	}
>  }
>  
> +/**
> + * @start_pfn: The start PFN to start searching for holes
> + * @end_pfn: The end PFN to stop searching for holes
> + *
> + * Return: Return the number of page frames without node assigned within a range.
> + */
> +unsigned long __init without_node_pages_in_range(unsigned long start_pfn,
> +						unsigned long end_pfn)
> +{
> +	struct memblock_type *type = &memblock.memory;
> +	struct memblock_region *r;
> +	unsigned long num_pages;
> +	int i, nid;
> +
> +	for (i = 0; i < type->cnt; i++) {

Please use for_each_mem_pfn_range() or for_each_mem_region() loop here.

> +		r = &type->regions[i];
> +		nid = memblock_get_region_node(r);
> +
> +		if (PFN_UP(r->base) >= PFN_DOWN(r->base + r->size))
> +			continue;
> +		if (nid == NUMA_NO_NODE)
> +			num_pages += PFN_DOWN(r->base + r->size) - PFN_UP(r->base);
> +	}
> +	return num_pages;
> +}
> +
>  /*
>   * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
>   * then all holes in the requested range will be accounted for.
> -- 
> 2.25.1
> 

-- 
Sincerely yours,
Mike.