Using bin_attributes with a 0 size causes fstat and friends to return that 0 size.
This breaks userspace code that retrieves the size before reading the file. Rather
than reverting 75bd50fa841 ("drivers/base/node.c: use bin_attribute to break the size
limitation of cpumap ABI") let's put in a size value at compile time. Use direct
comparison and a worst-case maximum to ensure compile time constants. For cpulist the
max is on the order of NR_CPUS * (ceil(log10(NR_CPUS)) + 1) which for 8192 is 40960
(8192 * 5). In order to get near that you'd need a system with every other CPU on one
node or something similar. e.g. (0,2,4,8, ... ). To simplify the math and support
larger NR_CPUS in the future we are using NR_CPUS * 7. We also set it to a min of
PAGE_SIZE to retain the older behavior for smaller NR_CPUS. The cpumap file wants to
be something like NR_CPUS/4 + NR_CPUS/32, for the ","s so for simplicity we are using
NR_CPUS/2.
Add a set of macros for these values to cpumask.h so they can be used in multiple places.
Apply these to the handful of such files in drivers/base/topology.c as well as node.c.
On an 80 cpu 4-node sytem (NR_CPUS == 8192)
before:
-r--r--r--. 1 root root 0 Jul 12 14:08 /sys/devices/system/node/node0/cpulist
-r--r--r--. 1 root root 0 Jul 11 17:25 /sys/devices/system/node/node0/cpumap
after:
-r--r--r--. 1 root root 57344 Jul 13 11:32 /sys/devices/system/node/node0/cpulist
-r--r--r--. 1 root root 4096 Jul 13 11:31 /sys/devices/system/node/node0/cpumap
CONFIG_NR_CPUS = 16384
-r--r--r--. 1 root root 114688 Jul 13 14:03 /sys/devices/system/node/node0/cpulist
-r--r--r--. 1 root root 8192 Jul 13 14:02 /sys/devices/system/node/node0/cpumap
Fixes: 75bd50fa841 ("drivers/base/node.c: use bin_attribute to break the size limitation of cpumap ABI")
Fixes: bb9ec13d156 ("topology: use bin_attribute to break the size limitation of cpumap ABI")
Cc: Greg Kroah-Hartman <[email protected]>
Cc: "Rafael J. Wysocki" <[email protected]>
Signed-off-by: Phil Auld <[email protected]>
---
v2: Fix cpumap size calculation. Increase multiplier for cpulist size.
v3: Add comments in code.
v4: Define constants in cpumask.h. Move comments there. Also fix
topology.c.
drivers/base/node.c | 4 ++--
drivers/base/topology.c | 32 ++++++++++++++++----------------
include/linux/cpumask.h | 16 ++++++++++++++++
3 files changed, 34 insertions(+), 18 deletions(-)
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 0ac6376ef7a1..eb0f43784c2b 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -45,7 +45,7 @@ static inline ssize_t cpumap_read(struct file *file, struct kobject *kobj,
return n;
}
-static BIN_ATTR_RO(cpumap, 0);
+static BIN_ATTR_RO(cpumap, CPUMAP_FILE_MAX_BYTES);
static inline ssize_t cpulist_read(struct file *file, struct kobject *kobj,
struct bin_attribute *attr, char *buf,
@@ -66,7 +66,7 @@ static inline ssize_t cpulist_read(struct file *file, struct kobject *kobj,
return n;
}
-static BIN_ATTR_RO(cpulist, 0);
+static BIN_ATTR_RO(cpulist, CPULIST_FILE_MAX_BYTES);
/**
* struct node_access_nodes - Access class device to hold user visible
diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index ac6ad9ab67f9..89f98be5c5b9 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -62,47 +62,47 @@ define_id_show_func(ppin, "0x%llx");
static DEVICE_ATTR_ADMIN_RO(ppin);
define_siblings_read_func(thread_siblings, sibling_cpumask);
-static BIN_ATTR_RO(thread_siblings, 0);
-static BIN_ATTR_RO(thread_siblings_list, 0);
+static BIN_ATTR_RO(thread_siblings, CPUMAP_FILE_MAX_BYTES);
+static BIN_ATTR_RO(thread_siblings_list, CPULIST_FILE_MAX_BYTES);
define_siblings_read_func(core_cpus, sibling_cpumask);
-static BIN_ATTR_RO(core_cpus, 0);
-static BIN_ATTR_RO(core_cpus_list, 0);
+static BIN_ATTR_RO(core_cpus, CPUMAP_FILE_MAX_BYTES);
+static BIN_ATTR_RO(core_cpus_list, CPULIST_FILE_MAX_BYTES);
define_siblings_read_func(core_siblings, core_cpumask);
-static BIN_ATTR_RO(core_siblings, 0);
-static BIN_ATTR_RO(core_siblings_list, 0);
+static BIN_ATTR_RO(core_siblings, CPUMAP_FILE_MAX_BYTES);
+static BIN_ATTR_RO(core_siblings_list, CPULIST_FILE_MAX_BYTES);
#ifdef TOPOLOGY_CLUSTER_SYSFS
define_siblings_read_func(cluster_cpus, cluster_cpumask);
-static BIN_ATTR_RO(cluster_cpus, 0);
-static BIN_ATTR_RO(cluster_cpus_list, 0);
+static BIN_ATTR_RO(cluster_cpus, CPUMAP_FILE_MAX_BYTES);
+static BIN_ATTR_RO(cluster_cpus_list, CPULIST_FILE_MAX_BYTES);
#endif
#ifdef TOPOLOGY_DIE_SYSFS
define_siblings_read_func(die_cpus, die_cpumask);
-static BIN_ATTR_RO(die_cpus, 0);
-static BIN_ATTR_RO(die_cpus_list, 0);
+static BIN_ATTR_RO(die_cpus, CPUMAP_FILE_MAX_BYTES);
+static BIN_ATTR_RO(die_cpus_list, CPULIST_FILE_MAX_BYTES);
#endif
define_siblings_read_func(package_cpus, core_cpumask);
-static BIN_ATTR_RO(package_cpus, 0);
-static BIN_ATTR_RO(package_cpus_list, 0);
+static BIN_ATTR_RO(package_cpus, CPUMAP_FILE_MAX_BYTES);
+static BIN_ATTR_RO(package_cpus_list, CPULIST_FILE_MAX_BYTES);
#ifdef TOPOLOGY_BOOK_SYSFS
define_id_show_func(book_id, "%d");
static DEVICE_ATTR_RO(book_id);
define_siblings_read_func(book_siblings, book_cpumask);
-static BIN_ATTR_RO(book_siblings, 0);
-static BIN_ATTR_RO(book_siblings_list, 0);
+static BIN_ATTR_RO(book_siblings, CPUMAP_FILE_MAX_BYTES);
+static BIN_ATTR_RO(book_siblings_list, CPULIST_FILE_MAX_BYTES);
#endif
#ifdef TOPOLOGY_DRAWER_SYSFS
define_id_show_func(drawer_id, "%d");
static DEVICE_ATTR_RO(drawer_id);
define_siblings_read_func(drawer_siblings, drawer_cpumask);
-static BIN_ATTR_RO(drawer_siblings, 0);
-static BIN_ATTR_RO(drawer_siblings_list, 0);
+static BIN_ATTR_RO(drawer_siblings, CPUMAP_FILE_MAX_BYTES);
+static BIN_ATTR_RO(drawer_siblings_list, CPULIST_FILE_MAX_BYTES);
#endif
static struct bin_attribute *bin_attrs[] = {
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index fe29ac7cc469..007acdb462bd 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -1071,4 +1071,20 @@ cpumap_print_list_to_buf(char *buf, const struct cpumask *mask,
[0] = 1UL \
} }
+/*
+ * Provide a valid theoretical max size for cpumap ands cpulist sysfs files to
+ * avoid breaking userspace which may allocate a buffer based on the size
+ * reported by e.g. fstat.
+ *
+ * For cpumap NR_CPUS/2 is a simplification of NR_CPUS/4 + NR_CPUS/32.
+ *
+ * For cpulist 7 is (ceil(log10(NR_CPUS)) + 1) allowing for NR_CPUS to be up to
+ * 2 orders of magnitude larger than 8192. This covers a worst-case of every
+ * other cpu being on one of two nodes for a very large NR_CPUS.
+ *
+ * Use PAGE_SIZE as a minimum for smaller configurations.
+ */
+#define CPUMAP_FILE_MAX_BYTES (((NR_CPUS >> 1) > PAGE_SIZE) ? NR_CPUS >> 1 : PAGE_SIZE)
+#define CPULIST_FILE_MAX_BYTES (((NR_CPUS * 7) > PAGE_SIZE) ? NR_CPUS * 7 : PAGE_SIZE)
+
#endif /* __LINUX_CPUMASK_H */
--
2.31.1
On Thu, Jul 14, 2022 at 02:30:21PM -0400, Phil Auld wrote:
> Using bin_attributes with a 0 size causes fstat and friends to return that 0 size.
> This breaks userspace code that retrieves the size before reading the file. Rather
> than reverting 75bd50fa841 ("drivers/base/node.c: use bin_attribute to break the size
> limitation of cpumap ABI") let's put in a size value at compile time. Use direct
> comparison and a worst-case maximum to ensure compile time constants. For cpulist the
> max is on the order of NR_CPUS * (ceil(log10(NR_CPUS)) + 1) which for 8192 is 40960
> (8192 * 5). In order to get near that you'd need a system with every other CPU on one
> node or something similar. e.g. (0,2,4,8, ... ). To simplify the math and support
> larger NR_CPUS in the future we are using NR_CPUS * 7. We also set it to a min of
> PAGE_SIZE to retain the older behavior for smaller NR_CPUS. The cpumap file wants to
> be something like NR_CPUS/4 + NR_CPUS/32, for the ","s so for simplicity we are using
> NR_CPUS/2.
>
> Add a set of macros for these values to cpumask.h so they can be used in multiple places.
> Apply these to the handful of such files in drivers/base/topology.c as well as node.c.
Git should have asked you to round at 72 columns, right?
And that's one huge wall of text for the first paragraph, can you make
that more readable?
>
> On an 80 cpu 4-node sytem (NR_CPUS == 8192)
>
> before:
>
> -r--r--r--. 1 root root 0 Jul 12 14:08 /sys/devices/system/node/node0/cpulist
> -r--r--r--. 1 root root 0 Jul 11 17:25 /sys/devices/system/node/node0/cpumap
>
> after:
>
> -r--r--r--. 1 root root 57344 Jul 13 11:32 /sys/devices/system/node/node0/cpulist
> -r--r--r--. 1 root root 4096 Jul 13 11:31 /sys/devices/system/node/node0/cpumap
>
> CONFIG_NR_CPUS = 16384
> -r--r--r--. 1 root root 114688 Jul 13 14:03 /sys/devices/system/node/node0/cpulist
> -r--r--r--. 1 root root 8192 Jul 13 14:02 /sys/devices/system/node/node0/cpumap
>
> Fixes: 75bd50fa841 ("drivers/base/node.c: use bin_attribute to break the size limitation of cpumap ABI")
> Fixes: bb9ec13d156 ("topology: use bin_attribute to break the size limitation of cpumap ABI")
> Cc: Greg Kroah-Hartman <[email protected]>
> Cc: "Rafael J. Wysocki" <[email protected]>
> Signed-off-by: Phil Auld <[email protected]>
You want to cc: stable too, right?
> --- a/include/linux/cpumask.h
> +++ b/include/linux/cpumask.h
> @@ -1071,4 +1071,20 @@ cpumap_print_list_to_buf(char *buf, const struct cpumask *mask,
> [0] = 1UL \
> } }
>
> +/*
> + * Provide a valid theoretical max size for cpumap ands cpulist sysfs files to
> + * avoid breaking userspace which may allocate a buffer based on the size
> + * reported by e.g. fstat.
> + *
> + * For cpumap NR_CPUS/2 is a simplification of NR_CPUS/4 + NR_CPUS/32.
> + *
> + * For cpulist 7 is (ceil(log10(NR_CPUS)) + 1) allowing for NR_CPUS to be up to
> + * 2 orders of magnitude larger than 8192. This covers a worst-case of every
> + * other cpu being on one of two nodes for a very large NR_CPUS.
> + *
> + * Use PAGE_SIZE as a minimum for smaller configurations.
> + */
Please run checkpatch on your patches before sending them out and
getting grumpy emails from maintainers asking you to run checkpatch on
your patches...
greg k-h
On Thu, Jul 14, 2022 at 02:30:21PM -0400, Phil Auld wrote:
> Using bin_attributes with a 0 size causes fstat and friends to return that 0 size.
> This breaks userspace code that retrieves the size before reading the file. Rather
> than reverting 75bd50fa841 ("drivers/base/node.c: use bin_attribute to break the size
> limitation of cpumap ABI") let's put in a size value at compile time. Use direct
> comparison and a worst-case maximum to ensure compile time constants. For cpulist the
> max is on the order of NR_CPUS * (ceil(log10(NR_CPUS)) + 1) which for 8192 is 40960
> (8192 * 5). In order to get near that you'd need a system with every other CPU on one
> node or something similar. e.g. (0,2,4,8, ... ).
My python says:
>>> len(str(list(range(0, 8194, 2))).replace(" ", ''))
19931
Where the list looks like:
[0,2,4,6,8,10,...,8190,8192]
So excluding open and close braces, max length of the cpu list is
19929 bytes, which is almost3 times smaller than your estimation
(8192 * 7 = 57344).
For NR_CPUS == 16x8192:
>>> len(str(list(range(0, 8194 * 16, 2))).replace(" ", '')) - 2
403308
>>> 8192 * 16 * 7
917504
For NR_CPUS == 128x8192:
>>> len(str(list(range(0, 8194 * 128, 2))).replace(" ", '')) - 2
3639774
>>> 8192 * 16 * 7
7340032
Looks like it 2x overestimates for large lists, and 4x for standard
256-bit mask. Before, it was possible to fit ~1800 cpus into 4k page,
after - 585.
> To simplify the math and support
> larger NR_CPUS in the future we are using NR_CPUS * 7. We also set it to a min of
> PAGE_SIZE to retain the older behavior for smaller NR_CPUS. The cpumap file wants to
> be something like NR_CPUS/4 + NR_CPUS/32, for the ","s so for simplicity we are using
> NR_CPUS/2.
This again overestimates almost twice. In this case, NR_CPUS * 9/32 - 1
is a precise value, if I didn't screw up. Why don't you just use it?
> Add a set of macros for these values to cpumask.h so they can be used in multiple places.
> Apply these to the handful of such files in drivers/base/topology.c as well as node.c.
>
> On an 80 cpu 4-node sytem (NR_CPUS == 8192)
>
> before:
>
> -r--r--r--. 1 root root 0 Jul 12 14:08 /sys/devices/system/node/node0/cpulist
> -r--r--r--. 1 root root 0 Jul 11 17:25 /sys/devices/system/node/node0/cpumap
>
> after:
>
> -r--r--r--. 1 root root 57344 Jul 13 11:32 /sys/devices/system/node/node0/cpulist
> -r--r--r--. 1 root root 4096 Jul 13 11:31 /sys/devices/system/node/node0/cpumap
>
> CONFIG_NR_CPUS = 16384
> -r--r--r--. 1 root root 114688 Jul 13 14:03 /sys/devices/system/node/node0/cpulist
> -r--r--r--. 1 root root 8192 Jul 13 14:02 /sys/devices/system/node/node0/cpumap
>
> Fixes: 75bd50fa841 ("drivers/base/node.c: use bin_attribute to break the size limitation of cpumap ABI")
> Fixes: bb9ec13d156 ("topology: use bin_attribute to break the size limitation of cpumap ABI")
> Cc: Greg Kroah-Hartman <[email protected]>
> Cc: "Rafael J. Wysocki" <[email protected]>
> Signed-off-by: Phil Auld <[email protected]>
> ---
>
> v2: Fix cpumap size calculation. Increase multiplier for cpulist size.
>
> v3: Add comments in code.
>
> v4: Define constants in cpumask.h. Move comments there. Also fix
> topology.c.
>
>
> drivers/base/node.c | 4 ++--
> drivers/base/topology.c | 32 ++++++++++++++++----------------
> include/linux/cpumask.h | 16 ++++++++++++++++
> 3 files changed, 34 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/base/node.c b/drivers/base/node.c
> index 0ac6376ef7a1..eb0f43784c2b 100644
> --- a/drivers/base/node.c
> +++ b/drivers/base/node.c
> @@ -45,7 +45,7 @@ static inline ssize_t cpumap_read(struct file *file, struct kobject *kobj,
> return n;
> }
>
> -static BIN_ATTR_RO(cpumap, 0);
> +static BIN_ATTR_RO(cpumap, CPUMAP_FILE_MAX_BYTES);
>
> static inline ssize_t cpulist_read(struct file *file, struct kobject *kobj,
> struct bin_attribute *attr, char *buf,
> @@ -66,7 +66,7 @@ static inline ssize_t cpulist_read(struct file *file, struct kobject *kobj,
> return n;
> }
>
> -static BIN_ATTR_RO(cpulist, 0);
> +static BIN_ATTR_RO(cpulist, CPULIST_FILE_MAX_BYTES);
>
> /**
> * struct node_access_nodes - Access class device to hold user visible
> diff --git a/drivers/base/topology.c b/drivers/base/topology.c
> index ac6ad9ab67f9..89f98be5c5b9 100644
> --- a/drivers/base/topology.c
> +++ b/drivers/base/topology.c
> @@ -62,47 +62,47 @@ define_id_show_func(ppin, "0x%llx");
> static DEVICE_ATTR_ADMIN_RO(ppin);
>
> define_siblings_read_func(thread_siblings, sibling_cpumask);
> -static BIN_ATTR_RO(thread_siblings, 0);
> -static BIN_ATTR_RO(thread_siblings_list, 0);
> +static BIN_ATTR_RO(thread_siblings, CPUMAP_FILE_MAX_BYTES);
> +static BIN_ATTR_RO(thread_siblings_list, CPULIST_FILE_MAX_BYTES);
>
> define_siblings_read_func(core_cpus, sibling_cpumask);
> -static BIN_ATTR_RO(core_cpus, 0);
> -static BIN_ATTR_RO(core_cpus_list, 0);
> +static BIN_ATTR_RO(core_cpus, CPUMAP_FILE_MAX_BYTES);
> +static BIN_ATTR_RO(core_cpus_list, CPULIST_FILE_MAX_BYTES);
>
> define_siblings_read_func(core_siblings, core_cpumask);
> -static BIN_ATTR_RO(core_siblings, 0);
> -static BIN_ATTR_RO(core_siblings_list, 0);
> +static BIN_ATTR_RO(core_siblings, CPUMAP_FILE_MAX_BYTES);
> +static BIN_ATTR_RO(core_siblings_list, CPULIST_FILE_MAX_BYTES);
>
> #ifdef TOPOLOGY_CLUSTER_SYSFS
> define_siblings_read_func(cluster_cpus, cluster_cpumask);
> -static BIN_ATTR_RO(cluster_cpus, 0);
> -static BIN_ATTR_RO(cluster_cpus_list, 0);
> +static BIN_ATTR_RO(cluster_cpus, CPUMAP_FILE_MAX_BYTES);
> +static BIN_ATTR_RO(cluster_cpus_list, CPULIST_FILE_MAX_BYTES);
> #endif
>
> #ifdef TOPOLOGY_DIE_SYSFS
> define_siblings_read_func(die_cpus, die_cpumask);
> -static BIN_ATTR_RO(die_cpus, 0);
> -static BIN_ATTR_RO(die_cpus_list, 0);
> +static BIN_ATTR_RO(die_cpus, CPUMAP_FILE_MAX_BYTES);
> +static BIN_ATTR_RO(die_cpus_list, CPULIST_FILE_MAX_BYTES);
> #endif
>
> define_siblings_read_func(package_cpus, core_cpumask);
> -static BIN_ATTR_RO(package_cpus, 0);
> -static BIN_ATTR_RO(package_cpus_list, 0);
> +static BIN_ATTR_RO(package_cpus, CPUMAP_FILE_MAX_BYTES);
> +static BIN_ATTR_RO(package_cpus_list, CPULIST_FILE_MAX_BYTES);
>
> #ifdef TOPOLOGY_BOOK_SYSFS
> define_id_show_func(book_id, "%d");
> static DEVICE_ATTR_RO(book_id);
> define_siblings_read_func(book_siblings, book_cpumask);
> -static BIN_ATTR_RO(book_siblings, 0);
> -static BIN_ATTR_RO(book_siblings_list, 0);
> +static BIN_ATTR_RO(book_siblings, CPUMAP_FILE_MAX_BYTES);
> +static BIN_ATTR_RO(book_siblings_list, CPULIST_FILE_MAX_BYTES);
> #endif
>
> #ifdef TOPOLOGY_DRAWER_SYSFS
> define_id_show_func(drawer_id, "%d");
> static DEVICE_ATTR_RO(drawer_id);
> define_siblings_read_func(drawer_siblings, drawer_cpumask);
> -static BIN_ATTR_RO(drawer_siblings, 0);
> -static BIN_ATTR_RO(drawer_siblings_list, 0);
> +static BIN_ATTR_RO(drawer_siblings, CPUMAP_FILE_MAX_BYTES);
> +static BIN_ATTR_RO(drawer_siblings_list, CPULIST_FILE_MAX_BYTES);
> #endif
>
> static struct bin_attribute *bin_attrs[] = {
> diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
> index fe29ac7cc469..007acdb462bd 100644
> --- a/include/linux/cpumask.h
> +++ b/include/linux/cpumask.h
> @@ -1071,4 +1071,20 @@ cpumap_print_list_to_buf(char *buf, const struct cpumask *mask,
> [0] = 1UL \
> } }
>
> +/*
> + * Provide a valid theoretical max size for cpumap ands cpulist sysfs files to
s/ands/and
> + * avoid breaking userspace which may allocate a buffer based on the size
> + * reported by e.g. fstat.
> + *
> + * For cpumap NR_CPUS/2 is a simplification of NR_CPUS/4 + NR_CPUS/32.
> + *
> + * For cpulist 7 is (ceil(log10(NR_CPUS)) + 1) allowing for NR_CPUS to be up to
> + * 2 orders of magnitude larger than 8192. This covers a worst-case of every
> + * other cpu being on one of two nodes for a very large NR_CPUS.
> + *
> + * Use PAGE_SIZE as a minimum for smaller configurations.
> + */
> +#define CPUMAP_FILE_MAX_BYTES (((NR_CPUS >> 1) > PAGE_SIZE) ? NR_CPUS >> 1 : PAGE_SIZE)
> +#define CPULIST_FILE_MAX_BYTES (((NR_CPUS * 7) > PAGE_SIZE) ? NR_CPUS * 7 : PAGE_SIZE)
> +
> #endif /* __LINUX_CPUMASK_H */
> --
> 2.31.1
Hi Yury,
On Thu, Jul 14, 2022 at 12:32:17PM -0700 Yury Norov wrote:
> On Thu, Jul 14, 2022 at 02:30:21PM -0400, Phil Auld wrote:
> > Using bin_attributes with a 0 size causes fstat and friends to return that 0 size.
> > This breaks userspace code that retrieves the size before reading the file. Rather
> > than reverting 75bd50fa841 ("drivers/base/node.c: use bin_attribute to break the size
> > limitation of cpumap ABI") let's put in a size value at compile time. Use direct
> > comparison and a worst-case maximum to ensure compile time constants. For cpulist the
> > max is on the order of NR_CPUS * (ceil(log10(NR_CPUS)) + 1) which for 8192 is 40960
> > (8192 * 5). In order to get near that you'd need a system with every other CPU on one
> > node or something similar. e.g. (0,2,4,8, ... ).
>
> My python says:
> >>> len(str(list(range(0, 8194, 2))).replace(" ", ''))
> 19931
>
> Where the list looks like:
> [0,2,4,6,8,10,...,8190,8192]
>
> So excluding open and close braces, max length of the cpu list is
> 19929 bytes, which is almost3 times smaller than your estimation
> (8192 * 7 = 57344).
>
>
> For NR_CPUS == 16x8192:
> >>> len(str(list(range(0, 8194 * 16, 2))).replace(" ", '')) - 2
> 403308
> >>> 8192 * 16 * 7
> 917504
>
> For NR_CPUS == 128x8192:
> >>> len(str(list(range(0, 8194 * 128, 2))).replace(" ", '')) - 2
> 3639774
> >>> 8192 * 16 * 7
> 7340032
>
> Looks like it 2x overestimates for large lists, and 4x for standard
> 256-bit mask.
Thanks! It's totally possible my math is wrong. But I'm not seeing a
suggestion here. What would you like the formula to be?
I don't have any attachment to the numbers that are in there now. I'm not
surprised it's 2x since it counts all of them. I could just divide it by 2
I suppose.
> Before, it was possible to fit ~1800 cpus into 4k page, after - 585.
>
I don't understand this part. Nothing I did changes how the files are
actually built I think.
> > To simplify the math and support
> > larger NR_CPUS in the future we are using NR_CPUS * 7. We also set it to a min of
> > PAGE_SIZE to retain the older behavior for smaller NR_CPUS. The cpumap file wants to
> > be something like NR_CPUS/4 + NR_CPUS/32, for the ","s so for simplicity we are using
> > NR_CPUS/2.
>
> This again overestimates almost twice. In this case, NR_CPUS * 9/32 - 1
> is a precise value, if I didn't screw up. Why don't you just use it?
>
I was just keeping it simple since it's used twice. But I can do it this way.
Cheers,
Phil
> > Add a set of macros for these values to cpumask.h so they can be used in multiple places.
> > Apply these to the handful of such files in drivers/base/topology.c as well as node.c.
> >
> > On an 80 cpu 4-node sytem (NR_CPUS == 8192)
> >
> > before:
> >
> > -r--r--r--. 1 root root 0 Jul 12 14:08 /sys/devices/system/node/node0/cpulist
> > -r--r--r--. 1 root root 0 Jul 11 17:25 /sys/devices/system/node/node0/cpumap
> >
> > after:
> >
> > -r--r--r--. 1 root root 57344 Jul 13 11:32 /sys/devices/system/node/node0/cpulist
> > -r--r--r--. 1 root root 4096 Jul 13 11:31 /sys/devices/system/node/node0/cpumap
> >
> > CONFIG_NR_CPUS = 16384
> > -r--r--r--. 1 root root 114688 Jul 13 14:03 /sys/devices/system/node/node0/cpulist
> > -r--r--r--. 1 root root 8192 Jul 13 14:02 /sys/devices/system/node/node0/cpumap
> >
> > Fixes: 75bd50fa841 ("drivers/base/node.c: use bin_attribute to break the size limitation of cpumap ABI")
> > Fixes: bb9ec13d156 ("topology: use bin_attribute to break the size limitation of cpumap ABI")
> > Cc: Greg Kroah-Hartman <[email protected]>
> > Cc: "Rafael J. Wysocki" <[email protected]>
> > Signed-off-by: Phil Auld <[email protected]>
> > ---
> >
> > v2: Fix cpumap size calculation. Increase multiplier for cpulist size.
> >
> > v3: Add comments in code.
> >
> > v4: Define constants in cpumask.h. Move comments there. Also fix
> > topology.c.
> >
> >
> > drivers/base/node.c | 4 ++--
> > drivers/base/topology.c | 32 ++++++++++++++++----------------
> > include/linux/cpumask.h | 16 ++++++++++++++++
> > 3 files changed, 34 insertions(+), 18 deletions(-)
> >
> > diff --git a/drivers/base/node.c b/drivers/base/node.c
> > index 0ac6376ef7a1..eb0f43784c2b 100644
> > --- a/drivers/base/node.c
> > +++ b/drivers/base/node.c
> > @@ -45,7 +45,7 @@ static inline ssize_t cpumap_read(struct file *file, struct kobject *kobj,
> > return n;
> > }
> >
> > -static BIN_ATTR_RO(cpumap, 0);
> > +static BIN_ATTR_RO(cpumap, CPUMAP_FILE_MAX_BYTES);
> >
> > static inline ssize_t cpulist_read(struct file *file, struct kobject *kobj,
> > struct bin_attribute *attr, char *buf,
> > @@ -66,7 +66,7 @@ static inline ssize_t cpulist_read(struct file *file, struct kobject *kobj,
> > return n;
> > }
> >
> > -static BIN_ATTR_RO(cpulist, 0);
> > +static BIN_ATTR_RO(cpulist, CPULIST_FILE_MAX_BYTES);
> >
> > /**
> > * struct node_access_nodes - Access class device to hold user visible
> > diff --git a/drivers/base/topology.c b/drivers/base/topology.c
> > index ac6ad9ab67f9..89f98be5c5b9 100644
> > --- a/drivers/base/topology.c
> > +++ b/drivers/base/topology.c
> > @@ -62,47 +62,47 @@ define_id_show_func(ppin, "0x%llx");
> > static DEVICE_ATTR_ADMIN_RO(ppin);
> >
> > define_siblings_read_func(thread_siblings, sibling_cpumask);
> > -static BIN_ATTR_RO(thread_siblings, 0);
> > -static BIN_ATTR_RO(thread_siblings_list, 0);
> > +static BIN_ATTR_RO(thread_siblings, CPUMAP_FILE_MAX_BYTES);
> > +static BIN_ATTR_RO(thread_siblings_list, CPULIST_FILE_MAX_BYTES);
> >
> > define_siblings_read_func(core_cpus, sibling_cpumask);
> > -static BIN_ATTR_RO(core_cpus, 0);
> > -static BIN_ATTR_RO(core_cpus_list, 0);
> > +static BIN_ATTR_RO(core_cpus, CPUMAP_FILE_MAX_BYTES);
> > +static BIN_ATTR_RO(core_cpus_list, CPULIST_FILE_MAX_BYTES);
> >
> > define_siblings_read_func(core_siblings, core_cpumask);
> > -static BIN_ATTR_RO(core_siblings, 0);
> > -static BIN_ATTR_RO(core_siblings_list, 0);
> > +static BIN_ATTR_RO(core_siblings, CPUMAP_FILE_MAX_BYTES);
> > +static BIN_ATTR_RO(core_siblings_list, CPULIST_FILE_MAX_BYTES);
> >
> > #ifdef TOPOLOGY_CLUSTER_SYSFS
> > define_siblings_read_func(cluster_cpus, cluster_cpumask);
> > -static BIN_ATTR_RO(cluster_cpus, 0);
> > -static BIN_ATTR_RO(cluster_cpus_list, 0);
> > +static BIN_ATTR_RO(cluster_cpus, CPUMAP_FILE_MAX_BYTES);
> > +static BIN_ATTR_RO(cluster_cpus_list, CPULIST_FILE_MAX_BYTES);
> > #endif
> >
> > #ifdef TOPOLOGY_DIE_SYSFS
> > define_siblings_read_func(die_cpus, die_cpumask);
> > -static BIN_ATTR_RO(die_cpus, 0);
> > -static BIN_ATTR_RO(die_cpus_list, 0);
> > +static BIN_ATTR_RO(die_cpus, CPUMAP_FILE_MAX_BYTES);
> > +static BIN_ATTR_RO(die_cpus_list, CPULIST_FILE_MAX_BYTES);
> > #endif
> >
> > define_siblings_read_func(package_cpus, core_cpumask);
> > -static BIN_ATTR_RO(package_cpus, 0);
> > -static BIN_ATTR_RO(package_cpus_list, 0);
> > +static BIN_ATTR_RO(package_cpus, CPUMAP_FILE_MAX_BYTES);
> > +static BIN_ATTR_RO(package_cpus_list, CPULIST_FILE_MAX_BYTES);
> >
> > #ifdef TOPOLOGY_BOOK_SYSFS
> > define_id_show_func(book_id, "%d");
> > static DEVICE_ATTR_RO(book_id);
> > define_siblings_read_func(book_siblings, book_cpumask);
> > -static BIN_ATTR_RO(book_siblings, 0);
> > -static BIN_ATTR_RO(book_siblings_list, 0);
> > +static BIN_ATTR_RO(book_siblings, CPUMAP_FILE_MAX_BYTES);
> > +static BIN_ATTR_RO(book_siblings_list, CPULIST_FILE_MAX_BYTES);
> > #endif
> >
> > #ifdef TOPOLOGY_DRAWER_SYSFS
> > define_id_show_func(drawer_id, "%d");
> > static DEVICE_ATTR_RO(drawer_id);
> > define_siblings_read_func(drawer_siblings, drawer_cpumask);
> > -static BIN_ATTR_RO(drawer_siblings, 0);
> > -static BIN_ATTR_RO(drawer_siblings_list, 0);
> > +static BIN_ATTR_RO(drawer_siblings, CPUMAP_FILE_MAX_BYTES);
> > +static BIN_ATTR_RO(drawer_siblings_list, CPULIST_FILE_MAX_BYTES);
> > #endif
> >
> > static struct bin_attribute *bin_attrs[] = {
> > diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
> > index fe29ac7cc469..007acdb462bd 100644
> > --- a/include/linux/cpumask.h
> > +++ b/include/linux/cpumask.h
> > @@ -1071,4 +1071,20 @@ cpumap_print_list_to_buf(char *buf, const struct cpumask *mask,
> > [0] = 1UL \
> > } }
> >
> > +/*
> > + * Provide a valid theoretical max size for cpumap ands cpulist sysfs files to
>
> s/ands/and
>
> > + * avoid breaking userspace which may allocate a buffer based on the size
> > + * reported by e.g. fstat.
> > + *
> > + * For cpumap NR_CPUS/2 is a simplification of NR_CPUS/4 + NR_CPUS/32.
> > + *
> > + * For cpulist 7 is (ceil(log10(NR_CPUS)) + 1) allowing for NR_CPUS to be up to
> > + * 2 orders of magnitude larger than 8192. This covers a worst-case of every
> > + * other cpu being on one of two nodes for a very large NR_CPUS.
> > + *
> > + * Use PAGE_SIZE as a minimum for smaller configurations.
> > + */
> > +#define CPUMAP_FILE_MAX_BYTES (((NR_CPUS >> 1) > PAGE_SIZE) ? NR_CPUS >> 1 : PAGE_SIZE)
> > +#define CPULIST_FILE_MAX_BYTES (((NR_CPUS * 7) > PAGE_SIZE) ? NR_CPUS * 7 : PAGE_SIZE)
> > +
> > #endif /* __LINUX_CPUMASK_H */
> > --
> > 2.31.1
>
--
Hi Greg,
On Thu, Jul 14, 2022 at 08:44:19PM +0200 Greg Kroah-Hartman wrote:
> On Thu, Jul 14, 2022 at 02:30:21PM -0400, Phil Auld wrote:
> > Using bin_attributes with a 0 size causes fstat and friends to return that 0 size.
> > This breaks userspace code that retrieves the size before reading the file. Rather
> > than reverting 75bd50fa841 ("drivers/base/node.c: use bin_attribute to break the size
> > limitation of cpumap ABI") let's put in a size value at compile time. Use direct
> > comparison and a worst-case maximum to ensure compile time constants. For cpulist the
> > max is on the order of NR_CPUS * (ceil(log10(NR_CPUS)) + 1) which for 8192 is 40960
> > (8192 * 5). In order to get near that you'd need a system with every other CPU on one
> > node or something similar. e.g. (0,2,4,8, ... ). To simplify the math and support
> > larger NR_CPUS in the future we are using NR_CPUS * 7. We also set it to a min of
> > PAGE_SIZE to retain the older behavior for smaller NR_CPUS. The cpumap file wants to
> > be something like NR_CPUS/4 + NR_CPUS/32, for the ","s so for simplicity we are using
> > NR_CPUS/2.
> >
> > Add a set of macros for these values to cpumask.h so they can be used in multiple places.
> > Apply these to the handful of such files in drivers/base/topology.c as well as node.c.
>
> Git should have asked you to round at 72 columns, right?
>
My git doesn't do that. Maybe I'm missing a useful config.
> And that's one huge wall of text for the first paragraph, can you make
> that more readable?
Fixed.
>
> >
> > On an 80 cpu 4-node sytem (NR_CPUS == 8192)
> >
> > before:
> >
> > -r--r--r--. 1 root root 0 Jul 12 14:08 /sys/devices/system/node/node0/cpulist
> > -r--r--r--. 1 root root 0 Jul 11 17:25 /sys/devices/system/node/node0/cpumap
> >
> > after:
> >
> > -r--r--r--. 1 root root 57344 Jul 13 11:32 /sys/devices/system/node/node0/cpulist
> > -r--r--r--. 1 root root 4096 Jul 13 11:31 /sys/devices/system/node/node0/cpumap
> >
> > CONFIG_NR_CPUS = 16384
> > -r--r--r--. 1 root root 114688 Jul 13 14:03 /sys/devices/system/node/node0/cpulist
> > -r--r--r--. 1 root root 8192 Jul 13 14:02 /sys/devices/system/node/node0/cpumap
> >
> > Fixes: 75bd50fa841 ("drivers/base/node.c: use bin_attribute to break the size limitation of cpumap ABI")
> > Fixes: bb9ec13d156 ("topology: use bin_attribute to break the size limitation of cpumap ABI")
> > Cc: Greg Kroah-Hartman <[email protected]>
> > Cc: "Rafael J. Wysocki" <[email protected]>
> > Signed-off-by: Phil Auld <[email protected]>
>
> You want to cc: stable too, right?
>
Will do.
> > --- a/include/linux/cpumask.h
> > +++ b/include/linux/cpumask.h
> > @@ -1071,4 +1071,20 @@ cpumap_print_list_to_buf(char *buf, const struct cpumask *mask,
> > [0] = 1UL \
> > } }
> >
> > +/*
> > + * Provide a valid theoretical max size for cpumap ands cpulist sysfs files to
> > + * avoid breaking userspace which may allocate a buffer based on the size
> > + * reported by e.g. fstat.
> > + *
> > + * For cpumap NR_CPUS/2 is a simplification of NR_CPUS/4 + NR_CPUS/32.
> > + *
> > + * For cpulist 7 is (ceil(log10(NR_CPUS)) + 1) allowing for NR_CPUS to be up to
> > + * 2 orders of magnitude larger than 8192. This covers a worst-case of every
> > + * other cpu being on one of two nodes for a very large NR_CPUS.
> > + *
> > + * Use PAGE_SIZE as a minimum for smaller configurations.
> > + */
>
> Please run checkpatch on your patches before sending them out and
> getting grumpy emails from maintainers asking you to run checkpatch on
> your patches...
>
Aargh, yes, sorry. I used to use the RH internal version all the time but the workflow
has changed and it's not needed anymore. Totally forget about the uptream version.
You'd think this was my first upstream patch...
I'll fix up the numbers and clean it up for the next version.
Cheers,
Phil
> greg k-h
>
--
On Thu, Jul 14, 2022 at 02:33:20PM -0700 Yury Norov wrote:
> On Thu, Jul 14, 2022 at 05:07:25PM -0400, Phil Auld wrote:
> > Hi Yury,
> >
> > On Thu, Jul 14, 2022 at 12:32:17PM -0700 Yury Norov wrote:
> > > On Thu, Jul 14, 2022 at 02:30:21PM -0400, Phil Auld wrote:
> > > > Using bin_attributes with a 0 size causes fstat and friends to return that 0 size.
> > > > This breaks userspace code that retrieves the size before reading the file. Rather
> > > > than reverting 75bd50fa841 ("drivers/base/node.c: use bin_attribute to break the size
> > > > limitation of cpumap ABI") let's put in a size value at compile time. Use direct
> > > > comparison and a worst-case maximum to ensure compile time constants. For cpulist the
> > > > max is on the order of NR_CPUS * (ceil(log10(NR_CPUS)) + 1) which for 8192 is 40960
> > > > (8192 * 5). In order to get near that you'd need a system with every other CPU on one
> > > > node or something similar. e.g. (0,2,4,8, ... ).
> > >
> > > My python says:
> > > >>> len(str(list(range(0, 8194, 2))).replace(" ", ''))
> > > 19931
> > >
> > > Where the list looks like:
> > > [0,2,4,6,8,10,...,8190,8192]
> > >
> > > So excluding open and close braces, max length of the cpu list is
> > > 19929 bytes, which is almost3 times smaller than your estimation
> > > (8192 * 7 = 57344).
> > >
> > >
> > > For NR_CPUS == 16x8192:
> > > >>> len(str(list(range(0, 8194 * 16, 2))).replace(" ", '')) - 2
> > > 403308
> > > >>> 8192 * 16 * 7
> > > 917504
> > >
> > > For NR_CPUS == 128x8192:
> > > >>> len(str(list(range(0, 8194 * 128, 2))).replace(" ", '')) - 2
> > > 3639774
> > > >>> 8192 * 16 * 7
> > > 7340032
> > >
> > > Looks like it 2x overestimates for large lists, and 4x for standard
> > > 256-bit mask.
> >
> > Thanks! It's totally possible my math is wrong. But I'm not seeing a
> > suggestion here. What would you like the formula to be?
> >
> > I don't have any attachment to the numbers that are in there now. I'm not
> > surprised it's 2x since it counts all of them. I could just divide it by 2
> > I suppose.
>
> Agree.
>
> Actually, I accidentally removed the sentence below from the previous email:
>
> I think there's no simple formula for this, so dividing by 2 would
> probably be a simplest solution.
>
Right! That's what I've got now :) I'll have a new version after I get
it built and sanity tested.
Thanks again,
Phil
> > > Before, it was possible to fit ~1800 cpus into 4k page, after - 585.
> > >
> >
> > I don't understand this part. Nothing I did changes how the files are
> > actually built I think.
>
> I just meant that we'll significantly increase requirements for the
> size.
>
> > > > To simplify the math and support
> > > > larger NR_CPUS in the future we are using NR_CPUS * 7. We also set it to a min of
> > > > PAGE_SIZE to retain the older behavior for smaller NR_CPUS. The cpumap file wants to
> > > > be something like NR_CPUS/4 + NR_CPUS/32, for the ","s so for simplicity we are using
> > > > NR_CPUS/2.
> > >
> > > This again overestimates almost twice. In this case, NR_CPUS * 9/32 - 1
> > > is a precise value, if I didn't screw up. Why don't you just use it?
> > >
> >
> > I was just keeping it simple since it's used twice. But I can do it this way.
> >
> >
> > Cheers,
> > Phil
> >
> > > > Add a set of macros for these values to cpumask.h so they can be used in multiple places.
> > > > Apply these to the handful of such files in drivers/base/topology.c as well as node.c.
> > > >
> > > > On an 80 cpu 4-node sytem (NR_CPUS == 8192)
> > > >
> > > > before:
> > > >
> > > > -r--r--r--. 1 root root 0 Jul 12 14:08 /sys/devices/system/node/node0/cpulist
> > > > -r--r--r--. 1 root root 0 Jul 11 17:25 /sys/devices/system/node/node0/cpumap
> > > >
> > > > after:
> > > >
> > > > -r--r--r--. 1 root root 57344 Jul 13 11:32 /sys/devices/system/node/node0/cpulist
> > > > -r--r--r--. 1 root root 4096 Jul 13 11:31 /sys/devices/system/node/node0/cpumap
> > > >
> > > > CONFIG_NR_CPUS = 16384
> > > > -r--r--r--. 1 root root 114688 Jul 13 14:03 /sys/devices/system/node/node0/cpulist
> > > > -r--r--r--. 1 root root 8192 Jul 13 14:02 /sys/devices/system/node/node0/cpumap
> > > >
> > > > Fixes: 75bd50fa841 ("drivers/base/node.c: use bin_attribute to break the size limitation of cpumap ABI")
> > > > Fixes: bb9ec13d156 ("topology: use bin_attribute to break the size limitation of cpumap ABI")
> > > > Cc: Greg Kroah-Hartman <[email protected]>
> > > > Cc: "Rafael J. Wysocki" <[email protected]>
> > > > Signed-off-by: Phil Auld <[email protected]>
> > > > ---
> > > >
> > > > v2: Fix cpumap size calculation. Increase multiplier for cpulist size.
> > > >
> > > > v3: Add comments in code.
> > > >
> > > > v4: Define constants in cpumask.h. Move comments there. Also fix
> > > > topology.c.
> > > >
> > > >
> > > > drivers/base/node.c | 4 ++--
> > > > drivers/base/topology.c | 32 ++++++++++++++++----------------
> > > > include/linux/cpumask.h | 16 ++++++++++++++++
> > > > 3 files changed, 34 insertions(+), 18 deletions(-)
> > > >
> > > > diff --git a/drivers/base/node.c b/drivers/base/node.c
> > > > index 0ac6376ef7a1..eb0f43784c2b 100644
> > > > --- a/drivers/base/node.c
> > > > +++ b/drivers/base/node.c
> > > > @@ -45,7 +45,7 @@ static inline ssize_t cpumap_read(struct file *file, struct kobject *kobj,
> > > > return n;
> > > > }
> > > >
> > > > -static BIN_ATTR_RO(cpumap, 0);
> > > > +static BIN_ATTR_RO(cpumap, CPUMAP_FILE_MAX_BYTES);
> > > >
> > > > static inline ssize_t cpulist_read(struct file *file, struct kobject *kobj,
> > > > struct bin_attribute *attr, char *buf,
> > > > @@ -66,7 +66,7 @@ static inline ssize_t cpulist_read(struct file *file, struct kobject *kobj,
> > > > return n;
> > > > }
> > > >
> > > > -static BIN_ATTR_RO(cpulist, 0);
> > > > +static BIN_ATTR_RO(cpulist, CPULIST_FILE_MAX_BYTES);
> > > >
> > > > /**
> > > > * struct node_access_nodes - Access class device to hold user visible
> > > > diff --git a/drivers/base/topology.c b/drivers/base/topology.c
> > > > index ac6ad9ab67f9..89f98be5c5b9 100644
> > > > --- a/drivers/base/topology.c
> > > > +++ b/drivers/base/topology.c
> > > > @@ -62,47 +62,47 @@ define_id_show_func(ppin, "0x%llx");
> > > > static DEVICE_ATTR_ADMIN_RO(ppin);
> > > >
> > > > define_siblings_read_func(thread_siblings, sibling_cpumask);
> > > > -static BIN_ATTR_RO(thread_siblings, 0);
> > > > -static BIN_ATTR_RO(thread_siblings_list, 0);
> > > > +static BIN_ATTR_RO(thread_siblings, CPUMAP_FILE_MAX_BYTES);
> > > > +static BIN_ATTR_RO(thread_siblings_list, CPULIST_FILE_MAX_BYTES);
> > > >
> > > > define_siblings_read_func(core_cpus, sibling_cpumask);
> > > > -static BIN_ATTR_RO(core_cpus, 0);
> > > > -static BIN_ATTR_RO(core_cpus_list, 0);
> > > > +static BIN_ATTR_RO(core_cpus, CPUMAP_FILE_MAX_BYTES);
> > > > +static BIN_ATTR_RO(core_cpus_list, CPULIST_FILE_MAX_BYTES);
> > > >
> > > > define_siblings_read_func(core_siblings, core_cpumask);
> > > > -static BIN_ATTR_RO(core_siblings, 0);
> > > > -static BIN_ATTR_RO(core_siblings_list, 0);
> > > > +static BIN_ATTR_RO(core_siblings, CPUMAP_FILE_MAX_BYTES);
> > > > +static BIN_ATTR_RO(core_siblings_list, CPULIST_FILE_MAX_BYTES);
> > > >
> > > > #ifdef TOPOLOGY_CLUSTER_SYSFS
> > > > define_siblings_read_func(cluster_cpus, cluster_cpumask);
> > > > -static BIN_ATTR_RO(cluster_cpus, 0);
> > > > -static BIN_ATTR_RO(cluster_cpus_list, 0);
> > > > +static BIN_ATTR_RO(cluster_cpus, CPUMAP_FILE_MAX_BYTES);
> > > > +static BIN_ATTR_RO(cluster_cpus_list, CPULIST_FILE_MAX_BYTES);
> > > > #endif
> > > >
> > > > #ifdef TOPOLOGY_DIE_SYSFS
> > > > define_siblings_read_func(die_cpus, die_cpumask);
> > > > -static BIN_ATTR_RO(die_cpus, 0);
> > > > -static BIN_ATTR_RO(die_cpus_list, 0);
> > > > +static BIN_ATTR_RO(die_cpus, CPUMAP_FILE_MAX_BYTES);
> > > > +static BIN_ATTR_RO(die_cpus_list, CPULIST_FILE_MAX_BYTES);
> > > > #endif
> > > >
> > > > define_siblings_read_func(package_cpus, core_cpumask);
> > > > -static BIN_ATTR_RO(package_cpus, 0);
> > > > -static BIN_ATTR_RO(package_cpus_list, 0);
> > > > +static BIN_ATTR_RO(package_cpus, CPUMAP_FILE_MAX_BYTES);
> > > > +static BIN_ATTR_RO(package_cpus_list, CPULIST_FILE_MAX_BYTES);
> > > >
> > > > #ifdef TOPOLOGY_BOOK_SYSFS
> > > > define_id_show_func(book_id, "%d");
> > > > static DEVICE_ATTR_RO(book_id);
> > > > define_siblings_read_func(book_siblings, book_cpumask);
> > > > -static BIN_ATTR_RO(book_siblings, 0);
> > > > -static BIN_ATTR_RO(book_siblings_list, 0);
> > > > +static BIN_ATTR_RO(book_siblings, CPUMAP_FILE_MAX_BYTES);
> > > > +static BIN_ATTR_RO(book_siblings_list, CPULIST_FILE_MAX_BYTES);
> > > > #endif
> > > >
> > > > #ifdef TOPOLOGY_DRAWER_SYSFS
> > > > define_id_show_func(drawer_id, "%d");
> > > > static DEVICE_ATTR_RO(drawer_id);
> > > > define_siblings_read_func(drawer_siblings, drawer_cpumask);
> > > > -static BIN_ATTR_RO(drawer_siblings, 0);
> > > > -static BIN_ATTR_RO(drawer_siblings_list, 0);
> > > > +static BIN_ATTR_RO(drawer_siblings, CPUMAP_FILE_MAX_BYTES);
> > > > +static BIN_ATTR_RO(drawer_siblings_list, CPULIST_FILE_MAX_BYTES);
> > > > #endif
> > > >
> > > > static struct bin_attribute *bin_attrs[] = {
> > > > diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
> > > > index fe29ac7cc469..007acdb462bd 100644
> > > > --- a/include/linux/cpumask.h
> > > > +++ b/include/linux/cpumask.h
> > > > @@ -1071,4 +1071,20 @@ cpumap_print_list_to_buf(char *buf, const struct cpumask *mask,
> > > > [0] = 1UL \
> > > > } }
> > > >
> > > > +/*
> > > > + * Provide a valid theoretical max size for cpumap ands cpulist sysfs files to
> > >
> > > s/ands/and
> > >
> > > > + * avoid breaking userspace which may allocate a buffer based on the size
> > > > + * reported by e.g. fstat.
> > > > + *
> > > > + * For cpumap NR_CPUS/2 is a simplification of NR_CPUS/4 + NR_CPUS/32.
> > > > + *
> > > > + * For cpulist 7 is (ceil(log10(NR_CPUS)) + 1) allowing for NR_CPUS to be up to
> > > > + * 2 orders of magnitude larger than 8192. This covers a worst-case of every
> > > > + * other cpu being on one of two nodes for a very large NR_CPUS.
> > > > + *
> > > > + * Use PAGE_SIZE as a minimum for smaller configurations.
> > > > + */
> > > > +#define CPUMAP_FILE_MAX_BYTES (((NR_CPUS >> 1) > PAGE_SIZE) ? NR_CPUS >> 1 : PAGE_SIZE)
> > > > +#define CPULIST_FILE_MAX_BYTES (((NR_CPUS * 7) > PAGE_SIZE) ? NR_CPUS * 7 : PAGE_SIZE)
> > > > +
> > > > #endif /* __LINUX_CPUMASK_H */
> > > > --
> > > > 2.31.1
> > >
> >
> > --
>
--
On Thu, Jul 14, 2022 at 05:07:25PM -0400, Phil Auld wrote:
> Hi Yury,
>
> On Thu, Jul 14, 2022 at 12:32:17PM -0700 Yury Norov wrote:
> > On Thu, Jul 14, 2022 at 02:30:21PM -0400, Phil Auld wrote:
> > > Using bin_attributes with a 0 size causes fstat and friends to return that 0 size.
> > > This breaks userspace code that retrieves the size before reading the file. Rather
> > > than reverting 75bd50fa841 ("drivers/base/node.c: use bin_attribute to break the size
> > > limitation of cpumap ABI") let's put in a size value at compile time. Use direct
> > > comparison and a worst-case maximum to ensure compile time constants. For cpulist the
> > > max is on the order of NR_CPUS * (ceil(log10(NR_CPUS)) + 1) which for 8192 is 40960
> > > (8192 * 5). In order to get near that you'd need a system with every other CPU on one
> > > node or something similar. e.g. (0,2,4,8, ... ).
> >
> > My python says:
> > >>> len(str(list(range(0, 8194, 2))).replace(" ", ''))
> > 19931
> >
> > Where the list looks like:
> > [0,2,4,6,8,10,...,8190,8192]
> >
> > So excluding open and close braces, max length of the cpu list is
> > 19929 bytes, which is almost3 times smaller than your estimation
> > (8192 * 7 = 57344).
> >
> >
> > For NR_CPUS == 16x8192:
> > >>> len(str(list(range(0, 8194 * 16, 2))).replace(" ", '')) - 2
> > 403308
> > >>> 8192 * 16 * 7
> > 917504
> >
> > For NR_CPUS == 128x8192:
> > >>> len(str(list(range(0, 8194 * 128, 2))).replace(" ", '')) - 2
> > 3639774
> > >>> 8192 * 16 * 7
> > 7340032
> >
> > Looks like it 2x overestimates for large lists, and 4x for standard
> > 256-bit mask.
>
> Thanks! It's totally possible my math is wrong. But I'm not seeing a
> suggestion here. What would you like the formula to be?
>
> I don't have any attachment to the numbers that are in there now. I'm not
> surprised it's 2x since it counts all of them. I could just divide it by 2
> I suppose.
Agree.
Actually, I accidentally removed the sentence below from the previous email:
I think there's no simple formula for this, so dividing by 2 would
probably be a simplest solution.
> > Before, it was possible to fit ~1800 cpus into 4k page, after - 585.
> >
>
> I don't understand this part. Nothing I did changes how the files are
> actually built I think.
I just meant that we'll significantly increase requirements for the
size.
> > > To simplify the math and support
> > > larger NR_CPUS in the future we are using NR_CPUS * 7. We also set it to a min of
> > > PAGE_SIZE to retain the older behavior for smaller NR_CPUS. The cpumap file wants to
> > > be something like NR_CPUS/4 + NR_CPUS/32, for the ","s so for simplicity we are using
> > > NR_CPUS/2.
> >
> > This again overestimates almost twice. In this case, NR_CPUS * 9/32 - 1
> > is a precise value, if I didn't screw up. Why don't you just use it?
> >
>
> I was just keeping it simple since it's used twice. But I can do it this way.
>
>
> Cheers,
> Phil
>
> > > Add a set of macros for these values to cpumask.h so they can be used in multiple places.
> > > Apply these to the handful of such files in drivers/base/topology.c as well as node.c.
> > >
> > > On an 80 cpu 4-node sytem (NR_CPUS == 8192)
> > >
> > > before:
> > >
> > > -r--r--r--. 1 root root 0 Jul 12 14:08 /sys/devices/system/node/node0/cpulist
> > > -r--r--r--. 1 root root 0 Jul 11 17:25 /sys/devices/system/node/node0/cpumap
> > >
> > > after:
> > >
> > > -r--r--r--. 1 root root 57344 Jul 13 11:32 /sys/devices/system/node/node0/cpulist
> > > -r--r--r--. 1 root root 4096 Jul 13 11:31 /sys/devices/system/node/node0/cpumap
> > >
> > > CONFIG_NR_CPUS = 16384
> > > -r--r--r--. 1 root root 114688 Jul 13 14:03 /sys/devices/system/node/node0/cpulist
> > > -r--r--r--. 1 root root 8192 Jul 13 14:02 /sys/devices/system/node/node0/cpumap
> > >
> > > Fixes: 75bd50fa841 ("drivers/base/node.c: use bin_attribute to break the size limitation of cpumap ABI")
> > > Fixes: bb9ec13d156 ("topology: use bin_attribute to break the size limitation of cpumap ABI")
> > > Cc: Greg Kroah-Hartman <[email protected]>
> > > Cc: "Rafael J. Wysocki" <[email protected]>
> > > Signed-off-by: Phil Auld <[email protected]>
> > > ---
> > >
> > > v2: Fix cpumap size calculation. Increase multiplier for cpulist size.
> > >
> > > v3: Add comments in code.
> > >
> > > v4: Define constants in cpumask.h. Move comments there. Also fix
> > > topology.c.
> > >
> > >
> > > drivers/base/node.c | 4 ++--
> > > drivers/base/topology.c | 32 ++++++++++++++++----------------
> > > include/linux/cpumask.h | 16 ++++++++++++++++
> > > 3 files changed, 34 insertions(+), 18 deletions(-)
> > >
> > > diff --git a/drivers/base/node.c b/drivers/base/node.c
> > > index 0ac6376ef7a1..eb0f43784c2b 100644
> > > --- a/drivers/base/node.c
> > > +++ b/drivers/base/node.c
> > > @@ -45,7 +45,7 @@ static inline ssize_t cpumap_read(struct file *file, struct kobject *kobj,
> > > return n;
> > > }
> > >
> > > -static BIN_ATTR_RO(cpumap, 0);
> > > +static BIN_ATTR_RO(cpumap, CPUMAP_FILE_MAX_BYTES);
> > >
> > > static inline ssize_t cpulist_read(struct file *file, struct kobject *kobj,
> > > struct bin_attribute *attr, char *buf,
> > > @@ -66,7 +66,7 @@ static inline ssize_t cpulist_read(struct file *file, struct kobject *kobj,
> > > return n;
> > > }
> > >
> > > -static BIN_ATTR_RO(cpulist, 0);
> > > +static BIN_ATTR_RO(cpulist, CPULIST_FILE_MAX_BYTES);
> > >
> > > /**
> > > * struct node_access_nodes - Access class device to hold user visible
> > > diff --git a/drivers/base/topology.c b/drivers/base/topology.c
> > > index ac6ad9ab67f9..89f98be5c5b9 100644
> > > --- a/drivers/base/topology.c
> > > +++ b/drivers/base/topology.c
> > > @@ -62,47 +62,47 @@ define_id_show_func(ppin, "0x%llx");
> > > static DEVICE_ATTR_ADMIN_RO(ppin);
> > >
> > > define_siblings_read_func(thread_siblings, sibling_cpumask);
> > > -static BIN_ATTR_RO(thread_siblings, 0);
> > > -static BIN_ATTR_RO(thread_siblings_list, 0);
> > > +static BIN_ATTR_RO(thread_siblings, CPUMAP_FILE_MAX_BYTES);
> > > +static BIN_ATTR_RO(thread_siblings_list, CPULIST_FILE_MAX_BYTES);
> > >
> > > define_siblings_read_func(core_cpus, sibling_cpumask);
> > > -static BIN_ATTR_RO(core_cpus, 0);
> > > -static BIN_ATTR_RO(core_cpus_list, 0);
> > > +static BIN_ATTR_RO(core_cpus, CPUMAP_FILE_MAX_BYTES);
> > > +static BIN_ATTR_RO(core_cpus_list, CPULIST_FILE_MAX_BYTES);
> > >
> > > define_siblings_read_func(core_siblings, core_cpumask);
> > > -static BIN_ATTR_RO(core_siblings, 0);
> > > -static BIN_ATTR_RO(core_siblings_list, 0);
> > > +static BIN_ATTR_RO(core_siblings, CPUMAP_FILE_MAX_BYTES);
> > > +static BIN_ATTR_RO(core_siblings_list, CPULIST_FILE_MAX_BYTES);
> > >
> > > #ifdef TOPOLOGY_CLUSTER_SYSFS
> > > define_siblings_read_func(cluster_cpus, cluster_cpumask);
> > > -static BIN_ATTR_RO(cluster_cpus, 0);
> > > -static BIN_ATTR_RO(cluster_cpus_list, 0);
> > > +static BIN_ATTR_RO(cluster_cpus, CPUMAP_FILE_MAX_BYTES);
> > > +static BIN_ATTR_RO(cluster_cpus_list, CPULIST_FILE_MAX_BYTES);
> > > #endif
> > >
> > > #ifdef TOPOLOGY_DIE_SYSFS
> > > define_siblings_read_func(die_cpus, die_cpumask);
> > > -static BIN_ATTR_RO(die_cpus, 0);
> > > -static BIN_ATTR_RO(die_cpus_list, 0);
> > > +static BIN_ATTR_RO(die_cpus, CPUMAP_FILE_MAX_BYTES);
> > > +static BIN_ATTR_RO(die_cpus_list, CPULIST_FILE_MAX_BYTES);
> > > #endif
> > >
> > > define_siblings_read_func(package_cpus, core_cpumask);
> > > -static BIN_ATTR_RO(package_cpus, 0);
> > > -static BIN_ATTR_RO(package_cpus_list, 0);
> > > +static BIN_ATTR_RO(package_cpus, CPUMAP_FILE_MAX_BYTES);
> > > +static BIN_ATTR_RO(package_cpus_list, CPULIST_FILE_MAX_BYTES);
> > >
> > > #ifdef TOPOLOGY_BOOK_SYSFS
> > > define_id_show_func(book_id, "%d");
> > > static DEVICE_ATTR_RO(book_id);
> > > define_siblings_read_func(book_siblings, book_cpumask);
> > > -static BIN_ATTR_RO(book_siblings, 0);
> > > -static BIN_ATTR_RO(book_siblings_list, 0);
> > > +static BIN_ATTR_RO(book_siblings, CPUMAP_FILE_MAX_BYTES);
> > > +static BIN_ATTR_RO(book_siblings_list, CPULIST_FILE_MAX_BYTES);
> > > #endif
> > >
> > > #ifdef TOPOLOGY_DRAWER_SYSFS
> > > define_id_show_func(drawer_id, "%d");
> > > static DEVICE_ATTR_RO(drawer_id);
> > > define_siblings_read_func(drawer_siblings, drawer_cpumask);
> > > -static BIN_ATTR_RO(drawer_siblings, 0);
> > > -static BIN_ATTR_RO(drawer_siblings_list, 0);
> > > +static BIN_ATTR_RO(drawer_siblings, CPUMAP_FILE_MAX_BYTES);
> > > +static BIN_ATTR_RO(drawer_siblings_list, CPULIST_FILE_MAX_BYTES);
> > > #endif
> > >
> > > static struct bin_attribute *bin_attrs[] = {
> > > diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
> > > index fe29ac7cc469..007acdb462bd 100644
> > > --- a/include/linux/cpumask.h
> > > +++ b/include/linux/cpumask.h
> > > @@ -1071,4 +1071,20 @@ cpumap_print_list_to_buf(char *buf, const struct cpumask *mask,
> > > [0] = 1UL \
> > > } }
> > >
> > > +/*
> > > + * Provide a valid theoretical max size for cpumap ands cpulist sysfs files to
> >
> > s/ands/and
> >
> > > + * avoid breaking userspace which may allocate a buffer based on the size
> > > + * reported by e.g. fstat.
> > > + *
> > > + * For cpumap NR_CPUS/2 is a simplification of NR_CPUS/4 + NR_CPUS/32.
> > > + *
> > > + * For cpulist 7 is (ceil(log10(NR_CPUS)) + 1) allowing for NR_CPUS to be up to
> > > + * 2 orders of magnitude larger than 8192. This covers a worst-case of every
> > > + * other cpu being on one of two nodes for a very large NR_CPUS.
> > > + *
> > > + * Use PAGE_SIZE as a minimum for smaller configurations.
> > > + */
> > > +#define CPUMAP_FILE_MAX_BYTES (((NR_CPUS >> 1) > PAGE_SIZE) ? NR_CPUS >> 1 : PAGE_SIZE)
> > > +#define CPULIST_FILE_MAX_BYTES (((NR_CPUS * 7) > PAGE_SIZE) ? NR_CPUS * 7 : PAGE_SIZE)
> > > +
> > > #endif /* __LINUX_CPUMASK_H */
> > > --
> > > 2.31.1
> >
>
> --