Subject: [PATCH v7 2/4] topology: use bin_attribute to break the size limitation of cpumap ABI

From: Tian Tao <[email protected]>

Reading /sys/devices/system/cpu/cpuX/topology/ returns cpu topology.
However, the size of this file is limited to PAGE_SIZE because of
the limitation for sysfs attribute.
This patch moves to use bin_attribute to extend the ABI to be more
than one page so that cpumap bitmask and list won't be potentially
trimmed.

Signed-off-by: Tian Tao <[email protected]>
Cc: Greg Kroah-Hartman <[email protected]>
Cc: "Rafael J. Wysocki" <[email protected]>
Signed-off-by: Barry Song <[email protected]>
---
drivers/base/topology.c | 115 ++++++++++++++++++++++------------------
1 file changed, 63 insertions(+), 52 deletions(-)

diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index 4d254fcc93d1..dd3980124e33 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -21,25 +21,27 @@ static ssize_t name##_show(struct device *dev, \
return sysfs_emit(buf, "%d\n", topology_##name(dev->id)); \
}

-#define define_siblings_show_map(name, mask) \
-static ssize_t name##_show(struct device *dev, \
- struct device_attribute *attr, char *buf) \
-{ \
- return cpumap_print_to_pagebuf(false, buf, topology_##mask(dev->id));\
+#define define_siblings_read_func(name, mask) \
+static ssize_t name##_read(struct file *file, struct kobject *kobj, \
+ struct bin_attribute *attr, char *buf, \
+ loff_t off, size_t count) \
+{ \
+ struct device *dev = kobj_to_dev(kobj); \
+ \
+ return cpumap_print_to_buf(false, buf, topology_##mask(dev->id), \
+ off, count); \
+} \
+ \
+static ssize_t name##_list_read(struct file *file, struct kobject *kobj, \
+ struct bin_attribute *attr, char *buf, \
+ loff_t off, size_t count) \
+{ \
+ struct device *dev = kobj_to_dev(kobj); \
+ \
+ return cpumap_print_to_buf(true, buf, topology_##mask(dev->id), \
+ off, count); \
}

-#define define_siblings_show_list(name, mask) \
-static ssize_t name##_list_show(struct device *dev, \
- struct device_attribute *attr, \
- char *buf) \
-{ \
- return cpumap_print_to_pagebuf(true, buf, topology_##mask(dev->id));\
-}
-
-#define define_siblings_show_func(name, mask) \
- define_siblings_show_map(name, mask); \
- define_siblings_show_list(name, mask)
-
define_id_show_func(physical_package_id);
static DEVICE_ATTR_RO(physical_package_id);

@@ -49,71 +51,80 @@ static DEVICE_ATTR_RO(die_id);
define_id_show_func(core_id);
static DEVICE_ATTR_RO(core_id);

-define_siblings_show_func(thread_siblings, sibling_cpumask);
-static DEVICE_ATTR_RO(thread_siblings);
-static DEVICE_ATTR_RO(thread_siblings_list);
+define_siblings_read_func(thread_siblings, sibling_cpumask);
+static BIN_ATTR_RO(thread_siblings, 0);
+static BIN_ATTR_RO(thread_siblings_list, 0);

-define_siblings_show_func(core_cpus, sibling_cpumask);
-static DEVICE_ATTR_RO(core_cpus);
-static DEVICE_ATTR_RO(core_cpus_list);
+define_siblings_read_func(core_cpus, sibling_cpumask);
+static BIN_ATTR_RO(core_cpus, 0);
+static BIN_ATTR_RO(core_cpus_list, 0);

-define_siblings_show_func(core_siblings, core_cpumask);
-static DEVICE_ATTR_RO(core_siblings);
-static DEVICE_ATTR_RO(core_siblings_list);
+define_siblings_read_func(core_siblings, core_cpumask);
+static BIN_ATTR_RO(core_siblings, 0);
+static BIN_ATTR_RO(core_siblings_list, 0);

-define_siblings_show_func(die_cpus, die_cpumask);
-static DEVICE_ATTR_RO(die_cpus);
-static DEVICE_ATTR_RO(die_cpus_list);
+define_siblings_read_func(die_cpus, die_cpumask);
+static BIN_ATTR_RO(die_cpus, 0);
+static BIN_ATTR_RO(die_cpus_list, 0);

-define_siblings_show_func(package_cpus, core_cpumask);
-static DEVICE_ATTR_RO(package_cpus);
-static DEVICE_ATTR_RO(package_cpus_list);
+define_siblings_read_func(package_cpus, core_cpumask);
+static BIN_ATTR_RO(package_cpus, 0);
+static BIN_ATTR_RO(package_cpus_list, 0);

#ifdef CONFIG_SCHED_BOOK
define_id_show_func(book_id);
static DEVICE_ATTR_RO(book_id);
-define_siblings_show_func(book_siblings, book_cpumask);
-static DEVICE_ATTR_RO(book_siblings);
-static DEVICE_ATTR_RO(book_siblings_list);
+define_siblings_read_func(book_siblings, book_cpumask);
+static BIN_ATTR_RO(book_siblings, 0);
+static BIN_ATTR_RO(book_siblings_list, 0);
#endif

#ifdef CONFIG_SCHED_DRAWER
define_id_show_func(drawer_id);
static DEVICE_ATTR_RO(drawer_id);
-define_siblings_show_func(drawer_siblings, drawer_cpumask);
-static DEVICE_ATTR_RO(drawer_siblings);
-static DEVICE_ATTR_RO(drawer_siblings_list);
+define_siblings_read_func(drawer_siblings, drawer_cpumask);
+static BIN_ATTR_RO(drawer_siblings, 0);
+static BIN_ATTR_RO(drawer_siblings_list, 0);
#endif

+static struct bin_attribute *bin_attrs[] = {
+ &bin_attr_core_cpus,
+ &bin_attr_core_cpus_list,
+ &bin_attr_thread_siblings,
+ &bin_attr_thread_siblings_list,
+ &bin_attr_core_siblings,
+ &bin_attr_core_siblings_list,
+ &bin_attr_die_cpus,
+ &bin_attr_die_cpus_list,
+ &bin_attr_package_cpus,
+ &bin_attr_package_cpus_list,
+#ifdef CONFIG_SCHED_BOOK
+ &bin_attr_book_siblings,
+ &bin_attr_book_siblings_list,
+#endif
+#ifdef CONFIG_SCHED_DRAWER
+ &bin_attr_drawer_siblings,
+ &bin_attr_drawer_siblings_list,
+#endif
+ NULL
+};
+
static struct attribute *default_attrs[] = {
&dev_attr_physical_package_id.attr,
&dev_attr_die_id.attr,
&dev_attr_core_id.attr,
- &dev_attr_thread_siblings.attr,
- &dev_attr_thread_siblings_list.attr,
- &dev_attr_core_cpus.attr,
- &dev_attr_core_cpus_list.attr,
- &dev_attr_core_siblings.attr,
- &dev_attr_core_siblings_list.attr,
- &dev_attr_die_cpus.attr,
- &dev_attr_die_cpus_list.attr,
- &dev_attr_package_cpus.attr,
- &dev_attr_package_cpus_list.attr,
#ifdef CONFIG_SCHED_BOOK
&dev_attr_book_id.attr,
- &dev_attr_book_siblings.attr,
- &dev_attr_book_siblings_list.attr,
#endif
#ifdef CONFIG_SCHED_DRAWER
&dev_attr_drawer_id.attr,
- &dev_attr_drawer_siblings.attr,
- &dev_attr_drawer_siblings_list.attr,
#endif
NULL
};

static const struct attribute_group topology_attr_group = {
.attrs = default_attrs,
+ .bin_attrs = bin_attrs,
.name = "topology"
};

--
2.25.1


Subject: RE: [PATCH v7 2/4] topology: use bin_attribute to break the size limitation of cpumap ABI

Hi Yury,
Not sure if I have totally got your idea. But please see if the below
is closer to what you prefer.

I haven't really tested it. But this approach can somehow solve the
problem you mentioned(malloc/free and printing is done 1000times for
a 1MB buffer which is read 1K each time).

Bitmap provides some API to alloc and return print_buf:

+ssize_t bitmap_get_print_buf(bool list, char **buf, const unsigned long *maskp,
+ int nmaskbits)
+{
+ const char *fmt = list ? "%*pbl\n" : "%*pb\n";
+ ssize_t size;
+
+ size = snprintf(NULL, 0, fmt, nmaskbits, maskp);
+ *buf = kmalloc_track_caller(size + 1, GFP_KERNEL);
+ scnprintf(*buf, size + 1, fmt, nmaskbits, maskp);
+
+ return size + 1;
+}
+
+static inline ssize_t
+cpumap_get_print_buf(bool list, char **buf, const struct cpumask *mask)
+{
+ return bitmap_get_print_buf(list, buf, cpumask_bits(mask),
+ nr_cpu_ids);
+}
+
+struct bitmap_print_buf
+{
+ char *buf;
+ ssize_t size;
+};
+

In bin_attribute, move to get and save the buffer while sysfs entry is
read at the first time and free it when file arrives EOF:

#define define_id_show_func(name) \
static ssize_t name##_show(struct device *dev, \
struct device_attribute *attr, char *buf) \
@@ -27,9 +53,27 @@ static ssize_t name##_read(struct file *file, struct kobject *kobj, \
loff_t off, size_t count) \
{ \
struct device *dev = kobj_to_dev(kobj); \
- \
- return cpumap_print_to_buf(false, buf, topology_##mask(dev->id), \
- off, count); \
+ struct bitmap_print_buf *bmb = dev_get_drvdata(dev); \
+ if (!bmb) { \
+ bmb = devm_kzalloc(dev, sizeof(*bmb), GFP_KERNEL); \
+ if (!bmb) \
+ return -ENOMEM; \
+ dev_set_drvdata(dev, bmb); \
+ } \
+ /* for the 1st time, getting the printed buffer */ \
+ if (!bmb->buf) \
+ bmb->size = cpumap_get_print_buf(false, &bmb->buf, \
+ topology_##mask(dev->id)); \
+ /* when we arrive EOF, free the printed buffer */ \
+ if (off >= bmb->size) { \
+ kfree(bmb->buf); bmb->buf = NULL; \
+ return 0; \
+ } \
+ /* while a large printed buffer is read many times, we reuse \
+ * the buffer we get at the 1st time \
+ */ \
+ strncpy(buf, bmb->buf + off, count); \
+ return min(count, bmb->size - off); \
} \
\
This means a huge change in drivers though I am not sure Greg is
a fan of this approach. Anyway, "1000 times" is not a real case.
Typically we will arrive EOF after one time.

Thanks
Barry

> -----Original Message-----
> From: Song Bao Hua (Barry Song)
> Sent: Thursday, July 15, 2021 11:59 PM
> To: [email protected]; [email protected];
> [email protected]; [email protected];
> [email protected]
> Cc: [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; tangchengchang
> <[email protected]>; Zengtao (B) <[email protected]>;
> yangyicong <[email protected]>; [email protected]; Linuxarm
> <[email protected]>; tiantao (H) <[email protected]>; Song Bao Hua
> (Barry Song) <[email protected]>
> Subject: [PATCH v7 2/4] topology: use bin_attribute to break the size limitation
> of cpumap ABI
>
> From: Tian Tao <[email protected]>
>
> Reading /sys/devices/system/cpu/cpuX/topology/ returns cpu topology.
> However, the size of this file is limited to PAGE_SIZE because of
> the limitation for sysfs attribute.
> This patch moves to use bin_attribute to extend the ABI to be more
> than one page so that cpumap bitmask and list won't be potentially
> trimmed.
>
> Signed-off-by: Tian Tao <[email protected]>
> Cc: Greg Kroah-Hartman <[email protected]>
> Cc: "Rafael J. Wysocki" <[email protected]>
> Signed-off-by: Barry Song <[email protected]>
> ---
> drivers/base/topology.c | 115 ++++++++++++++++++++++------------------
> 1 file changed, 63 insertions(+), 52 deletions(-)
>
> diff --git a/drivers/base/topology.c b/drivers/base/topology.c
> index 4d254fcc93d1..dd3980124e33 100644
> --- a/drivers/base/topology.c
> +++ b/drivers/base/topology.c
> @@ -21,25 +21,27 @@ static ssize_t name##_show(struct device *dev,
> \
> return sysfs_emit(buf, "%d\n", topology_##name(dev->id)); \
> }
>
> -#define define_siblings_show_map(name, mask) \
> -static ssize_t name##_show(struct device *dev, \
> - struct device_attribute *attr, char *buf) \
> -{ \
> - return cpumap_print_to_pagebuf(false, buf, topology_##mask(dev->id));\
> +#define define_siblings_read_func(name, mask) \
> +static ssize_t name##_read(struct file *file, struct kobject *kobj, \
> + struct bin_attribute *attr, char *buf, \
> + loff_t off, size_t count) \
> +{ \
> + struct device *dev = kobj_to_dev(kobj); \
> + \
> + return cpumap_print_to_buf(false, buf, topology_##mask(dev->id), \
> + off, count); \
> +} \
> + \
> +static ssize_t name##_list_read(struct file *file, struct kobject *kobj, \
> + struct bin_attribute *attr, char *buf, \
> + loff_t off, size_t count) \
> +{ \
> + struct device *dev = kobj_to_dev(kobj); \
> + \
> + return cpumap_print_to_buf(true, buf, topology_##mask(dev->id), \
> + off, count); \
> }
>
> -#define define_siblings_show_list(name, mask) \
> -static ssize_t name##_list_show(struct device *dev, \
> - struct device_attribute *attr, \
> - char *buf) \
> -{ \
> - return cpumap_print_to_pagebuf(true, buf, topology_##mask(dev->id));\
> -}
> -
> -#define define_siblings_show_func(name, mask) \
> - define_siblings_show_map(name, mask); \
> - define_siblings_show_list(name, mask)
> -
> define_id_show_func(physical_package_id);
> static DEVICE_ATTR_RO(physical_package_id);
>
> @@ -49,71 +51,80 @@ static DEVICE_ATTR_RO(die_id);
> define_id_show_func(core_id);
> static DEVICE_ATTR_RO(core_id);
>
> -define_siblings_show_func(thread_siblings, sibling_cpumask);
> -static DEVICE_ATTR_RO(thread_siblings);
> -static DEVICE_ATTR_RO(thread_siblings_list);
> +define_siblings_read_func(thread_siblings, sibling_cpumask);
> +static BIN_ATTR_RO(thread_siblings, 0);
> +static BIN_ATTR_RO(thread_siblings_list, 0);
>
> -define_siblings_show_func(core_cpus, sibling_cpumask);
> -static DEVICE_ATTR_RO(core_cpus);
> -static DEVICE_ATTR_RO(core_cpus_list);
> +define_siblings_read_func(core_cpus, sibling_cpumask);
> +static BIN_ATTR_RO(core_cpus, 0);
> +static BIN_ATTR_RO(core_cpus_list, 0);
>
> -define_siblings_show_func(core_siblings, core_cpumask);
> -static DEVICE_ATTR_RO(core_siblings);
> -static DEVICE_ATTR_RO(core_siblings_list);
> +define_siblings_read_func(core_siblings, core_cpumask);
> +static BIN_ATTR_RO(core_siblings, 0);
> +static BIN_ATTR_RO(core_siblings_list, 0);
>
> -define_siblings_show_func(die_cpus, die_cpumask);
> -static DEVICE_ATTR_RO(die_cpus);
> -static DEVICE_ATTR_RO(die_cpus_list);
> +define_siblings_read_func(die_cpus, die_cpumask);
> +static BIN_ATTR_RO(die_cpus, 0);
> +static BIN_ATTR_RO(die_cpus_list, 0);
>
> -define_siblings_show_func(package_cpus, core_cpumask);
> -static DEVICE_ATTR_RO(package_cpus);
> -static DEVICE_ATTR_RO(package_cpus_list);
> +define_siblings_read_func(package_cpus, core_cpumask);
> +static BIN_ATTR_RO(package_cpus, 0);
> +static BIN_ATTR_RO(package_cpus_list, 0);
>
> #ifdef CONFIG_SCHED_BOOK
> define_id_show_func(book_id);
> static DEVICE_ATTR_RO(book_id);
> -define_siblings_show_func(book_siblings, book_cpumask);
> -static DEVICE_ATTR_RO(book_siblings);
> -static DEVICE_ATTR_RO(book_siblings_list);
> +define_siblings_read_func(book_siblings, book_cpumask);
> +static BIN_ATTR_RO(book_siblings, 0);
> +static BIN_ATTR_RO(book_siblings_list, 0);
> #endif
>
> #ifdef CONFIG_SCHED_DRAWER
> define_id_show_func(drawer_id);
> static DEVICE_ATTR_RO(drawer_id);
> -define_siblings_show_func(drawer_siblings, drawer_cpumask);
> -static DEVICE_ATTR_RO(drawer_siblings);
> -static DEVICE_ATTR_RO(drawer_siblings_list);
> +define_siblings_read_func(drawer_siblings, drawer_cpumask);
> +static BIN_ATTR_RO(drawer_siblings, 0);
> +static BIN_ATTR_RO(drawer_siblings_list, 0);
> #endif
>
> +static struct bin_attribute *bin_attrs[] = {
> + &bin_attr_core_cpus,
> + &bin_attr_core_cpus_list,
> + &bin_attr_thread_siblings,
> + &bin_attr_thread_siblings_list,
> + &bin_attr_core_siblings,
> + &bin_attr_core_siblings_list,
> + &bin_attr_die_cpus,
> + &bin_attr_die_cpus_list,
> + &bin_attr_package_cpus,
> + &bin_attr_package_cpus_list,
> +#ifdef CONFIG_SCHED_BOOK
> + &bin_attr_book_siblings,
> + &bin_attr_book_siblings_list,
> +#endif
> +#ifdef CONFIG_SCHED_DRAWER
> + &bin_attr_drawer_siblings,
> + &bin_attr_drawer_siblings_list,
> +#endif
> + NULL
> +};
> +
> static struct attribute *default_attrs[] = {
> &dev_attr_physical_package_id.attr,
> &dev_attr_die_id.attr,
> &dev_attr_core_id.attr,
> - &dev_attr_thread_siblings.attr,
> - &dev_attr_thread_siblings_list.attr,
> - &dev_attr_core_cpus.attr,
> - &dev_attr_core_cpus_list.attr,
> - &dev_attr_core_siblings.attr,
> - &dev_attr_core_siblings_list.attr,
> - &dev_attr_die_cpus.attr,
> - &dev_attr_die_cpus_list.attr,
> - &dev_attr_package_cpus.attr,
> - &dev_attr_package_cpus_list.attr,
> #ifdef CONFIG_SCHED_BOOK
> &dev_attr_book_id.attr,
> - &dev_attr_book_siblings.attr,
> - &dev_attr_book_siblings_list.attr,
> #endif
> #ifdef CONFIG_SCHED_DRAWER
> &dev_attr_drawer_id.attr,
> - &dev_attr_drawer_siblings.attr,
> - &dev_attr_drawer_siblings_list.attr,
> #endif
> NULL
> };
>
> static const struct attribute_group topology_attr_group = {
> .attrs = default_attrs,
> + .bin_attrs = bin_attrs,
> .name = "topology"
> };
>
> --
> 2.25.1

2021-07-16 20:04:52

by Yury Norov

[permalink] [raw]
Subject: Re: [PATCH v7 2/4] topology: use bin_attribute to break the size limitation of cpumap ABI

On Fri, Jul 16, 2021 at 08:49:58AM +0000, Song Bao Hua (Barry Song) wrote:
> Hi Yury,
> Not sure if I have totally got your idea. But please see if the below
> is closer to what you prefer.
>
> I haven't really tested it. But this approach can somehow solve the
> problem you mentioned(malloc/free and printing is done 1000times for
> a 1MB buffer which is read 1K each time).
>
> Bitmap provides some API to alloc and return print_buf:
>
> +ssize_t bitmap_get_print_buf(bool list, char **buf, const unsigned long *maskp,
> + int nmaskbits)
> +{
> + const char *fmt = list ? "%*pbl\n" : "%*pb\n";
> + ssize_t size;
> +
> + size = snprintf(NULL, 0, fmt, nmaskbits, maskp);
> + *buf = kmalloc_track_caller(size + 1, GFP_KERNEL);
> + scnprintf(*buf, size + 1, fmt, nmaskbits, maskp);
> +
> + return size + 1;
> +}
> +
> +static inline ssize_t
> +cpumap_get_print_buf(bool list, char **buf, const struct cpumask *mask)
> +{
> + return bitmap_get_print_buf(list, buf, cpumask_bits(mask),
> + nr_cpu_ids);
> +}
> +
> +struct bitmap_print_buf
> +{
> + char *buf;
> + ssize_t size;
> +};
> +
>
> In bin_attribute, move to get and save the buffer while sysfs entry is
> read at the first time and free it when file arrives EOF:
>
> #define define_id_show_func(name) \
> static ssize_t name##_show(struct device *dev, \
> struct device_attribute *attr, char *buf) \
> @@ -27,9 +53,27 @@ static ssize_t name##_read(struct file *file, struct kobject *kobj, \
> loff_t off, size_t count) \
> { \
> struct device *dev = kobj_to_dev(kobj); \
> - \
> - return cpumap_print_to_buf(false, buf, topology_##mask(dev->id), \
> - off, count); \
> + struct bitmap_print_buf *bmb = dev_get_drvdata(dev); \
> + if (!bmb) { \
> + bmb = devm_kzalloc(dev, sizeof(*bmb), GFP_KERNEL); \
> + if (!bmb) \
> + return -ENOMEM; \
> + dev_set_drvdata(dev, bmb); \
> + } \
> + /* for the 1st time, getting the printed buffer */ \
> + if (!bmb->buf) \
> + bmb->size = cpumap_get_print_buf(false, &bmb->buf, \
> + topology_##mask(dev->id)); \
> + /* when we arrive EOF, free the printed buffer */ \
> + if (off >= bmb->size) { \
> + kfree(bmb->buf); bmb->buf = NULL; \
> + return 0; \
> + } \
> + /* while a large printed buffer is read many times, we reuse \
> + * the buffer we get at the 1st time \
> + */ \
> + strncpy(buf, bmb->buf + off, count); \
> + return min(count, bmb->size - off); \
> } \
> \
> This means a huge change in drivers though I am not sure Greg is
> a fan of this approach. Anyway, "1000 times" is not a real case.
> Typically we will arrive EOF after one time.

Not a real case in your driver doesn't mean not a real case at all.
You're adding your code to the very basic core layer, and so you'd
consider all possible scenarios, not only your particular one.

On the other hand, if you add your function(s) at drivers/base/node.c
and explain that O(nbits**2) 'is not a real case' in _this_ driver -
I think it will be acceptable. Maybe this is your choice...

> Thanks
> Barry

> Not sure if I have totally got your idea. But please see if the below
> is closer to what you prefer.
>
> I haven't really tested it. But this approach can somehow solve the
> problem you mentioned(malloc/free and printing is done 1000times for
> a 1MB buffer which is read 1K each time).
>
> Bitmap provides some API to alloc and return print_buf:

I'm not too familar to the topology things, and in fact never exposed
anything thru the sysfs.

From general knowledge, it's better to allocate memory for the string
on file creation to avoid situation when kernel has no memory to allocate
it when user tries to read.

So from my perspective, the most straightforward solution would be:

register(cpumask)
{
size_t max_size = cpumap_max_string_size(list, cpumask)
void *buf = kmalloc(max_size, ...);

sysfs_create_file(..., buf)
}

unregister()
{
kfree(buf);
}

show()
{
snprintf(buf, max_size, "*pbl", cpumask);
}

This would require to add bitmap_max_string_size(list, bitmap, nbits),
but it's O(1), and I think, others will find it helpful.

Again, I'm not professional with sysfs, and fully admit that it might
be wrong.

Subject: RE: [PATCH v7 2/4] topology: use bin_attribute to break the size limitation of cpumap ABI



> -----Original Message-----
> From: Yury Norov [mailto:[email protected]]
> Sent: Saturday, July 17, 2021 8:04 AM
> To: Song Bao Hua (Barry Song) <[email protected]>
> Cc: [email protected]; [email protected];
> [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; tangchengchang
> <[email protected]>; Zengtao (B) <[email protected]>;
> yangyicong <[email protected]>; [email protected]; Linuxarm
> <[email protected]>; tiantao (H) <[email protected]>
> Subject: Re: [PATCH v7 2/4] topology: use bin_attribute to break the size
> limitation of cpumap ABI
>
> On Fri, Jul 16, 2021 at 08:49:58AM +0000, Song Bao Hua (Barry Song) wrote:
> > Hi Yury,
> > Not sure if I have totally got your idea. But please see if the below
> > is closer to what you prefer.
> >
> > I haven't really tested it. But this approach can somehow solve the
> > problem you mentioned(malloc/free and printing is done 1000times for
> > a 1MB buffer which is read 1K each time).
> >
> > Bitmap provides some API to alloc and return print_buf:
> >
> > +ssize_t bitmap_get_print_buf(bool list, char **buf, const unsigned long
> *maskp,
> > + int nmaskbits)
> > +{
> > + const char *fmt = list ? "%*pbl\n" : "%*pb\n";
> > + ssize_t size;
> > +
> > + size = snprintf(NULL, 0, fmt, nmaskbits, maskp);
> > + *buf = kmalloc_track_caller(size + 1, GFP_KERNEL);
> > + scnprintf(*buf, size + 1, fmt, nmaskbits, maskp);
> > +
> > + return size + 1;
> > +}
> > +
> > +static inline ssize_t
> > +cpumap_get_print_buf(bool list, char **buf, const struct cpumask *mask)
> > +{
> > + return bitmap_get_print_buf(list, buf, cpumask_bits(mask),
> > + nr_cpu_ids);
> > +}
> > +
> > +struct bitmap_print_buf
> > +{
> > + char *buf;
> > + ssize_t size;
> > +};
> > +
> >
> > In bin_attribute, move to get and save the buffer while sysfs entry is
> > read at the first time and free it when file arrives EOF:
> >
> > #define define_id_show_func(name) \
> > static ssize_t name##_show(struct device *dev, \
> > struct device_attribute *attr, char *buf) \
> > @@ -27,9 +53,27 @@ static ssize_t name##_read(struct file *file, struct kobject
> *kobj, \
> > loff_t off, size_t count) \
> > { \
> > struct device *dev = kobj_to_dev(kobj); \
> > - \
> > - return cpumap_print_to_buf(false, buf, topology_##mask(dev->id),
> \
> > - off, count); \
> > + struct bitmap_print_buf *bmb = dev_get_drvdata(dev); \
> > + if (!bmb) { \
> > + bmb = devm_kzalloc(dev, sizeof(*bmb), GFP_KERNEL); \
> > + if (!bmb) \
> > + return -ENOMEM; \
> > + dev_set_drvdata(dev, bmb); \
> > + } \
> > + /* for the 1st time, getting the printed buffer */ \
> > + if (!bmb->buf) \
> > + bmb->size = cpumap_get_print_buf(false, &bmb->buf, \
> > + topology_##mask(dev->id)); \
> > + /* when we arrive EOF, free the printed buffer */ \
> > + if (off >= bmb->size) { \
> > + kfree(bmb->buf); bmb->buf = NULL;
> \
> > + return 0; \
> > + } \
> > + /* while a large printed buffer is read many times, we reuse \
> > + * the buffer we get at the 1st time \
> > + */ \
> > + strncpy(buf, bmb->buf + off, count); \
> > + return min(count, bmb->size - off); \
> > } \
> > \
> > This means a huge change in drivers though I am not sure Greg is
> > a fan of this approach. Anyway, "1000 times" is not a real case.
> > Typically we will arrive EOF after one time.
>
> Not a real case in your driver doesn't mean not a real case at all.
> You're adding your code to the very basic core layer, and so you'd
> consider all possible scenarios, not only your particular one.
>

Generally yes. And generally I agree with your point. That point is
exactly what I have always adhered to in software design. But my point
I have been arguing is that the new APIs are for sysfs ABI only. So it
is not particular one, it is common.

In my understanding, only ABI things to users will need to print bitmap
to mask or list. Rarely a kernel module will need it. Kernel modules
would be running real and/or/andnot bit operations on binary bitmap
directly.

Anyway, I'm glad to take a step in the way you prefer.

> On the other hand, if you add your function(s) at drivers/base/node.c
> and explain that O(nbits**2) 'is not a real case' in _this_ driver -
> I think it will be acceptable. Maybe this is your choice...
>
> > Thanks
> > Barry
>
> > Not sure if I have totally got your idea. But please see if the below
> > is closer to what you prefer.
> >
> > I haven't really tested it. But this approach can somehow solve the
> > problem you mentioned(malloc/free and printing is done 1000times for
> > a 1MB buffer which is read 1K each time).
> >
> > Bitmap provides some API to alloc and return print_buf:
>
> I'm not too familar to the topology things, and in fact never exposed
> anything thru the sysfs.
>
> From general knowledge, it's better to allocate memory for the string
> on file creation to avoid situation when kernel has no memory to allocate
> it when user tries to read.
>
> So from my perspective, the most straightforward solution would be:
>
> register(cpumask)
> {
> size_t max_size = cpumap_max_string_size(list, cpumask)
> void *buf = kmalloc(max_size, ...);
>
> sysfs_create_file(..., buf)
> }
>
> unregister()
> {
> kfree(buf);
> }
>
> show()
> {
> snprintf(buf, max_size, "*pbl", cpumask);
> }
>

Generally good idea. However, for sysfs ABI entries, it might not be
that true.

A sysfs entry might never be read for its whole life. As I explained
before, a sysfs entry - especially for list, is randomly "cat" by users.
Many of them won't be read forever. And after they are read once, they
will probably never be read again. The operations to read ABI could be
random and rare. Performance wouldn't be a concern.

To avoid holding the memory which might never be used, it is better to
allocate and free the memory during runtime. I mean to allocate in show()
and free in show(), aka, to do it on demand.

For example, for a server with 256CPU and each cpu has dozens of sysfs ABI
entries, only a few of sysfs list entries might be randomly "cat" by users.
Holding 256*entries memory doesn't look good.

> This would require to add bitmap_max_string_size(list, bitmap, nbits),
> but it's O(1), and I think, others will find it helpful.

What about getting size and memory at the same time?

ssize_t bitmap_get_print_buf(bool list, char **buf, const unsigned long
*maskp, int nmaskbits)

ssize_t cpumap_get_print_buf(bool list, char **buf, const struct cpumask *mask);

This API returns the size of printed buffer, and it also gets the
printed result saved in *buf. Then drivers don't need to do three
steps:

1. get cpumap buffer size which is your cpumap_max_string_size()
2. allocate memory for buffer according to size got in step 1
3. print bitmap(cpumap) to buffer by "pbl"

It will only need to call bitmap_get_print_buf() and all three
things are done inside bitmap_get_print_buf().

How to use the size and memory allocated in cpumap_get_print_buf
will be totally up to users.

The other benefit for this is that if we get string size during initialization,
and then we print in show() entries, the size got at the beginning might be not
enough as system topology might have changed. Sysfs ABI reflects the status of
system at this moment.

>
> Again, I'm not professional with sysfs, and fully admit that it might
> be wrong.

Never mind.

Thanks
Barry

2021-07-17 01:14:30

by Yury Norov

[permalink] [raw]
Subject: Re: [PATCH v7 2/4] topology: use bin_attribute to break the size limitation of cpumap ABI

On Sat, Jul 17, 2021 at 12:16:48AM +0000, Song Bao Hua (Barry Song) wrote:
>
>
> > -----Original Message-----
> > From: Yury Norov [mailto:[email protected]]
> > Sent: Saturday, July 17, 2021 8:04 AM
> > To: Song Bao Hua (Barry Song) <[email protected]>
> > Cc: [email protected]; [email protected];
> > [email protected]; [email protected];
> > [email protected]; [email protected]; [email protected];
> > [email protected]; [email protected]; [email protected];
> > [email protected]; [email protected]; [email protected];
> > [email protected]; [email protected]; tangchengchang
> > <[email protected]>; Zengtao (B) <[email protected]>;
> > yangyicong <[email protected]>; [email protected]; Linuxarm
> > <[email protected]>; tiantao (H) <[email protected]>
> > Subject: Re: [PATCH v7 2/4] topology: use bin_attribute to break the size
> > limitation of cpumap ABI
> >
> > On Fri, Jul 16, 2021 at 08:49:58AM +0000, Song Bao Hua (Barry Song) wrote:
> > > Hi Yury,
> > > Not sure if I have totally got your idea. But please see if the below
> > > is closer to what you prefer.
> > >
> > > I haven't really tested it. But this approach can somehow solve the
> > > problem you mentioned(malloc/free and printing is done 1000times for
> > > a 1MB buffer which is read 1K each time).
> > >
> > > Bitmap provides some API to alloc and return print_buf:
> > >
> > > +ssize_t bitmap_get_print_buf(bool list, char **buf, const unsigned long
> > *maskp,
> > > + int nmaskbits)
> > > +{
> > > + const char *fmt = list ? "%*pbl\n" : "%*pb\n";
> > > + ssize_t size;
> > > +
> > > + size = snprintf(NULL, 0, fmt, nmaskbits, maskp);
> > > + *buf = kmalloc_track_caller(size + 1, GFP_KERNEL);
> > > + scnprintf(*buf, size + 1, fmt, nmaskbits, maskp);
> > > +
> > > + return size + 1;
> > > +}
> > > +
> > > +static inline ssize_t
> > > +cpumap_get_print_buf(bool list, char **buf, const struct cpumask *mask)
> > > +{
> > > + return bitmap_get_print_buf(list, buf, cpumask_bits(mask),
> > > + nr_cpu_ids);
> > > +}
> > > +
> > > +struct bitmap_print_buf
> > > +{
> > > + char *buf;
> > > + ssize_t size;
> > > +};
> > > +
> > >
> > > In bin_attribute, move to get and save the buffer while sysfs entry is
> > > read at the first time and free it when file arrives EOF:
> > >
> > > #define define_id_show_func(name) \
> > > static ssize_t name##_show(struct device *dev, \
> > > struct device_attribute *attr, char *buf) \
> > > @@ -27,9 +53,27 @@ static ssize_t name##_read(struct file *file, struct kobject
> > *kobj, \
> > > loff_t off, size_t count) \
> > > { \
> > > struct device *dev = kobj_to_dev(kobj); \
> > > - \
> > > - return cpumap_print_to_buf(false, buf, topology_##mask(dev->id),
> > \
> > > - off, count); \
> > > + struct bitmap_print_buf *bmb = dev_get_drvdata(dev); \
> > > + if (!bmb) { \
> > > + bmb = devm_kzalloc(dev, sizeof(*bmb), GFP_KERNEL); \
> > > + if (!bmb) \
> > > + return -ENOMEM; \
> > > + dev_set_drvdata(dev, bmb); \
> > > + } \
> > > + /* for the 1st time, getting the printed buffer */ \
> > > + if (!bmb->buf) \
> > > + bmb->size = cpumap_get_print_buf(false, &bmb->buf, \
> > > + topology_##mask(dev->id)); \
> > > + /* when we arrive EOF, free the printed buffer */ \
> > > + if (off >= bmb->size) { \
> > > + kfree(bmb->buf); bmb->buf = NULL;
> > \
> > > + return 0; \
> > > + } \
> > > + /* while a large printed buffer is read many times, we reuse \
> > > + * the buffer we get at the 1st time \
> > > + */ \
> > > + strncpy(buf, bmb->buf + off, count); \
> > > + return min(count, bmb->size - off); \
> > > } \
> > > \
> > > This means a huge change in drivers though I am not sure Greg is
> > > a fan of this approach. Anyway, "1000 times" is not a real case.
> > > Typically we will arrive EOF after one time.
> >
> > Not a real case in your driver doesn't mean not a real case at all.
> > You're adding your code to the very basic core layer, and so you'd
> > consider all possible scenarios, not only your particular one.
> >
>
> Generally yes. And generally I agree with your point. That point is
> exactly what I have always adhered to in software design. But my point
> I have been arguing is that the new APIs are for sysfs ABI only. So it
> is not particular one, it is common.
>
> In my understanding, only ABI things to users will need to print bitmap
> to mask or list. Rarely a kernel module will need it. Kernel modules
> would be running real and/or/andnot bit operations on binary bitmap
> directly.
>
> Anyway, I'm glad to take a step in the way you prefer.
>
> > On the other hand, if you add your function(s) at drivers/base/node.c
> > and explain that O(nbits**2) 'is not a real case' in _this_ driver -
> > I think it will be acceptable. Maybe this is your choice...
> >
> > > Thanks
> > > Barry
> >
> > > Not sure if I have totally got your idea. But please see if the below
> > > is closer to what you prefer.
> > >
> > > I haven't really tested it. But this approach can somehow solve the
> > > problem you mentioned(malloc/free and printing is done 1000times for
> > > a 1MB buffer which is read 1K each time).
> > >
> > > Bitmap provides some API to alloc and return print_buf:
> >
> > I'm not too familar to the topology things, and in fact never exposed
> > anything thru the sysfs.
> >
> > From general knowledge, it's better to allocate memory for the string
> > on file creation to avoid situation when kernel has no memory to allocate
> > it when user tries to read.
> >
> > So from my perspective, the most straightforward solution would be:
> >
> > register(cpumask)
> > {
> > size_t max_size = cpumap_max_string_size(list, cpumask)
> > void *buf = kmalloc(max_size, ...);
> >
> > sysfs_create_file(..., buf)
> > }
> >
> > unregister()
> > {
> > kfree(buf);
> > }
> >
> > show()
> > {
> > snprintf(buf, max_size, "*pbl", cpumask);
> > }
> >
>
> Generally good idea. However, for sysfs ABI entries, it might not be
> that true.
>
> A sysfs entry might never be read for its whole life. As I explained
> before, a sysfs entry - especially for list, is randomly "cat" by users.
> Many of them won't be read forever. And after they are read once, they
> will probably never be read again. The operations to read ABI could be
> random and rare. Performance wouldn't be a concern.
>
> To avoid holding the memory which might never be used, it is better to
> allocate and free the memory during runtime. I mean to allocate in show()
> and free in show(), aka, to do it on demand.
>
> For example, for a server with 256CPU and each cpu has dozens of sysfs ABI
> entries, only a few of sysfs list entries might be randomly "cat" by users.
> Holding 256*entries memory doesn't look good.

Ok, makes sense.

> > This would require to add bitmap_max_string_size(list, bitmap, nbits),
> > but it's O(1), and I think, others will find it helpful.
>
> What about getting size and memory at the same time?

1. We already have kasprintf()
2. It breaks coding style.

Documentation/process/coding-style.rst:
Functions should be short and sweet, and do just one thing.

From practical point of view, there should be some balance between
granularity and ease-of-use. But in this case, bitmap_list cries for
a function that will help to estimate size of output buffer. And it's
easy to imagine a case where the estimated length of bitmap is needed
explicitly:

size_t max_size = bitmap_max_string_size(nbits);
char *buf = kmalloc(PAGE_ALIGN(max_size) * nr_cpus);

Thought, I don't insist. In your driver you can do:

size_t size = snprintf(NULL, 0, ...);
void *buf = kmalloc(size);

It will be fully correct, and you already have everything you need.

> ssize_t bitmap_get_print_buf(bool list, char **buf, const unsigned long
> *maskp, int nmaskbits)
>
> ssize_t cpumap_get_print_buf(bool list, char **buf, const struct cpumask *mask);
>
> This API returns the size of printed buffer, and it also gets the
> printed result saved in *buf. Then drivers don't need to do three
> steps:
>
> 1. get cpumap buffer size which is your cpumap_max_string_size()
> 2. allocate memory for buffer according to size got in step 1
> 3. print bitmap(cpumap) to buffer by "pbl"
>
> It will only need to call bitmap_get_print_buf() and all three
> things are done inside bitmap_get_print_buf().
>
> How to use the size and memory allocated in cpumap_get_print_buf
> will be totally up to users.
>
> The other benefit for this is that if we get string size during initialization,
> and then we print in show() entries, the size got at the beginning might be not
> enough as system topology might have changed. Sysfs ABI reflects the status of
> system at this moment.
>
> >
> > Again, I'm not professional with sysfs, and fully admit that it might
> > be wrong.
>
> Never mind.
>
> Thanks
> Barry

2021-07-19 09:10:51

by Andy Shevchenko

[permalink] [raw]
Subject: Re: [PATCH v7 2/4] topology: use bin_attribute to break the size limitation of cpumap ABI

On Fri, Jul 16, 2021 at 06:12:21PM -0700, Yury Norov wrote:
> On Sat, Jul 17, 2021 at 12:16:48AM +0000, Song Bao Hua (Barry Song) wrote:
> > > From: Yury Norov [mailto:[email protected]]
> > > Sent: Saturday, July 17, 2021 8:04 AM
> > > On Fri, Jul 16, 2021 at 08:49:58AM +0000, Song Bao Hua (Barry Song) wrote:

...

> > Generally good idea. However, for sysfs ABI entries, it might not be
> > that true.
> >
> > A sysfs entry might never be read for its whole life. As I explained
> > before, a sysfs entry - especially for list, is randomly "cat" by users.
> > Many of them won't be read forever. And after they are read once, they
> > will probably never be read again. The operations to read ABI could be
> > random and rare. Performance wouldn't be a concern.
> >
> > To avoid holding the memory which might never be used, it is better to
> > allocate and free the memory during runtime. I mean to allocate in show()
> > and free in show(), aka, to do it on demand.
> >
> > For example, for a server with 256CPU and each cpu has dozens of sysfs ABI
> > entries, only a few of sysfs list entries might be randomly "cat" by users.
> > Holding 256*entries memory doesn't look good.
>
> Ok, makes sense.
>
> > > This would require to add bitmap_max_string_size(list, bitmap, nbits),
> > > but it's O(1), and I think, others will find it helpful.
> >
> > What about getting size and memory at the same time?
>
> 1. We already have kasprintf()
> 2. It breaks coding style.
>
> Documentation/process/coding-style.rst:
> Functions should be short and sweet, and do just one thing.
>
> From practical point of view, there should be some balance between
> granularity and ease-of-use. But in this case, bitmap_list cries for
> a function that will help to estimate size of output buffer.

According to the vsnprintf() logic the estimated size is what it returns. If
user supplies too few bytes available, the comparison with the returned value
can tell caller that space wasn't big enough.

> And it's
> easy to imagine a case where the estimated length of bitmap is needed
> explicitly:
>
> size_t max_size = bitmap_max_string_size(nbits);
> char *buf = kmalloc(PAGE_ALIGN(max_size) * nr_cpus);
>
> Thought, I don't insist. In your driver you can do:
>
> size_t size = snprintf(NULL, 0, ...);
> void *buf = kmalloc(size);
>
> It will be fully correct, and you already have everything you need.
>
> > ssize_t bitmap_get_print_buf(bool list, char **buf, const unsigned long
> > *maskp, int nmaskbits)
> >
> > ssize_t cpumap_get_print_buf(bool list, char **buf, const struct cpumask *mask);
> >
> > This API returns the size of printed buffer, and it also gets the
> > printed result saved in *buf. Then drivers don't need to do three
> > steps:
> >
> > 1. get cpumap buffer size which is your cpumap_max_string_size()
> > 2. allocate memory for buffer according to size got in step 1
> > 3. print bitmap(cpumap) to buffer by "pbl"
> >
> > It will only need to call bitmap_get_print_buf() and all three
> > things are done inside bitmap_get_print_buf().
> >
> > How to use the size and memory allocated in cpumap_get_print_buf
> > will be totally up to users.
> >
> > The other benefit for this is that if we get string size during initialization,
> > and then we print in show() entries, the size got at the beginning might be not
> > enough as system topology might have changed. Sysfs ABI reflects the status of
> > system at this moment.

--
With Best Regards,
Andy Shevchenko


Subject: RE: [PATCH v7 2/4] topology: use bin_attribute to break the size limitation of cpumap ABI



> -----Original Message-----
> From: [email protected]
> [mailto:[email protected]]
> Sent: Monday, July 19, 2021 9:07 PM
> To: Yury Norov <[email protected]>
> Cc: Song Bao Hua (Barry Song) <[email protected]>;
> [email protected]; [email protected];
> [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; tangchengchang <[email protected]>; Zengtao (B)
> <[email protected]>; yangyicong <[email protected]>;
> [email protected]; Linuxarm <[email protected]>; tiantao (H)
> <[email protected]>
> Subject: Re: [PATCH v7 2/4] topology: use bin_attribute to break the size
> limitation of cpumap ABI
>
> On Fri, Jul 16, 2021 at 06:12:21PM -0700, Yury Norov wrote:
> > On Sat, Jul 17, 2021 at 12:16:48AM +0000, Song Bao Hua (Barry Song) wrote:
> > > > From: Yury Norov [mailto:[email protected]]
> > > > Sent: Saturday, July 17, 2021 8:04 AM
> > > > On Fri, Jul 16, 2021 at 08:49:58AM +0000, Song Bao Hua (Barry Song) wrote:
>
> ...
>
> > > Generally good idea. However, for sysfs ABI entries, it might not be
> > > that true.
> > >
> > > A sysfs entry might never be read for its whole life. As I explained
> > > before, a sysfs entry - especially for list, is randomly "cat" by users.
> > > Many of them won't be read forever. And after they are read once, they
> > > will probably never be read again. The operations to read ABI could be
> > > random and rare. Performance wouldn't be a concern.
> > >
> > > To avoid holding the memory which might never be used, it is better to
> > > allocate and free the memory during runtime. I mean to allocate in show()
> > > and free in show(), aka, to do it on demand.
> > >
> > > For example, for a server with 256CPU and each cpu has dozens of sysfs ABI
> > > entries, only a few of sysfs list entries might be randomly "cat" by users.
> > > Holding 256*entries memory doesn't look good.
> >
> > Ok, makes sense.
> >
> > > > This would require to add bitmap_max_string_size(list, bitmap, nbits),
> > > > but it's O(1), and I think, others will find it helpful.
> > >
> > > What about getting size and memory at the same time?
> >
> > 1. We already have kasprintf()
> > 2. It breaks coding style.
> >
> > Documentation/process/coding-style.rst:
> > Functions should be short and sweet, and do just one thing.
> >
> > From practical point of view, there should be some balance between
> > granularity and ease-of-use. But in this case, bitmap_list cries for
> > a function that will help to estimate size of output buffer.
>
> According to the vsnprintf() logic the estimated size is what it returns. If
> user supplies too few bytes available, the comparison with the returned value
> can tell caller that space wasn't big enough.

As far as my understanding, for estimated size in bitmap_max_string_size()
Yury might mean something as below?

* For bitmask:
Each 32bit needs 9 bytes "11111111,", so the max size of mask would be:
9*nr_cpus / 32 ?

* For list:
Maximally cpu0-9 need 2bytes, like "1,"
Maximally cpu10-99 need 3bytes, like "50,"
Maximally cpu100-999 need 4bytes, like "101,"
Maximally cpu1000-9999 need 5 bytes..

So once we know the size of the bitmap, we can figure out the maximum
size of its string for mask and list?

Pls correct me if you don't mean this, yury.


>
> > And it's
> > easy to imagine a case where the estimated length of bitmap is needed
> > explicitly:
> >
> > size_t max_size = bitmap_max_string_size(nbits);
> > char *buf = kmalloc(PAGE_ALIGN(max_size) * nr_cpus);
> >
> > Thought, I don't insist. In your driver you can do:
> >
> > size_t size = snprintf(NULL, 0, ...);
> > void *buf = kmalloc(size);
> >
> > It will be fully correct, and you already have everything you need.
> >
> > > ssize_t bitmap_get_print_buf(bool list, char **buf, const unsigned long
> > > *maskp, int nmaskbits)
> > >
> > > ssize_t cpumap_get_print_buf(bool list, char **buf, const struct cpumask
> *mask);
> > >
> > > This API returns the size of printed buffer, and it also gets the
> > > printed result saved in *buf. Then drivers don't need to do three
> > > steps:
> > >
> > > 1. get cpumap buffer size which is your cpumap_max_string_size()
> > > 2. allocate memory for buffer according to size got in step 1
> > > 3. print bitmap(cpumap) to buffer by "pbl"
> > >
> > > It will only need to call bitmap_get_print_buf() and all three
> > > things are done inside bitmap_get_print_buf().
> > >
> > > How to use the size and memory allocated in cpumap_get_print_buf
> > > will be totally up to users.
> > >
> > > The other benefit for this is that if we get string size during initialization,
> > > and then we print in show() entries, the size got at the beginning might
> be not
> > > enough as system topology might have changed. Sysfs ABI reflects the status
> of
> > > system at this moment.
>
> --
> With Best Regards,
> Andy Shevchenko
>

Thanks
Barry

2021-07-19 18:29:32

by Yury Norov

[permalink] [raw]
Subject: Re: [PATCH v7 2/4] topology: use bin_attribute to break the size limitation of cpumap ABI

On Mon, Jul 19, 2021 at 11:10:45AM +0000, Song Bao Hua (Barry Song) wrote:
>
>
> > -----Original Message-----
> > From: [email protected]
> > [mailto:[email protected]]
> > Sent: Monday, July 19, 2021 9:07 PM
> > To: Yury Norov <[email protected]>
> > Cc: Song Bao Hua (Barry Song) <[email protected]>;
> > [email protected]; [email protected];
> > [email protected]; [email protected];
> > [email protected]; [email protected]; [email protected];
> > [email protected]; [email protected]; [email protected];
> > [email protected]; [email protected]; [email protected];
> > [email protected]; tangchengchang <[email protected]>; Zengtao (B)
> > <[email protected]>; yangyicong <[email protected]>;
> > [email protected]; Linuxarm <[email protected]>; tiantao (H)
> > <[email protected]>
> > Subject: Re: [PATCH v7 2/4] topology: use bin_attribute to break the size
> > limitation of cpumap ABI
> >
> > On Fri, Jul 16, 2021 at 06:12:21PM -0700, Yury Norov wrote:
> > > On Sat, Jul 17, 2021 at 12:16:48AM +0000, Song Bao Hua (Barry Song) wrote:
> > > > > From: Yury Norov [mailto:[email protected]]
> > > > > Sent: Saturday, July 17, 2021 8:04 AM
> > > > > On Fri, Jul 16, 2021 at 08:49:58AM +0000, Song Bao Hua (Barry Song) wrote:
> >
> > ...
> >
> > > > Generally good idea. However, for sysfs ABI entries, it might not be
> > > > that true.
> > > >
> > > > A sysfs entry might never be read for its whole life. As I explained
> > > > before, a sysfs entry - especially for list, is randomly "cat" by users.
> > > > Many of them won't be read forever. And after they are read once, they
> > > > will probably never be read again. The operations to read ABI could be
> > > > random and rare. Performance wouldn't be a concern.
> > > >
> > > > To avoid holding the memory which might never be used, it is better to
> > > > allocate and free the memory during runtime. I mean to allocate in show()
> > > > and free in show(), aka, to do it on demand.
> > > >
> > > > For example, for a server with 256CPU and each cpu has dozens of sysfs ABI
> > > > entries, only a few of sysfs list entries might be randomly "cat" by users.
> > > > Holding 256*entries memory doesn't look good.
> > >
> > > Ok, makes sense.
> > >
> > > > > This would require to add bitmap_max_string_size(list, bitmap, nbits),
> > > > > but it's O(1), and I think, others will find it helpful.
> > > >
> > > > What about getting size and memory at the same time?
> > >
> > > 1. We already have kasprintf()
> > > 2. It breaks coding style.
> > >
> > > Documentation/process/coding-style.rst:
> > > Functions should be short and sweet, and do just one thing.
> > >
> > > From practical point of view, there should be some balance between
> > > granularity and ease-of-use. But in this case, bitmap_list cries for
> > > a function that will help to estimate size of output buffer.
> >
> > According to the vsnprintf() logic the estimated size is what it returns. If
> > user supplies too few bytes available, the comparison with the returned value
> > can tell caller that space wasn't big enough.

snprintf(NULL, 0, "pbl", ...) also works, but it's O(nbits), and user is not
guaranteed that allocated memory would be always sufficient. I mean max possible
length for given nbits, not a length of a specific string.

In case of lists, the length may grow. Consider:
0-8 -> 0-3,5-8 -> 0,2,4,6,8

If we want to allocate a storage for strings that may change, it would be
helpful to allocate memory for the lengthiest string in advance.

So, bitmap_max_string_len() may be a convenient O(1) alternative for
those who interested in printing the same bitmap in the same buffer.

> As far as my understanding, for estimated size in bitmap_max_string_size()
> Yury might mean something as below?
>
> * For bitmask:
> Each 32bit needs 9 bytes "11111111,", so the max size of mask would be:
> 9*nr_cpus / 32 ?

11111 -> "f1", but your formula gives 1.
I think it should be like this (not tested):
DIV_ROUND_UP(nbits, 4) + nbits < 32 ? 0 : nbits / 32 - 1

> * For list:
> Maximally cpu0-9 need 2bytes, like "1,"
> Maximally cpu10-99 need 3bytes, like "50,"
> Maximally cpu100-999 need 4bytes, like "101,"
> Maximally cpu1000-9999 need 5 bytes..
>
> So once we know the size of the bitmap, we can figure out the maximum
> size of its string for mask and list?
>
> Pls correct me if you don't mean this, yury.

Assuming that longest possible strings are of the form 0,2,4,6,... I
think it's correct except for the last comma, so substract 1.

If we decide to go on with this bitmap_max_strlen(), the list part
should be tested extensively.

> > > And it's
> > > easy to imagine a case where the estimated length of bitmap is needed
> > > explicitly:
> > >
> > > size_t max_size = bitmap_max_string_size(nbits);
> > > char *buf = kmalloc(PAGE_ALIGN(max_size) * nr_cpus);
> > >
> > > Thought, I don't insist. In your driver you can do:
> > >
> > > size_t size = snprintf(NULL, 0, ...);
> > > void *buf = kmalloc(size);
> > >
> > > It will be fully correct, and you already have everything you need.
> > >
> > > > ssize_t bitmap_get_print_buf(bool list, char **buf, const unsigned long
> > > > *maskp, int nmaskbits)
> > > >
> > > > ssize_t cpumap_get_print_buf(bool list, char **buf, const struct cpumask
> > *mask);
> > > >
> > > > This API returns the size of printed buffer, and it also gets the
> > > > printed result saved in *buf. Then drivers don't need to do three
> > > > steps:
> > > >
> > > > 1. get cpumap buffer size which is your cpumap_max_string_size()
> > > > 2. allocate memory for buffer according to size got in step 1
> > > > 3. print bitmap(cpumap) to buffer by "pbl"
> > > >
> > > > It will only need to call bitmap_get_print_buf() and all three
> > > > things are done inside bitmap_get_print_buf().
> > > >
> > > > How to use the size and memory allocated in cpumap_get_print_buf
> > > > will be totally up to users.
> > > >
> > > > The other benefit for this is that if we get string size during initialization,
> > > > and then we print in show() entries, the size got at the beginning might
> > be not
> > > > enough as system topology might have changed. Sysfs ABI reflects the status
> > of
> > > > system at this moment.
> >
> > --
> > With Best Regards,
> > Andy Shevchenko
> >
>
> Thanks
> Barry

Subject: RE: [PATCH v7 2/4] topology: use bin_attribute to break the size limitation of cpumap ABI



> -----Original Message-----
> From: Yury Norov [mailto:[email protected]]
> Sent: Tuesday, July 20, 2021 5:10 AM
> To: Song Bao Hua (Barry Song) <[email protected]>
> Cc: [email protected]; [email protected];
> [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; tangchengchang
> <[email protected]>; Zengtao (B) <[email protected]>;
> yangyicong <[email protected]>; [email protected]; Linuxarm
> <[email protected]>; tiantao (H) <[email protected]>
> Subject: Re: [PATCH v7 2/4] topology: use bin_attribute to break the size
> limitation of cpumap ABI
>
> On Mon, Jul 19, 2021 at 11:10:45AM +0000, Song Bao Hua (Barry Song) wrote:
> >
> >
> > > -----Original Message-----
> > > From: [email protected]
> > > [mailto:[email protected]]
> > > Sent: Monday, July 19, 2021 9:07 PM
> > > To: Yury Norov <[email protected]>
> > > Cc: Song Bao Hua (Barry Song) <[email protected]>;
> > > [email protected]; [email protected];
> > > [email protected]; [email protected];
> > > [email protected]; [email protected]; [email protected];
> > > [email protected]; [email protected]; [email protected];
> > > [email protected]; [email protected]; [email protected];
> > > [email protected]; tangchengchang <[email protected]>;
> Zengtao (B)
> > > <[email protected]>; yangyicong <[email protected]>;
> > > [email protected]; Linuxarm <[email protected]>; tiantao (H)
> > > <[email protected]>
> > > Subject: Re: [PATCH v7 2/4] topology: use bin_attribute to break the size
> > > limitation of cpumap ABI
> > >
> > > On Fri, Jul 16, 2021 at 06:12:21PM -0700, Yury Norov wrote:
> > > > On Sat, Jul 17, 2021 at 12:16:48AM +0000, Song Bao Hua (Barry Song) wrote:
> > > > > > From: Yury Norov [mailto:[email protected]]
> > > > > > Sent: Saturday, July 17, 2021 8:04 AM
> > > > > > On Fri, Jul 16, 2021 at 08:49:58AM +0000, Song Bao Hua (Barry Song)
> wrote:
> > >
> > > ...
> > >
> > > > > Generally good idea. However, for sysfs ABI entries, it might not be
> > > > > that true.
> > > > >
> > > > > A sysfs entry might never be read for its whole life. As I explained
> > > > > before, a sysfs entry - especially for list, is randomly "cat" by users.
> > > > > Many of them won't be read forever. And after they are read once, they
> > > > > will probably never be read again. The operations to read ABI could
> be
> > > > > random and rare. Performance wouldn't be a concern.
> > > > >
> > > > > To avoid holding the memory which might never be used, it is better
> to
> > > > > allocate and free the memory during runtime. I mean to allocate in show()
> > > > > and free in show(), aka, to do it on demand.
> > > > >
> > > > > For example, for a server with 256CPU and each cpu has dozens of sysfs
> ABI
> > > > > entries, only a few of sysfs list entries might be randomly "cat" by
> users.
> > > > > Holding 256*entries memory doesn't look good.
> > > >
> > > > Ok, makes sense.
> > > >
> > > > > > This would require to add bitmap_max_string_size(list, bitmap, nbits),
> > > > > > but it's O(1), and I think, others will find it helpful.
> > > > >
> > > > > What about getting size and memory at the same time?
> > > >
> > > > 1. We already have kasprintf()
> > > > 2. It breaks coding style.
> > > >
> > > > Documentation/process/coding-style.rst:
> > > > Functions should be short and sweet, and do just one thing.
> > > >
> > > > From practical point of view, there should be some balance between
> > > > granularity and ease-of-use. But in this case, bitmap_list cries for
> > > > a function that will help to estimate size of output buffer.
> > >
> > > According to the vsnprintf() logic the estimated size is what it returns.
> If
> > > user supplies too few bytes available, the comparison with the returned
> value
> > > can tell caller that space wasn't big enough.
>
> snprintf(NULL, 0, "pbl", ...) also works, but it's O(nbits), and user is not
> guaranteed that allocated memory would be always sufficient. I mean max possible
> length for given nbits, not a length of a specific string.
>
> In case of lists, the length may grow. Consider:
> 0-8 -> 0-3,5-8 -> 0,2,4,6,8
>
> If we want to allocate a storage for strings that may change, it would be
> helpful to allocate memory for the lengthiest string in advance.
>
> So, bitmap_max_string_len() may be a convenient O(1) alternative for
> those who interested in printing the same bitmap in the same buffer.
>
> > As far as my understanding, for estimated size in bitmap_max_string_size()
> > Yury might mean something as below?
> >
> > * For bitmask:
> > Each 32bit needs 9 bytes "11111111,", so the max size of mask would be:
> > 9*nr_cpus / 32 ?
>
> 11111 -> "f1", but your formula gives 1.
> I think it should be like this (not tested):
> DIV_ROUND_UP(nbits, 4) + nbits < 32 ? 0 : nbits / 32 - 1

I am not quite sure this is correct, in case nbits=64,
we will need:
ffffffff,ffffffff 9+8=17

your formula is ignoring the "," for each 32bits?

>
> > * For list:
> > Maximally cpu0-9 need 2bytes, like "1,"
> > Maximally cpu10-99 need 3bytes, like "50,"
> > Maximally cpu100-999 need 4bytes, like "101,"
> > Maximally cpu1000-9999 need 5 bytes..
> >
> > So once we know the size of the bitmap, we can figure out the maximum
> > size of its string for mask and list?
> >
> > Pls correct me if you don't mean this, yury.
>
> Assuming that longest possible strings are of the form 0,2,4,6,... I
> think it's correct except for the last comma, so substract 1.

I don't think 0,2,4,6,8 takes maximum length,
0-1,3-4,6-7 are longer.

It seems maximum bitmap pattern is(binary):
110110110110...

In this case, each 3bits takes 4 bytes.

For cpu 10-99, it would be: 10-11,13-14,16-17,19-20,

Each 3bits takes "6 bytes".

For cpu100-999, it would be: 100-101, 103-104, 106-107,....
Each 3bits takes "8 bytes"?

Anyway, It seems be quite tricky :-)

>
> If we decide to go on with this bitmap_max_strlen(), the list part
> should be tested extensively.
>

Btw, sysfs core code needs changes to support pre-allocated
memory in customized size and seq read on it. This needs a lot
of efforts I am looking at.

> > > > And it's
> > > > easy to imagine a case where the estimated length of bitmap is needed
> > > > explicitly:
> > > >
> > > > size_t max_size = bitmap_max_string_size(nbits);
> > > > char *buf = kmalloc(PAGE_ALIGN(max_size) * nr_cpus);
> > > >
> > > > Thought, I don't insist. In your driver you can do:
> > > >
> > > > size_t size = snprintf(NULL, 0, ...);
> > > > void *buf = kmalloc(size);
> > > >
> > > > It will be fully correct, and you already have everything you need.
> > > >
> > > > > ssize_t bitmap_get_print_buf(bool list, char **buf, const unsigned long
> > > > > *maskp, int nmaskbits)
> > > > >
> > > > > ssize_t cpumap_get_print_buf(bool list, char **buf, const struct cpumask
> > > *mask);
> > > > >
> > > > > This API returns the size of printed buffer, and it also gets the
> > > > > printed result saved in *buf. Then drivers don't need to do three
> > > > > steps:
> > > > >
> > > > > 1. get cpumap buffer size which is your cpumap_max_string_size()
> > > > > 2. allocate memory for buffer according to size got in step 1
> > > > > 3. print bitmap(cpumap) to buffer by "pbl"
> > > > >
> > > > > It will only need to call bitmap_get_print_buf() and all three
> > > > > things are done inside bitmap_get_print_buf().
> > > > >
> > > > > How to use the size and memory allocated in cpumap_get_print_buf
> > > > > will be totally up to users.
> > > > >
> > > > > The other benefit for this is that if we get string size during
> initialization,
> > > > > and then we print in show() entries, the size got at the beginning might
> > > be not
> > > > > enough as system topology might have changed. Sysfs ABI reflects the
> status
> > > of
> > > > > system at this moment.
> > >
> > > --
> > > With Best Regards,
> > > Andy Shevchenko
> > >
> >


Thanks
Barry