/proc/meminfo or show_free_areas does not show full system wide memory
usage status because memory stats do not track all memory allocations.
There seems to be huge hidden memory especially on embedded system. It
is because some HW IPs in the system use common DRAM memory instead of
internal memory. Device drivers directly request huge pages from the
page allocator with alloc_pages.
In Android system, most of those hidden memory seems to be vmalloc
pages, ion system heap memory, graphics memory, and memory for DRAM
based compressed swap storage. They may be shown in other node but it
seems to be useful if /proc/meminfo_extra shows all those extra memory
information. And show_mem also need to print the info in oom situation.
Fortunately vmalloc pages is already shown by commit 97105f0ab7b8
("mm: vmalloc: show number of vmalloc pages in /proc/meminfo"). Swap
memory using zsmalloc can be seen through vmstat by commit 91537fee0013
("mm: add NR_ZSMALLOC to vmstat") but not on /proc/meminfo.
Memory usage of specific driver can be various so that showing the usage
through upstream meminfo.c is not easy. To print the extra memory usage
of a driver, introduce following APIs. Each driver needs to count as
atomic_long_t.
int register_meminfo_extra(atomic_long_t *val, int shift,
const char *name);
int unregister_meminfo_extra(atomic_long_t *val);
Currently register ION system heap allocator and zsmalloc pages.
Additionally tested on local graphics driver.
i.e) cat /proc/meminfo_extra | tail -3
IonSystemHeap: 242620 kB
ZsPages: 203860 kB
GraphicDriver: 196576 kB
i.e.) show_mem on oom
<6>[ 420.856428] Mem-Info:
<6>[ 420.856433] IonSystemHeap:32813kB ZsPages:44114kB GraphicDriver::13091kB
<6>[ 420.856450] active_anon:957205 inactive_anon:159383 isolated_anon:0
---
v2: move to /proc/meminfo_extra, and use rcu
v1: print info at /proc/meminfo
On v1 patch, there was not resolved discussion about the logic. There
seems to be agreement on showing memory usage, but there was a lack of
consensus on way of showing the information. Other opinion is using
/sys/ as separate file for each driver.
---
Jaewon Kim (3):
meminfo_extra: introduce meminfo extra
mm: zsmalloc: include zs page size in meminfo extra
android: ion: include system heap size in meminfo extra
drivers/staging/android/ion/ion.c | 2 +
drivers/staging/android/ion/ion.h | 1 +
drivers/staging/android/ion/ion_system_heap.c | 2 +
fs/proc/Makefile | 1 +
fs/proc/meminfo_extra.c | 123 ++++++++++++++++++++++++++
include/linux/mm.h | 4 +
mm/page_alloc.c | 1 +
mm/zsmalloc.c | 2 +
8 files changed, 136 insertions(+)
create mode 100644 fs/proc/meminfo_extra.c
--
2.13.7
Provide APIs to drivers so that they can show its memory usage on
/proc/meminfo_extra.
int register_meminfo_extra(atomic_long_t *val, int shift,
const char *name);
int unregister_meminfo_extra(atomic_long_t *val);
Signed-off-by: Jaewon Kim <[email protected]>
---
v2: move to /proc/meminfo_extra as a new file, meminfo_extra.c
use rcu to reduce lock overhead
v1: print info at /proc/meminfo
---
fs/proc/Makefile | 1 +
fs/proc/meminfo_extra.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/mm.h | 4 ++
mm/page_alloc.c | 1 +
4 files changed, 129 insertions(+)
create mode 100644 fs/proc/meminfo_extra.c
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index bd08616ed8ba..83d2f55591c6 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -19,6 +19,7 @@ proc-y += devices.o
proc-y += interrupts.o
proc-y += loadavg.o
proc-y += meminfo.o
+proc-y += meminfo_extra.o
proc-y += stat.o
proc-y += uptime.o
proc-y += util.o
diff --git a/fs/proc/meminfo_extra.c b/fs/proc/meminfo_extra.c
new file mode 100644
index 000000000000..bd3f0d2b7fb7
--- /dev/null
+++ b/fs/proc/meminfo_extra.c
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+
+static void show_val_kb(struct seq_file *m, const char *s, unsigned long num)
+{
+ seq_put_decimal_ull_width(m, s, num << (PAGE_SHIFT - 10), 8);
+ seq_write(m, " kB\n", 4);
+}
+
+static LIST_HEAD(meminfo_head);
+static DEFINE_SPINLOCK(meminfo_lock);
+
+#define NAME_SIZE 15
+#define NAME_BUF_SIZE (NAME_SIZE + 2) /* ':' and '\0' */
+
+struct meminfo_extra {
+ struct list_head list;
+ atomic_long_t *val;
+ int shift_for_page;
+ char name[NAME_BUF_SIZE];
+ char name_pad[NAME_BUF_SIZE];
+};
+
+int register_meminfo_extra(atomic_long_t *val, int shift, const char *name)
+{
+ struct meminfo_extra *meminfo, *memtemp;
+ int len;
+ int error = 0;
+
+ meminfo = kzalloc(sizeof(*meminfo), GFP_KERNEL);
+ if (!meminfo) {
+ error = -ENOMEM;
+ goto out;
+ }
+
+ meminfo->val = val;
+ meminfo->shift_for_page = shift;
+ strncpy(meminfo->name, name, NAME_SIZE);
+ len = strlen(meminfo->name);
+ meminfo->name[len] = ':';
+ strncpy(meminfo->name_pad, meminfo->name, NAME_BUF_SIZE);
+ while (++len < NAME_BUF_SIZE - 1)
+ meminfo->name_pad[len] = ' ';
+
+ spin_lock(&meminfo_lock);
+ list_for_each_entry_rcu(memtemp, &meminfo_head, list) {
+ if (memtemp->val == val) {
+ error = -EINVAL;
+ break;
+ }
+ }
+ if (!error)
+ list_add_tail_rcu(&meminfo->list, &meminfo_head);
+ spin_unlock(&meminfo_lock);
+ if (error)
+ kfree(meminfo);
+out:
+
+ return error;
+}
+EXPORT_SYMBOL(register_meminfo_extra);
+
+int unregister_meminfo_extra(atomic_long_t *val)
+{
+ struct meminfo_extra *memtemp;
+ int error = -EINVAL;
+
+ spin_lock(&meminfo_lock);
+ list_for_each_entry_rcu(memtemp, &meminfo_head, list) {
+ if (memtemp->val == val) {
+ list_del_rcu(&memtemp->list);
+ error = 0;
+ break;
+ }
+ }
+ spin_unlock(&meminfo_lock);
+ if (!error) {
+ synchronize_rcu();
+ kfree(memtemp);
+ }
+
+ return error;
+}
+EXPORT_SYMBOL(unregister_meminfo_extra);
+
+static void __meminfo_extra(struct seq_file *m)
+{
+ struct meminfo_extra *memtemp;
+ unsigned long nr_page;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(memtemp, &meminfo_head, list) {
+ nr_page = (unsigned long)atomic_long_read(memtemp->val);
+ nr_page = nr_page >> memtemp->shift_for_page;
+ if (m)
+ show_val_kb(m, memtemp->name_pad, nr_page);
+ else
+ pr_cont("%s%lukB ", memtemp->name, nr_page);
+ }
+ rcu_read_unlock();
+}
+
+void show_meminfo_extra(void)
+{
+ __meminfo_extra(NULL);
+}
+
+static int meminfo_extra_proc_show(struct seq_file *m, void *v)
+{
+ __meminfo_extra(m);
+
+ return 0;
+}
+
+static int __init proc_meminfo_extra_init(void)
+{
+ proc_create_single("meminfo_extra", 0, NULL, meminfo_extra_proc_show);
+ return 0;
+}
+fs_initcall(proc_meminfo_extra_init);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 52269e56c514..55317161ab57 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2898,6 +2898,10 @@ void __init setup_nr_node_ids(void);
static inline void setup_nr_node_ids(void) {}
#endif
+void show_meminfo_extra(void);
+int register_meminfo_extra(atomic_long_t *val, int shift, const char *name);
+int unregister_meminfo_extra(atomic_long_t *val);
+
extern int memcmp_pages(struct page *page1, struct page *page2);
static inline int pages_identical(struct page *page1, struct page *page2)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3c4eb750a199..db1be9a39783 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5229,6 +5229,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
struct zone *zone;
pg_data_t *pgdat;
+ show_meminfo_extra();
for_each_populated_zone(zone) {
if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
continue;
--
2.13.7
On Mon, Mar 23, 2020 at 05:05:01PM +0900, Jaewon Kim wrote:
> Provide APIs to drivers so that they can show its memory usage on
> /proc/meminfo_extra.
>
> int register_meminfo_extra(atomic_long_t *val, int shift,
> const char *name);
> int unregister_meminfo_extra(atomic_long_t *val);
Nit, isn't it nicer to have the subsystem name first:
meminfo_extra_register()
meminfo_extra_unregister()
?
>
> Signed-off-by: Jaewon Kim <[email protected]>
> ---
> v2: move to /proc/meminfo_extra as a new file, meminfo_extra.c
> use rcu to reduce lock overhead
> v1: print info at /proc/meminfo
> ---
> fs/proc/Makefile | 1 +
> fs/proc/meminfo_extra.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++
> include/linux/mm.h | 4 ++
> mm/page_alloc.c | 1 +
> 4 files changed, 129 insertions(+)
> create mode 100644 fs/proc/meminfo_extra.c
>
> diff --git a/fs/proc/Makefile b/fs/proc/Makefile
> index bd08616ed8ba..83d2f55591c6 100644
> --- a/fs/proc/Makefile
> +++ b/fs/proc/Makefile
> @@ -19,6 +19,7 @@ proc-y += devices.o
> proc-y += interrupts.o
> proc-y += loadavg.o
> proc-y += meminfo.o
> +proc-y += meminfo_extra.o
> proc-y += stat.o
> proc-y += uptime.o
> proc-y += util.o
> diff --git a/fs/proc/meminfo_extra.c b/fs/proc/meminfo_extra.c
> new file mode 100644
> index 000000000000..bd3f0d2b7fb7
> --- /dev/null
> +++ b/fs/proc/meminfo_extra.c
> @@ -0,0 +1,123 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <linux/mm.h>
> +#include <linux/proc_fs.h>
> +#include <linux/seq_file.h>
> +#include <linux/slab.h>
> +
> +static void show_val_kb(struct seq_file *m, const char *s, unsigned long num)
> +{
> + seq_put_decimal_ull_width(m, s, num << (PAGE_SHIFT - 10), 8);
> + seq_write(m, " kB\n", 4);
> +}
> +
> +static LIST_HEAD(meminfo_head);
> +static DEFINE_SPINLOCK(meminfo_lock);
> +
> +#define NAME_SIZE 15
> +#define NAME_BUF_SIZE (NAME_SIZE + 2) /* ':' and '\0' */
> +
> +struct meminfo_extra {
> + struct list_head list;
> + atomic_long_t *val;
> + int shift_for_page;
> + char name[NAME_BUF_SIZE];
> + char name_pad[NAME_BUF_SIZE];
> +};
> +
> +int register_meminfo_extra(atomic_long_t *val, int shift, const char *name)
> +{
> + struct meminfo_extra *meminfo, *memtemp;
> + int len;
> + int error = 0;
> +
> + meminfo = kzalloc(sizeof(*meminfo), GFP_KERNEL);
> + if (!meminfo) {
> + error = -ENOMEM;
> + goto out;
> + }
> +
> + meminfo->val = val;
> + meminfo->shift_for_page = shift;
> + strncpy(meminfo->name, name, NAME_SIZE);
> + len = strlen(meminfo->name);
> + meminfo->name[len] = ':';
> + strncpy(meminfo->name_pad, meminfo->name, NAME_BUF_SIZE);
> + while (++len < NAME_BUF_SIZE - 1)
> + meminfo->name_pad[len] = ' ';
> +
> + spin_lock(&meminfo_lock);
> + list_for_each_entry_rcu(memtemp, &meminfo_head, list) {
> + if (memtemp->val == val) {
> + error = -EINVAL;
> + break;
> + }
> + }
> + if (!error)
> + list_add_tail_rcu(&meminfo->list, &meminfo_head);
> + spin_unlock(&meminfo_lock);
If you have a lock, why are you needing rcu?
> + if (error)
> + kfree(meminfo);
> +out:
> +
> + return error;
> +}
> +EXPORT_SYMBOL(register_meminfo_extra);
EXPORT_SYMBOL_GPL()? I have to ask :)
thanks,
greg k-h
Hi Jaewon,
On 03/23/20 at 05:05pm, Jaewon Kim wrote:
> Provide APIs to drivers so that they can show its memory usage on
> /proc/meminfo_extra.
>
> int register_meminfo_extra(atomic_long_t *val, int shift,
> const char *name);
> int unregister_meminfo_extra(atomic_long_t *val);
>
> Signed-off-by: Jaewon Kim <[email protected]>
> ---
> v2: move to /proc/meminfo_extra as a new file, meminfo_extra.c
> use rcu to reduce lock overhead
> v1: print info at /proc/meminfo
> ---
> fs/proc/Makefile | 1 +
> fs/proc/meminfo_extra.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++
> include/linux/mm.h | 4 ++
> mm/page_alloc.c | 1 +
> 4 files changed, 129 insertions(+)
> create mode 100644 fs/proc/meminfo_extra.c
>
> diff --git a/fs/proc/Makefile b/fs/proc/Makefile
> index bd08616ed8ba..83d2f55591c6 100644
> --- a/fs/proc/Makefile
> +++ b/fs/proc/Makefile
> @@ -19,6 +19,7 @@ proc-y += devices.o
> proc-y += interrupts.o
> proc-y += loadavg.o
> proc-y += meminfo.o
> +proc-y += meminfo_extra.o
> proc-y += stat.o
> proc-y += uptime.o
> proc-y += util.o
> diff --git a/fs/proc/meminfo_extra.c b/fs/proc/meminfo_extra.c
> new file mode 100644
> index 000000000000..bd3f0d2b7fb7
> --- /dev/null
> +++ b/fs/proc/meminfo_extra.c
> @@ -0,0 +1,123 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <linux/mm.h>
> +#include <linux/proc_fs.h>
> +#include <linux/seq_file.h>
> +#include <linux/slab.h>
> +
> +static void show_val_kb(struct seq_file *m, const char *s, unsigned long num)
> +{
> + seq_put_decimal_ull_width(m, s, num << (PAGE_SHIFT - 10), 8);
> + seq_write(m, " kB\n", 4);
> +}
> +
> +static LIST_HEAD(meminfo_head);
> +static DEFINE_SPINLOCK(meminfo_lock);
> +
> +#define NAME_SIZE 15
> +#define NAME_BUF_SIZE (NAME_SIZE + 2) /* ':' and '\0' */
> +
> +struct meminfo_extra {
> + struct list_head list;
> + atomic_long_t *val;
> + int shift_for_page;
Can this be simplified to use a bytes value without an extra shift?
> + char name[NAME_BUF_SIZE];
> + char name_pad[NAME_BUF_SIZE];
> +};
> +
There should be document about below function here.
> +int register_meminfo_extra(atomic_long_t *val, int shift, const char *name)
> +{
> + struct meminfo_extra *meminfo, *memtemp;
> + int len;
> + int error = 0;
> +
> + meminfo = kzalloc(sizeof(*meminfo), GFP_KERNEL);
> + if (!meminfo) {
> + error = -ENOMEM;
> + goto out;
> + }
> +
> + meminfo->val = val;
> + meminfo->shift_for_page = shift;
> + strncpy(meminfo->name, name, NAME_SIZE);
> + len = strlen(meminfo->name);
> + meminfo->name[len] = ':';
> + strncpy(meminfo->name_pad, meminfo->name, NAME_BUF_SIZE);
> + while (++len < NAME_BUF_SIZE - 1)
> + meminfo->name_pad[len] = ' ';
> +
> + spin_lock(&meminfo_lock);
> + list_for_each_entry_rcu(memtemp, &meminfo_head, list) {
> + if (memtemp->val == val) {
> + error = -EINVAL;
> + break;
> + }
> + }
> + if (!error)
> + list_add_tail_rcu(&meminfo->list, &meminfo_head);
> + spin_unlock(&meminfo_lock);
> + if (error)
> + kfree(meminfo);
> +out:
> +
> + return error;
> +}
> +EXPORT_SYMBOL(register_meminfo_extra);
> +
> +int unregister_meminfo_extra(atomic_long_t *val)
> +{
> + struct meminfo_extra *memtemp;
> + int error = -EINVAL;
> +
> + spin_lock(&meminfo_lock);
> + list_for_each_entry_rcu(memtemp, &meminfo_head, list) {
> + if (memtemp->val == val) {
> + list_del_rcu(&memtemp->list);
> + error = 0;
> + break;
> + }
> + }
> + spin_unlock(&meminfo_lock);
> + if (!error) {
> + synchronize_rcu();
> + kfree(memtemp);
> + }
> +
> + return error;
> +}
> +EXPORT_SYMBOL(unregister_meminfo_extra);
> +
> +static void __meminfo_extra(struct seq_file *m)
> +{
> + struct meminfo_extra *memtemp;
> + unsigned long nr_page;
> +
> + rcu_read_lock();
> + list_for_each_entry_rcu(memtemp, &meminfo_head, list) {
> + nr_page = (unsigned long)atomic_long_read(memtemp->val);
> + nr_page = nr_page >> memtemp->shift_for_page;
> + if (m)
> + show_val_kb(m, memtemp->name_pad, nr_page);
> + else
> + pr_cont("%s%lukB ", memtemp->name, nr_page);
nr_page != nr_kb?
> + }
> + rcu_read_unlock();
> +}
> +
> +void show_meminfo_extra(void)
> +{
> + __meminfo_extra(NULL);
> +}
> +
> +static int meminfo_extra_proc_show(struct seq_file *m, void *v)
> +{
> + __meminfo_extra(m);
> +
> + return 0;
> +}
> +
> +static int __init proc_meminfo_extra_init(void)
> +{
> + proc_create_single("meminfo_extra", 0, NULL, meminfo_extra_proc_show);
> + return 0;
> +}
> +fs_initcall(proc_meminfo_extra_init);
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 52269e56c514..55317161ab57 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2898,6 +2898,10 @@ void __init setup_nr_node_ids(void);
> static inline void setup_nr_node_ids(void) {}
> #endif
>
> +void show_meminfo_extra(void);
> +int register_meminfo_extra(atomic_long_t *val, int shift, const char *name);
> +int unregister_meminfo_extra(atomic_long_t *val);
> +
> extern int memcmp_pages(struct page *page1, struct page *page2);
>
> static inline int pages_identical(struct page *page1, struct page *page2)
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 3c4eb750a199..db1be9a39783 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5229,6 +5229,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
> struct zone *zone;
> pg_data_t *pgdat;
>
> + show_meminfo_extra();
> for_each_populated_zone(zone) {
> if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
> continue;
> --
> 2.13.7
>
>
> _______________________________________________
> kexec mailing list
> [email protected]
> http://lists.infradead.org/mailman/listinfo/kexec
>
Thanks
Dave
On 2020년 03월 23일 18:53, Greg KH wrote:
> On Mon, Mar 23, 2020 at 05:05:01PM +0900, Jaewon Kim wrote:
>> Provide APIs to drivers so that they can show its memory usage on
>> /proc/meminfo_extra.
>>
>> int register_meminfo_extra(atomic_long_t *val, int shift,
>> const char *name);
>> int unregister_meminfo_extra(atomic_long_t *val);
> Nit, isn't it nicer to have the subsystem name first:
> meminfo_extra_register()
> meminfo_extra_unregister()
> ?
OK. Name can be changed.
>
>
>> Signed-off-by: Jaewon Kim <[email protected]>
>> ---
>> v2: move to /proc/meminfo_extra as a new file, meminfo_extra.c
>> use rcu to reduce lock overhead
>> v1: print info at /proc/meminfo
>> ---
>> fs/proc/Makefile | 1 +
>> fs/proc/meminfo_extra.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++
>> include/linux/mm.h | 4 ++
>> mm/page_alloc.c | 1 +
>> 4 files changed, 129 insertions(+)
>> create mode 100644 fs/proc/meminfo_extra.c
>>
>> diff --git a/fs/proc/Makefile b/fs/proc/Makefile
>> index bd08616ed8ba..83d2f55591c6 100644
>> --- a/fs/proc/Makefile
>> +++ b/fs/proc/Makefile
>> @@ -19,6 +19,7 @@ proc-y += devices.o
>> proc-y += interrupts.o
>> proc-y += loadavg.o
>> proc-y += meminfo.o
>> +proc-y += meminfo_extra.o
>> proc-y += stat.o
>> proc-y += uptime.o
>> proc-y += util.o
>> diff --git a/fs/proc/meminfo_extra.c b/fs/proc/meminfo_extra.c
>> new file mode 100644
>> index 000000000000..bd3f0d2b7fb7
>> --- /dev/null
>> +++ b/fs/proc/meminfo_extra.c
>> @@ -0,0 +1,123 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +#include <linux/mm.h>
>> +#include <linux/proc_fs.h>
>> +#include <linux/seq_file.h>
>> +#include <linux/slab.h>
>> +
>> +static void show_val_kb(struct seq_file *m, const char *s, unsigned long num)
>> +{
>> + seq_put_decimal_ull_width(m, s, num << (PAGE_SHIFT - 10), 8);
>> + seq_write(m, " kB\n", 4);
>> +}
>> +
>> +static LIST_HEAD(meminfo_head);
>> +static DEFINE_SPINLOCK(meminfo_lock);
>> +
>> +#define NAME_SIZE 15
>> +#define NAME_BUF_SIZE (NAME_SIZE + 2) /* ':' and '\0' */
>> +
>> +struct meminfo_extra {
>> + struct list_head list;
>> + atomic_long_t *val;
>> + int shift_for_page;
>> + char name[NAME_BUF_SIZE];
>> + char name_pad[NAME_BUF_SIZE];
>> +};
>> +
>> +int register_meminfo_extra(atomic_long_t *val, int shift, const char *name)
>> +{
>> + struct meminfo_extra *meminfo, *memtemp;
>> + int len;
>> + int error = 0;
>> +
>> + meminfo = kzalloc(sizeof(*meminfo), GFP_KERNEL);
>> + if (!meminfo) {
>> + error = -ENOMEM;
>> + goto out;
>> + }
>> +
>> + meminfo->val = val;
>> + meminfo->shift_for_page = shift;
>> + strncpy(meminfo->name, name, NAME_SIZE);
>> + len = strlen(meminfo->name);
>> + meminfo->name[len] = ':';
>> + strncpy(meminfo->name_pad, meminfo->name, NAME_BUF_SIZE);
>> + while (++len < NAME_BUF_SIZE - 1)
>> + meminfo->name_pad[len] = ' ';
>> +
>> + spin_lock(&meminfo_lock);
>> + list_for_each_entry_rcu(memtemp, &meminfo_head, list) {
>> + if (memtemp->val == val) {
>> + error = -EINVAL;
>> + break;
>> + }
>> + }
>> + if (!error)
>> + list_add_tail_rcu(&meminfo->list, &meminfo_head);
>> + spin_unlock(&meminfo_lock);
> If you have a lock, why are you needing rcu?
I think _rcu should be removed out of list_for_each_entry_rcu.
But I'm confused about what you meant.
I used rcu_read_lock on __meminfo_extra,
and I think spin_lock is also needed for addition and deletion to handle multiple modifiers.
>
>
>
>> + if (error)
>> + kfree(meminfo);
>> +out:
>> +
>> + return error;
>> +}
>> +EXPORT_SYMBOL(register_meminfo_extra);
> EXPORT_SYMBOL_GPL()? I have to ask :)
I can use EXPORT_SYMBOL_GPL.
>
> thanks,
>
> greg k-h
>
>
Hello
Thank you for your comment.
By the way there was not resolved discussion on v1 patch as I mentioned on cover page.
I'd like to hear your opinion on this /proc/meminfo_extra node.
Do you think this is meaningful or cannot co-exist with other future sysfs based API.
On Tue, Mar 24, 2020 at 06:11:17PM +0900, Jaewon Kim wrote:
> On 2020년 03월 23일 18:53, Greg KH wrote:
> >> +int register_meminfo_extra(atomic_long_t *val, int shift, const char *name)
> >> +{
> >> + struct meminfo_extra *meminfo, *memtemp;
> >> + int len;
> >> + int error = 0;
> >> +
> >> + meminfo = kzalloc(sizeof(*meminfo), GFP_KERNEL);
> >> + if (!meminfo) {
> >> + error = -ENOMEM;
> >> + goto out;
> >> + }
> >> +
> >> + meminfo->val = val;
> >> + meminfo->shift_for_page = shift;
> >> + strncpy(meminfo->name, name, NAME_SIZE);
> >> + len = strlen(meminfo->name);
> >> + meminfo->name[len] = ':';
> >> + strncpy(meminfo->name_pad, meminfo->name, NAME_BUF_SIZE);
> >> + while (++len < NAME_BUF_SIZE - 1)
> >> + meminfo->name_pad[len] = ' ';
> >> +
> >> + spin_lock(&meminfo_lock);
> >> + list_for_each_entry_rcu(memtemp, &meminfo_head, list) {
> >> + if (memtemp->val == val) {
> >> + error = -EINVAL;
> >> + break;
> >> + }
> >> + }
> >> + if (!error)
> >> + list_add_tail_rcu(&meminfo->list, &meminfo_head);
> >> + spin_unlock(&meminfo_lock);
> > If you have a lock, why are you needing rcu?
> I think _rcu should be removed out of list_for_each_entry_rcu.
> But I'm confused about what you meant.
> I used rcu_read_lock on __meminfo_extra,
> and I think spin_lock is also needed for addition and deletion to handle multiple modifiers.
If that's the case, then that's fine, it just didn't seem like that was
needed. Or I might have been reading your rcu logic incorrectly...
> >> + if (error)
> >> + kfree(meminfo);
> >> +out:
> >> +
> >> + return error;
> >> +}
> >> +EXPORT_SYMBOL(register_meminfo_extra);
> > EXPORT_SYMBOL_GPL()? I have to ask :)
> I can use EXPORT_SYMBOL_GPL.
> >
> > thanks,
> >
> > greg k-h
> >
> >
>
> Hello
> Thank you for your comment.
>
> By the way there was not resolved discussion on v1 patch as I mentioned on cover page.
> I'd like to hear your opinion on this /proc/meminfo_extra node.
I think it is the propagation of an old and obsolete interface that you
will have to support for the next 20+ years and yet not actually be
useful :)
> Do you think this is meaningful or cannot co-exist with other future
> sysfs based API.
What sysfs-based API?
I still don't know _why_ you want this. The ION stuff is not needed as
that code is about to be deleted, so who else wants this? What is the
use-case for it that is so desperately needed that parsing
yet-another-proc file is going to solve the problem?
thanks,
greg k-h
On 2020년 03월 24일 19:11, Greg KH wrote:
> On Tue, Mar 24, 2020 at 06:11:17PM +0900, Jaewon Kim wrote:
>> On 2020년 03월 23일 18:53, Greg KH wrote:
>>>> +int register_meminfo_extra(atomic_long_t *val, int shift, const char *name)
>>>> +{
>>>> + struct meminfo_extra *meminfo, *memtemp;
>>>> + int len;
>>>> + int error = 0;
>>>> +
>>>> + meminfo = kzalloc(sizeof(*meminfo), GFP_KERNEL);
>>>> + if (!meminfo) {
>>>> + error = -ENOMEM;
>>>> + goto out;
>>>> + }
>>>> +
>>>> + meminfo->val = val;
>>>> + meminfo->shift_for_page = shift;
>>>> + strncpy(meminfo->name, name, NAME_SIZE);
>>>> + len = strlen(meminfo->name);
>>>> + meminfo->name[len] = ':';
>>>> + strncpy(meminfo->name_pad, meminfo->name, NAME_BUF_SIZE);
>>>> + while (++len < NAME_BUF_SIZE - 1)
>>>> + meminfo->name_pad[len] = ' ';
>>>> +
>>>> + spin_lock(&meminfo_lock);
>>>> + list_for_each_entry_rcu(memtemp, &meminfo_head, list) {
>>>> + if (memtemp->val == val) {
>>>> + error = -EINVAL;
>>>> + break;
>>>> + }
>>>> + }
>>>> + if (!error)
>>>> + list_add_tail_rcu(&meminfo->list, &meminfo_head);
>>>> + spin_unlock(&meminfo_lock);
>>> If you have a lock, why are you needing rcu?
>> I think _rcu should be removed out of list_for_each_entry_rcu.
>> But I'm confused about what you meant.
>> I used rcu_read_lock on __meminfo_extra,
>> and I think spin_lock is also needed for addition and deletion to handle multiple modifiers.
> If that's the case, then that's fine, it just didn't seem like that was
> needed. Or I might have been reading your rcu logic incorrectly...
>
>>>> + if (error)
>>>> + kfree(meminfo);
>>>> +out:
>>>> +
>>>> + return error;
>>>> +}
>>>> +EXPORT_SYMBOL(register_meminfo_extra);
>>> EXPORT_SYMBOL_GPL()? I have to ask :)
>> I can use EXPORT_SYMBOL_GPL.
>>> thanks,
>>>
>>> greg k-h
>>>
>>>
>> Hello
>> Thank you for your comment.
>>
>> By the way there was not resolved discussion on v1 patch as I mentioned on cover page.
>> I'd like to hear your opinion on this /proc/meminfo_extra node.
> I think it is the propagation of an old and obsolete interface that you
> will have to support for the next 20+ years and yet not actually be
> useful :)
>
>> Do you think this is meaningful or cannot co-exist with other future
>> sysfs based API.
> What sysfs-based API?
Please refer to mail thread on v1 patch set - https://lkml.org/lkml/fancy/2020/3/10/2102
especially discussion with Leon Romanovsky on https://lkml.org/lkml/fancy/2020/3/16/140
>
> I still don't know _why_ you want this. The ION stuff is not needed as
> that code is about to be deleted, so who else wants this? What is the
> use-case for it that is so desperately needed that parsing
> yet-another-proc file is going to solve the problem?
In my Android device, there are graphic driver memory, zsmalloc memory except ION.
I don't know other cases in other platform.
Not desperately needed but I think we need one userspace knob to see overall hidden huge memory.
Additionally I'd like to see all those hidden memory in OutOfMemory log.
This is useful to get clue to find memory hogger.
i.e.) show_mem on oom
<6>[ 420.856428] Mem-Info:
<6>[ 420.856433] IonSystemHeap:32813kB ZsPages:44114kB GraphicDriver::13091kB
<6>[ 420.856450] active_anon:957205 inactive_anon:159383 isolated_anon:0
Thank you
Jaewon Kim
>
> thanks,
>
> greg k-h
>
>
On Tue, Mar 24, 2020 at 08:37:38PM +0900, Jaewon Kim wrote:
>
>
> On 2020년 03월 24일 19:11, Greg KH wrote:
> > On Tue, Mar 24, 2020 at 06:11:17PM +0900, Jaewon Kim wrote:
> >> On 2020년 03월 23일 18:53, Greg KH wrote:
> >>>> +int register_meminfo_extra(atomic_long_t *val, int shift, const char *name)
> >>>> +{
> >>>> + struct meminfo_extra *meminfo, *memtemp;
> >>>> + int len;
> >>>> + int error = 0;
> >>>> +
> >>>> + meminfo = kzalloc(sizeof(*meminfo), GFP_KERNEL);
> >>>> + if (!meminfo) {
> >>>> + error = -ENOMEM;
> >>>> + goto out;
> >>>> + }
> >>>> +
> >>>> + meminfo->val = val;
> >>>> + meminfo->shift_for_page = shift;
> >>>> + strncpy(meminfo->name, name, NAME_SIZE);
> >>>> + len = strlen(meminfo->name);
> >>>> + meminfo->name[len] = ':';
> >>>> + strncpy(meminfo->name_pad, meminfo->name, NAME_BUF_SIZE);
> >>>> + while (++len < NAME_BUF_SIZE - 1)
> >>>> + meminfo->name_pad[len] = ' ';
> >>>> +
> >>>> + spin_lock(&meminfo_lock);
> >>>> + list_for_each_entry_rcu(memtemp, &meminfo_head, list) {
> >>>> + if (memtemp->val == val) {
> >>>> + error = -EINVAL;
> >>>> + break;
> >>>> + }
> >>>> + }
> >>>> + if (!error)
> >>>> + list_add_tail_rcu(&meminfo->list, &meminfo_head);
> >>>> + spin_unlock(&meminfo_lock);
> >>> If you have a lock, why are you needing rcu?
> >> I think _rcu should be removed out of list_for_each_entry_rcu.
> >> But I'm confused about what you meant.
> >> I used rcu_read_lock on __meminfo_extra,
> >> and I think spin_lock is also needed for addition and deletion to handle multiple modifiers.
> > If that's the case, then that's fine, it just didn't seem like that was
> > needed. Or I might have been reading your rcu logic incorrectly...
> >
> >>>> + if (error)
> >>>> + kfree(meminfo);
> >>>> +out:
> >>>> +
> >>>> + return error;
> >>>> +}
> >>>> +EXPORT_SYMBOL(register_meminfo_extra);
> >>> EXPORT_SYMBOL_GPL()? I have to ask :)
> >> I can use EXPORT_SYMBOL_GPL.
> >>> thanks,
> >>>
> >>> greg k-h
> >>>
> >>>
> >> Hello
> >> Thank you for your comment.
> >>
> >> By the way there was not resolved discussion on v1 patch as I mentioned on cover page.
> >> I'd like to hear your opinion on this /proc/meminfo_extra node.
> > I think it is the propagation of an old and obsolete interface that you
> > will have to support for the next 20+ years and yet not actually be
> > useful :)
> >
> >> Do you think this is meaningful or cannot co-exist with other future
> >> sysfs based API.
> > What sysfs-based API?
> Please refer to mail thread on v1 patch set - https://lkml.org/lkml/fancy/2020/3/10/2102
> especially discussion with Leon Romanovsky on https://lkml.org/lkml/fancy/2020/3/16/140
I really do not understand what you are referring to here, sorry. I do
not see any sysfs-based code in that thread.
And try to use lore.kernel.org, lkml.org doesn't always work and we have
no control over that :(
> > I still don't know _why_ you want this. The ION stuff is not needed as
> > that code is about to be deleted, so who else wants this? What is the
> > use-case for it that is so desperately needed that parsing
> > yet-another-proc file is going to solve the problem?
> In my Android device, there are graphic driver memory, zsmalloc memory except ION.
Ok, so what does Android have to do with this?
> I don't know other cases in other platform.
> Not desperately needed but I think we need one userspace knob to see overall hidden huge memory.
Why? Who wants that? What would userspace do with that? And what
exactly do you want to show?
Is this just a debugging thing? Then use debugfs for that, not proc.
Isn't that what the DRM developers are starting to do?
> Additionally I'd like to see all those hidden memory in OutOfMemory log.
How is anything hidden, can't you see it in the slab information?
> This is useful to get clue to find memory hogger.
> i.e.) show_mem on oom
> <6>[ 420.856428] Mem-Info:
> <6>[ 420.856433] IonSystemHeap:32813kB ZsPages:44114kB GraphicDriver::13091kB
> <6>[ 420.856450] active_anon:957205 inactive_anon:159383 isolated_anon:0
So what does this show you? That someone is takign a ton of ION memory
for some unknown use? What can you do with that? What would you do
with that?
And memory is almost never assigned to a "driver", it is assigned to a
"device" that uses it. Drivers can handle multiple devices at the same
time, so why would you break this down by drivers? Are you assuming
that a driver only talks to one piece of hardware?
I think you need a much better use case for all of this other than
"wouldn't it be nice to see some numbers", as that isn't going to help
anyone out in the end.
thanks,
greg k-h
On 2020년 03월 24일 20:46, Greg KH wrote:
> On Tue, Mar 24, 2020 at 08:37:38PM +0900, Jaewon Kim wrote:
>>
>> On 2020년 03월 24일 19:11, Greg KH wrote:
>>> On Tue, Mar 24, 2020 at 06:11:17PM +0900, Jaewon Kim wrote:
>>>> On 2020년 03월 23일 18:53, Greg KH wrote:
>>>>>> +int register_meminfo_extra(atomic_long_t *val, int shift, const char *name)
>>>>>> +{
>>>>>> + struct meminfo_extra *meminfo, *memtemp;
>>>>>> + int len;
>>>>>> + int error = 0;
>>>>>> +
>>>>>> + meminfo = kzalloc(sizeof(*meminfo), GFP_KERNEL);
>>>>>> + if (!meminfo) {
>>>>>> + error = -ENOMEM;
>>>>>> + goto out;
>>>>>> + }
>>>>>> +
>>>>>> + meminfo->val = val;
>>>>>> + meminfo->shift_for_page = shift;
>>>>>> + strncpy(meminfo->name, name, NAME_SIZE);
>>>>>> + len = strlen(meminfo->name);
>>>>>> + meminfo->name[len] = ':';
>>>>>> + strncpy(meminfo->name_pad, meminfo->name, NAME_BUF_SIZE);
>>>>>> + while (++len < NAME_BUF_SIZE - 1)
>>>>>> + meminfo->name_pad[len] = ' ';
>>>>>> +
>>>>>> + spin_lock(&meminfo_lock);
>>>>>> + list_for_each_entry_rcu(memtemp, &meminfo_head, list) {
>>>>>> + if (memtemp->val == val) {
>>>>>> + error = -EINVAL;
>>>>>> + break;
>>>>>> + }
>>>>>> + }
>>>>>> + if (!error)
>>>>>> + list_add_tail_rcu(&meminfo->list, &meminfo_head);
>>>>>> + spin_unlock(&meminfo_lock);
>>>>> If you have a lock, why are you needing rcu?
>>>> I think _rcu should be removed out of list_for_each_entry_rcu.
>>>> But I'm confused about what you meant.
>>>> I used rcu_read_lock on __meminfo_extra,
>>>> and I think spin_lock is also needed for addition and deletion to handle multiple modifiers.
>>> If that's the case, then that's fine, it just didn't seem like that was
>>> needed. Or I might have been reading your rcu logic incorrectly...
>>>
>>>>>> + if (error)
>>>>>> + kfree(meminfo);
>>>>>> +out:
>>>>>> +
>>>>>> + return error;
>>>>>> +}
>>>>>> +EXPORT_SYMBOL(register_meminfo_extra);
>>>>> EXPORT_SYMBOL_GPL()? I have to ask :)
>>>> I can use EXPORT_SYMBOL_GPL.
>>>>> thanks,
>>>>>
>>>>> greg k-h
>>>>>
>>>>>
>>>> Hello
>>>> Thank you for your comment.
>>>>
>>>> By the way there was not resolved discussion on v1 patch as I mentioned on cover page.
>>>> I'd like to hear your opinion on this /proc/meminfo_extra node.
>>> I think it is the propagation of an old and obsolete interface that you
>>> will have to support for the next 20+ years and yet not actually be
>>> useful :)
>>>
>>>> Do you think this is meaningful or cannot co-exist with other future
>>>> sysfs based API.
>>> What sysfs-based API?
>> Please refer to mail thread on v1 patch set - https://protect2.fireeye.com/url?k=16e3accc-4b2f6548-16e22783-0cc47aa8f5ba-935fe828ac2f6656&u=https://lkml.org/lkml/fancy/2020/3/10/2102
>> especially discussion with Leon Romanovsky on https://protect2.fireeye.com/url?k=74208ed9-29ec475d-74210596-0cc47aa8f5ba-0bd4ef48931fec95&u=https://lkml.org/lkml/fancy/2020/3/16/140
> I really do not understand what you are referring to here, sorry. I do
> not see any sysfs-based code in that thread.
Sorry. I also did not see actual code.
Hello Leon Romanovsky, could you elaborate your plan regarding sysfs stuff?
>
> And try to use lore.kernel.org, lkml.org doesn't always work and we have
> no control over that :(
>
>>> I still don't know _why_ you want this. The ION stuff is not needed as
>>> that code is about to be deleted, so who else wants this? What is the
>>> use-case for it that is so desperately needed that parsing
>>> yet-another-proc file is going to solve the problem?
>> In my Android device, there are graphic driver memory, zsmalloc memory except ION.
> Ok, so what does Android have to do with this?
Some driver in Android platform may use my API to show its memory usage.
>
>> I don't know other cases in other platform.
>> Not desperately needed but I think we need one userspace knob to see overall hidden huge memory.
> Why? Who wants that? What would userspace do with that? And what
> exactly do you want to show?
>
> Is this just a debugging thing? Then use debugfs for that, not proc.
> Isn't that what the DRM developers are starting to do?
>
>> Additionally I'd like to see all those hidden memory in OutOfMemory log.
> How is anything hidden, can't you see it in the slab information?
>
Let me explain more.
0. slab
As I said in cover page, this is not for memory allocated by slab.
I'd like to know where huge memory has gone.
Those are directly allocated by alloc_pages instead of slab.
/proc/slabinfo does not show this information.
1. /proc/meminfo_extra
/proc/meminfo_extra could be debugging thing to see memory status at a certain time.
But it, I think, is also basic information rather than just for debugging.
It is similar with /proc/meminfo which is in procfs instead of debugfs.
2. oom log
oom log in show_mem is more than just debugging.
As existing oom log shows much memory information, I think we need the hidden memory info.
Without these information, we do NOT know oom reason because other traditional stats are not enough.
>> This is useful to get clue to find memory hogger.
>> i.e.) show_mem on oom
>> <6>[ 420.856428] Mem-Info:
>> <6>[ 420.856433] IonSystemHeap:32813kB ZsPages:44114kB GraphicDriver::13091kB
>> <6>[ 420.856450] active_anon:957205 inactive_anon:159383 isolated_anon:0
> So what does this show you? That someone is takign a ton of ION memory
> for some unknown use? What can you do with that? What would you do
> with that?
We may not know exact memory owner. But we can narrow down.
Anyway I think this is meaningful instead of no clue.
>
> And memory is almost never assigned to a "driver", it is assigned to a
> "device" that uses it. Drivers can handle multiple devices at the same
> time, so why would you break this down by drivers? Are you assuming
> that a driver only talks to one piece of hardware?
Yes a driver may support several devices. I don't know if it same on an embedded device.
Anyway I think the idea works even for several devices, although the driver should
distinguish memory usage for each device and should register each memory stat.
>
> I think you need a much better use case for all of this other than
> "wouldn't it be nice to see some numbers", as that isn't going to help
> anyone out in the end.
Sorry. As of now, I do not know other better use case, but I still think
memory information should cover most of memory usage.
Huge memory consumed by driver or other core logic should be shown in OoM.
>
> thanks,
>
> greg k-h
>
>
On Tue, Mar 24, 2020 at 09:53:16PM +0900, Jaewon Kim wrote:
>
>
> On 2020년 03월 24일 20:46, Greg KH wrote:
> > On Tue, Mar 24, 2020 at 08:37:38PM +0900, Jaewon Kim wrote:
> >>
> >> On 2020년 03월 24일 19:11, Greg KH wrote:
> >>> On Tue, Mar 24, 2020 at 06:11:17PM +0900, Jaewon Kim wrote:
> >>>> On 2020년 03월 23일 18:53, Greg KH wrote:
> >>>>>> +int register_meminfo_extra(atomic_long_t *val, int shift, const char *name)
> >>>>>> +{
> >>>>>> + struct meminfo_extra *meminfo, *memtemp;
> >>>>>> + int len;
> >>>>>> + int error = 0;
> >>>>>> +
> >>>>>> + meminfo = kzalloc(sizeof(*meminfo), GFP_KERNEL);
> >>>>>> + if (!meminfo) {
> >>>>>> + error = -ENOMEM;
> >>>>>> + goto out;
> >>>>>> + }
> >>>>>> +
> >>>>>> + meminfo->val = val;
> >>>>>> + meminfo->shift_for_page = shift;
> >>>>>> + strncpy(meminfo->name, name, NAME_SIZE);
> >>>>>> + len = strlen(meminfo->name);
> >>>>>> + meminfo->name[len] = ':';
> >>>>>> + strncpy(meminfo->name_pad, meminfo->name, NAME_BUF_SIZE);
> >>>>>> + while (++len < NAME_BUF_SIZE - 1)
> >>>>>> + meminfo->name_pad[len] = ' ';
> >>>>>> +
> >>>>>> + spin_lock(&meminfo_lock);
> >>>>>> + list_for_each_entry_rcu(memtemp, &meminfo_head, list) {
> >>>>>> + if (memtemp->val == val) {
> >>>>>> + error = -EINVAL;
> >>>>>> + break;
> >>>>>> + }
> >>>>>> + }
> >>>>>> + if (!error)
> >>>>>> + list_add_tail_rcu(&meminfo->list, &meminfo_head);
> >>>>>> + spin_unlock(&meminfo_lock);
> >>>>> If you have a lock, why are you needing rcu?
> >>>> I think _rcu should be removed out of list_for_each_entry_rcu.
> >>>> But I'm confused about what you meant.
> >>>> I used rcu_read_lock on __meminfo_extra,
> >>>> and I think spin_lock is also needed for addition and deletion to handle multiple modifiers.
> >>> If that's the case, then that's fine, it just didn't seem like that was
> >>> needed. Or I might have been reading your rcu logic incorrectly...
> >>>
> >>>>>> + if (error)
> >>>>>> + kfree(meminfo);
> >>>>>> +out:
> >>>>>> +
> >>>>>> + return error;
> >>>>>> +}
> >>>>>> +EXPORT_SYMBOL(register_meminfo_extra);
> >>>>> EXPORT_SYMBOL_GPL()? I have to ask :)
> >>>> I can use EXPORT_SYMBOL_GPL.
> >>>>> thanks,
> >>>>>
> >>>>> greg k-h
> >>>>>
> >>>>>
> >>>> Hello
> >>>> Thank you for your comment.
> >>>>
> >>>> By the way there was not resolved discussion on v1 patch as I mentioned on cover page.
> >>>> I'd like to hear your opinion on this /proc/meminfo_extra node.
> >>> I think it is the propagation of an old and obsolete interface that you
> >>> will have to support for the next 20+ years and yet not actually be
> >>> useful :)
> >>>
> >>>> Do you think this is meaningful or cannot co-exist with other future
> >>>> sysfs based API.
> >>> What sysfs-based API?
> >> Please refer to mail thread on v1 patch set - https://protect2.fireeye.com/url?k=16e3accc-4b2f6548-16e22783-0cc47aa8f5ba-935fe828ac2f6656&u=https://lkml.org/lkml/fancy/2020/3/10/2102
> >> especially discussion with Leon Romanovsky on https://protect2.fireeye.com/url?k=74208ed9-29ec475d-74210596-0cc47aa8f5ba-0bd4ef48931fec95&u=https://lkml.org/lkml/fancy/2020/3/16/140
> > I really do not understand what you are referring to here, sorry. I do
> > not see any sysfs-based code in that thread.
> Sorry. I also did not see actual code.
> Hello Leon Romanovsky, could you elaborate your plan regarding sysfs stuff?
> >
> > And try to use lore.kernel.org, lkml.org doesn't always work and we have
> > no control over that :(
> >
> >>> I still don't know _why_ you want this. The ION stuff is not needed as
> >>> that code is about to be deleted, so who else wants this? What is the
> >>> use-case for it that is so desperately needed that parsing
> >>> yet-another-proc file is going to solve the problem?
> >> In my Android device, there are graphic driver memory, zsmalloc memory except ION.
> > Ok, so what does Android have to do with this?
> Some driver in Android platform may use my API to show its memory usage.
I do not understand what this means.
> >> I don't know other cases in other platform.
> >> Not desperately needed but I think we need one userspace knob to see overall hidden huge memory.
> > Why? Who wants that? What would userspace do with that? And what
> > exactly do you want to show?
> >
> > Is this just a debugging thing? Then use debugfs for that, not proc.
> > Isn't that what the DRM developers are starting to do?
> >
> >> Additionally I'd like to see all those hidden memory in OutOfMemory log.
> > How is anything hidden, can't you see it in the slab information?
> >
> Let me explain more.
>
> 0. slab
> As I said in cover page, this is not for memory allocated by slab.
Great, then have the subsystem that allocates such memory, be the thing
that exports the information. Drivers "on their own" do not grab any
memory without asking for it from other parts of the kernel.
Modify those "other parts", this isn't a driver-specific thing at all.
So, what "other parts" are involved here?
> I'd like to know where huge memory has gone.
> Those are directly allocated by alloc_pages instead of slab.
> /proc/slabinfo does not show this information.
Why isn't alloc_pages information exported anywhere? Work on that.
> 1. /proc/meminfo_extra
> /proc/meminfo_extra could be debugging thing to see memory status at a certain time.
If it is debugging, then use debugfs.
> But it, I think, is also basic information rather than just for debugging.
Who would use that information for anything except debugging?
> It is similar with /proc/meminfo which is in procfs instead of debugfs.
meminfo is older than debugfs and sysfs, can't change that today.
> 2. oom log
> oom log in show_mem is more than just debugging.
Why? Who sees this?
> As existing oom log shows much memory information, I think we need the hidden memory info.
> Without these information, we do NOT know oom reason because other traditional stats are not enough.
Why not? Kernel users of memory shouldn't be triggering OOM events.
> >> This is useful to get clue to find memory hogger.
> >> i.e.) show_mem on oom
> >> <6>[ 420.856428] Mem-Info:
> >> <6>[ 420.856433] IonSystemHeap:32813kB ZsPages:44114kB GraphicDriver::13091kB
> >> <6>[ 420.856450] active_anon:957205 inactive_anon:159383 isolated_anon:0
> > So what does this show you? That someone is takign a ton of ION memory
> > for some unknown use? What can you do with that? What would you do
> > with that?
> We may not know exact memory owner. But we can narrow down.
> Anyway I think this is meaningful instead of no clue.
Again, work on the subsystems that actually allocate the memory, not
drivers. And if you want to mess with drivers, do it in a
device-specific way, not a driver-specific way.
> > And memory is almost never assigned to a "driver", it is assigned to a
> > "device" that uses it. Drivers can handle multiple devices at the same
> > time, so why would you break this down by drivers? Are you assuming
> > that a driver only talks to one piece of hardware?
> Yes a driver may support several devices. I don't know if it same on an embedded device.
Why wouldn't it be? Is this new interface somehow only acceptable for
systems with one-device-per-driver? If so, that's not going to work at
all.
> Anyway I think the idea works even for several devices, although the driver should
> distinguish memory usage for each device and should register each memory stat.
And how would that happen?
thanks,
greg k-h
On Mon, Mar 23, 2020 at 05:05:00PM +0900, Jaewon Kim wrote:
> /proc/meminfo or show_free_areas does not show full system wide memory
> usage status because memory stats do not track all memory allocations.
> There seems to be huge hidden memory especially on embedded system. It
> is because some HW IPs in the system use common DRAM memory instead of
> internal memory. Device drivers directly request huge pages from the
> page allocator with alloc_pages.
>
> In Android system, most of those hidden memory seems to be vmalloc
> pages, ion system heap memory, graphics memory, and memory for DRAM
> based compressed swap storage. They may be shown in other node but it
> seems to be useful if /proc/meminfo_extra shows all those extra memory
> information. And show_mem also need to print the info in oom situation.
>
> Fortunately vmalloc pages is already shown by commit 97105f0ab7b8
> ("mm: vmalloc: show number of vmalloc pages in /proc/meminfo"). Swap
> memory using zsmalloc can be seen through vmstat by commit 91537fee0013
> ("mm: add NR_ZSMALLOC to vmstat") but not on /proc/meminfo.
>
> Memory usage of specific driver can be various so that showing the usage
> through upstream meminfo.c is not easy. To print the extra memory usage
> of a driver, introduce following APIs. Each driver needs to count as
> atomic_long_t.
>
> int register_meminfo_extra(atomic_long_t *val, int shift,
> const char *name);
> int unregister_meminfo_extra(atomic_long_t *val);
>
> Currently register ION system heap allocator and zsmalloc pages.
> Additionally tested on local graphics driver.
>
> i.e) cat /proc/meminfo_extra | tail -3
> IonSystemHeap: 242620 kB
> ZsPages: 203860 kB
> GraphicDriver: 196576 kB
In that case definitely delete ':', spaces and KB.
They only slowdown generation and parsing in userspace.
Values should be printed /proc/vmstat does it, maybe with tab instead of
space.
foo 1234
bar 0
zot 111
On Tue, Mar 24, 2020 at 12:46:45PM +0100, Greg KH wrote:
> On Tue, Mar 24, 2020 at 08:37:38PM +0900, Jaewon Kim wrote:
> > I don't know other cases in other platform.
> > Not desperately needed but I think we need one userspace knob to see overall hidden huge memory.
>
> Why? Who wants that? What would userspace do with that? And what
> exactly do you want to show?
>
> Is this just a debugging thing? Then use debugfs for that, not proc.
Yes, please use debugfs. Android can ban it in production all at once.
> Isn't that what the DRM developers are starting to do?
>
> > Additionally I'd like to see all those hidden memory in OutOfMemory log.
>
> How is anything hidden, can't you see it in the slab information?
There real usecase for something like that is vmware baloon driver.
Two VM instances oversteal pages one from another resulting in guest OOM
which looks like one giant get_free_page() memory leak from inside VM.
I don't know how often this type of issue happens nowadays but countless
OOM support tickets were filed and closed over this nonsense.
On 2020년 03월 24일 22:19, Greg KH wrote:
> On Tue, Mar 24, 2020 at 09:53:16PM +0900, Jaewon Kim wrote:
>>
>> On 2020년 03월 24일 20:46, Greg KH wrote:
>>> On Tue, Mar 24, 2020 at 08:37:38PM +0900, Jaewon Kim wrote:
>>>> On 2020년 03월 24일 19:11, Greg KH wrote:
>>>>> On Tue, Mar 24, 2020 at 06:11:17PM +0900, Jaewon Kim wrote:
>>>>>> On 2020년 03월 23일 18:53, Greg KH wrote:
>>>>>>>> +int register_meminfo_extra(atomic_long_t *val, int shift, const char *name)
>>>>>>>> +{
>>>>>>>> + struct meminfo_extra *meminfo, *memtemp;
>>>>>>>> + int len;
>>>>>>>> + int error = 0;
>>>>>>>> +
>>>>>>>> + meminfo = kzalloc(sizeof(*meminfo), GFP_KERNEL);
>>>>>>>> + if (!meminfo) {
>>>>>>>> + error = -ENOMEM;
>>>>>>>> + goto out;
>>>>>>>> + }
>>>>>>>> +
>>>>>>>> + meminfo->val = val;
>>>>>>>> + meminfo->shift_for_page = shift;
>>>>>>>> + strncpy(meminfo->name, name, NAME_SIZE);
>>>>>>>> + len = strlen(meminfo->name);
>>>>>>>> + meminfo->name[len] = ':';
>>>>>>>> + strncpy(meminfo->name_pad, meminfo->name, NAME_BUF_SIZE);
>>>>>>>> + while (++len < NAME_BUF_SIZE - 1)
>>>>>>>> + meminfo->name_pad[len] = ' ';
>>>>>>>> +
>>>>>>>> + spin_lock(&meminfo_lock);
>>>>>>>> + list_for_each_entry_rcu(memtemp, &meminfo_head, list) {
>>>>>>>> + if (memtemp->val == val) {
>>>>>>>> + error = -EINVAL;
>>>>>>>> + break;
>>>>>>>> + }
>>>>>>>> + }
>>>>>>>> + if (!error)
>>>>>>>> + list_add_tail_rcu(&meminfo->list, &meminfo_head);
>>>>>>>> + spin_unlock(&meminfo_lock);
>>>>>>> If you have a lock, why are you needing rcu?
>>>>>> I think _rcu should be removed out of list_for_each_entry_rcu.
>>>>>> But I'm confused about what you meant.
>>>>>> I used rcu_read_lock on __meminfo_extra,
>>>>>> and I think spin_lock is also needed for addition and deletion to handle multiple modifiers.
>>>>> If that's the case, then that's fine, it just didn't seem like that was
>>>>> needed. Or I might have been reading your rcu logic incorrectly...
>>>>>
>>>>>>>> + if (error)
>>>>>>>> + kfree(meminfo);
>>>>>>>> +out:
>>>>>>>> +
>>>>>>>> + return error;
>>>>>>>> +}
>>>>>>>> +EXPORT_SYMBOL(register_meminfo_extra);
>>>>>>> EXPORT_SYMBOL_GPL()? I have to ask :)
>>>>>> I can use EXPORT_SYMBOL_GPL.
>>>>>>> thanks,
>>>>>>>
>>>>>>> greg k-h
>>>>>>>
>>>>>>>
>>>>>> Hello
>>>>>> Thank you for your comment.
>>>>>>
>>>>>> By the way there was not resolved discussion on v1 patch as I mentioned on cover page.
>>>>>> I'd like to hear your opinion on this /proc/meminfo_extra node.
>>>>> I think it is the propagation of an old and obsolete interface that you
>>>>> will have to support for the next 20+ years and yet not actually be
>>>>> useful :)
>>>>>
>>>>>> Do you think this is meaningful or cannot co-exist with other future
>>>>>> sysfs based API.
>>>>> What sysfs-based API?
>>>> Please refer to mail thread on v1 patch set - https://protect2.fireeye.com/url?k=16e3accc-4b2f6548-16e22783-0cc47aa8f5ba-935fe828ac2f6656&u=https://lkml.org/lkml/fancy/2020/3/10/2102
>>>> especially discussion with Leon Romanovsky on https://protect2.fireeye.com/url?k=74208ed9-29ec475d-74210596-0cc47aa8f5ba-0bd4ef48931fec95&u=https://lkml.org/lkml/fancy/2020/3/16/140
>>> I really do not understand what you are referring to here, sorry. I do
>>> not see any sysfs-based code in that thread.
>> Sorry. I also did not see actual code.
>> Hello Leon Romanovsky, could you elaborate your plan regarding sysfs stuff?
>>> And try to use lore.kernel.org, lkml.org doesn't always work and we have
>>> no control over that :(
>>>
>>>>> I still don't know _why_ you want this. The ION stuff is not needed as
>>>>> that code is about to be deleted, so who else wants this? What is the
>>>>> use-case for it that is so desperately needed that parsing
>>>>> yet-another-proc file is going to solve the problem?
>>>> In my Android device, there are graphic driver memory, zsmalloc memory except ION.
>>> Ok, so what does Android have to do with this?
>> Some driver in Android platform may use my API to show its memory usage.
> I do not understand what this means.
>
>>>> I don't know other cases in other platform.
>>>> Not desperately needed but I think we need one userspace knob to see overall hidden huge memory.
>>> Why? Who wants that? What would userspace do with that? And what
>>> exactly do you want to show?
>>>
>>> Is this just a debugging thing? Then use debugfs for that, not proc.
>>> Isn't that what the DRM developers are starting to do?
>>>
>>>> Additionally I'd like to see all those hidden memory in OutOfMemory log.
>>> How is anything hidden, can't you see it in the slab information?
>>>
>> Let me explain more.
>>
>> 0. slab
>> As I said in cover page, this is not for memory allocated by slab.
> Great, then have the subsystem that allocates such memory, be the thing
> that exports the information. Drivers "on their own" do not grab any
> memory without asking for it from other parts of the kernel.
>
> Modify those "other parts", this isn't a driver-specific thing at all.
>
> So, what "other parts" are involved here?
>
>> I'd like to know where huge memory has gone.
>> Those are directly allocated by alloc_pages instead of slab.
>> /proc/slabinfo does not show this information.
> Why isn't alloc_pages information exported anywhere? Work on that.
>
>> 1. /proc/meminfo_extra
>> /proc/meminfo_extra could be debugging thing to see memory status at a certain time.
> If it is debugging, then use debugfs.
>
>> But it, I think, is also basic information rather than just for debugging.
> Who would use that information for anything except debugging?
>
>> It is similar with /proc/meminfo which is in procfs instead of debugfs.
> meminfo is older than debugfs and sysfs, can't change that today.
>
>> 2. oom log
>> oom log in show_mem is more than just debugging.
> Why? Who sees this?
>
>> As existing oom log shows much memory information, I think we need the hidden memory info.
>> Without these information, we do NOT know oom reason because other traditional stats are not enough.
> Why not? Kernel users of memory shouldn't be triggering OOM events.
>
>
>>>> This is useful to get clue to find memory hogger.
>>>> i.e.) show_mem on oom
>>>> <6>[ 420.856428] Mem-Info:
>>>> <6>[ 420.856433] IonSystemHeap:32813kB ZsPages:44114kB GraphicDriver::13091kB
>>>> <6>[ 420.856450] active_anon:957205 inactive_anon:159383 isolated_anon:0
>>> So what does this show you? That someone is takign a ton of ION memory
>>> for some unknown use? What can you do with that? What would you do
>>> with that?
>> We may not know exact memory owner. But we can narrow down.
>> Anyway I think this is meaningful instead of no clue.
> Again, work on the subsystems that actually allocate the memory, not
> drivers. And if you want to mess with drivers, do it in a
> device-specific way, not a driver-specific way.
>
>>> And memory is almost never assigned to a "driver", it is assigned to a
>>> "device" that uses it. Drivers can handle multiple devices at the same
>>> time, so why would you break this down by drivers? Are you assuming
>>> that a driver only talks to one piece of hardware?
>> Yes a driver may support several devices. I don't know if it same on an embedded device.
> Why wouldn't it be? Is this new interface somehow only acceptable for
> systems with one-device-per-driver? If so, that's not going to work at
> all.
>
>> Anyway I think the idea works even for several devices, although the driver should
>> distinguish memory usage for each device and should register each memory stat.
> And how would that happen?
>
> thanks,
>
> greg k-h
>
>
Thank you for you guys' comment
Let me consider more
On Tue, Mar 24, 2020 at 09:53:16PM +0900, Jaewon Kim wrote:
>
>
> On 2020년 03월 24일 20:46, Greg KH wrote:
> > On Tue, Mar 24, 2020 at 08:37:38PM +0900, Jaewon Kim wrote:
> >>
> >> On 2020년 03월 24일 19:11, Greg KH wrote:
> >>> On Tue, Mar 24, 2020 at 06:11:17PM +0900, Jaewon Kim wrote:
> >>>> On 2020년 03월 23일 18:53, Greg KH wrote:
> >>>>>> +int register_meminfo_extra(atomic_long_t *val, int shift, const char *name)
> >>>>>> +{
> >>>>>> + struct meminfo_extra *meminfo, *memtemp;
> >>>>>> + int len;
> >>>>>> + int error = 0;
> >>>>>> +
> >>>>>> + meminfo = kzalloc(sizeof(*meminfo), GFP_KERNEL);
> >>>>>> + if (!meminfo) {
> >>>>>> + error = -ENOMEM;
> >>>>>> + goto out;
> >>>>>> + }
> >>>>>> +
> >>>>>> + meminfo->val = val;
> >>>>>> + meminfo->shift_for_page = shift;
> >>>>>> + strncpy(meminfo->name, name, NAME_SIZE);
> >>>>>> + len = strlen(meminfo->name);
> >>>>>> + meminfo->name[len] = ':';
> >>>>>> + strncpy(meminfo->name_pad, meminfo->name, NAME_BUF_SIZE);
> >>>>>> + while (++len < NAME_BUF_SIZE - 1)
> >>>>>> + meminfo->name_pad[len] = ' ';
> >>>>>> +
> >>>>>> + spin_lock(&meminfo_lock);
> >>>>>> + list_for_each_entry_rcu(memtemp, &meminfo_head, list) {
> >>>>>> + if (memtemp->val == val) {
> >>>>>> + error = -EINVAL;
> >>>>>> + break;
> >>>>>> + }
> >>>>>> + }
> >>>>>> + if (!error)
> >>>>>> + list_add_tail_rcu(&meminfo->list, &meminfo_head);
> >>>>>> + spin_unlock(&meminfo_lock);
> >>>>> If you have a lock, why are you needing rcu?
> >>>> I think _rcu should be removed out of list_for_each_entry_rcu.
> >>>> But I'm confused about what you meant.
> >>>> I used rcu_read_lock on __meminfo_extra,
> >>>> and I think spin_lock is also needed for addition and deletion to handle multiple modifiers.
> >>> If that's the case, then that's fine, it just didn't seem like that was
> >>> needed. Or I might have been reading your rcu logic incorrectly...
> >>>
> >>>>>> + if (error)
> >>>>>> + kfree(meminfo);
> >>>>>> +out:
> >>>>>> +
> >>>>>> + return error;
> >>>>>> +}
> >>>>>> +EXPORT_SYMBOL(register_meminfo_extra);
> >>>>> EXPORT_SYMBOL_GPL()? I have to ask :)
> >>>> I can use EXPORT_SYMBOL_GPL.
> >>>>> thanks,
> >>>>>
> >>>>> greg k-h
> >>>>>
> >>>>>
> >>>> Hello
> >>>> Thank you for your comment.
> >>>>
> >>>> By the way there was not resolved discussion on v1 patch as I mentioned on cover page.
> >>>> I'd like to hear your opinion on this /proc/meminfo_extra node.
> >>> I think it is the propagation of an old and obsolete interface that you
> >>> will have to support for the next 20+ years and yet not actually be
> >>> useful :)
> >>>
> >>>> Do you think this is meaningful or cannot co-exist with other future
> >>>> sysfs based API.
> >>> What sysfs-based API?
> >> Please refer to mail thread on v1 patch set - https://protect2.fireeye.com/url?k=16e3accc-4b2f6548-16e22783-0cc47aa8f5ba-935fe828ac2f6656&u=https://lkml.org/lkml/fancy/2020/3/10/2102
> >> especially discussion with Leon Romanovsky on https://protect2.fireeye.com/url?k=74208ed9-29ec475d-74210596-0cc47aa8f5ba-0bd4ef48931fec95&u=https://lkml.org/lkml/fancy/2020/3/16/140
> > I really do not understand what you are referring to here, sorry. I do
> > not see any sysfs-based code in that thread.
> Sorry. I also did not see actual code.
> Hello Leon Romanovsky, could you elaborate your plan regarding sysfs stuff?
Sorry for being late, I wasn't in "TO:", so missed the whole discussion.
Greg,
We need the exposed information for the memory optimizations (debug, not
production) of our high speed NICs. Our devices (mlx5) allocates a lot of
memory, so optimization there can help us to scale in SRIOV mode easier and
be less constraint by the memory.
I want to emphasize that I don't like idea of extending /proc/* interface
because it is going to be painful to grep on large machines with many
devices. And I don't like the idea that every driver will need to register
into this interface, because it will be abused almost immediately.
My proposal was to create new sysfs file by driver/core and put all
information automatically there, for example, it can be
/sys/devices/pci0000:00/0000:00:0c.0/meminfo
^^^^^^^
Thanks
On Sun, Mar 29, 2020 at 10:19:07AM +0300, Leon Romanovsky wrote:
> On Tue, Mar 24, 2020 at 09:53:16PM +0900, Jaewon Kim wrote:
> >
> >
> > On 2020년 03월 24일 20:46, Greg KH wrote:
> > > On Tue, Mar 24, 2020 at 08:37:38PM +0900, Jaewon Kim wrote:
> > >>
> > >> On 2020년 03월 24일 19:11, Greg KH wrote:
> > >>> On Tue, Mar 24, 2020 at 06:11:17PM +0900, Jaewon Kim wrote:
> > >>>> On 2020년 03월 23일 18:53, Greg KH wrote:
> > >>>>>> +int register_meminfo_extra(atomic_long_t *val, int shift, const char *name)
> > >>>>>> +{
> > >>>>>> + struct meminfo_extra *meminfo, *memtemp;
> > >>>>>> + int len;
> > >>>>>> + int error = 0;
> > >>>>>> +
> > >>>>>> + meminfo = kzalloc(sizeof(*meminfo), GFP_KERNEL);
> > >>>>>> + if (!meminfo) {
> > >>>>>> + error = -ENOMEM;
> > >>>>>> + goto out;
> > >>>>>> + }
> > >>>>>> +
> > >>>>>> + meminfo->val = val;
> > >>>>>> + meminfo->shift_for_page = shift;
> > >>>>>> + strncpy(meminfo->name, name, NAME_SIZE);
> > >>>>>> + len = strlen(meminfo->name);
> > >>>>>> + meminfo->name[len] = ':';
> > >>>>>> + strncpy(meminfo->name_pad, meminfo->name, NAME_BUF_SIZE);
> > >>>>>> + while (++len < NAME_BUF_SIZE - 1)
> > >>>>>> + meminfo->name_pad[len] = ' ';
> > >>>>>> +
> > >>>>>> + spin_lock(&meminfo_lock);
> > >>>>>> + list_for_each_entry_rcu(memtemp, &meminfo_head, list) {
> > >>>>>> + if (memtemp->val == val) {
> > >>>>>> + error = -EINVAL;
> > >>>>>> + break;
> > >>>>>> + }
> > >>>>>> + }
> > >>>>>> + if (!error)
> > >>>>>> + list_add_tail_rcu(&meminfo->list, &meminfo_head);
> > >>>>>> + spin_unlock(&meminfo_lock);
> > >>>>> If you have a lock, why are you needing rcu?
> > >>>> I think _rcu should be removed out of list_for_each_entry_rcu.
> > >>>> But I'm confused about what you meant.
> > >>>> I used rcu_read_lock on __meminfo_extra,
> > >>>> and I think spin_lock is also needed for addition and deletion to handle multiple modifiers.
> > >>> If that's the case, then that's fine, it just didn't seem like that was
> > >>> needed. Or I might have been reading your rcu logic incorrectly...
> > >>>
> > >>>>>> + if (error)
> > >>>>>> + kfree(meminfo);
> > >>>>>> +out:
> > >>>>>> +
> > >>>>>> + return error;
> > >>>>>> +}
> > >>>>>> +EXPORT_SYMBOL(register_meminfo_extra);
> > >>>>> EXPORT_SYMBOL_GPL()? I have to ask :)
> > >>>> I can use EXPORT_SYMBOL_GPL.
> > >>>>> thanks,
> > >>>>>
> > >>>>> greg k-h
> > >>>>>
> > >>>>>
> > >>>> Hello
> > >>>> Thank you for your comment.
> > >>>>
> > >>>> By the way there was not resolved discussion on v1 patch as I mentioned on cover page.
> > >>>> I'd like to hear your opinion on this /proc/meminfo_extra node.
> > >>> I think it is the propagation of an old and obsolete interface that you
> > >>> will have to support for the next 20+ years and yet not actually be
> > >>> useful :)
> > >>>
> > >>>> Do you think this is meaningful or cannot co-exist with other future
> > >>>> sysfs based API.
> > >>> What sysfs-based API?
> > >> Please refer to mail thread on v1 patch set - https://protect2.fireeye.com/url?k=16e3accc-4b2f6548-16e22783-0cc47aa8f5ba-935fe828ac2f6656&u=https://lkml.org/lkml/fancy/2020/3/10/2102
> > >> especially discussion with Leon Romanovsky on https://protect2.fireeye.com/url?k=74208ed9-29ec475d-74210596-0cc47aa8f5ba-0bd4ef48931fec95&u=https://lkml.org/lkml/fancy/2020/3/16/140
> > > I really do not understand what you are referring to here, sorry. I do
> > > not see any sysfs-based code in that thread.
> > Sorry. I also did not see actual code.
> > Hello Leon Romanovsky, could you elaborate your plan regarding sysfs stuff?
>
> Sorry for being late, I wasn't in "TO:", so missed the whole discussion.
>
> Greg,
>
> We need the exposed information for the memory optimizations (debug, not
> production) of our high speed NICs. Our devices (mlx5) allocates a lot of
> memory, so optimization there can help us to scale in SRIOV mode easier and
> be less constraint by the memory.
Great, then use debugfs and expose what ever you want in what ever way
you want, no restrictions there, you do not need any type of kernel-wide
/proc file for that today.
> I want to emphasize that I don't like idea of extending /proc/* interface
> because it is going to be painful to grep on large machines with many
> devices. And I don't like the idea that every driver will need to register
> into this interface, because it will be abused almost immediately.
I agree.
> My proposal was to create new sysfs file by driver/core and put all
> information automatically there, for example, it can be
> /sys/devices/pci0000:00/0000:00:0c.0/meminfo
> ^^^^^^^
Nope, again, use debugfs, as sysfs is only one-value-per-file.
thanks,
greg k-h
On Sun, Mar 29, 2020 at 09:23:04AM +0200, Greg KH wrote:
> On Sun, Mar 29, 2020 at 10:19:07AM +0300, Leon Romanovsky wrote:
> > On Tue, Mar 24, 2020 at 09:53:16PM +0900, Jaewon Kim wrote:
> > >
> > >
> > > On 2020년 03월 24일 20:46, Greg KH wrote:
> > > > On Tue, Mar 24, 2020 at 08:37:38PM +0900, Jaewon Kim wrote:
> > > >>
> > > >> On 2020년 03월 24일 19:11, Greg KH wrote:
> > > >>> On Tue, Mar 24, 2020 at 06:11:17PM +0900, Jaewon Kim wrote:
> > > >>>> On 2020년 03월 23일 18:53, Greg KH wrote:
> > > >>>>>> +int register_meminfo_extra(atomic_long_t *val, int shift, const char *name)
> > > >>>>>> +{
> > > >>>>>> + struct meminfo_extra *meminfo, *memtemp;
> > > >>>>>> + int len;
> > > >>>>>> + int error = 0;
> > > >>>>>> +
> > > >>>>>> + meminfo = kzalloc(sizeof(*meminfo), GFP_KERNEL);
> > > >>>>>> + if (!meminfo) {
> > > >>>>>> + error = -ENOMEM;
> > > >>>>>> + goto out;
> > > >>>>>> + }
> > > >>>>>> +
> > > >>>>>> + meminfo->val = val;
> > > >>>>>> + meminfo->shift_for_page = shift;
> > > >>>>>> + strncpy(meminfo->name, name, NAME_SIZE);
> > > >>>>>> + len = strlen(meminfo->name);
> > > >>>>>> + meminfo->name[len] = ':';
> > > >>>>>> + strncpy(meminfo->name_pad, meminfo->name, NAME_BUF_SIZE);
> > > >>>>>> + while (++len < NAME_BUF_SIZE - 1)
> > > >>>>>> + meminfo->name_pad[len] = ' ';
> > > >>>>>> +
> > > >>>>>> + spin_lock(&meminfo_lock);
> > > >>>>>> + list_for_each_entry_rcu(memtemp, &meminfo_head, list) {
> > > >>>>>> + if (memtemp->val == val) {
> > > >>>>>> + error = -EINVAL;
> > > >>>>>> + break;
> > > >>>>>> + }
> > > >>>>>> + }
> > > >>>>>> + if (!error)
> > > >>>>>> + list_add_tail_rcu(&meminfo->list, &meminfo_head);
> > > >>>>>> + spin_unlock(&meminfo_lock);
> > > >>>>> If you have a lock, why are you needing rcu?
> > > >>>> I think _rcu should be removed out of list_for_each_entry_rcu.
> > > >>>> But I'm confused about what you meant.
> > > >>>> I used rcu_read_lock on __meminfo_extra,
> > > >>>> and I think spin_lock is also needed for addition and deletion to handle multiple modifiers.
> > > >>> If that's the case, then that's fine, it just didn't seem like that was
> > > >>> needed. Or I might have been reading your rcu logic incorrectly...
> > > >>>
> > > >>>>>> + if (error)
> > > >>>>>> + kfree(meminfo);
> > > >>>>>> +out:
> > > >>>>>> +
> > > >>>>>> + return error;
> > > >>>>>> +}
> > > >>>>>> +EXPORT_SYMBOL(register_meminfo_extra);
> > > >>>>> EXPORT_SYMBOL_GPL()? I have to ask :)
> > > >>>> I can use EXPORT_SYMBOL_GPL.
> > > >>>>> thanks,
> > > >>>>>
> > > >>>>> greg k-h
> > > >>>>>
> > > >>>>>
> > > >>>> Hello
> > > >>>> Thank you for your comment.
> > > >>>>
> > > >>>> By the way there was not resolved discussion on v1 patch as I mentioned on cover page.
> > > >>>> I'd like to hear your opinion on this /proc/meminfo_extra node.
> > > >>> I think it is the propagation of an old and obsolete interface that you
> > > >>> will have to support for the next 20+ years and yet not actually be
> > > >>> useful :)
> > > >>>
> > > >>>> Do you think this is meaningful or cannot co-exist with other future
> > > >>>> sysfs based API.
> > > >>> What sysfs-based API?
> > > >> Please refer to mail thread on v1 patch set - https://protect2.fireeye.com/url?k=16e3accc-4b2f6548-16e22783-0cc47aa8f5ba-935fe828ac2f6656&u=https://lkml.org/lkml/fancy/2020/3/10/2102
> > > >> especially discussion with Leon Romanovsky on https://protect2.fireeye.com/url?k=74208ed9-29ec475d-74210596-0cc47aa8f5ba-0bd4ef48931fec95&u=https://lkml.org/lkml/fancy/2020/3/16/140
> > > > I really do not understand what you are referring to here, sorry. I do
> > > > not see any sysfs-based code in that thread.
> > > Sorry. I also did not see actual code.
> > > Hello Leon Romanovsky, could you elaborate your plan regarding sysfs stuff?
> >
> > Sorry for being late, I wasn't in "TO:", so missed the whole discussion.
> >
> > Greg,
> >
> > We need the exposed information for the memory optimizations (debug, not
> > production) of our high speed NICs. Our devices (mlx5) allocates a lot of
> > memory, so optimization there can help us to scale in SRIOV mode easier and
> > be less constraint by the memory.
>
> Great, then use debugfs and expose what ever you want in what ever way
> you want, no restrictions there, you do not need any type of kernel-wide
> /proc file for that today.
No argue here, just gave you an example why Jaewon's idea is worth to explore.
>
> > I want to emphasize that I don't like idea of extending /proc/* interface
> > because it is going to be painful to grep on large machines with many
> > devices. And I don't like the idea that every driver will need to register
> > into this interface, because it will be abused almost immediately.
>
> I agree.
>
> > My proposal was to create new sysfs file by driver/core and put all
> > information automatically there, for example, it can be
> > /sys/devices/pci0000:00/0000:00:0c.0/meminfo
> > ^^^^^^^
>
> Nope, again, use debugfs, as sysfs is only one-value-per-file.
Everything that is not /proc and one global file for whole kernel
is fine by me. Debugfs is more than enough for us.
Thanks
>
> thanks,
>
> greg k-h