2008-01-15 00:52:54

by KOSAKI Motohiro

[permalink] [raw]
Subject: [RFC][PATCH 0/5] mem notifications v4

Hi!

The /dev/mem_notify is low memory notification device.
it can avoid swappness and oom by cooperationg with the user process.

You need not be annoyed by OOM any longer :)
please any comments!


related discussion:
--------------------------------------------------------------
LKML OOM notifications requirement discussion
http://www.gossamer-threads.com/lists/linux/kernel/832802?nohighlight=1#832802
OOM notifications patch [Marcelo Tosatti]
http://marc.info/?l=linux-kernel&m=119273914027743&w=2
mem notifications v3 [Marcelo Tosatti]
http://marc.info/?l=linux-mm&m=119852828327044&w=2
Thrashing notification patch [Daniel Spang]
http://marc.info/?l=linux-mm&m=119427416315676&w=2


Changelog
-------------------------------------------------
v3 -> v4 (by KOSAKI Motohiro)
o rebase to 2.6.24-rc6-mm1
o avoid wake up all.
o add judgement point to __free_one_page().
o add zone awareness.

v2 -> v3 (by Marcelo Tosatti)
o changes the notification point to happen whenever
the VM moves an anonymous page to the inactive list.
o implement notification rate limit.

v1(oom notify) -> v2 (by Marcelo Tosatti)
o name change
o notify timing change from just swap thrashing to
just before thrashing.
o also works with swapless device.




2008-01-15 01:00:20

by KOSAKI Motohiro

[permalink] [raw]
Subject: [RFC][PATCH 1/5] introduce poll_wait_exclusive() new API

There are 2 way of adding item to wait_queue,
1. add_wait_queue()
2. add_wait_queue_exclusive()
and add_wait_queue_exclusive() is very useful API.

unforunately, poll_wait_exclusive() against poll_wait() doesn't exist.
it means there is no way that wake up only 1 process where polled.
wake_up() is wake up all sleeping process by poll_wait(), not 1 process.

this patch introduce poll_wait_exclusive() new API for allow wake up only 1 process.

<example of usage>
unsigned int kosaki_poll(struct file *file,
struct poll_table_struct *wait)
{
poll_wait_exclusive(file, &kosaki_wait_queue, wait);
if (data_exist)
return POLLIN | POLLRDNORM;
return 0;
}


Signed-off-by: Marcelo Tosatti <[email protected]>
Signed-off-by: KOSAKI Motohiro <[email protected]>

---
fs/eventpoll.c | 7 +++++--
fs/select.c | 9 ++++++---
include/linux/poll.h | 11 +++++++++--
3 files changed, 20 insertions(+), 7 deletions(-)



Index: linux-2.6.24-rc6-memnotify/fs/eventpoll.c
===================================================================
--- linux-2.6.24-rc6-memnotify.orig/fs/eventpoll.c 2007-12-30 02:08:58.000000000 +0900
+++ linux-2.6.24-rc6-memnotify/fs/eventpoll.c 2007-12-30 07:10:46.000000000 +0900
@@ -676,7 +676,7 @@ out_unlock:
* target file wakeup lists.
*/
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
- poll_table *pt)
+ poll_table *pt, int exclusive)
{
struct epitem *epi = ep_item_from_epqueue(pt);
struct eppoll_entry *pwq;
@@ -685,7 +685,10 @@ static void ep_ptable_queue_proc(struct
init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
pwq->whead = whead;
pwq->base = epi;
- add_wait_queue(whead, &pwq->wait);
+ if (exclusive)
+ add_wait_queue_exclusive(whead, &pwq->wait);
+ else
+ add_wait_queue(whead, &pwq->wait);
list_add_tail(&pwq->llink, &epi->pwqlist);
epi->nwait++;
} else {
Index: linux-2.6.24-rc6-memnotify/fs/select.c
===================================================================
--- linux-2.6.24-rc6-memnotify.orig/fs/select.c 2007-12-30 02:09:00.000000000 +0900
+++ linux-2.6.24-rc6-memnotify/fs/select.c 2007-12-30 02:34:05.000000000 +0900
@@ -48,7 +48,7 @@ struct poll_table_page {
* poll table.
*/
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
- poll_table *p);
+ poll_table *p, int exclusive);

void poll_initwait(struct poll_wqueues *pwq)
{
@@ -117,7 +117,7 @@ static struct poll_table_entry *poll_get

/* Add a new entry */
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
- poll_table *p)
+ poll_table *p, int exclusive)
{
struct poll_table_entry *entry = poll_get_entry(p);
if (!entry)
@@ -126,7 +126,10 @@ static void __pollwait(struct file *filp
entry->filp = filp;
entry->wait_address = wait_address;
init_waitqueue_entry(&entry->wait, current);
- add_wait_queue(wait_address, &entry->wait);
+ if (exclusive)
+ add_wait_queue_exclusive(wait_address, &entry->wait);
+ else
+ add_wait_queue(wait_address, &entry->wait);
}

#define FDS_IN(fds, n) (fds->in + n)
Index: linux-2.6.24-rc6-memnotify/include/linux/poll.h
===================================================================
--- linux-2.6.24-rc6-memnotify.orig/include/linux/poll.h 2007-12-30 02:09:16.000000000 +0900
+++ linux-2.6.24-rc6-memnotify/include/linux/poll.h 2007-12-30 02:41:35.000000000 +0900
@@ -28,7 +28,8 @@ struct poll_table_struct;
/*
* structures and helpers for f_op->poll implementations
*/
-typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);
+typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *,
+ struct poll_table_struct *, int);

typedef struct poll_table_struct {
poll_queue_proc qproc;
@@ -37,7 +38,13 @@ typedef struct poll_table_struct {
static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
if (p && wait_address)
- p->qproc(filp, wait_address, p);
+ p->qproc(filp, wait_address, p, 0);
+}
+
+static inline void poll_wait_exclusive(struct file *filp, wait_queue_head_t *wait_address, poll_table *p)
+{
+ if (p && wait_address)
+ p->qproc(filp, wait_address, p, 1);
}

static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)


2008-01-15 01:01:19

by KOSAKI Motohiro

[permalink] [raw]
Subject: [RFC][PATCH 2/5] introduce wake_up_locked_nr() new API


introduce new API wake_up_locked_nr() and wake_up_locked_all().
it it similar as wake_up_nr() and wake_up_all(), but it doesn't lock.

Signed-off-by: Marcelo Tosatti <[email protected]>
Signed-off-by: KOSAKI Motohiro <[email protected]>

---
include/linux/wait.h | 7 +++++--
kernel/sched.c | 5 +++--
2 files changed, 8 insertions(+), 4 deletions(-)

Index: linux-2.6.24-rc6-mm1-memnotify/include/linux/wait.h
===================================================================
--- linux-2.6.24-rc6-mm1-memnotify.orig/include/linux/wait.h 2008-01-13 16:43:04.000000000 +0900
+++ linux-2.6.24-rc6-mm1-memnotify/include/linux/wait.h 2008-01-13 16:52:21.000000000 +0900
@@ -142,7 +142,7 @@ static inline void __remove_wait_queue(w
}

void FASTCALL(__wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key));
-extern void FASTCALL(__wake_up_locked(wait_queue_head_t *q, unsigned int mode));
+void FASTCALL(__wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr, void *key));
extern void FASTCALL(__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr));
void FASTCALL(__wake_up_bit(wait_queue_head_t *, void *, int));
int FASTCALL(__wait_on_bit(wait_queue_head_t *, struct wait_bit_queue *, int (*)(void *), unsigned));
@@ -155,7 +155,10 @@ wait_queue_head_t *FASTCALL(bit_waitqueu
#define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL)
#define wake_up_nr(x, nr) __wake_up(x, TASK_NORMAL, nr, NULL)
#define wake_up_all(x) __wake_up(x, TASK_NORMAL, 0, NULL)
-#define wake_up_locked(x) __wake_up_locked((x), TASK_NORMAL)
+
+#define wake_up_locked(x) __wake_up_locked((x), TASK_NORMAL, 1, NULL)
+#define wake_up_locked_nr(x, nr) __wake_up_locked((x), TASK_NORMAL, nr, NULL)
+#define wake_up_locked_all(x) __wake_up_locked((x), TASK_NORMAL, 0, NULL)

#define wake_up_interruptible(x) __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL)
#define wake_up_interruptible_nr(x, nr) __wake_up(x, TASK_INTERRUPTIBLE, nr, NULL)
Index: linux-2.6.24-rc6-mm1-memnotify/kernel/sched.c
===================================================================
--- linux-2.6.24-rc6-mm1-memnotify.orig/kernel/sched.c 2008-01-13 16:42:22.000000000 +0900
+++ linux-2.6.24-rc6-mm1-memnotify/kernel/sched.c 2008-01-13 16:53:28.000000000 +0900
@@ -3837,9 +3837,10 @@ EXPORT_SYMBOL(__wake_up);
/*
* Same as __wake_up but called with the spinlock in wait_queue_head_t held.
*/
-void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
+void __wake_up_locked(wait_queue_head_t *q, unsigned int mode,
+ int nr_exclusive, void *key)
{
- __wake_up_common(q, mode, 1, 0, NULL);
+ __wake_up_common(q, mode, nr_exclusive, 0, key);
}

/**

2008-01-15 01:02:20

by KOSAKI Motohiro

[permalink] [raw]
Subject: [RFC][PATCH 3/5] add /dev/mem_notify device

the core of this patch series.
add /dev/mem_notify device for notification low memory to user process.

<usage examle>

fd = open("/dev/mem_notify", O_RDONLY);
if (fd < 0) {
exit(1);
}
pollfds.fd = fd;
pollfds.events = POLLIN;
pollfds.revents = 0;
err = poll(&pollfds, 1, -1); // wake up at low memory

...
</usage example>

Signed-off-by: Marcelo Tosatti <[email protected]>
Signed-off-by: KOSAKI Motohiro <[email protected]>

---
drivers/char/mem.c | 6 ++
include/linux/mem_notify.h | 41 ++++++++++++++++
include/linux/mmzone.h | 1
mm/Makefile | 2
mm/mem_notify.c | 109 +++++++++++++++++++++++++++++++++++++++++++++
mm/page_alloc.c | 1
6 files changed, 159 insertions(+), 1 deletion(-)

Index: linux-2.6.24-rc6-mm1-memnotify/drivers/char/mem.c
===================================================================
--- linux-2.6.24-rc6-mm1-memnotify.orig/drivers/char/mem.c 2008-01-13 16:56:54.000000000 +0900
+++ linux-2.6.24-rc6-mm1-memnotify/drivers/char/mem.c 2008-01-13 16:57:10.000000000 +0900
@@ -34,6 +34,8 @@
# include <linux/efi.h>
#endif

+extern struct file_operations mem_notify_fops;
+
/*
* Architectures vary in how they handle caching for addresses
* outside of main memory.
@@ -854,6 +856,9 @@ static int memory_open(struct inode * in
filp->f_op = &oldmem_fops;
break;
#endif
+ case 13:
+ filp->f_op = &mem_notify_fops;
+ break;
default:
return -ENXIO;
}
@@ -886,6 +891,7 @@ static const struct {
#ifdef CONFIG_CRASH_DUMP
{12,"oldmem", S_IRUSR | S_IWUSR | S_IRGRP, &oldmem_fops},
#endif
+ {13,"mem_notify", S_IRUGO, &mem_notify_fops},
};

static struct class *mem_class;
Index: linux-2.6.24-rc6-mm1-memnotify/include/linux/mem_notify.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.24-rc6-mm1-memnotify/include/linux/mem_notify.h 2008-01-13 16:57:10.000000000 +0900
@@ -0,0 +1,42 @@
+/*
+ * Notify applications of memory pressure via /dev/mem_notify
+ *
+ * Copyright (C) 2008 Marcelo Tosatti <[email protected]>,
+ * KOSAKI Motohiro <[email protected]>
+ *
+ * Released under the GPL, see the file COPYING for details.
+ */
+
+#ifndef _LINUX_MEM_NOTIFY_H
+#define _LINUX_MEM_NOTIFY_H
+
+#define MEM_NOTIFY_FREQ (HZ/5)
+
+extern atomic_long_t last_mem_notify;
+
+extern void __memory_pressure_notify(struct zone *zone, int pressure);
+
+
+static inline void memory_pressure_notify(struct zone *zone, int pressure)
+{
+ unsigned long target;
+ unsigned long pages_high, pages_free, pages_reserve;
+
+ if (pressure) {
+ target = atomic_long_read(&last_mem_notify) + MEM_NOTIFY_FREQ;
+ if (likely(time_before(jiffies, target)))
+ return;
+
+ pages_high = zone->pages_high;
+ pages_free = zone_page_state(zone, NR_FREE_PAGES);
+ pages_reserve = zone->lowmem_reserve[MAX_NR_ZONES-1];
+ if (unlikely(pages_free > (pages_high+pages_reserve)*2))
+ return;
+
+ } else if (likely(!zone->mem_notify_status))
+ return;
+
+ __memory_pressure_notify(zone, pressure);
+}
+
+#endif /* _LINUX_MEM_NOTIFY_H */
Index: linux-2.6.24-rc6-mm1-memnotify/include/linux/mmzone.h
===================================================================
--- linux-2.6.24-rc6-mm1-memnotify.orig/include/linux/mmzone.h 2008-01-13 16:56:54.000000000 +0900
+++ linux-2.6.24-rc6-mm1-memnotify/include/linux/mmzone.h 2008-01-13 16:57:10.000000000 +0900
@@ -283,6 +283,7 @@ struct zone {
*/
int prev_priority;

+ int mem_notify_status;

ZONE_PADDING(_pad2_)
/* Rarely used or read-mostly fields */
Index: linux-2.6.24-rc6-mm1-memnotify/mm/Makefile
===================================================================
--- linux-2.6.24-rc6-mm1-memnotify.orig/mm/Makefile 2008-01-13 16:56:54.000000000 +0900
+++ linux-2.6.24-rc6-mm1-memnotify/mm/Makefile 2008-01-13 16:57:10.000000000 +0900
@@ -11,7 +11,7 @@ obj-y := bootmem.o filemap.o mempool.o
page_alloc.o page-writeback.o pdflush.o \
readahead.o swap.o truncate.o vmscan.o \
prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
- page_isolation.o $(mmu-y)
+ page_isolation.o mem_notify.o $(mmu-y)

obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
obj-$(CONFIG_BOUNCE) += bounce.o
Index: linux-2.6.24-rc6-mm1-memnotify/mm/mem_notify.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.24-rc6-mm1-memnotify/mm/mem_notify.c 2008-01-13 17:25:39.000000000 +0900
@@ -0,0 +1,109 @@
+/*
+ * Notify applications of memory pressure via /dev/mem_notify
+ *
+ * Copyright (C) 2008 Marcelo Tosatti <[email protected]>,
+ * KOSAKI Motohiro <[email protected]>
+ *
+ * Released under the GPL, see the file COPYING for details.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/wait.h>
+#include <linux/poll.h>
+#include <linux/timer.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
+#include <linux/percpu.h>
+#include <linux/timer.h>
+
+#include <asm/atomic.h>
+
+#define PROC_WAKEUP_GUARD (10*HZ)
+
+struct mem_notify_file_info {
+ unsigned long last_proc_notify;
+};
+
+static DECLARE_WAIT_QUEUE_HEAD(mem_wait);
+static atomic_long_t nr_under_memory_pressure_zones = ATOMIC_LONG_INIT(0);
+static atomic_t nr_watcher_task = ATOMIC_INIT(0);
+
+atomic_long_t last_mem_notify = ATOMIC_LONG_INIT(INITIAL_JIFFIES);
+
+void __memory_pressure_notify(struct zone* zone, int pressure)
+{
+ int nr_wakeup;
+ int flags;
+
+ spin_lock_irqsave(&mem_wait.lock, flags);
+
+ if (pressure != zone->mem_notify_status) {
+ long val = pressure ? 1 : -1;
+ atomic_long_add(val, &nr_under_memory_pressure_zones);
+ zone->mem_notify_status = pressure;
+ }
+
+ if (pressure) {
+ nr_wakeup = max_t(int, atomic_read(&nr_watcher_task)>>4, 100);
+ atomic_long_set(&last_mem_notify, jiffies);
+ wake_up_locked_nr(&mem_wait, nr_wakeup);
+ }
+
+ spin_unlock_irqrestore(&mem_wait.lock, flags);
+}
+
+static int mem_notify_open(struct inode *inode, struct file *file)
+{
+ struct mem_notify_file_info *info;
+ int err = 0;
+
+ info = kmalloc(sizeof(*info), GFP_KERNEL);
+ if (!info) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ info->last_proc_notify = INITIAL_JIFFIES;
+ file->private_data = info;
+ atomic_inc(&nr_watcher_task);
+out:
+ return err;
+}
+
+static int mem_notify_release(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+ atomic_dec(&nr_watcher_task);
+ return 0;
+}
+
+static unsigned int mem_notify_poll(struct file *file, poll_table *wait)
+{
+ struct mem_notify_file_info *info = file->private_data;
+ unsigned long now = jiffies;
+ unsigned long timeout;
+ unsigned int retval = 0;
+
+ poll_wait_exclusive(file, &mem_wait, wait);
+
+ timeout = info->last_proc_notify + PROC_WAKEUP_GUARD;
+ if (time_before(now, timeout))
+ goto out;
+
+ if (atomic_long_read(&nr_under_memory_pressure_zones) != 0) {
+ info->last_proc_notify = now;
+ retval = POLLIN;
+ }
+
+out:
+ return retval;
+}
+
+struct file_operations mem_notify_fops = {
+ .open = mem_notify_open,
+ .release = mem_notify_release,
+ .poll = mem_notify_poll,
+};
+EXPORT_SYMBOL(mem_notify_fops);
Index: linux-2.6.24-rc6-mm1-memnotify/mm/page_alloc.c
===================================================================
--- linux-2.6.24-rc6-mm1-memnotify.orig/mm/page_alloc.c 2008-01-13 16:56:54.000000000 +0900
+++ linux-2.6.24-rc6-mm1-memnotify/mm/page_alloc.c 2008-01-13 17:25:15.000000000 +0900
@@ -3456,6 +3456,7 @@ static void __meminit free_area_init_cor
zone->zone_pgdat = pgdat;

zone->prev_priority = DEF_PRIORITY;
+ zone->mem_notify_status = 0;

zone_pcp_init(zone);
INIT_LIST_HEAD(&zone->active_list);
Index: linux-2.6.24-rc6-mm1-memnotify/Documentation/devices.txt
===================================================================
--- linux-2.6.24-rc6-mm1-memnotify.orig/Documentation/devices.txt 2008-01-13 16:42:57.000000000 +0900
+++ linux-2.6.24-rc6-mm1-memnotify/Documentation/devices.txt 2008-01-13 17:07:05.000000000 +0900
@@ -96,6 +96,7 @@ Your cooperation is appreciated.
11 = /dev/kmsg Writes to this come out as printk's
12 = /dev/oldmem Used by crashdump kernels to access
the memory of the kernel that crashed.
+ 13 = /dev/mem_notify Low memory notification.

1 block RAM disk
0 = /dev/ram0 First RAM disk

2008-01-15 01:03:25

by KOSAKI Motohiro

[permalink] [raw]
Subject: [RFC][PATCH 4/5] memory_pressure_notify() caller

the notification point to happen whenever the VM moves an
anonymous page to the inactive list - this is a pretty good indication
that there are unused anonymous pages present which will be very likely
swapped out soon.

and, It is judged out of trouble at the fllowing situations.
o memory pressure decrease and stop moves an anonymous page to the inactive list.
o free pages increase than (pages_high+lowmem_reserve)*2.


Signed-off-by: Marcelo Tosatti <[email protected]>
Signed-off-by: KOSAKI Motohiro <[email protected]>

---
mm/vmscan.c | 15 +++++++++++++++
1 file changed, 15 insertions(+)

Index: linux-2.6.24-rc6-mm1-memnotify/mm/vmscan.c
===================================================================
--- linux-2.6.24-rc6-mm1-memnotify.orig/mm/vmscan.c 2008-01-13 16:59:28.000000000 +0900
+++ linux-2.6.24-rc6-mm1-memnotify/mm/vmscan.c 2008-01-13 17:03:58.000000000 +0900
@@ -963,6 +963,7 @@ static int calc_reclaim_mapped(struct sc
long distress;
long swap_tendency;
long imbalance;
+ int reclaim_mapped = 0;
int prev_priority;

if (scan_global_lru(sc) && zone_is_near_oom(zone))
@@ -1089,10 +1090,14 @@ static void shrink_active_list(unsigned
struct page *page;
struct pagevec pvec;
int reclaim_mapped = 0;
+ bool inactivated_anon = 0;

if (sc->may_swap)
reclaim_mapped = calc_reclaim_mapped(sc, zone, priority);

+ if (!reclaim_mapped)
+ memory_pressure_notify(zone, 0);
+
lru_add_drain();
spin_lock_irq(&zone->lru_lock);
pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
@@ -1116,6 +1121,13 @@ static void shrink_active_list(unsigned
if (!reclaim_mapped ||
(total_swap_pages == 0 && PageAnon(page)) ||
page_referenced(page, 0, sc->mem_cgroup)) {
+ /* deal with the case where there is no
+ * swap but an anonymous page would be
+ * moved to the inactive list.
+ */
+ if (!total_swap_pages && reclaim_mapped &&
+ PageAnon(page))
+ inactivated_anon = 1;
list_add(&page->lru, &l_active);
continue;
}
@@ -1123,8 +1135,12 @@ static void shrink_active_list(unsigned
list_add(&page->lru, &l_active);
continue;
}
+ if (PageAnon(page))
+ inactivated_anon = 1;
list_add(&page->lru, &l_inactive);
}
+ if (inactivated_anon)
+ memory_pressure_notify(zone, 1);

pagevec_init(&pvec, 1);
pgmoved = 0;
@@ -1158,6 +1174,8 @@ static void shrink_active_list(unsigned
pagevec_strip(&pvec);
spin_lock_irq(&zone->lru_lock);
}
+ if (!reclaim_mapped)
+ memory_pressure_notify(zone, 0);

pgmoved = 0;
while (!list_empty(&l_active)) {
Index: linux-2.6.24-rc6-mm1-memnotify/mm/page_alloc.c
===================================================================
--- linux-2.6.24-rc6-mm1-memnotify.orig/mm/page_alloc.c 2008-01-13 16:57:10.000000000 +0900
+++ linux-2.6.24-rc6-mm1-memnotify/mm/page_alloc.c 2008-01-13 17:04:34.000000000 +0900
@@ -44,6 +44,7 @@
#include <linux/fault-inject.h>
#include <linux/page-isolation.h>
#include <linux/memcontrol.h>
+#include <linux/mem_notify.h>

#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -435,6 +436,8 @@ static inline void __free_one_page(struc
unsigned long page_idx;
int order_size = 1 << order;
int migratetype = get_pageblock_migratetype(page);
+ unsigned long prev_free;
+ unsigned long notify_threshold;

if (unlikely(PageCompound(page)))
destroy_compound_page(page, order);
@@ -444,6 +447,7 @@ static inline void __free_one_page(struc
VM_BUG_ON(page_idx & (order_size - 1));
VM_BUG_ON(bad_range(zone, page));

+ prev_free = zone_page_state(zone, NR_FREE_PAGES);
__mod_zone_page_state(zone, NR_FREE_PAGES, order_size);
while (order < MAX_ORDER-1) {
unsigned long combined_idx;
@@ -465,6 +469,13 @@ static inline void __free_one_page(struc
list_add(&page->lru,
&zone->free_area[order].free_list[migratetype]);
zone->free_area[order].nr_free++;
+
+ notify_threshold = (zone->pages_high +
+ zone->lowmem_reserve[MAX_NR_ZONES-1]) * 2;
+
+ if (unlikely((prev_free <= notify_threshold) &&
+ (zone_page_state(zone, NR_FREE_PAGES) > notify_threshold)))
+ memory_pressure_notify(zone, 0);
}

static inline int free_pages_check(struct page *page)

2008-01-15 01:04:29

by KOSAKI Motohiro

[permalink] [raw]
Subject: [RFC][PATCH 5/5] /proc/zoneinfo enhancement

show new member of zone struct by /proc/zoneinfo.

Signed-off-by: Marcelo Tosatti <[email protected]>
Signed-off-by: KOSAKI Motohiro <[email protected]>

---
mm/vmstat.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)

Index: linux-2.6.24-rc6-mm1-memnotify/mm/vmstat.c
===================================================================
--- linux-2.6.24-rc6-mm1-memnotify.orig/mm/vmstat.c 2008-01-13 16:42:54.000000000 +0900
+++ linux-2.6.24-rc6-mm1-memnotify/mm/vmstat.c 2008-01-13 17:07:43.000000000 +0900
@@ -795,9 +795,11 @@ static void zoneinfo_show_print(struct s
seq_printf(m,
"\n all_unreclaimable: %u"
"\n prev_priority: %i"
+ "\n mem_notify_status: %i"
"\n start_pfn: %lu",
- zone_is_all_unreclaimable(zone),
+ zone_is_all_unreclaimable(zone),
zone->prev_priority,
+ zone->mem_notify_status,
zone->zone_start_pfn);
seq_putc(m, '\n');
}

2008-01-15 01:10:36

by Randy Dunlap

[permalink] [raw]
Subject: Re: [RFC][PATCH 3/5] add /dev/mem_notify device

On Tue, 15 Jan 2008 10:01:21 +0900 KOSAKI Motohiro wrote:

> the core of this patch series.
> add /dev/mem_notify device for notification low memory to user process.
>
> <usage examle>
>
> fd = open("/dev/mem_notify", O_RDONLY);
> if (fd < 0) {
> exit(1);
> }
> pollfds.fd = fd;
> pollfds.events = POLLIN;
> pollfds.revents = 0;
> err = poll(&pollfds, 1, -1); // wake up at low memory
>
> ...
> </usage example>
>
> Signed-off-by: Marcelo Tosatti <[email protected]>
> Signed-off-by: KOSAKI Motohiro <[email protected]>
>
> ---
> drivers/char/mem.c | 6 ++
> include/linux/mem_notify.h | 41 ++++++++++++++++
> include/linux/mmzone.h | 1
> mm/Makefile | 2
> mm/mem_notify.c | 109 +++++++++++++++++++++++++++++++++++++++++++++
> mm/page_alloc.c | 1
> 6 files changed, 159 insertions(+), 1 deletion(-)
>

Hi,

1/ I don't see the file below listed in the diffstat above...

2/ Where is the userspace interface information for the syscall?

> Index: linux-2.6.24-rc6-mm1-memnotify/Documentation/devices.txt
> ===================================================================
> --- linux-2.6.24-rc6-mm1-memnotify.orig/Documentation/devices.txt 2008-01-13 16:42:57.000000000 +0900
> +++ linux-2.6.24-rc6-mm1-memnotify/Documentation/devices.txt 2008-01-13 17:07:05.000000000 +0900
> @@ -96,6 +96,7 @@ Your cooperation is appreciated.
> 11 = /dev/kmsg Writes to this come out as printk's
> 12 = /dev/oldmem Used by crashdump kernels to access
> the memory of the kernel that crashed.
> + 13 = /dev/mem_notify Low memory notification.
>
> 1 block RAM disk
> 0 = /dev/ram0 First RAM disk


---
~Randy

2008-01-15 01:22:20

by KOSAKI Motohiro

[permalink] [raw]
Subject: Re: [RFC][PATCH 3/5] add /dev/mem_notify device

Hi randy

> Hi,
>
> 1/ I don't see the file below listed in the diffstat above...

Agghh...
sorry, it is mistake.
I repost soon.

thanks.


> 2/ Where is the userspace interface information for the syscall?

No.
userspace interface is only poll(2).


2008-01-15 01:26:48

by KOSAKI Motohiro

[permalink] [raw]
Subject: Re: [RFC][PATCH 3/5] add /dev/mem_notify device

Hi

> > 1/ I don't see the file below listed in the diffstat above...
>
> Agghh...
> sorry, it is mistake.
> I repost soon.
>
> thanks.

the below diffstat is correct.
thanks!

------------------------------
Documentation/devices.txt | 1
drivers/char/mem.c | 6 ++
include/linux/mem_notify.h | 42 +++++++++++++++++
include/linux/mmzone.h | 1
mm/Makefile | 2
mm/mem_notify.c | 109 +++++++++++++++++++++++++++++++++++++++++++++
mm/page_alloc.c | 1
7 files changed, 161 insertions(+), 1 deletion(-)

2008-01-15 02:03:31

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [RFC][PATCH 4/5] memory_pressure_notify() caller

On Tue, 15 Jan 2008 10:02:30 +0900
KOSAKI Motohiro <[email protected]> wrote:

> +
> + notify_threshold = (zone->pages_high +
> + zone->lowmem_reserve[MAX_NR_ZONES-1]) * 2;
> +
Why MAX_NR_ZONES-1 ?


> + if (unlikely((prev_free <= notify_threshold) &&
> + (zone_page_state(zone, NR_FREE_PAGES) > notify_threshold)))
> + memory_pressure_notify(zone, 0);
> }

How about this
==
if (unlikely(zone->mem_notify_status && ...)


Thanks,
-Kame

2008-01-15 02:07:44

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [RFC][PATCH 3/5] add /dev/mem_notify device

On Tue, 15 Jan 2008 10:01:21 +0900
KOSAKI Motohiro <[email protected]> wrote:

> + if (pressure) {
> + nr_wakeup = max_t(int, atomic_read(&nr_watcher_task)>>4, 100);
> + atomic_long_set(&last_mem_notify, jiffies);
> + wake_up_locked_nr(&mem_wait, nr_wakeup);
> + }
What is this for ? and Why ?
Are there too many waiters ?

Thanks
-Kame

2008-01-15 02:22:22

by KOSAKI Motohiro

[permalink] [raw]
Subject: Re: [RFC][PATCH 3/5] add /dev/mem_notify device

Hi Kame

> > + if (pressure) {
> > + nr_wakeup = max_t(int, atomic_read(&nr_watcher_task)>>4, 100);
> > + atomic_long_set(&last_mem_notify, jiffies);
> > + wake_up_locked_nr(&mem_wait, nr_wakeup);
> > + }
> What is this for ? and Why ?
> Are there too many waiters ?

my intent is for avoid thundering herd.
100 is heuristic value.

and too many wakeup cause too much memory freed.
I don't want it.

of course, if any problem happened, I will change.
Do you dislike it?

2008-01-15 02:38:37

by KOSAKI Motohiro

[permalink] [raw]
Subject: Re: [RFC][PATCH 4/5] memory_pressure_notify() caller

Hi KAME,

> > + notify_threshold = (zone->pages_high +
> > + zone->lowmem_reserve[MAX_NR_ZONES-1]) * 2;
> > +
> Why MAX_NR_ZONES-1 ?

this is intent to max lowmem_reserve.

in normal case,
shrink_active_list isn't called when free_pages > pages_high.
but just after memory freed, it happened rarely.

I don't want incorrect notify at system enough free memory.

related discussion
http://marc.info/?l=linux-mm&m=119878630211348&w=2


> > + if (unlikely((prev_free <= notify_threshold) &&
> > + (zone_page_state(zone, NR_FREE_PAGES) > notify_threshold)))
> > + memory_pressure_notify(zone, 0);
> > }
>
> How about this
> ==
> if (unlikely(zone->mem_notify_status && ...)

Nice idea.
I will applied it at next post.

thank you!



2008-01-15 02:56:58

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [RFC][PATCH 4/5] memory_pressure_notify() caller

On Tue, 15 Jan 2008 11:37:48 +0900
KOSAKI Motohiro <[email protected]> wrote:

> Hi KAME,
>
> > > + notify_threshold = (zone->pages_high +
> > > + zone->lowmem_reserve[MAX_NR_ZONES-1]) * 2;
> > > +
> > Why MAX_NR_ZONES-1 ?
>
> this is intent to max lowmem_reserve.
>
Ah, my point is.. how about this ?
==
if (page_zoneid(page) != ZONE_DMA)
notify_threshold = zone->pages_high +
zone->lowmem_reserve[page_zoneid(page) - 1] * 2;
==

Thanks,
-Kame

2008-01-15 02:57:25

by Rik van Riel

[permalink] [raw]
Subject: Re: [RFC][PATCH 3/5] add /dev/mem_notify device

On Tue, 15 Jan 2008 11:20:56 +0900
KOSAKI Motohiro <[email protected]> wrote:

> Hi Kame
>
> > > + if (pressure) {
> > > + nr_wakeup = max_t(int, atomic_read(&nr_watcher_task)>>4, 100);
> > > + atomic_long_set(&last_mem_notify, jiffies);
> > > + wake_up_locked_nr(&mem_wait, nr_wakeup);
> > > + }
> > What is this for ? and Why ?
> > Are there too many waiters ?
>
> my intent is for avoid thundering herd.
> 100 is heuristic value.
>
> and too many wakeup cause too much memory freed.
> I don't want it.
>
> of course, if any problem happened, I will change.

I agree with you. Your code looks like it could be a reasonable
heuristic, but the only way to really find that out is to test
the code on live systems under varying workloads.

Maybe we need to wake up fewer tasks more often, maybe we are
better off waking up more tasks but fewer times. Either way,
at this time we simply do not know and can stick with your current
code.

--
All rights reversed.

2008-01-15 03:09:15

by KOSAKI Motohiro

[permalink] [raw]
Subject: Re: [RFC][PATCH 4/5] memory_pressure_notify() caller

Hi Kame,

> > > > + notify_threshold = (zone->pages_high +
> > > > + zone->lowmem_reserve[MAX_NR_ZONES-1]) * 2;
> > > > +
> > > Why MAX_NR_ZONES-1 ?
> >
> > this is intent to max lowmem_reserve.
> >
> Ah, my point is.. how about this ?
> ==
> if (page_zoneid(page) != ZONE_DMA)
> notify_threshold = zone->pages_high +
> zone->lowmem_reserve[page_zoneid(page) - 1] * 2;

your point out is very good point.

but judged by zone size is more better, may be.
on some 64bit system, ZONE_DMA is 4GB.
small memory system can't ignore it.

fortunately, zone size check can at free_area_init_core().


- kosaki


2008-01-15 10:48:14

by Alan

[permalink] [raw]
Subject: Re: [RFC][PATCH 5/5] /proc/zoneinfo enhancement

On Tue, 15 Jan 2008 10:03:23 +0900
KOSAKI Motohiro <[email protected]> wrote:

> show new member of zone struct by /proc/zoneinfo.
>
> Signed-off-by: Marcelo Tosatti <[email protected]>
> Signed-off-by: KOSAKI Motohiro <[email protected]>

Minor NAK - Please put new fields at the end - it makes it less likely to
break badly written tools.

2008-01-15 10:49:54

by KOSAKI Motohiro

[permalink] [raw]
Subject: Re: [RFC][PATCH 5/5] /proc/zoneinfo enhancement

Hi alan

> > show new member of zone struct by /proc/zoneinfo.
> >
> > Signed-off-by: Marcelo Tosatti <[email protected]>
> > Signed-off-by: KOSAKI Motohiro <[email protected]>
>
> Minor NAK - Please put new fields at the end - it makes it less likely to
> break badly written tools.

Oh I see.
I applied your opinion at next post.

Thanks!


- kosaki

2008-01-15 10:50:12

by Alan

[permalink] [raw]
Subject: Re: [RFC][PATCH 3/5] add /dev/mem_notify device

On Tue, 15 Jan 2008 10:01:21 +0900
KOSAKI Motohiro <[email protected]> wrote:

> the core of this patch series.
> add /dev/mem_notify device for notification low memory to user process.

As you only wake one process how would you use this API from processes
which want to monitor and can free memory under load. Also what fairness
guarantees are there...

Alan

2008-01-15 10:59:32

by KOSAKI Motohiro

[permalink] [raw]
Subject: Re: [RFC][PATCH 3/5] add /dev/mem_notify device


> > the core of this patch series.
> > add /dev/mem_notify device for notification low memory to user process.
>
> As you only wake one process how would you use this API from processes
> which want to monitor and can free memory under load. Also what fairness
> guarantees are there...

Sorry, I don't make sense what you mean fairness.
Could you tell more?



2008-01-15 11:23:58

by Alan

[permalink] [raw]
Subject: Re: [RFC][PATCH 3/5] add /dev/mem_notify device

On Tue, 15 Jan 2008 19:59:02 +0900
KOSAKI Motohiro <[email protected]> wrote:

>
> > > the core of this patch series.
> > > add /dev/mem_notify device for notification low memory to user process.
> >
> > As you only wake one process how would you use this API from processes
> > which want to monitor and can free memory under load. Also what fairness
> > guarantees are there...
>
> Sorry, I don't make sense what you mean fairness.
> Could you tell more?

If you have two processes each waiting on mem_notify is it not possible
that one of them will keep being the one woken up and the other will
remain stuck ?

It also appears there is no way to wait for memory shortages (processes
that can free memory easily) only for memory to start appearing.

2008-01-15 11:48:58

by KOSAKI Motohiro

[permalink] [raw]
Subject: Re: [RFC][PATCH 3/5] add /dev/mem_notify device

Hi Alan

thank you for kindfull explain.

> > > > the core of this patch series.
> > > > add /dev/mem_notify device for notification low memory to user process.
> > >
> > > As you only wake one process how would you use this API from processes
> > > which want to monitor and can free memory under load. Also what fairness
> > > guarantees are there...
> >
> > Sorry, I don't make sense what you mean fairness.
> > Could you tell more?
>
> If you have two processes each waiting on mem_notify is it not possible
> that one of them will keep being the one woken up and the other will
> remain stuck ?

current wake up order is simply FIFO by poll(2) called.
because the VM cannot know how much amount each process can do in free.
the process rss and freeable memory is not proportional.

thus I adopt wake up one after another until restoration memory shortage.


> It also appears there is no way to wait for memory shortages (processes
> that can free memory easily) only for memory to start appearing.

poll() with never timeout don't fill your requirement?
to be honest, maybe I don't understand your afraid yet. sorry.


-kosaki

2008-01-15 12:03:36

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [RFC][PATCH 3/5] add /dev/mem_notify device

Hi Alan,

On Tue, Jan 15, 2008 at 11:20:27AM +0000, Alan Cox wrote:
> On Tue, 15 Jan 2008 19:59:02 +0900
> KOSAKI Motohiro <[email protected]> wrote:
>
> >
> > > > the core of this patch series.
> > > > add /dev/mem_notify device for notification low memory to user process.
> > >
> > > As you only wake one process how would you use this API from processes
> > > which want to monitor and can free memory under load. Also what fairness
> > > guarantees are there...
> >
> > Sorry, I don't make sense what you mean fairness.
> > Could you tell more?
>
> If you have two processes each waiting on mem_notify is it not possible
> that one of them will keep being the one woken up and the other will
> remain stuck ?

Tasks are added to the end of waitqueue->task_list through
add_wait_queue_exclusive, and waken up from the start of the list. So
I don't think that can happen (its FIFO).

> It also appears there is no way to wait for memory shortages (processes
> that can free memory easily) only for memory to start appearing.

The notification is sent once the VM starts moving anonymous pages to
the inactive list (meaning there is memory shortage). So polling on the
device is all about waiting for memory shortage.

Or do you mean something else?

2008-01-15 13:46:12

by Alan

[permalink] [raw]
Subject: Re: [RFC][PATCH 3/5] add /dev/mem_notify device

> current wake up order is simply FIFO by poll(2) called.
> because the VM cannot know how much amount each process can do in free.
> the process rss and freeable memory is not proportional.

Ok this makes sense.
>
> thus I adopt wake up one after another until restoration memory shortage.
>
>
> > It also appears there is no way to wait for memory shortages (processes
> > that can free memory easily) only for memory to start appearing.
>
> poll() with never timeout don't fill your requirement?
> to be honest, maybe I don't understand your afraid yet. sorry.

My misunderstanding. There is in fact no way to wait for memory to become
available. The poll() method you provide works nicely waiting for
shortages and responding to them by freeing memory.

It would be interesting to add FASYNC support to this. Some users have
asked for a signal when memory shortage occurs (as IBM AIX provides
this). FASYNC support would allow a SIGIO to be delivered from this
device when memory shortages occurred. Poll as you have implemented is of
course the easier way for a program to monitor memory and a better
interface.

Alan

2008-01-15 13:46:28

by Alan

[permalink] [raw]
Subject: Re: [RFC][PATCH 3/5] add /dev/mem_notify device

> Tasks are added to the end of waitqueue->task_list through
> add_wait_queue_exclusive, and waken up from the start of the list. So
> I don't think that can happen (its FIFO).

Agreed

2008-01-15 22:16:36

by Pavel Machek

[permalink] [raw]
Subject: Re: [RFC][PATCH 3/5] add /dev/mem_notify device

Hi!

> the core of this patch series.
> add /dev/mem_notify device for notification low memory to user process.
>
> <usage examle>
>
> fd = open("/dev/mem_notify", O_RDONLY);
> if (fd < 0) {
> exit(1);
> }
> pollfds.fd = fd;
> pollfds.events = POLLIN;
> pollfds.revents = 0;
> err = poll(&pollfds, 1, -1); // wake up at low memory
>
> ...
> </usage example>

Nice, this is really needed for openmoko, zaurus, etc....

But this changelog needs to go into Documentation/...

...and /dev/mem_notify is really a bad name. /dev/memory_low?
/dev/oom?

Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

2008-01-15 22:55:29

by Daniel Spång

[permalink] [raw]
Subject: Re: [RFC][PATCH 4/5] memory_pressure_notify() caller

Hi,

On 1/15/08, KOSAKI Motohiro <[email protected]> wrote:
> the notification point to happen whenever the VM moves an
> anonymous page to the inactive list - this is a pretty good indication
> that there are unused anonymous pages present which will be very likely
> swapped out soon.

> + /* deal with the case where there is no
> + * swap but an anonymous page would be
> + * moved to the inactive list.
> + */
> + if (!total_swap_pages && reclaim_mapped &&
> + PageAnon(page))
> + inactivated_anon = 1;

As you know I have had some concerns regarding a too early
notification in a swapless system.

I did a test with a populated page cache in a swapless system:

$ cat /bigfile > /dev/null # populate page cache
$ cat /proc/meminfo
MemTotal: 1037040 kB
MemFree: 113976 kB
Buffers: 1068 kB
Cached: 907552 kB
SwapCached: 0 kB
Active: 11116 kB
Inactive: 903968 kB
HighTotal: 130992 kB
HighFree: 252 kB
LowTotal: 906048 kB
LowFree: 113724 kB
SwapTotal: 0 kB
SwapFree: 0 kB
Dirty: 36 kB
Writeback: 0 kB
AnonPages: 6484 kB
Mapped: 1216 kB
Slab: 4024 kB
SReclaimable: 864 kB
SUnreclaim: 3160 kB
PageTables: 444 kB
NFS_Unstable: 0 kB
Bounce: 0 kB
CommitLimit: 518520 kB
Committed_AS: 18816 kB
VmallocTotal: 114680 kB
VmallocUsed: 904 kB
VmallocChunk: 113672 kB

Start to allocate memory, 10 MB every second, exit on notification.

$ cat /proc/meminfo # just after notification
MemTotal: 1037040 kB
MemFree: 123468 kB
Buffers: 876 kB
Cached: 897976 kB
SwapCached: 0 kB
Active: 12984 kB
Inactive: 892332 kB
HighTotal: 130992 kB
HighFree: 1064 kB
LowTotal: 906048 kB
LowFree: 122404 kB
SwapTotal: 0 kB
SwapFree: 0 kB
Dirty: 0 kB
Writeback: 0 kB
AnonPages: 6484 kB
Mapped: 1220 kB
Slab: 4012 kB
SReclaimable: 864 kB
SUnreclaim: 3148 kB
PageTables: 448 kB
NFS_Unstable: 0 kB
Bounce: 0 kB
CommitLimit: 518520 kB
Committed_AS: 18816 kB
VmallocTotal: 114680 kB
VmallocUsed: 904 kB
VmallocChunk: 113672 kB

The notification fires after only ~100 MB allocated, i.e., when page
reclaim is beginning to nag from page cache. Isn't this a bit early?
Repeating the test with swap enabled results in a notification after
~600 MB allocated, which is more reasonable and just before the system
starts to swap.

Cheers,
Daniel

2008-01-15 22:59:45

by Rik van Riel

[permalink] [raw]
Subject: Re: [RFC][PATCH 4/5] memory_pressure_notify() caller

On Tue, 15 Jan 2008 23:55:17 +0100
"Daniel Spång" <[email protected]> wrote:

> The notification fires after only ~100 MB allocated, i.e., when page
> reclaim is beginning to nag from page cache. Isn't this a bit early?
> Repeating the test with swap enabled results in a notification after
> ~600 MB allocated, which is more reasonable and just before the system
> starts to swap.

Your issue may have more to do with the fact that the
highmem zone is 128MB in size and some balancing issues
between __alloc_pages and try_to_free_pages.

--
All rights reversed.

2008-01-15 23:40:10

by Daniel Spång

[permalink] [raw]
Subject: Re: [RFC][PATCH 4/5] memory_pressure_notify() caller

On 1/15/08, Rik van Riel <[email protected]> wrote:
>
> On Tue, 15 Jan 2008 23:55:17 +0100
> "Daniel Sp?ng" <[email protected]> wrote:
>
> > The notification fires after only ~100 MB allocated, i.e., when page
> > reclaim is beginning to nag from page cache. Isn't this a bit early?
> > Repeating the test with swap enabled results in a notification after
> > ~600 MB allocated, which is more reasonable and just before the system
> > starts to swap.
>
> Your issue may have more to do with the fact that the
> highmem zone is 128MB in size and some balancing issues
> between __alloc_pages and try_to_free_pages.

I don't think so. I ran the test again without highmem and noticed the
same behaviour:

$ cat /proc/meminfo
MemTotal: 895876 kB
MemFree: 111292 kB
Buffers: 924 kB
Cached: 768664 kB
SwapCached: 0 kB
Active: 9196 kB
Inactive: 767480 kB
HighTotal: 0 kB
HighFree: 0 kB
LowTotal: 895876 kB
LowFree: 111292 kB
SwapTotal: 0 kB
SwapFree: 0 kB
Dirty: 32 kB
Writeback: 0 kB
AnonPages: 7108 kB
Mapped: 1224 kB
Slab: 4288 kB
SReclaimable: 1316 kB
SUnreclaim: 2972 kB
PageTables: 448 kB
NFS_Unstable: 0 kB
Bounce: 0 kB
CommitLimit: 447936 kB
Committed_AS: 19676 kB
VmallocTotal: 122872 kB
VmallocUsed: 904 kB
VmallocChunk: 121864 kB

Start to allocate memory, 10 MB every second, exit on notification
which happened after 110 MB.

$ cat /proc/meminfo #after
MemTotal: 895876 kB
MemFree: 116748 kB
Buffers: 904 kB
Cached: 762944 kB
SwapCached: 0 kB
Active: 12864 kB
Inactive: 758064 kB
HighTotal: 0 kB
HighFree: 0 kB
LowTotal: 895876 kB
LowFree: 116748 kB
SwapTotal: 0 kB
SwapFree: 0 kB
Dirty: 4 kB
Writeback: 0 kB
AnonPages: 7108 kB
Mapped: 1224 kB
Slab: 4284 kB
SReclaimable: 1316 kB
SUnreclaim: 2968 kB
PageTables: 448 kB
NFS_Unstable: 0 kB
Bounce: 0 kB
CommitLimit: 447936 kB
Committed_AS: 19676 kB
VmallocTotal: 122872 kB
VmallocUsed: 904 kB
VmallocChunk: 121864 kB

2008-01-16 01:50:16

by KOSAKI Motohiro

[permalink] [raw]
Subject: Re: [RFC][PATCH 4/5] memory_pressure_notify() caller

Hi Daniel

> > > The notification fires after only ~100 MB allocated, i.e., when page
> > > reclaim is beginning to nag from page cache. Isn't this a bit early?
> > > Repeating the test with swap enabled results in a notification after
> > > ~600 MB allocated, which is more reasonable and just before the system
> > > starts to swap.
> >
> > Your issue may have more to do with the fact that the
> > highmem zone is 128MB in size and some balancing issues
> > between __alloc_pages and try_to_free_pages.
>
> I don't think so. I ran the test again without highmem and noticed the
> same behaviour:

Thank you for good point out!
Could you please post your test program and reproduced method?

unfortunately,
my simple test is so good works in swapless system ;-)

thanks.


2008-01-16 01:58:18

by KOSAKI Motohiro

[permalink] [raw]
Subject: Re: [RFC][PATCH 3/5] add /dev/mem_notify device

Hi Pavel

> > err = poll(&pollfds, 1, -1); // wake up at low memory
> >
> > ...
> > </usage example>
>
> Nice, this is really needed for openmoko, zaurus, etc....
>
> But this changelog needs to go into Documentation/...
>
> ...and /dev/mem_notify is really a bad name. /dev/memory_low?
> /dev/oom?

thank you for your kindful advise.

but..

to be honest, my english is very limited.
I can't make judgments name is good or not.

Marcelo, What do you think his idea?



2008-01-16 02:44:04

by KOSAKI Motohiro

[permalink] [raw]
Subject: Re: [RFC][PATCH 3/5] add /dev/mem_notify device

Hi Alan

> > > It also appears there is no way to wait for memory shortages (processes
> > > that can free memory easily) only for memory to start appearing.
> >
> > poll() with never timeout don't fill your requirement?
> > to be honest, maybe I don't understand your afraid yet. sorry.
>
> My misunderstanding. There is in fact no way to wait for memory to become
> available. The poll() method you provide works nicely waiting for
> shortages and responding to them by freeing memory.
>
> It would be interesting to add FASYNC support to this. Some users have
> asked for a signal when memory shortage occurs (as IBM AIX provides
> this). FASYNC support would allow a SIGIO to be delivered from this
> device when memory shortages occurred. Poll as you have implemented is of
> course the easier way for a program to monitor memory and a better
> interface.

OK.
I will challenge implement at mem_notify v5.


- kosaki

2008-01-16 04:11:22

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [RFC][PATCH 3/5] add /dev/mem_notify device

On Wed, Jan 16, 2008 at 10:57:16AM +0900, KOSAKI Motohiro wrote:
> Hi Pavel
>
> > > err = poll(&pollfds, 1, -1); // wake up at low memory
> > >
> > > ...
> > > </usage example>
> >
> > Nice, this is really needed for openmoko, zaurus, etc....
> >
> > But this changelog needs to go into Documentation/...
> >
> > ...and /dev/mem_notify is really a bad name. /dev/memory_low?
> > /dev/oom?
>
> thank you for your kindful advise.
>
> but..
>
> to be honest, my english is very limited.
> I can't make judgments name is good or not.
>
> Marcelo, What do you think his idea?

"mem_notify" sounds alright, but I don't really care.

Notify:

To give notice to; inform: notified the citizens of the curfew by
posting signs.

2008-01-16 11:04:03

by Daniel Spång

[permalink] [raw]
Subject: Re: [RFC][PATCH 4/5] memory_pressure_notify() caller

On 1/16/08, KOSAKI Motohiro <[email protected]> wrote:
> Hi Daniel
>
> > > > The notification fires after only ~100 MB allocated, i.e., when page
> > > > reclaim is beginning to nag from page cache. Isn't this a bit early?
> > > > Repeating the test with swap enabled results in a notification after
> > > > ~600 MB allocated, which is more reasonable and just before the system
> > > > starts to swap.
> > >
> > > Your issue may have more to do with the fact that the
> > > highmem zone is 128MB in size and some balancing issues
> > > between __alloc_pages and try_to_free_pages.
> >
> > I don't think so. I ran the test again without highmem and noticed the
> > same behaviour:
>
> Thank you for good point out!
> Could you please post your test program and reproduced method?

Sure:

1. Fill almost all available memory with page cache in a system without swap.
2. Run attached alloc-test program.
3. Notification fires when page cache is reclaimed.

Example:

$ cat /bigfile > /dev/null
$ cat /proc/meminfo
MemTotal: 895876 kB
MemFree: 94272 kB
Buffers: 884 kB
Cached: 782868 kB
SwapCached: 0 kB
Active: 15356 kB
Inactive: 778000 kB
HighTotal: 0 kB
HighFree: 0 kB
LowTotal: 895876 kB
LowFree: 94272 kB
SwapTotal: 0 kB
SwapFree: 0 kB
Dirty: 0 kB
Writeback: 0 kB
AnonPages: 9624 kB
Mapped: 1352 kB
Slab: 4220 kB
SReclaimable: 1168 kB
SUnreclaim: 3052 kB
PageTables: 528 kB
NFS_Unstable: 0 kB
Bounce: 0 kB
CommitLimit: 447936 kB
Committed_AS: 28988 kB
VmallocTotal: 122872 kB
VmallocUsed: 904 kB
VmallocChunk: 121864 kB
$ ./test-alloc
---------
Got notification, allocated 90 MB
$ cat /proc/meminfo
MemTotal: 895876 kB
MemFree: 101960 kB
Buffers: 888 kB
Cached: 775200 kB
SwapCached: 0 kB
Active: 15356 kB
Inactive: 770336 kB
HighTotal: 0 kB
HighFree: 0 kB
LowTotal: 895876 kB
LowFree: 101960 kB
SwapTotal: 0 kB
SwapFree: 0 kB
Dirty: 28 kB
Writeback: 0 kB
AnonPages: 9624 kB
Mapped: 1352 kB
Slab: 4224 kB
SReclaimable: 1168 kB
SUnreclaim: 3056 kB
PageTables: 532 kB
NFS_Unstable: 0 kB
Bounce: 0 kB
CommitLimit: 447936 kB
Committed_AS: 28988 kB
VmallocTotal: 122872 kB
VmallocUsed: 904 kB
VmallocChunk: 121864 kB


Attachments:
(No filename) (2.50 kB)
alloc-test.c (1.70 kB)
Download all attachments

2008-01-16 11:42:35

by Pavel Machek

[permalink] [raw]
Subject: Re: [RFC][PATCH 3/5] add /dev/mem_notify device

On Wed 2008-01-16 02:13:32, Marcelo Tosatti wrote:
> On Wed, Jan 16, 2008 at 10:57:16AM +0900, KOSAKI Motohiro wrote:
> > Hi Pavel
> >
> > > > err = poll(&pollfds, 1, -1); // wake up at low memory
> > > >
> > > > ...
> > > > </usage example>
> > >
> > > Nice, this is really needed for openmoko, zaurus, etc....
> > >
> > > But this changelog needs to go into Documentation/...
> > >
> > > ...and /dev/mem_notify is really a bad name. /dev/memory_low?
> > > /dev/oom?
> >
> > thank you for your kindful advise.
> >
> > but..
> >
> > to be honest, my english is very limited.
> > I can't make judgments name is good or not.
> >
> > Marcelo, What do you think his idea?
>
> "mem_notify" sounds alright, but I don't really care.
>
> Notify:
>
> To give notice to; inform: notified the citizens of the curfew by
> posting signs.

I'd read mem_notify as "tell me when new memory is unplugged" or
something. /dev/oom_notify? Plus, /dev/ names usually do not have "_"
in them.
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

2008-01-16 11:51:36

by Daniel Spång

[permalink] [raw]
Subject: Re: [RFC][PATCH 3/5] add /dev/mem_notify device

On 1/16/08, Pavel Machek <[email protected]> wrote:
> On Wed 2008-01-16 02:13:32, Marcelo Tosatti wrote:
> > On Wed, Jan 16, 2008 at 10:57:16AM +0900, KOSAKI Motohiro wrote:
> > > Hi Pavel
> > >
> > > > > err = poll(&pollfds, 1, -1); // wake up at low memory
> > > > >
> > > > > ...
> > > > > </usage example>
> > > >
> > > > Nice, this is really needed for openmoko, zaurus, etc....
> > > >
> > > > But this changelog needs to go into Documentation/...
> > > >
> > > > ...and /dev/mem_notify is really a bad name. /dev/memory_low?
> > > > /dev/oom?
> > >
> > > thank you for your kindful advise.
> > >
> > > but..
> > >
> > > to be honest, my english is very limited.
> > > I can't make judgments name is good or not.
> > >
> > > Marcelo, What do you think his idea?
> >
> > "mem_notify" sounds alright, but I don't really care.
> >
> > Notify:
> >
> > To give notice to; inform: notified the citizens of the curfew by
> > posting signs.
>
> I'd read mem_notify as "tell me when new memory is unplugged" or
> something. /dev/oom_notify? Plus, /dev/ names usually do not have "_"
> in them.

I don't think we should use oom in the name, since the notification is
sent long before oom.

2008-01-17 03:05:59

by KOSAKI Motohiro

[permalink] [raw]
Subject: Re: [RFC][PATCH 3/5] add /dev/mem_notify device

Hi

> > I'd read mem_notify as "tell me when new memory is unplugged" or
> > something. /dev/oom_notify? Plus, /dev/ names usually do not have "_"
> > in them.
>
> I don't think we should use oom in the name, since the notification is
> sent long before oom.

OK, I don't change name.
Of cource, I will change soon if anyone propose more good name.

thanks

- kosaki

2008-01-17 03:27:19

by KOSAKI Motohiro

[permalink] [raw]
Subject: Re: [RFC][PATCH 4/5] memory_pressure_notify() caller

Hi Daniel

> > Thank you for good point out!
> > Could you please post your test program and reproduced method?
>
> Sure:
>
> 1. Fill almost all available memory with page cache in a system without swap.
> 2. Run attached alloc-test program.
> 3. Notification fires when page cache is reclaimed.

Unfortunately, I can't reproduce it.

my machine
CPU: Pentium4 2.8GHz with HT
memory: 512M


1. I doubt ZONE_DMA, please shipment ignore zone_dma patch(below).
2. Could you please send your .config and /etc/sysctl.conf?
I hope more reproduce challenge.

thanks.

- kosaki




Signed-off-by: KOSAKI Motohiro <[email protected]>

---
include/linux/mem_notify.h | 3 +++
mm/page_alloc.c | 6 +++++-
2 files changed, 8 insertions(+), 1 deletion(-)

Index: linux-2.6.24-rc6-mm1-memnotify/include/linux/mem_notify.h
===================================================================
--- linux-2.6.24-rc6-mm1-memnotify.orig/include/linux/mem_notify.h
2008-01-16 21:31:09.000000000 +0900
+++ linux-2.6.24-rc6-mm1-memnotify/include/linux/mem_notify.h
2008-01-16 21:34:24.000000000 +0900
@@ -22,6 +22,9 @@ static inline void memory_pressure_notif
unsigned long target;
unsigned long pages_high, pages_free, pages_reserve;

+ if (unlikely(zone->mem_notify_status == -1))
+ return;
+
if (pressure) {
target = atomic_long_read(&last_mem_notify) + MEM_NOTIFY_FREQ;
if (likely(time_before(jiffies, target)))
Index: linux-2.6.24-rc6-mm1-memnotify/mm/page_alloc.c
===================================================================
--- linux-2.6.24-rc6-mm1-memnotify.orig/mm/page_alloc.c 2008-01-13
19:50:27.000000000 +0900
+++ linux-2.6.24-rc6-mm1-memnotify/mm/page_alloc.c 2008-01-16
21:41:58.000000000 +0900
@@ -3467,7 +3467,11 @@ static void __meminit free_area_init_cor
zone->zone_pgdat = pgdat;

zone->prev_priority = DEF_PRIORITY;
- zone->mem_notify_status = 0;
+
+ if (zone->present_pages < (pgdat->node_present_pages / 10))
+ zone->mem_notify_status = -1;
+ else
+ zone->mem_notify_status = 0;

zone_pcp_init(zone);
INIT_LIST_HEAD(&zone->active_list);


2008-01-18 10:24:37

by Daniel Spång

[permalink] [raw]
Subject: Re: [RFC][PATCH 4/5] memory_pressure_notify() caller

On 1/17/08, KOSAKI Motohiro <[email protected]> wrote:
> Hi Daniel
>
> > > Thank you for good point out!
> > > Could you please post your test program and reproduced method?
> >
> > Sure:
> >
> > 1. Fill almost all available memory with page cache in a system without swap.
> > 2. Run attached alloc-test program.
> > 3. Notification fires when page cache is reclaimed.
>
> Unfortunately, I can't reproduce it.
>
> my machine
> CPU: Pentium4 2.8GHz with HT
> memory: 512M
>
>
> 1. I doubt ZONE_DMA, please shipment ignore zone_dma patch(below).
> 2. Could you please send your .config and /etc/sysctl.conf?
> I hope more reproduce challenge.
>
> thanks.
>
> - kosaki
>
>
>
>
> Signed-off-by: KOSAKI Motohiro <[email protected]>
>
> ---
> include/linux/mem_notify.h | 3 +++
> mm/page_alloc.c | 6 +++++-
> 2 files changed, 8 insertions(+), 1 deletion(-)
>
> Index: linux-2.6.24-rc6-mm1-memnotify/include/linux/mem_notify.h
> ===================================================================
> --- linux-2.6.24-rc6-mm1-memnotify.orig/include/linux/mem_notify.h
> 2008-01-16 21:31:09.000000000 +0900
> +++ linux-2.6.24-rc6-mm1-memnotify/include/linux/mem_notify.h
> 2008-01-16 21:34:24.000000000 +0900
> @@ -22,6 +22,9 @@ static inline void memory_pressure_notif
> unsigned long target;
> unsigned long pages_high, pages_free, pages_reserve;
>
> + if (unlikely(zone->mem_notify_status == -1))
> + return;
> +
> if (pressure) {
> target = atomic_long_read(&last_mem_notify) + MEM_NOTIFY_FREQ;
> if (likely(time_before(jiffies, target)))
> Index: linux-2.6.24-rc6-mm1-memnotify/mm/page_alloc.c
> ===================================================================
> --- linux-2.6.24-rc6-mm1-memnotify.orig/mm/page_alloc.c 2008-01-13
> 19:50:27.000000000 +0900
> +++ linux-2.6.24-rc6-mm1-memnotify/mm/page_alloc.c 2008-01-16
> 21:41:58.000000000 +0900
> @@ -3467,7 +3467,11 @@ static void __meminit free_area_init_cor
> zone->zone_pgdat = pgdat;
>
> zone->prev_priority = DEF_PRIORITY;
> - zone->mem_notify_status = 0;
> +
> + if (zone->present_pages < (pgdat->node_present_pages / 10))
> + zone->mem_notify_status = -1;
> + else
> + zone->mem_notify_status = 0;
>
> zone_pcp_init(zone);
> INIT_LIST_HEAD(&zone->active_list);

Your patch above solves the problem I had with early notification.

Cheers,
Daniel

2008-01-18 10:31:29

by KOSAKI Motohiro

[permalink] [raw]
Subject: Re: [RFC][PATCH 4/5] memory_pressure_notify() caller

Hi!

> > 1. I doubt ZONE_DMA, please shipment ignore zone_dma patch(below).
>
> Your patch above solves the problem I had with early notification.

really!?
I am really happy!!

Thanks you.


- kosaki