Hi!
The /dev/mem_notify is low memory notification device.
it can avoid swappness and oom by cooperationg with the user process.
You need not be annoyed by OOM any longer :)
please any comments!
related discussion:
--------------------------------------------------------------
LKML OOM notifications requirement discussion
http://www.gossamer-threads.com/lists/linux/kernel/832802?nohighlight=1#832802
OOM notifications patch [Marcelo Tosatti]
http://marc.info/?l=linux-kernel&m=119273914027743&w=2
mem notifications v3 [Marcelo Tosatti]
http://marc.info/?l=linux-mm&m=119852828327044&w=2
Thrashing notification patch [Daniel Spang]
http://marc.info/?l=linux-mm&m=119427416315676&w=2
Changelog
-------------------------------------------------
v3 -> v4 (by KOSAKI Motohiro)
o rebase to 2.6.24-rc6-mm1
o avoid wake up all.
o add judgement point to __free_one_page().
o add zone awareness.
v2 -> v3 (by Marcelo Tosatti)
o changes the notification point to happen whenever
the VM moves an anonymous page to the inactive list.
o implement notification rate limit.
v1(oom notify) -> v2 (by Marcelo Tosatti)
o name change
o notify timing change from just swap thrashing to
just before thrashing.
o also works with swapless device.
--
There are 2 way of adding item to wait_queue,
1. add_wait_queue()
2. add_wait_queue_exclusive()
and add_wait_queue_exclusive() is very useful API.
unforunately, poll_wait_exclusive() against poll_wait() doesn't exist.
it means there is no way that wake up only 1 process where polled.
wake_up() is wake up all sleeping process by poll_wait(), not 1 process.
this patch introduce poll_wait_exclusive() new API for allow wake up only 1 process.
<example of usage>
unsigned int kosaki_poll(struct file *file,
struct poll_table_struct *wait)
{
poll_wait_exclusive(file, &kosaki_wait_queue, wait);
if (data_exist)
return POLLIN | POLLRDNORM;
return 0;
}
Signed-off-by: Marcelo Tosatti <marcelo@kvack.org>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
---
fs/eventpoll.c | 7 +++++--
fs/select.c | 9 ++++++---
include/linux/poll.h | 11 +++++++++--
3 files changed, 20 insertions(+), 7 deletions(-)
Index: linux-2.6.24-rc6-memnotify/fs/eventpoll.c
===================================================================
--- linux-2.6.24-rc6-memnotify.orig/fs/eventpoll.c 2007-12-30 02:08:58.000000000 +0900
+++ linux-2.6.24-rc6-memnotify/fs/eventpoll.c 2007-12-30 07:10:46.000000000 +0900
@@ -676,7 +676,7 @@ out_unlock:
* target file wakeup lists.
*/
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
- poll_table *pt)
+ poll_table *pt, int exclusive)
{
struct epitem *epi = ep_item_from_epqueue(pt);
struct eppoll_entry *pwq;
@@ -685,7 +685,10 @@ static void ep_ptable_queue_proc(struct
init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
pwq->whead = whead;
pwq->base = epi;
- add_wait_queue(whead, &pwq->wait);
+ if (exclusive)
+ add_wait_queue_exclusive(whead, &pwq->wait);
+ else
+ add_wait_queue(whead, &pwq->wait);
list_add_tail(&pwq->llink, &epi->pwqlist);
epi->nwait++;
} else {
Index: ...introduce new API wake_up_locked_nr() and wake_up_locked_all(). it it similar as wake_up_nr() and wake_up_all(), but it doesn't lock. Signed-off-by: Marcelo Tosatti <marcelo@kvack.org> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> --- include/linux/wait.h | 7 +++++-- kernel/sched.c | 5 +++-- 2 files changed, 8 insertions(+), 4 deletions(-) Index: linux-2.6.24-rc6-mm1-memnotify/include/linux/wait.h =================================================================== --- linux-2.6.24-rc6-mm1-memnotify.orig/include/linux/wait.h 2008-01-13 16:43:04.000000000 +0900 +++ linux-2.6.24-rc6-mm1-memnotify/include/linux/wait.h 2008-01-13 16:52:21.000000000 +0900 @@ -142,7 +142,7 @@ static inline void __remove_wait_queue(w } void FASTCALL(__wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key)); -extern void FASTCALL(__wake_up_locked(wait_queue_head_t *q, unsigned int mode)); +void FASTCALL(__wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr, void *key)); extern void FASTCALL(__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr)); void FASTCALL(__wake_up_bit(wait_queue_head_t *, void *, int)); int FASTCALL(__wait_on_bit(wait_queue_head_t *, struct wait_bit_queue *, int (*)(void *), unsigned)); @@ -155,7 +155,10 @@ wait_queue_head_t *FASTCALL(bit_waitqueu #define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL) #define wake_up_nr(x, nr) __wake_up(x, TASK_NORMAL, nr, NULL) #define wake_up_all(x) __wake_up(x, TASK_NORMAL, 0, NULL) -#define wake_up_locked(x) __wake_up_locked((x), TASK_NORMAL) + +#define wake_up_locked(x) __wake_up_locked((x), TASK_NORMAL, 1, NULL) +#define wake_up_locked_nr(x, nr) __wake_up_locked((x), TASK_NORMAL, nr, NULL) +#define wake_up_locked_all(x) __wake_up_locked((x), TASK_NORMAL, 0, NULL) #define wake_up_interruptible(x) __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL) #define wake_up_interruptible_nr(x, nr) __wake_up(x, TASK_INTERRUPTIBLE, nr, NULL) Index: ...
the core of this patch series.
add /dev/mem_notify device for notification low memory to user process.
<usage examle>
fd = open("/dev/mem_notify", O_RDONLY);
if (fd < 0) {
exit(1);
}
pollfds.fd = fd;
pollfds.events = POLLIN;
pollfds.revents = 0;
err = poll(&pollfds, 1, -1); // wake up at low memory
...
</usage example>
Signed-off-by: Marcelo Tosatti <marcelo@kvack.org>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
---
drivers/char/mem.c | 6 ++
include/linux/mem_notify.h | 41 ++++++++++++++++
include/linux/mmzone.h | 1
mm/Makefile | 2
mm/mem_notify.c | 109 +++++++++++++++++++++++++++++++++++++++++++++
mm/page_alloc.c | 1
6 files changed, 159 insertions(+), 1 deletion(-)
Index: linux-2.6.24-rc6-mm1-memnotify/drivers/char/mem.c
===================================================================
--- linux-2.6.24-rc6-mm1-memnotify.orig/drivers/char/mem.c 2008-01-13 16:56:54.000000000 +0900
+++ linux-2.6.24-rc6-mm1-memnotify/drivers/char/mem.c 2008-01-13 16:57:10.000000000 +0900
@@ -34,6 +34,8 @@
# include <linux/efi.h>
#endif
+extern struct file_operations mem_notify_fops;
+
/*
* Architectures vary in how they handle caching for addresses
* outside of main memory.
@@ -854,6 +856,9 @@ static int memory_open(struct inode * in
filp->f_op = &oldmem_fops;
break;
#endif
+ case 13:
+ filp->f_op = &mem_notify_fops;
+ break;
default:
return -ENXIO;
}
@@ -886,6 +891,7 @@ static const struct {
#ifdef CONFIG_CRASH_DUMP
{12,"oldmem", S_IRUSR | S_IWUSR | S_IRGRP, &oldmem_fops},
#endif
+ {13,"mem_notify", S_IRUGO, &mem_notify_fops},
};
static struct class *mem_class;
Index: linux-2.6.24-rc6-mm1-memnotify/include/linux/mem_notify.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ ...Hi, 1/ I don't see the file below listed in the diffstat above... --- ~Randy --
Agghh... sorry, it is mistake. I repost soon. No. userspace interface is only poll(2). --
the below diffstat is correct. thanks! ------------------------------ Documentation/devices.txt | 1 drivers/char/mem.c | 6 ++ include/linux/mem_notify.h | 42 +++++++++++++++++ include/linux/mmzone.h | 1 mm/Makefile | 2 mm/mem_notify.c | 109 +++++++++++++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 1 7 files changed, 161 insertions(+), 1 deletion(-) --
On Tue, 15 Jan 2008 10:01:21 +0900 What is this for ? and Why ? Are there too many waiters ? Thanks -Kame --
my intent is for avoid thundering herd. 100 is heuristic value. and too many wakeup cause too much memory freed. I don't want it. of course, if any problem happened, I will change. Do you dislike it? --
On Tue, 15 Jan 2008 11:20:56 +0900 I agree with you. Your code looks like it could be a reasonable heuristic, but the only way to really find that out is to test the code on live systems under varying workloads. Maybe we need to wake up fewer tasks more often, maybe we are better off waking up more tasks but fewer times. Either way, at this time we simply do not know and can stick with your current code. -- All rights reversed. --
On Tue, 15 Jan 2008 10:01:21 +0900 As you only wake one process how would you use this API from processes which want to monitor and can free memory under load. Also what fairness guarantees are there... Alan --
Sorry, I don't make sense what you mean fairness. Could you tell more? --
On Tue, 15 Jan 2008 19:59:02 +0900 If you have two processes each waiting on mem_notify is it not possible that one of them will keep being the one woken up and the other will remain stuck ? It also appears there is no way to wait for memory shortages (processes that can free memory easily) only for memory to start appearing. --
Hi Alan current wake up order is simply FIFO by poll(2) called. because the VM cannot know how much amount each process can do in free. the process rss and freeable memory is not proportional. poll() with never timeout don't fill your requirement? to be honest, maybe I don't understand your afraid yet. sorry. -kosaki --
My misunderstanding. There is in fact no way to wait for memory to become available. The poll() method you provide works nicely waiting for shortages and responding to them by freeing memory. It would be interesting to add FASYNC support to this. Some users have asked for a signal when memory shortage occurs (as IBM AIX provides this). FASYNC support would allow a SIGIO to be delivered from this device when memory shortages occurred. Poll as you have implemented is of course the easier way for a program to monitor memory and a better interface. Alan --
OK. I will challenge implement at mem_notify v5. - kosaki --
Hi Alan, Tasks are added to the end of waitqueue->task_list through add_wait_queue_exclusive, and waken up from the start of the list. So The notification is sent once the VM starts moving anonymous pages to the inactive list (meaning there is memory shortage). So polling on the device is all about waiting for memory shortage. Or do you mean something else? --
Nice, this is really needed for openmoko, zaurus, etc.... But this changelog needs to go into Documentation/... ...and /dev/mem_notify is really a bad name. /dev/memory_low? /dev/oom? Pavel -- (english) http://www.livejournal.com/~pavelmachek (cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html --
thank you for your kindful advise. but.. to be honest, my english is very limited. I can't make judgments name is good or not. Marcelo, What do you think his idea? --
"mem_notify" sounds alright, but I don't really care. Notify: To give notice to; inform: notified the citizens of the curfew by posting signs. --
OK, I don't change name. Of cource, I will change soon if anyone propose more good name. thanks - kosaki --
the notification point to happen whenever the VM moves an
anonymous page to the inactive list - this is a pretty good indication
that there are unused anonymous pages present which will be very likely
swapped out soon.
and, It is judged out of trouble at the fllowing situations.
o memory pressure decrease and stop moves an anonymous page to the inactive list.
o free pages increase than (pages_high+lowmem_reserve)*2.
Signed-off-by: Marcelo Tosatti <marcelo@kvack.org>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
---
mm/vmscan.c | 15 +++++++++++++++
1 file changed, 15 insertions(+)
Index: linux-2.6.24-rc6-mm1-memnotify/mm/vmscan.c
===================================================================
--- linux-2.6.24-rc6-mm1-memnotify.orig/mm/vmscan.c 2008-01-13 16:59:28.000000000 +0900
+++ linux-2.6.24-rc6-mm1-memnotify/mm/vmscan.c 2008-01-13 17:03:58.000000000 +0900
@@ -963,6 +963,7 @@ static int calc_reclaim_mapped(struct sc
long distress;
long swap_tendency;
long imbalance;
+ int reclaim_mapped = 0;
int prev_priority;
if (scan_global_lru(sc) && zone_is_near_oom(zone))
@@ -1089,10 +1090,14 @@ static void shrink_active_list(unsigned
struct page *page;
struct pagevec pvec;
int reclaim_mapped = 0;
+ bool inactivated_anon = 0;
if (sc->may_swap)
reclaim_mapped = calc_reclaim_mapped(sc, zone, priority);
+ if (!reclaim_mapped)
+ memory_pressure_notify(zone, 0);
+
lru_add_drain();
spin_lock_irq(&zone->lru_lock);
pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
@@ -1116,6 +1121,13 @@ static void shrink_active_list(unsigned
if (!reclaim_mapped ||
(total_swap_pages == 0 && PageAnon(page)) ||
page_referenced(page, 0, sc->mem_cgroup)) {
+ /* deal with the case where there is no
+ * swap but an anonymous page would be
+ * moved to the inactive list.
+ */
+ if (!total_swap_pages && reclaim_mapped &&
+ PageAnon(page))
+ inactivated_anon = ...On Tue, 15 Jan 2008 10:02:30 +0900 How about this == if (unlikely(zone->mem_notify_status && ...) Thanks, -Kame --
On Tue, 15 Jan 2008 11:37:48 +0900
Ah, my point is.. how about this ?
==
if (page_zoneid(page) != ZONE_DMA)
notify_threshold = zone->pages_high +
zone->lowmem_reserve[page_zoneid(page) - 1] * 2;
==
Thanks,
-Kame
--
your point out is very good point. but judged by zone size is more better, may be. on some 64bit system, ZONE_DMA is 4GB. small memory system can't ignore it. fortunately, zone size check can at free_area_init_core(). - kosaki --
Hi, As you know I have had some concerns regarding a too early notification in a swapless system. I did a test with a populated page cache in a swapless system: $ cat /bigfile > /dev/null # populate page cache $ cat /proc/meminfo MemTotal: 1037040 kB MemFree: 113976 kB Buffers: 1068 kB Cached: 907552 kB SwapCached: 0 kB Active: 11116 kB Inactive: 903968 kB HighTotal: 130992 kB HighFree: 252 kB LowTotal: 906048 kB LowFree: 113724 kB SwapTotal: 0 kB SwapFree: 0 kB Dirty: 36 kB Writeback: 0 kB AnonPages: 6484 kB Mapped: 1216 kB Slab: 4024 kB SReclaimable: 864 kB SUnreclaim: 3160 kB PageTables: 444 kB NFS_Unstable: 0 kB Bounce: 0 kB CommitLimit: 518520 kB Committed_AS: 18816 kB VmallocTotal: 114680 kB VmallocUsed: 904 kB VmallocChunk: 113672 kB Start to allocate memory, 10 MB every second, exit on notification. $ cat /proc/meminfo # just after notification MemTotal: 1037040 kB MemFree: 123468 kB Buffers: 876 kB Cached: 897976 kB SwapCached: 0 kB Active: 12984 kB Inactive: 892332 kB HighTotal: 130992 kB HighFree: 1064 kB LowTotal: 906048 kB LowFree: 122404 kB SwapTotal: 0 kB SwapFree: 0 kB Dirty: 0 kB Writeback: 0 kB AnonPages: 6484 kB Mapped: 1220 kB Slab: 4012 kB SReclaimable: 864 kB SUnreclaim: 3148 kB PageTables: 448 kB NFS_Unstable: 0 kB Bounce: 0 kB CommitLimit: 518520 kB Committed_AS: 18816 kB VmallocTotal: 114680 kB VmallocUsed: 904 kB VmallocChunk: 113672 kB The notification fires after only ~100 MB allocated, i.e., when page reclaim is beginning to nag from page cache. Isn't this a bit early? Repeating the test with ...
On Tue, 15 Jan 2008 23:55:17 +0100 Your issue may have more to do with the fact that the highmem zone is 128MB in size and some balancing issues between __alloc_pages and try_to_free_pages. -- All rights reversed. --
I don't think so. I ran the test again without highmem and noticed the same behaviour: $ cat /proc/meminfo MemTotal: 895876 kB MemFree: 111292 kB Buffers: 924 kB Cached: 768664 kB SwapCached: 0 kB Active: 9196 kB Inactive: 767480 kB HighTotal: 0 kB HighFree: 0 kB LowTotal: 895876 kB LowFree: 111292 kB SwapTotal: 0 kB SwapFree: 0 kB Dirty: 32 kB Writeback: 0 kB AnonPages: 7108 kB Mapped: 1224 kB Slab: 4288 kB SReclaimable: 1316 kB SUnreclaim: 2972 kB PageTables: 448 kB NFS_Unstable: 0 kB Bounce: 0 kB CommitLimit: 447936 kB Committed_AS: 19676 kB VmallocTotal: 122872 kB VmallocUsed: 904 kB VmallocChunk: 121864 kB Start to allocate memory, 10 MB every second, exit on notification which happened after 110 MB. $ cat /proc/meminfo #after MemTotal: 895876 kB MemFree: 116748 kB Buffers: 904 kB Cached: 762944 kB SwapCached: 0 kB Active: 12864 kB Inactive: 758064 kB HighTotal: 0 kB HighFree: 0 kB LowTotal: 895876 kB LowFree: 116748 kB SwapTotal: 0 kB SwapFree: 0 kB Dirty: 4 kB Writeback: 0 kB AnonPages: 7108 kB Mapped: 1224 kB Slab: 4284 kB SReclaimable: 1316 kB SUnreclaim: 2968 kB PageTables: 448 kB NFS_Unstable: 0 kB Bounce: 0 kB CommitLimit: 447936 kB Committed_AS: 19676 kB VmallocTotal: 122872 kB VmallocUsed: 904 kB VmallocChunk: 121864 kB --
Thank you for good point out! Could you please post your test program and reproduced method? unfortunately, my simple test is so good works in swapless system ;-) thanks. --
Sure: 1. Fill almost all available memory with page cache in a system without swap. 2. Run attached alloc-test program. 3. Notification fires when page cache is reclaimed. Example: $ cat /bigfile > /dev/null $ cat /proc/meminfo MemTotal: 895876 kB MemFree: 94272 kB Buffers: 884 kB Cached: 782868 kB SwapCached: 0 kB Active: 15356 kB Inactive: 778000 kB HighTotal: 0 kB HighFree: 0 kB LowTotal: 895876 kB LowFree: 94272 kB SwapTotal: 0 kB SwapFree: 0 kB Dirty: 0 kB Writeback: 0 kB AnonPages: 9624 kB Mapped: 1352 kB Slab: 4220 kB SReclaimable: 1168 kB SUnreclaim: 3052 kB PageTables: 528 kB NFS_Unstable: 0 kB Bounce: 0 kB CommitLimit: 447936 kB Committed_AS: 28988 kB VmallocTotal: 122872 kB VmallocUsed: 904 kB VmallocChunk: 121864 kB $ ./test-alloc --------- Got notification, allocated 90 MB $ cat /proc/meminfo MemTotal: 895876 kB MemFree: 101960 kB Buffers: 888 kB Cached: 775200 kB SwapCached: 0 kB Active: 15356 kB Inactive: 770336 kB HighTotal: 0 kB HighFree: 0 kB LowTotal: 895876 kB LowFree: 101960 kB SwapTotal: 0 kB SwapFree: 0 kB Dirty: 28 kB Writeback: 0 kB AnonPages: 9624 kB Mapped: 1352 kB Slab: 4224 kB SReclaimable: 1168 kB SUnreclaim: 3056 kB PageTables: 532 kB NFS_Unstable: 0 kB Bounce: 0 kB CommitLimit: 447936 kB Committed_AS: 28988 kB VmallocTotal: 122872 kB VmallocUsed: 904 kB VmallocChunk: 121864 kB
Unfortunately, I can't reproduce it.
my machine
CPU: Pentium4 2.8GHz with HT
memory: 512M
1. I doubt ZONE_DMA, please shipment ignore zone_dma patch(below).
2. Could you please send your .config and /etc/sysctl.conf?
I hope more reproduce challenge.
thanks.
- kosaki
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
---
include/linux/mem_notify.h | 3 +++
mm/page_alloc.c | 6 +++++-
2 files changed, 8 insertions(+), 1 deletion(-)
Index: linux-2.6.24-rc6-mm1-memnotify/include/linux/mem_notify.h
===================================================================
--- linux-2.6.24-rc6-mm1-memnotify.orig/include/linux/mem_notify.h
2008-01-16 21:31:09.000000000 +0900
+++ linux-2.6.24-rc6-mm1-memnotify/include/linux/mem_notify.h
2008-01-16 21:34:24.000000000 +0900
@@ -22,6 +22,9 @@ static inline void memory_pressure_notif
unsigned long target;
unsigned long pages_high, pages_free, pages_reserve;
+ if (unlikely(zone->mem_notify_status == -1))
+ return;
+
if (pressure) {
target = atomic_long_read(&last_mem_notify) + MEM_NOTIFY_FREQ;
if (likely(time_before(jiffies, target)))
Index: linux-2.6.24-rc6-mm1-memnotify/mm/page_alloc.c
===================================================================
--- linux-2.6.24-rc6-mm1-memnotify.orig/mm/page_alloc.c 2008-01-13
19:50:27.000000000 +0900
+++ linux-2.6.24-rc6-mm1-memnotify/mm/page_alloc.c 2008-01-16
21:41:58.000000000 +0900
@@ -3467,7 +3467,11 @@ static void __meminit free_area_init_cor
zone->zone_pgdat = pgdat;
zone->prev_priority = DEF_PRIORITY;
- zone->mem_notify_status = 0;
+
+ if (zone->present_pages < (pgdat->node_present_pages / 10))
+ zone->mem_notify_status = -1;
+ else
+ zone->mem_notify_status = 0;
zone_pcp_init(zone);
...really!? I am really happy!! Thanks you. - kosaki --
On Tue, 15 Jan 2008 10:03:23 +0900 Minor NAK - Please put new fields at the end - it makes it less likely to break badly written tools. --
Oh I see. I applied your opinion at next post. Thanks! - kosaki --
