Agghh!
I attached old version at last mail, please drop it.
right patch is here.
------------------------------------------------------------------------
add throttle to shrink_zone() for performance improvement and prevent incorrect oom.
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
---
include/linux/mmzone.h | 2 +
include/linux/sched.h | 1
mm/Kconfig | 10 ++++++++
mm/page_alloc.c | 4 +++
mm/vmscan.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++-
5 files changed, 72 insertions(+), 1 deletion(-)
Index: b/include/linux/mmzone.h
===================================================================
--- a/include/linux/mmzone.h 2008-05-03 00:39:44.000000000 +0900
+++ b/include/linux/mmzone.h 2008-05-03 00:44:12.000000000 +0900
@@ -328,6 +328,8 @@ struct zone {
unsigned long spanned_pages; /* total size, including holes */
unsigned long present_pages; /* amount of memory (excluding holes) */
+ atomic_t nr_reclaimers;
+ wait_queue_head_t reclaim_throttle_waitq;
/*
* rarely used fields:
*/
Index: b/mm/page_alloc.c
===================================================================
--- a/mm/page_alloc.c 2008-05-03 00:39:44.000000000 +0900
+++ b/mm/page_alloc.c 2008-05-03 00:44:12.000000000 +0900
@@ -3502,6 +3502,10 @@ static void __paginginit free_area_init_
zone->nr_scan_inactive = 0;
zap_zone_vm_stats(zone);
zone->flags = 0;
+
+ zone->nr_reclaimers = ATOMIC_INIT(0);
+ init_waitqueue_head(&zone->reclaim_throttle_waitq);
+
if (!size)
continue;
Index: b/mm/vmscan.c
===================================================================
--- a/mm/vmscan.c 2008-05-03 00:43:48.000000000 +0900
+++ b/mm/vmscan.c 2008-05-04 22:47:49.000000000 +0900
@@ -74,6 +74,11 @@ struct scan_control {
int order;
+ /* Can shrink be cutted off if other task freeded enough page. */
+ int may_cut_off;
+
+ unsigned long was_freed;
+
/* Which cgroup do we reclaim from */
struct mem_cgroup *mem_cgroup;
@@ -120,6 +125,7 @@ struct scan_control {
int vm_swappiness = 60;
long vm_total_pages; /* The total number of pages which the VM controls */
+#define MAX_RECLAIM_TASKS CONFIG_NR_MAX_RECLAIM_TASKS_PER_ZONE
static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);
@@ -1187,7 +1193,46 @@ static int shrink_zone(int priority, str
unsigned long nr_inactive;
unsigned long nr_to_scan;
unsigned long nr_reclaimed = 0;
+ int ret = 0;
+ int throttle_on = 0;
+ unsigned long freed;
+ unsigned long threshold;
+
+ /* avoid recursing wait_evnet */
+ if (current->flags & PF_RECLAIMING)
+ goto shrinking;
+
+ throttle_on = 1;
+ current->flags |= PF_RECLAIMING;
+ wait_event(zone->reclaim_throttle_waitq,
+ atomic_add_unless(&zone->nr_reclaimers, 1, MAX_RECLAIM_TASKS));
+
+ /* in some situation (e.g. hibernation), shrink processing shouldn't be
+ cut off even though large memory freeded. */
+ if (!sc->may_cut_off)
+ goto shrinking;
+
+ /* kswapd is no related for user latency experience. */
+ if (current->flags & PF_KSWAPD)
+ goto shrinking;
+
+ /* x4 ratio mean we want rarely check.
+ because frequently check decrease performance. */
+ threshold = ((1 << sc->order) + zone->pages_high) * 4;
+ freed = get_vm_event(PGFREE);
+
+ /* reclaim still necessary? */
+ if (scan_global_lru(sc) &&
+ freed - sc->was_freed >= threshold) {
+ if (zone_watermark_ok(zone, sc->order, zone->pages_high,
+ gfp_zone(sc->gfp_mask), 0)) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ sc->was_freed = freed;
+ }
+shrinking:
if (scan_global_lru(sc)) {
/*
* Add one to nr_to_scan just to make sure that the kernel
@@ -1239,9 +1284,16 @@ static int shrink_zone(int priority, str
}
}
+out:
+ if (throttle_on) {
+ current->flags &= ~PF_RECLAIMING;
+ atomic_dec(&zone->nr_reclaimers);
+ wake_up(&zone->reclaim_throttle_waitq);
+ }
+
sc->nr_reclaimed += nr_reclaimed;
throttle_vm_writeout(sc->gfp_mask);
- return 0;
+ return ret;
}
/*
@@ -1438,6 +1490,8 @@ unsigned long try_to_free_pages(struct z
.order = order,
.mem_cgroup = NULL,
.isolate_pages = isolate_pages_global,
+ .may_cut_off = 1,
+ .was_freed = get_vm_event(PGFREE),
};
return do_try_to_free_pages(zonelist, &sc);
Index: b/mm/Kconfig
===================================================================
--- a/mm/Kconfig 2008-05-03 00:39:44.000000000 +0900
+++ b/mm/Kconfig 2008-05-03 00:44:12.000000000 +0900
@@ -205,3 +205,13 @@ config NR_QUICK
config VIRT_TO_BUS
def_bool y
depends on !ARCH_NO_VIRT_TO_BUS
+
+config NR_MAX_RECLAIM_TASKS_PER_ZONE
+ int "maximum number of reclaiming tasks at the same time"
+ default 3
+ help
+ This value determines the number of threads which can do page reclaim
+ in a zone simultaneously. If this is too big, performance under heavy memory
+ pressure will decrease.
+ If unsure, use default.
+
Index: b/include/linux/sched.h
===================================================================
--- a/include/linux/sched.h 2008-05-03 00:39:44.000000000 +0900
+++ b/include/linux/sched.h 2008-05-03 00:44:12.000000000 +0900
@@ -1484,6 +1484,7 @@ static inline void put_task_struct(struc
#define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */
#define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */
#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */
+#define PF_RECLAIMING 0x80000000 /* The task have page reclaim throttling ticket */
/*
* Only the _current_ task can read/write to tsk->flags, but other
--
| David Miller | Re: [patch 7/8] fdmap v2 - implement sys_socket2 |
| Sean | Re: [AppArmor 39/45] AppArmor: Profile loading and manipulation,pathname matching |
| Andi Kleen | Re: missing madvise functionality |
| Alan Cox | [PATCH 03/57] ali: watchdog locking and style |
git: | |
| Guido Ostkamp | [PATCH] Fix Solaris Workshop Compiler issues |
| David Lang | Re: mingw, windows, crlf/lf, and git |
| Johannes Schindelin | Re: [kernel.org users] [RFD] On deprecating "git-foo" for builtins |
| Johannes Schindelin | Re: [PATCH] Fix off by one error in prep_exclude. |
| Marco Peereboom | Re: Real men don't attack straw men |
| patrick keshishian | SMTP flood + spamdb |
| Marcos Laufer | dmesg IBM x3650 OpenBSD 4.3 |
| Nick Holland | Re: The Atheros story in much fewer words |
| Hans de Goede | Re: cat /proc/net/tcp takes 0.5 seconds on x86_64 |
| Stephen Hemminger | [RFC] TCP illinois max rtt aging |
| Tilman Schmidt | Re: 2.6.25-rc8: FTP transfer errors |
| Evgeniy Polyakov | Re: Network/block layer race. |
| high memory | 16 hours ago | Linux kernel |
| semaphore access speed | 18 hours ago | Applications and Utilities |
| the kernel how to power off the machine | 19 hours ago | Linux kernel |
| Easter Eggs in windows XP | 22 hours ago | Windows |
| Shared swap partition | 23 hours ago | Linux general |
| Root password | 23 hours ago | Linux general |
| Where/when DNOTIFY is used? | 1 day ago | Linux kernel |
| How to convert Linux Kernel built-in module into a loadable module | 1 day ago | Linux kernel |
| Linux 2.6.24 and I/O schedulers | 1 day ago | Linux kernel |
| USB Driver -- Interrupt Polling -- A Little Help Please | 1 day ago | Linux general |
