Hi,
This post is for showing what I'm trying now.
This patch set is for memory resource controller.
4 purposes here.
- improve performance of memcg.
- remove lock_page_cgroup()
- making page_cgroup->flags to be atomic_ops.
- support mem+swap controller.
But this is still under test and the series is not well organised.
and base tree is old. (2.6.27-rc1-mm1) I'll rebase this set to newer mmtom tree.
Maybe this set have some troubles/objections but I think the direction is not bad.
Patch description. (patch ordering is bad. I'll fix in the next post.)
[1/9] ... private_counter ...replace res_counter with my own counter.
This is for supporting mem+swap controller.
(And I think memcg has a bit different characteristics from other
users of res_counter....)
[2/9] ... change-order-uncharge ...
This patch is for making it easy to handle swap-cache.
[3/9] ... atomic_flags
This patch changes operations for page_cgroup->flags to be atomic_ops.
[4/9] ... delayed freeing.
delaying to free page_cgroup at uncharge.
[5/9] ... RCU freeing of page_cgroup
free page_cgroup by RCU.
[6/9] ... lockress page cgroup.
remove lock_page_cgroup() and use RCU semantics.
[7/9] ... add preftech
add prefetch() macro
[8/9] ... mem+swap controller base.
introduce mem+swap controller. A bit big patch....but have tons of TODO.
and have troubles. (it seems it's difficult to cause OOM killer.)
[9/9] ... mem+swap controller control files.
add mem+swap controller's control files.
I'd like to push patch [2,3,4,5,6,7] first.
Thanks,
-Kame
--
Replasce res_counter with new mem_counter to do complex counting.
This patch is for mem+swap controller.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
mm/memcontrol.c | 160 ++++++++++++++++++++++++++++++++++++++++++++++++--------
1 file changed, 139 insertions(+), 21 deletions(-)
Index: linux-2.6.27-rc1-mm1/mm/memcontrol.c
===================================================================
--- linux-2.6.27-rc1-mm1.orig/mm/memcontrol.c
+++ linux-2.6.27-rc1-mm1/mm/memcontrol.c
@@ -116,12 +116,20 @@ struct mem_cgroup_lru_info {
* no reclaim occurs from a cgroup at it's low water mark, this is
* a feature that will be implemented much later in the future.
*/
+struct mem_counter {
+ unsigned long pages_limit;
+ unsigned long pages;
+ unsigned long failcnt;
+ unsigned long max_usage;
+ spinlock_t lock;
+};
+
struct mem_cgroup {
struct cgroup_subsys_state css;
/*
* the counter to account for memory usage
*/
- struct res_counter res;
+ struct mem_counter res;
/*
* Per cgroup active and inactive list, similar to the
* per zone LRU lists.
@@ -181,6 +189,16 @@ enum charge_type {
MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
};
+/* Private File ID for memcg */
+enum {
+ MEMCG_FILE_TYPE_PAGE_LIMIT,
+ MEMCG_FILE_TYPE_PAGE_USAGE,
+ MEMCG_FILE_TYPE_FAILCNT,
+ MEMCG_FILE_TYPE_MAX_USAGE,
+};
+
+
+
/*
* Always modified under lru lock. Then, not necessary to preempt_disable()
*/
@@ -279,6 +297,74 @@ static void unlock_page_cgroup(struct pa
bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
}
+/*
+ * counter for memory resource accounting.
+ *
+ */
+static void mem_counter_init(struct mem_cgroup *memcg)
+{
+ spin_lock_init(&memcg->res.lock);
+ memcg->res.pages = 0;
+ memcg->res.pages_limit = ~0UL;
+ memcg->res.failcnt = 0;
+}
+
+static int mem_counter_charge(struct mem_cgroup *memcg, long num)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&memcg->res.lock, ...This patch changes placement of mem_cgroup_uncharge_cache_page().
After this patch, mem_cgroup_uncharge_cache_page() is called only after
page->mapping is cleared. This will make uncharge() handling easier in future.
(And error check code is added.)
Signed-off-by: KAMEZAWA Hiruyoki <kamezawa.hiroyu@jp.fujitsu.com>
mm/filemap.c | 2 +-
mm/memcontrol.c | 1 +
mm/migrate.c | 13 ++++++++++---
3 files changed, 12 insertions(+), 4 deletions(-)
Index: linux-2.6.27-rc1-mm1/mm/filemap.c
===================================================================
--- linux-2.6.27-rc1-mm1.orig/mm/filemap.c
+++ linux-2.6.27-rc1-mm1/mm/filemap.c
@@ -116,12 +116,12 @@ void __remove_from_page_cache(struct pag
{
struct address_space *mapping = page->mapping;
- mem_cgroup_uncharge_cache_page(page);
radix_tree_delete(&mapping->page_tree, page->index);
page->mapping = NULL;
mapping->nrpages--;
__dec_zone_page_state(page, NR_FILE_PAGES);
BUG_ON(page_mapped(page));
+ mem_cgroup_uncharge_cache_page(page);
/*
* Some filesystems seem to re-dirty the page even after
Index: linux-2.6.27-rc1-mm1/mm/migrate.c
===================================================================
--- linux-2.6.27-rc1-mm1.orig/mm/migrate.c
+++ linux-2.6.27-rc1-mm1/mm/migrate.c
@@ -330,8 +330,6 @@ static int migrate_page_move_mapping(str
__inc_zone_page_state(newpage, NR_FILE_PAGES);
spin_unlock_irq(&mapping->tree_lock);
- if (!PageSwapCache(newpage))
- mem_cgroup_uncharge_cache_page(page);
return 0;
}
@@ -378,7 +376,16 @@ static void migrate_page_copy(struct pag
#endif
ClearPagePrivate(page);
set_page_private(page, 0);
- page->mapping = NULL;
+
+ /* PageAnon() checks page->mapping's bit */
+ if (PageAnon(page)) {
+ /* This page is uncharged in try_to_unmap() */
+ page->mapping = NULL;
+ } else {
+ /* This page was removed from radix-tree.*/
+ page->mapping = NULL;
+ mem_cgroup_uncharge_cache_page(page);
+ }
/*
* If any waiters have ...Maybe coding style of this patch is not good. (I should use enum..etc.)
This will be rewritten.
This patch adds function to modify page_cgroup->flags with using
set_bit/clear_bit/test_bit.
set/clear/test_bit is an usual way to manipulate flags an will reduce
ugly if sentenses. "atomic" set_bit may increase overhead but will
allow us looser control of this flags. (flag modification without locks!)
Of course, we don't have to use "atomic" ops where we can convice there
is no race.
This is a base patch for adding new flags.
(FLAG names are a bit modified....they are too long for 80 columns.)
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
mm/memcontrol.c | 82 +++++++++++++++++++++++++++++++++++++-------------------
1 file changed, 55 insertions(+), 27 deletions(-)
Index: linux-2.6.27-rc1-mm1/mm/memcontrol.c
===================================================================
--- linux-2.6.27-rc1-mm1.orig/mm/memcontrol.c
+++ linux-2.6.27-rc1-mm1/mm/memcontrol.c
@@ -166,12 +166,35 @@ struct page_cgroup {
struct list_head lru; /* per cgroup LRU list */
struct page *page;
struct mem_cgroup *mem_cgroup;
- int flags;
+ unsigned long flags;
};
-#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */
-#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */
-#define PAGE_CGROUP_FLAG_FILE (0x4) /* page is file system backed */
-#define PAGE_CGROUP_FLAG_UNEVICTABLE (0x8) /* page is unevictableable */
+
+/* These 2 flags are unchanged during being used. */
+#define PAGE_CG_FLAG_CACHE (0) /* charged as cache */
+#define PAGE_CG_FLAG_FILE (1) /* page is file system backed */
+#define PAGE_CG_FLAG_ACTIVE (2) /* page is active in this cgroup */
+#define PAGE_CG_FLAG_UNEVICTABLE (3) /* page is unevictableable */
+
+static inline void page_cgroup_set_bit(struct page_cgroup *pc, int flag)
+{
+ set_bit(flag, &pc->flags);
+}
+
+static inline void __page_cgroup_set_bit(struct ...Making freeing page_cgroup() at mem_cgroup_uncharge() to use lazy manner.
In mem_cgroup_uncharge_common(), we don't free page_cgroup
and just link it to per-cpu free queue.
And remove it later by checking threshold.
This patch is a base patch for freeing page_cgroup by RCU patch.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
mm/memcontrol.c | 120 ++++++++++++++++++++++++++++++++++++++++++++++++--------
1 file changed, 103 insertions(+), 17 deletions(-)
Index: linux-2.6.27-rc1-mm1/mm/memcontrol.c
===================================================================
--- linux-2.6.27-rc1-mm1.orig/mm/memcontrol.c
+++ linux-2.6.27-rc1-mm1/mm/memcontrol.c
@@ -167,6 +167,7 @@ struct page_cgroup {
struct page *page;
struct mem_cgroup *mem_cgroup;
unsigned long flags;
+ struct page_cgroup *next; /* used for Lazy LRU */
};
/* These 2 flags are unchanged during being used. */
@@ -174,6 +175,21 @@ struct page_cgroup {
#define PAGE_CG_FLAG_FILE (1) /* page is file system backed */
#define PAGE_CG_FLAG_ACTIVE (2) /* page is active in this cgroup */
#define PAGE_CG_FLAG_UNEVICTABLE (3) /* page is unevictableable */
+#define PAGE_CG_FLAG_OBSOLETE (4) /* page is unevictableable */
+
+#define MEMCG_LRU_THRESH (16)
+
+/*
+ * per-cpu slot for freeing page_cgroup in lazy way.
+ */
+
+struct mem_cgroup_lazy_lru {
+ int count;
+ struct page_cgroup *next;
+};
+
+DEFINE_PER_CPU(struct mem_cgroup_lazy_lru, memcg_lazy_lru);
+
static inline void page_cgroup_set_bit(struct page_cgroup *pc, int flag)
{
@@ -495,10 +511,12 @@ void mem_cgroup_move_lists(struct page *
pc = page_get_page_cgroup(page);
if (pc) {
- mz = page_cgroup_zoneinfo(pc);
- spin_lock_irqsave(&mz->lru_lock, flags);
- __mem_cgroup_move_lists(pc, lru);
- spin_unlock_irqrestore(&mz->lru_lock, flags);
+ if (!page_cgroup_test_bit(pc, PAGE_CG_FLAG_OBSOLETE)) {
+ mz = page_cgroup_zoneinfo(pc);
+ spin_lock_irqsave(&mz->lru_lock, ...Making freeing of page_cgroup to be rcu routine.
This patch avoid directly freeing per-cpu page_cgroup free and
pass freeq to RCU routine.
This patch is a base patch for removing lock_page_cgroup().
By this, page_cgroup object is valid while rcu_read_lock() is taken.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
mm/memcontrol.c | 39 +++++++++++++++++++++++++++++++++------
1 file changed, 33 insertions(+), 6 deletions(-)
Index: linux-2.6.27-rc1-mm1/mm/memcontrol.c
===================================================================
--- linux-2.6.27-rc1-mm1.orig/mm/memcontrol.c
+++ linux-2.6.27-rc1-mm1/mm/memcontrol.c
@@ -638,21 +638,25 @@ unsigned long mem_cgroup_isolate_pages(u
return nr_taken;
}
-void __mem_cgroup_drop_lru(void)
+struct memcg_rcu_work {
+ struct rcu_head head;
+ struct page_cgroup *list;
+};
+
+
+void __mem_cgroup_drop_lru(struct rcu_head *head)
{
struct mem_cgroup *memcg;
struct page_cgroup *pc, *next;
struct mem_cgroup_per_zone *mz, *page_mz;
- struct mem_cgroup_lazy_lru *mll;
unsigned long flags;
+ struct memcg_rcu_work *work;
- mll = &get_cpu_var(memcg_lazy_lru);
- next = mll->next;
- mll->next = NULL;
- mll->count = 0;
- put_cpu_var(memcg_lazy_lru);
+ work = container_of(head, struct memcg_rcu_work, head);
+ next = work->list;
mz = NULL;
+ kfree(work);
local_irq_save(flags);
while (next) {
@@ -678,6 +682,27 @@ void __mem_cgroup_drop_lru(void)
return;
}
+static int mem_cgroup_drop_lru_rcu(void)
+{
+ struct mem_cgroup_lazy_lru *mll;
+ struct memcg_rcu_work *work;
+
+ work = kmalloc(sizeof(*work), GFP_ATOMIC);
+ if (!work)
+ return 1;
+
+ INIT_RCU_HEAD(&work->head);
+
+ mll = &get_cpu_var(memcg_lazy_lru);
+ work->list = mll->next;
+ mll->next = NULL;
+ mll->count = 0;
+ put_cpu_var(memcg_lazy_lru);
+ call_rcu(&work->head, __mem_cgroup_drop_lru);
+
+ return 0;
+}
+
static void mem_cgroup_drop_lru(struct page_cgroup *pc)
{
int count;
@@ -690,14 ...Experimental !!
This patch just removes lock_page_cgroup().
By RCU, it seems unnecessary....
Why it's safe without lock_page_cgroup().
Anon pages:
* pages are chareged/uncharged only when first-mapped/last-unmapped.
page_mapcount() handles that.
(at uncharge, pte_lock() is always held in racy case.)
Swap pages:
About SwapCache, there will be race.
mem_cgroup_charge() is moved under lock_page().
File pages: (not Shmem)
* pages are charged/uncharged only when it's added/removed to radix-tree.
In this case, PageLock() is always held.
Install Page:
Is it worth to charge driver's map page ? which is (maybe) not on LRU.
Is it targe resource of memcg ? I think no.
I removed charge/uncharge from install_page().
freeing page_cgroup is done under RCU.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
include/linux/mm_types.h | 2 -
mm/memcontrol.c | 86 ++++++-----------------------------------------
mm/memory.c | 17 +++------
3 files changed, 19 insertions(+), 86 deletions(-)
Index: linux-2.6.27-rc1-mm1/include/linux/mm_types.h
===================================================================
--- linux-2.6.27-rc1-mm1.orig/include/linux/mm_types.h
+++ linux-2.6.27-rc1-mm1/include/linux/mm_types.h
@@ -93,7 +93,7 @@ struct page {
not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
- unsigned long page_cgroup;
+ struct page_cgroup *page_cgroup;
#endif
#ifdef CONFIG_KMEMCHECK
Index: linux-2.6.27-rc1-mm1/mm/memcontrol.c
===================================================================
--- linux-2.6.27-rc1-mm1.orig/mm/memcontrol.c
+++ linux-2.6.27-rc1-mm1/mm/memcontrol.c
@@ -145,20 +145,6 @@ struct mem_cgroup {
static struct mem_cgroup init_mem_cgroup;
/*
- * We use the lower bit of the page->page_cgroup pointer as a bit spin
- * lock. We need to ensure that page->page_cgroup is at least two
- * byte aligned (based on comments ...mem_cgroup_charge() common has to take lock but the place of lock can be calculated in early stage. This patch tries to prefetch lock line. (Have some good effect on my host.) Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> --- mm/memcontrol.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) Index: linux-2.6.27-rc1-mm1/mm/memcontrol.c =================================================================== --- linux-2.6.27-rc1-mm1.orig/mm/memcontrol.c +++ linux-2.6.27-rc1-mm1/mm/memcontrol.c @@ -707,11 +707,14 @@ static int mem_cgroup_charge_common(stru struct page_cgroup *pc; unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; unsigned long flags; + int nid, zid; pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask); if (unlikely(pc == NULL)) goto err; + nid = page_to_nid(page); + zid = page_zonenum(page); /* * We always charge the cgroup the mm_struct belongs to. * The mm_struct's mem_cgroup changes on task migration if the @@ -753,6 +756,8 @@ static int mem_cgroup_charge_common(stru goto out; } } + mz = mem_cgroup_zoneinfo(mem, nid, zid); + prefetchw(mz); pc->mem_cgroup = mem; pc->page = page; @@ -773,7 +778,6 @@ static int mem_cgroup_charge_common(stru VM_BUG_ON(page->page_cgroup); page_assign_page_cgroup(page, pc); - mz = page_cgroup_zoneinfo(pc); spin_lock_irqsave(&mz->lru_lock, flags); __mem_cgroup_add_list(mz, pc); spin_unlock_irqrestore(&mz->lru_lock, flags); --
Very experimental...
mem+swap controller prototype.
This patch adds CONFIG_CGROUP_MEM_RES_CTLR_SWAP as memory resource
controller's swap extension.
When enabling this, memory resource controller will have 2 limits.
- memory.limit_in_bytes .... limit for pages
- memory.memsw_limit_in_bytes .... limit for pages + swaps.
Following is (easy) accounting state transion after this patch.
pages swaps pages_total memsw_total
+1 - +1 +1 new page allocation.
-1 +1 -1 - swap out.
+1 -1 0 - swap in (*).
- -1 - -1 swap_free.
At swap-out, swp_entry will be charged against the cgroup of the page.
At swap-in, the page will be charged when it's mapped.
(Maybe accounting at read_swap() will be beautiful but we can avoid some of
error handling to delay accounting until mem_cgroup_charge().)
The charge against swap_entry will be uncharged when swap_entry is freed.
The parameter res.swaps just includes swaps not-on swap cache.
So, this doesn't show real usage of swp_entry just shows swp_entry on disk.
This patch doesn't include codes for control files.
TODO:
- clean up. and add comments.
- support vm_swap_full() under cgroup.
- find easier-to-understand protocol....
- check force_empty....(maybe buggy)
- support page migration.
- test!!
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
include/linux/swap.h | 32 +++-
init/Kconfig | 12 +
kernel/power/swsusp.c | 2
mm/memcontrol.c | 387 +++++++++++++++++++++++++++++++++++++++++++++-----
mm/shmem.c | 2
mm/swap_state.c | 9 -
mm/swapfile.c | 54 ++++++
7 files changed, 453 insertions(+), 45 deletions(-)
Index: linux-2.6.27-rc1-mm1/mm/memcontrol.c
===================================================================
--- linux-2.6.27-rc1-mm1.orig/mm/memcontrol.c
+++ ...Hi. I think you are making updated ones, I send comments so far. IMHO, it would be better to "show" real usage of swp_entry. Otherwise, "sum of swap usage of all groups" != "swap usage of system shown by meminfo"(but it means adding another counter, hmm...). Instead of showing the usage of disk_swap, how about showing Is it needed? In my swap controller, swap entries are limited per cgroup. So, to make swap_cgroup_charge() fail less frequently, vm_swap_full() should be calculated per cgroup so that vm can free swap entries in advance. And, - move charge along with task move - hierarchy support Of course, more basic features and stabilization should be done first. I agree with this patch as a whole, but I'm worrying about race between swapout and swapin about the same entry(I should consider more...). Thanks, Daisuke Nishimura. --
On Fri, 22 Aug 2008 19:29:43 +0900 On memory resource. The sum of - mapped anonymous pages - pages of file cache. Yes, I feel the amount of disk_swap is not very useful in my test. swapout/swapin race is guarded by the face I always handle swap-cache. add_to_swap_cache/delete_from_swap_cache is under lock_page(). and do_swap_page()'s charging is moved under lock_page(). I saw race with force_empty ;(. I hope it's fixed in the latest version. Thanks, -Kame --
Add control files to mem+swap controller.
This patch adds following 2 files.
- memory.memsw_limit_in_bytes ..... limit for mem+swap usage.
- memory.swap_usage_in_bytes ..... usage for swap_entry.
Following rules must be kept.
memory.memsw_limit_in_bytes >= memory.limit_in_bytes.
If not, -EINVAL will return.
TODO:
- add Documentation.
- add function/file to force swap-in for reducing swap usage.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
mm/memcontrol.c | 100 ++++++++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 93 insertions(+), 7 deletions(-)
Index: linux-2.6.27-rc1-mm1/mm/memcontrol.c
===================================================================
--- linux-2.6.27-rc1-mm1.orig/mm/memcontrol.c
+++ linux-2.6.27-rc1-mm1/mm/memcontrol.c
@@ -268,10 +268,11 @@ enum {
MEMCG_FILE_TYPE_PAGE_USAGE,
MEMCG_FILE_TYPE_FAILCNT,
MEMCG_FILE_TYPE_MAX_USAGE,
+ MEMCG_FILE_TYPE_MEMSW_LIMIT,
+ MEMCG_FILE_TYPE_SWAP_USAGE,
};
-
/*
* Always modified under lru lock. Then, not necessary to preempt_disable()
* "flags" passed to this function is a copy of pc->flags but flags checked
@@ -415,11 +416,11 @@ mem_counter_recharge_swapout(struct mem_
}
static inline void
-mem_counter_uncharge_swap(struct mem_cgroup *memcg, long num)
+mem_counter_uncharge_swap(struct mem_cgroup *memcg)
{
unsigned long flags;
spin_lock_irqsave(&memcg->res.lock, flags);
- memcg->res.swaps -= num;
+ memcg->res.swaps -= 1;
spin_unlock_irqrestore(&memcg->res.lock, flags);
}
@@ -430,7 +431,9 @@ static int mem_counter_set_pages_limit(s
int ret = -EBUSY;
spin_lock_irqsave(&memcg->res.lock, flags);
- if (memcg->res.pages < lim) {
+ if (lim > memcg->res.memsw_limit)
+ ret = -EINVAL;
+ else if (memcg->res.pages < lim) {
memcg->res.pages_limit = lim;
ret = 0;
}
@@ -568,6 +571,25 @@ void mem_cgroup_move_lists(struct page *
}
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+
+static int ...Thank you for working on mem+swap controller. I'll review and comment later. Thanks, --
On Tue, 19 Aug 2008 18:11:50 +0900 My next version (writing now) will be much cleaner and clearer than this ;) So, please review only if you have free time. Thanks, --
I took a quick look at the patches, patch 1 seemed not so clear, why can't we enhance or fix resource counters? I'll review/test the patches tonight. -- Balbir --
On Wed, 20 Aug 2008 09:00:50 +0530 patch 1 is for patch 8. (patch order is too bad.) please ignore this version. this is just a preview.(Sorry) I'm now writing easier-to-read one, thanks, --
Hi, this is a patch set for lockless page_cgroup.
dropped patches related to mem+swap controller for easy review.
(I'm rewriting it, too.)
Changes from current -mm is.
- page_cgroup->flags operations is set to be atomic.
- lock_page_cgroup() is removed.
- page->page_cgroup is changed from unsigned long to struct page_cgroup*
- page_cgroup is freed by RCU.
- For avoiding race, charge/uncharge against mm/memory.c::insert_page() is
omitted. This is ususally used for mapping device's page. (I think...)
In my quick test, perfomance is improved a little. But the benefit of this
patch is to allow access page_cgroup without lock. I think this is good
for Yamamoto's Dirty page tracking for memcg.
For I/O tracking people, I added a header file for allowing access to
page_cgroup from out of memcontrol.c
The base kernel is recent mmtom. Any comments are welcome.
This is still under test. I have to do long-run test before removing "RFC".
patch [1-4] is core logic.
[1/7] page_cgroup_atomic_flags.patch
[2/7] delayed_batch_freeing_of_page_cgroup.patch
[3/7] freeing page_cgroup by rcu.patch
[4/7] lockess page_cgroup.patch
[5/7] add prefetch patch
[6/7] make-mapping-null-before-calling-uncharge.patch
[7/7] adding page_cgroup.h header file.patch
Thanks,
-Kame
--
This patch makes page_cgroup->flags to be atomic_ops and define
functions (and macros) to access it.
This patch itself makes memcg slow but this patch's final purpose is
to remove lock_page_cgroup() and allowing fast/easy access to page_cgroup.
Before trying to modify memory resource controller, this atomic operation
on flags is necessary.
Changelog (preview) -> (v1):
- patch ordering is changed.
- Added macro for defining functions for Test/Set/Clear bit.
- made the names of flags shorter.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
mm/memcontrol.c | 108 +++++++++++++++++++++++++++++++++++++++-----------------
1 file changed, 77 insertions(+), 31 deletions(-)
Index: mmtom-2.6.27-rc3+/mm/memcontrol.c
===================================================================
--- mmtom-2.6.27-rc3+.orig/mm/memcontrol.c
+++ mmtom-2.6.27-rc3+/mm/memcontrol.c
@@ -158,12 +158,57 @@ struct page_cgroup {
struct list_head lru; /* per cgroup LRU list */
struct page *page;
struct mem_cgroup *mem_cgroup;
- int flags;
+ unsigned long flags;
};
-#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */
-#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */
-#define PAGE_CGROUP_FLAG_FILE (0x4) /* page is file system backed */
-#define PAGE_CGROUP_FLAG_UNEVICTABLE (0x8) /* page is unevictableable */
+
+enum {
+ /* flags for mem_cgroup */
+ Pcg_CACHE, /* charged as cache */
+ /* flags for LRU placement */
+ Pcg_ACTIVE, /* page is active in this cgroup */
+ Pcg_FILE, /* page is file system backed */
+ Pcg_UNEVICTABLE, /* page is unevictableable */
+};
+
+#define TESTPCGFLAG(uname, lname) \
+static inline int Pcg##uname(struct page_cgroup *pc) \
+ { return test_bit(Pcg_##lname, &pc->flags); }
+
+#define SETPCGFLAG(uname, lname) \
+static inline void SetPcg##uname(struct page_cgroup *pc)\
+ { set_bit(Pcg_##lname, &pc->flags); }
+
+#define CLEARPCGFLAG(uname, lname) \
+static inline void ...Freeing page_cgroup at mem_cgroup_uncharge() in lazy way.
In mem_cgroup_uncharge_common(), we don't free page_cgroup
and just link it to per-cpu free queue.
And remove it later by checking threshold.
This patch is a base patch for freeing page_cgroup by RCU patch.
This patch depends on page_cgroup_atomic_flags.patch.
Changelog: (preview) -> (v1)
- Clean up.
- renamed functions
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
mm/memcontrol.c | 115 ++++++++++++++++++++++++++++++++++++++++++++++++++------
1 file changed, 103 insertions(+), 12 deletions(-)
Index: mmtom-2.6.27-rc3+/mm/memcontrol.c
===================================================================
--- mmtom-2.6.27-rc3+.orig/mm/memcontrol.c
+++ mmtom-2.6.27-rc3+/mm/memcontrol.c
@@ -159,11 +159,13 @@ struct page_cgroup {
struct page *page;
struct mem_cgroup *mem_cgroup;
unsigned long flags;
+ struct page_cgroup *next;
};
enum {
/* flags for mem_cgroup */
Pcg_CACHE, /* charged as cache */
+ Pcg_OBSOLETE, /* this page cgroup is invalid (unused) */
/* flags for LRU placement */
Pcg_ACTIVE, /* page is active in this cgroup */
Pcg_FILE, /* page is file system backed */
@@ -194,6 +196,10 @@ static inline void __ClearPcg##uname(str
TESTPCGFLAG(Cache, CACHE)
__SETPCGFLAG(Cache, CACHE)
+/* No "Clear" routine for OBSOLETE flag */
+TESTPCGFLAG(Obsolete, OBSOLETE);
+SETPCGFLAG(Obsolete, OBSOLETE);
+
/* LRU management flags (from global-lru definition) */
TESTPCGFLAG(File, FILE)
SETPCGFLAG(File, FILE)
@@ -220,6 +226,18 @@ static enum zone_type page_cgroup_zid(st
return page_zonenum(pc->page);
}
+/*
+ * per-cpu slot for freeing page_cgroup in lazy manner.
+ * All page_cgroup linked to this list is OBSOLETE.
+ */
+struct mem_cgroup_sink_list {
+ int count;
+ struct page_cgroup *next;
+};
+DEFINE_PER_CPU(struct mem_cgroup_sink_list, memcg_sink_list);
+#define MEMCG_LRU_THRESH (16)
+
+
enum charge_type {
MEM_CGROUP_CHARGE_TYPE_CACHE = ...By delayed_batch_freeing_of_page_cgroup.patch, page_cgroup can be
freed lazily. After this patch, page_cgroup is freed by RCU and
page_cgroup is RCU safe. This is necessary for lockless page_cgroup patch
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
mm/memcontrol.c | 44 ++++++++++++++++++++++++++++++++++++--------
1 file changed, 36 insertions(+), 8 deletions(-)
Index: mmtom-2.6.27-rc3+/mm/memcontrol.c
===================================================================
--- mmtom-2.6.27-rc3+.orig/mm/memcontrol.c
+++ mmtom-2.6.27-rc3+/mm/memcontrol.c
@@ -577,19 +577,23 @@ unsigned long mem_cgroup_isolate_pages(u
* Free obsolete page_cgroups which is linked to per-cpu drop list.
*/
-static void __free_obsolete_page_cgroup(void)
+struct page_cgroup_rcu_work {
+ struct rcu_head head;
+ struct page_cgroup *list;
+};
+
+static void __free_obsolete_page_cgroup_cb(struct rcu_head *head)
{
struct mem_cgroup *memcg;
struct page_cgroup *pc, *next;
struct mem_cgroup_per_zone *mz, *page_mz;
- struct mem_cgroup_sink_list *mcsl;
+ struct page_cgroup_rcu_work *work;
unsigned long flags;
- mcsl = &get_cpu_var(memcg_sink_list);
- next = mcsl->next;
- mcsl->next = NULL;
- mcsl->count = 0;
- put_cpu_var(memcg_sink_list);
+
+ work = container_of(head, struct page_cgroup_rcu_work, head);
+ next = work->list;
+ kfree(work);
mz = NULL;
@@ -616,6 +620,26 @@ static void __free_obsolete_page_cgroup(
local_irq_restore(flags);
}
+static int __free_obsolete_page_cgroup(void)
+{
+ struct page_cgroup_rcu_work *work;
+ struct mem_cgroup_sink_list *mcsl;
+
+ work = kmalloc(sizeof(*work), GFP_ATOMIC);
+ if (!work)
+ return -ENOMEM;
+ INIT_RCU_HEAD(&work->head);
+
+ mcsl = &get_cpu_var(memcg_sink_list);
+ work->list = mcsl->next;
+ mcsl->next = NULL;
+ mcsl->count = 0;
+ put_cpu_var(memcg_sink_list);
+
+ call_rcu(&work->head, __free_obsolete_page_cgroup_cb);
+ return 0;
+}
+
static void ...This patch removes lock_page_cgroup(). Now, page_cgroup is guarded by RCU.
To remove lock_page_cgroup(), we have to confirm there is no race.
Anon pages:
* pages are chareged/uncharged only when first-mapped/last-unmapped.
page_mapcount() handles that.
(And... pte_lock() is always held in any racy case.)
Swap pages:
There will be race because charge is done before lock_page().
This patch moves mem_cgroup_charge() under lock_page().
File pages: (not Shmem)
* pages are charged/uncharged only when it's added/removed to radix-tree.
In this case, PageLock() is always held.
Install Page:
Is it worth to charge this special map page ? which is (maybe) not on LRU.
I think no.
I removed charge/uncharge from install_page().
Page Migration:
We precharge it and map it back under lock_page(). This should be treated
as special case.
freeing page_cgroup is done under RCU.
After this patch, page_cgroup can be accessed via
**
rcu_read_lock();
pc = page_get_page_cgroup(page);
if (pc && !PcgObsolete(pc)) {
......
}
rcu_read_unlock();
**
This is now under test. Don't apply if you're not brave.
Changelog: (preview) -> (v1)
- Added comments.
- Fixed page migration.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
include/linux/mm_types.h | 2
mm/memcontrol.c | 119 +++++++++++++++++------------------------------
mm/memory.c | 16 +-----
3 files changed, 51 insertions(+), 86 deletions(-)
Index: mmtom-2.6.27-rc3+/mm/memcontrol.c
===================================================================
--- mmtom-2.6.27-rc3+.orig/mm/memcontrol.c
+++ mmtom-2.6.27-rc3+/mm/memcontrol.c
@@ -137,20 +137,6 @@ struct mem_cgroup {
static struct mem_cgroup init_mem_cgroup;
/*
- * We use the lower bit of the page->page_cgroup pointer as a bit spin
- * lock. We need to ensure that page->page_cgroup is at least two
- * byte aligned (based on comments from Nick Piggin). But since
- * ...Address of "mz" can be calculated in early stage. prefetch it (we always do spin_lock later.) Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> --- mm/memcontrol.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) Index: mmtom-2.6.27-rc3+/mm/memcontrol.c =================================================================== --- mmtom-2.6.27-rc3+.orig/mm/memcontrol.c +++ mmtom-2.6.27-rc3+/mm/memcontrol.c @@ -694,6 +694,8 @@ static int mem_cgroup_charge_common(stru } } + mz = mem_cgroup_zoneinfo(mem, page_to_nid(page), page_zonenum(page)); + prefetchw(mz); pc->mem_cgroup = mem; pc->page = page; pc->flags = 0; @@ -716,7 +718,6 @@ static int mem_cgroup_charge_common(stru page_assign_page_cgroup(page, pc); - mz = page_cgroup_zoneinfo(pc); spin_lock_irqsave(&mz->lru_lock, flags); __mem_cgroup_add_list(mz, pc); spin_unlock_irqrestore(&mz->lru_lock, flags); --
This patch tries to make page->mapping to be NULL before
mem_cgroup_uncharge_cache_page() is called.
"page->mapping == NULL" is a good check for "whether the page is still
radix-tree or not".
This patch also adds VM_BUG_ON() to mem_cgroup_uncharge_cache_page();
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
mm/filemap.c | 2 +-
mm/memcontrol.c | 1 +
mm/migrate.c | 11 +++++++++--
3 files changed, 11 insertions(+), 3 deletions(-)
Index: mmtom-2.6.27-rc3+/mm/filemap.c
===================================================================
--- mmtom-2.6.27-rc3+.orig/mm/filemap.c
+++ mmtom-2.6.27-rc3+/mm/filemap.c
@@ -116,12 +116,12 @@ void __remove_from_page_cache(struct pag
{
struct address_space *mapping = page->mapping;
- mem_cgroup_uncharge_cache_page(page);
radix_tree_delete(&mapping->page_tree, page->index);
page->mapping = NULL;
mapping->nrpages--;
__dec_zone_page_state(page, NR_FILE_PAGES);
BUG_ON(page_mapped(page));
+ mem_cgroup_uncharge_cache_page(page);
/*
* Some filesystems seem to re-dirty the page even after
Index: mmtom-2.6.27-rc3+/mm/memcontrol.c
===================================================================
--- mmtom-2.6.27-rc3+.orig/mm/memcontrol.c
+++ mmtom-2.6.27-rc3+/mm/memcontrol.c
@@ -854,6 +854,7 @@ void mem_cgroup_uncharge_page(struct pag
void mem_cgroup_uncharge_cache_page(struct page *page)
{
VM_BUG_ON(page_mapped(page));
+ VM_BUG_ON(page->mapping);
__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
}
Index: mmtom-2.6.27-rc3+/mm/migrate.c
===================================================================
--- mmtom-2.6.27-rc3+.orig/mm/migrate.c
+++ mmtom-2.6.27-rc3+/mm/migrate.c
@@ -330,8 +330,6 @@ static int migrate_page_move_mapping(str
__inc_zone_page_state(newpage, NR_FILE_PAGES);
spin_unlock_irq(&mapping->tree_lock);
- if (!PageSwapCache(newpage))
- mem_cgroup_uncharge_cache_page(page);
return 0;
}
@@ -379,6 +377,15 @@ ...You forget to remove this line :) Thanks, --
On Fri, 22 Aug 2008 13:57:43 +0900 Ouch, thanks. --
Experimental...I wonder whether this is enough for potential users.
==
page_cgroup is a struct for accounting each page under memory resource
controller. Currently, it's only used under memcontrol.h but there
is possible user of this struct (now).
(*) Because page_cgroup is an extended/on-demand mem_map by nature,
there are people who want to use this for recording information.
If no users, this patch is not necessary.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
include/linux/page_cgroup.h | 100 ++++++++++++++++++++++++++++++++++++++++++++
mm/memcontrol.c | 82 ------------------------------------
2 files changed, 101 insertions(+), 81 deletions(-)
Index: mmtom-2.6.27-rc3+/include/linux/page_cgroup.h
===================================================================
--- /dev/null
+++ mmtom-2.6.27-rc3+/include/linux/page_cgroup.h
@@ -0,0 +1,100 @@
+#ifndef __LINUX_PAGE_CGROUP_H
+#define __LINUX_PAGE_CGROUP_H
+
+/*
+ * A page_cgroup page is associated with every page descriptor. The
+ * page_cgroup helps us identify information about the cgroup.
+ *
+ * This is pointed from struct page by page->page_cgroup pointer.
+ * This pointer is safe under RCU. If a page_cgroup is marked as
+ * Obsolete, don't access it.
+ *
+ * Typical way to access page_cgroup is following.
+ *
+ * rcu_read_lock();
+ * pc = page_get_page_cgroup(page);
+ * if (pc && !PcgObsolete(pc)) {
+ * ......
+ * }
+ * rcu_read_unlock();
+ *
+ */
+struct page_cgroup {
+ struct list_head lru; /* per zone/memcg LRU list */
+ struct page *page; /* the page this accounts for */
+ struct mem_cgroup *mem_cgroup; /* belongs to this mem_cgroup */
+ unsigned long flags;
+ struct page_cgroup *next;
+};
+
+enum {
+ /* flags for mem_cgroup */
+ Pcg_CACHE, /* charged as cache */
+ Pcg_OBSOLETE, /* this page cgroup is invalid (unused) */
+ /* flags for LRU placement */
+ Pcg_ACTIVE, /* page is active in this cgroup */
+ Pcg_FILE, /* page is ...On Wed, 20 Aug 2008 18:53:06 +0900 Known problem: force_emtpy is broken...so rmdir will struck into nightmare. It's because of patch 2/7. will be fixed in the next version. Thanks, -Kame --
On Wed, 20 Aug 2008 19:41:08 +0900
This is a quick fix but I think I can find some better solution..
==
Because removal from LRU is delayed, mz->lru will never be empty until
someone kick drain. This patch rotate LRU while force_empty and makes
page_cgroup will be freed.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
mm/memcontrol.c | 40 +++++++++++++++++++++++++---------------
1 file changed, 25 insertions(+), 15 deletions(-)
Index: mmtom-2.6.27-rc3+/mm/memcontrol.c
===================================================================
--- mmtom-2.6.27-rc3+.orig/mm/memcontrol.c
+++ mmtom-2.6.27-rc3+/mm/memcontrol.c
@@ -893,34 +893,45 @@ static void mem_cgroup_force_empty_list(
struct mem_cgroup_per_zone *mz,
enum lru_list lru)
{
- struct page_cgroup *pc;
+ struct page_cgroup *pc, *tmp;
struct page *page;
int count = FORCE_UNCHARGE_BATCH;
unsigned long flags;
struct list_head *list;
+ int drain, rotate;
list = &mz->lists[lru];
spin_lock_irqsave(&mz->lru_lock, flags);
+ rotate = 0;
while (!list_empty(list)) {
pc = list_entry(list->prev, struct page_cgroup, lru);
- page = pc->page;
- get_page(page);
- spin_unlock_irqrestore(&mz->lru_lock, flags);
- /*
- * Check if this page is on LRU. !LRU page can be found
- * if it's under page migration.
- */
- if (PageLRU(page)) {
- __mem_cgroup_uncharge_common(page,
- MEM_CGROUP_CHARGE_TYPE_FORCE);
- put_page(page);
+ drain = PcgObsolete(pc);
+ if (drain) {
+ /* Skip this */
+ list_move(&pc->lru);
+ spin_unlock_irqrestore(&mz->lru_lock, flags);
+ rotate++;
+ if (rotate > MEMCG_LRU_THRESH/2)
+ mem_cgroup_all_force_drain();
+ cond_resched();
+ } else {
+ page = pc->page;
+ get_page(page);
+ spin_unlock_irqrestore(&mz->lru_lock, flags);
+ /*
+ * Check if this page is on LRU. !LRU page can be found
+ * if it's under page migration.
+ */
+ if (PageLRU(page)) ...On Wed, 20 Aug 2008 20:00:06 +0900 I'd like to rewrite force_empty to move all usage to "default" cgroup. There are some reasons. 1. current force_empty creates an alive page which has no page_cgroup. This is bad for routine which want to access page_cgroup from page. And this behavior will be an issue of race condition in future. 2. We can see amount of out-of-control usage in default cgroup. But to do this, I'll have to avoid "hitting limit" in default cgroup. I'm now wondering to make it impossible to set limit to default cgroup. (will show as a patch in the next version of series.) Does anyone have an idea ? Thanks, -Kame --
Hi, Kamezawa-San, The definition of default-cgroup would be root cgroup right? I would like to implement hierarchies correctly in order to define the default-cgroup (it could be a parent of the child cgroup for example). -- Balbir --
On Thu, 21 Aug 2008 09:06:53 +0530 Ah yes, "root" cgroup, now. I need trash-can-cgroup somewhere for force_empty. Accounted-in-trash-can is better than accounter by no one. Once we change the behavior, we can have another choices of improvements. 1. move account information to the parent cgroup. 2. move account information to user-defined trash-can cgroup. As first step, I'd like to start from "root" cgroup. We can improve behavior in step-by-step manner as we've done. Thanks, -Kame --
I don't have a strong objection about setting default cgroup unlimited and moving usages to default cgroup. But I think this is related to hierarchy support as Balbir-san says. And, setting default cgroup unlimited would not be so strange if hierarchy is supported. Thanks, Daisuke Nishimura. --
On Wed, 20 Aug 2008 20:00:06 +0900
This is a new routine for force_empty. Assumes init_mem_cgroup has no limit.
(lockless page_cgroup is also applied.)
I think this routine is enough generic to be enhanced for hierarchy in future.
I think move_account() routine can be used for other purpose.
(for example, move_task.)
==
int mem_cgroup_move_account(struct page *page, struct page_cgroup *pc,
struct mem_cgroup *from, struct mem_cgroup *to)
{
struct mem_cgroup_per_zone *from_mz, *to_mz;
int nid, zid;
int ret = 1;
VM_BUG_ON(to->no_limit == 0);
VM_BUG_ON(!irqs_disabled());
nid = page_to_nid(page);
zid = page_zonenum(page);
from_mz = mem_cgroup_zoneinfo(from, nid, zid);
to_mz = mem_cgroup_zoneinfo(to, nid, zid);
if (res_counter_charge(&to->res, PAGE_SIZE)) {
/* Now, we assume no_limit...no failure here. */
return ret;
}
if (spin_trylock(&to_mz->lru_lock)) {
__mem_cgroup_remove_list(from_mz, pc);
css_put(&from->css);
res_counter_uncharge(&from->res, PAGE_SIZE);
pc->mem_cgroup = to;
css_get(&to->css);
__mem_cgroup_add_list(to_mz, pc);
ret = 0;
spin_unlock(&to_mz->lru_lock);
} else {
res_counter_uncharge(&to->res, PAGE_SIZE);
}
return ret;
}
/*
* This routine moves all account to root cgroup.
*/
static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
struct mem_cgroup_per_zone *mz,
enum lru_list lru)
{
struct page_cgroup *pc;
unsigned long flags;
struct list_head *list;
int drain = 0;
list = &mz->lists[lru];
spin_lock_irqsave(&mz->lru_lock, flags);
while (!list_empty(list)) {
pc = list_entry(list->prev, struct ...Thanks, Kame. It is a good news that the page tracking framework is open. Thanks, Hirokazu Takahashi. --
