This is v6. Thank you all for kindly reviews. Major changes from v5 is a) changed ID allocation logic. Maybe much clearer than v6. b) fixed typos and bugs. Patch brief view: 1. changes css ID allocation in kernel/cgroup.c 2. use ID-array in memcg. 3. record ID to page_cgroup rather than pointer. 4. make update_file_mapped to be RCU aware routine instead of spinlock. 5. make update_file_mapped as general-purpose function. Thanks, -Kame --
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Now, css'id is allocated after ->create() is called. But to make use of ID
in ->create(), it should be available before ->create().
In another thinking, considering the ID is tightly coupled with "css",
it should be allocated when "css" is allocated.
This patch moves alloc_css_id() to css allocation routine. Now, only 2 subsys,
memory and blkio are useing ID. (To support complicated hierarchy walk.)
ID will be used in mem cgroup's ->create(), later.
Note:
If someone changes rules of css allocation, ID allocation should be moved too.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
block/blk-cgroup.c | 9 ++++++++
include/linux/cgroup.h | 16 ++++++++-------
kernel/cgroup.c | 50 ++++++++++++++-----------------------------------
mm/memcontrol.c | 5 ++++
4 files changed, 38 insertions(+), 42 deletions(-)
Index: mmotm-0811/kernel/cgroup.c
===================================================================
--- mmotm-0811.orig/kernel/cgroup.c
+++ mmotm-0811/kernel/cgroup.c
@@ -289,9 +289,6 @@ struct cg_cgroup_link {
static struct css_set init_css_set;
static struct cg_cgroup_link init_css_set_link;
-static int cgroup_init_idr(struct cgroup_subsys *ss,
- struct cgroup_subsys_state *css);
-
/* css_set_lock protects the list of css_set objects, and the
* chain of tasks off each css_set. Nests outside task->alloc_lock
* due to cgroup_iter_start() */
@@ -770,9 +767,6 @@ static struct backing_dev_info cgroup_ba
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
-static int alloc_css_id(struct cgroup_subsys *ss,
- struct cgroup *parent, struct cgroup *child);
-
static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
{
struct inode *inode = new_inode(sb);
@@ -3257,7 +3251,8 @@ static void init_cgroup_css(struct cgrou
css->cgroup = cgrp;
atomic_set(&css->refcnt, 1);
css->flags = 0;
- css->id = NULL;
+ if ...What rules? could you please elaborate? Seems cleaner, may be we need to update cgroups.txt? -- Three Cheers, Balbir --
On Wed, 25 Aug 2010 19:45:00 +0530 See Paul Menage's mail. He said "allocating css object under kernel/cgroup.c Hmm. will look into. Thanks, --
OK, the patch looks good to me otherwise Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com> -- Three Cheers, Balbir --
On Wed, 25 Aug 2010 17:06:40 +0900 I think we need some signs from Paul and Li, but anyway Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Thanks, --
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Now, memory cgroup has an ID per cgroup and make use of it at
- hierarchy walk,
- swap recording.
This patch is for making more use of it. The final purpose is
to replace page_cgroup->mem_cgroup's pointer to an unsigned short.
This patch caches a pointer of memcg in an array. By this, we
don't have to call css_lookup() which requires radix-hash walk.
This saves some amount of memory footprint at lookup memcg via id.
Changelog: 20100825
- applied comments.
Changelog: 20100811
- adjusted onto mmotm-2010-08-11
- fixed RCU related parts.
- use attach_id() callback.
Changelog: 20100804
- fixed description in init/Kconfig
Changelog: 20100730
- fixed rcu_read_unlock() placement.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
init/Kconfig | 10 +++++++
mm/memcontrol.c | 75 +++++++++++++++++++++++++++++++++++++++++---------------
2 files changed, 65 insertions(+), 20 deletions(-)
Index: mmotm-0811/mm/memcontrol.c
===================================================================
--- mmotm-0811.orig/mm/memcontrol.c
+++ mmotm-0811/mm/memcontrol.c
@@ -195,6 +195,7 @@ static void mem_cgroup_oom_notify(struct
*/
struct mem_cgroup {
struct cgroup_subsys_state css;
+ int valid; /* for checking validness under RCU access.*/
/*
* the counter to account for memory usage
*/
@@ -294,6 +295,29 @@ static bool move_file(void)
&mc.to->move_charge_at_immigrate);
}
+/* 0 is unused */
+static atomic_t mem_cgroup_num;
+#define NR_MEMCG_GROUPS (CONFIG_MEM_CGROUP_MAX_GROUPS + 1)
+static struct mem_cgroup *mem_cgroups[NR_MEMCG_GROUPS] __read_mostly;
+
+/* Must be called under rcu_read_lock */
+static struct mem_cgroup *id_to_memcg(unsigned short id)
+{
+ struct mem_cgroup *mem;
+ /* see mem_cgroup_free() */
+ mem = rcu_dereference_check(mem_cgroups[id], rcu_read_lock_held());
+ if (likely(mem && mem->valid))
+ return mem;
+ return NULL;
+}
+
+static ...Do we really need to add this new member ? Can't we safely access "mem(=rcu_dereference(mem_cgroup[id]))" under rcu_read_lock() ? I think mem_cgroup_num should be increased at mem_cgroup_alloc(), because it is decreased at __mem_cgroup_free(). Otherwise, it can be decreased while it has not been increased, if mem_cgroup_create() fails after mem_cgroup_alloc(). Thanks, Daisuke Nishimura. --
On Mon, 30 Aug 2010 17:03:24 +0900 Hmm. thank you for checking, I'll fix. Thanks, -Kame --
Looks good, quick thought - should we expost memcg->id to user space through a config file? I don't see any reason at this point, unless we do it for all controllers. -- Three Cheers, Balbir --
On Tue, 31 Aug 2010 12:44:14 +0530 I wonder....showing whether 2 interfaces as "path name" and "id" to users is a good thing or not. Yes, it's convenient to developpers as me and you, others, but I don't think it's useful to users, novice people. I'd like to avoid showing show memcg->id to users as much as possible. I don't want to say but memcg is enough complicated. Thanks, -Kame --
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Now, addresses of memory cgroup can be calculated by their ID without complex.
This patch relplaces pc->mem_cgroup from a pointer to a unsigned short.
On 64bit architecture, this offers us more 6bytes room per page_cgroup.
Use 2bytes for blkio-cgroup's page tracking. More 4bytes will be used for
some light-weight concurrent access.
We may able to move this id onto flags field but ...go step by step.
Changelog: 20100824
- fixed comments, and typo.
Changelog: 20100811
- using new rcu APIs, as rcu_dereference_check() etc.
Changelog: 20100804
- added comments to page_cgroup.h
Changelog: 20100730
- fixed some garbage added by debug code in early stage
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
include/linux/page_cgroup.h | 6 ++++
mm/memcontrol.c | 53 ++++++++++++++++++++++++++++----------------
mm/page_cgroup.c | 2 -
3 files changed, 40 insertions(+), 21 deletions(-)
Index: mmotm-0811/include/linux/page_cgroup.h
===================================================================
--- mmotm-0811.orig/include/linux/page_cgroup.h
+++ mmotm-0811/include/linux/page_cgroup.h
@@ -9,10 +9,14 @@
* page_cgroup helps us identify information about the cgroup
* All page cgroups are allocated at boot or memory hotplug event,
* then the page cgroup for pfn always exists.
+ *
+ * TODO: It seems ID for cgroup can be packed into "flags". But there will
+ * be race between assigning ID <-> set/clear flags. Please be careful.
*/
struct page_cgroup {
unsigned long flags;
- struct mem_cgroup *mem_cgroup;
+ unsigned short mem_cgroup; /* ID of assigned memory cgroup */
+ unsigned short blk_cgroup; /* Not Used..but will be. */
struct page *page;
struct list_head lru; /* per cgroup LRU list */
};
Index: mmotm-0811/mm/page_cgroup.c
===================================================================
--- mmotm-0811.orig/mm/page_cgroup.c
+++ ...On Wed, 25 Aug 2010 17:09:00 +0900 Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> --
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
At accounting file events per memory cgroup, we need to find memory cgroup
via page_cgroup->mem_cgroup. Now, we use lock_page_cgroup().
But, considering the context which page-cgroup for files are accessed,
we can use alternative light-weight mutual execusion in the most case.
At handling file-caches, the only race we have to take care of is "moving"
account, IOW, overwriting page_cgroup->mem_cgroup. Because file status
update is done while the page-cache is in stable state, we don't have to
take care of race with charge/uncharge.
Unlike charge/uncharge, "move" happens not so frequently. It happens only when
rmdir() and task-moving (with a special settings.)
This patch adds a race-checker for file-cache-status accounting v.s. account
moving. The new per-cpu-per-memcg counter MEM_CGROUP_ON_MOVE is added.
The routine for account move
1. Increment it before start moving
2. Call synchronize_rcu()
3. Decrement it after the end of moving.
By this, file-status-counting routine can check it needs to call
lock_page_cgroup(). In most case, I doesn't need to call it.
Changelog: 20100825
- added a comment about mc.lock
- fixed bad lock.
Changelog: 20100804
- added a comment for possible optimization hint.
Changelog: 20100730
- some cleanup.
Changelog: 20100729
- replaced __this_cpu_xxx() with this_cpu_xxx
(because we don't call spinlock)
- added VM_BUG_ON().
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
mm/memcontrol.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++---------
1 file changed, 69 insertions(+), 12 deletions(-)
Index: mmotm-0811/mm/memcontrol.c
===================================================================
--- mmotm-0811.orig/mm/memcontrol.c
+++ mmotm-0811/mm/memcontrol.c
@@ -90,6 +90,7 @@ enum mem_cgroup_stat_index {
MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
...On Wed, 25 Aug 2010 17:10:50 +0900 It doesn't cause any problem, but I think it would be better to change this to "id_to_memcg(..., false)". It's just under rcu_read_lock(), not under page_cgroup lock anymore. Otherwise, it looks good to me. Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Thanks, Daisuke Nishimura. --
On Tue, 31 Aug 2010 12:51:18 +0900 Thanks! -Kame --
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Preparing for adding new status arounf file caches.(dirty, writeback,etc..)
Using a unified macro and more generic names.
All counters will have the same rule for updating.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
include/linux/memcontrol.h | 24 ++++++++++++++++++++++
include/linux/page_cgroup.h | 19 ++++++++++++------
mm/memcontrol.c | 46 ++++++++++++++++++--------------------------
3 files changed, 56 insertions(+), 33 deletions(-)
Index: mmotm-0811/include/linux/memcontrol.h
===================================================================
--- mmotm-0811.orig/include/linux/memcontrol.h
+++ mmotm-0811/include/linux/memcontrol.h
@@ -25,6 +25,30 @@ struct page_cgroup;
struct page;
struct mm_struct;
+/*
+ * Per-cpu Statistics for memory cgroup.
+ */
+enum mem_cgroup_stat_index {
+ /*
+ * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
+ */
+ MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
+ MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
+ MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
+ MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
+ MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
+ MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */
+ MEM_CGROUP_ON_MOVE, /* A check for locking move account/status */
+ /* When you add new member for file-stat, please update page_cgroup.h */
+ MEM_CGROUP_FSTAT_BASE,
+ MEM_CGROUP_FSTAT_FILE_MAPPED = MEM_CGROUP_FSTAT_BASE,
+ MEM_CGROUP_FSTAT_END,
+ MEM_CGROUP_STAT_NSTATS = MEM_CGROUP_FSTAT_END,
+};
+
+#define MEMCG_FSTAT_IDX(idx) ((idx) - MEM_CGROUP_FSTAT_BASE)
+#define NR_FILE_FLAGS_MEMCG ((MEM_CGROUP_FSTAT_END - MEM_CGROUP_FSTAT_BASE))
+
extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
struct list_head *dst,
unsigned long *scanned, int order,
Index: ...On Wed, 25 Aug 2010 17:11:40 +0900 Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> It might be better to update this comment too. /* Update file-stat data for mem_cgroup */ or something ? Thanks, Daisuke Nishimura. --
On Tue, 31 Aug 2010 13:33:29 +0900 Nice catch. I'll fix this. -Kame --
