[PATCH 0/5] bio-cgroup: Introduction

Previous thread: [PATCH 4/5] watchdog: move omap_wdt.h to include/linux/watchdog by Felipe Balbi on Friday, September 19, 2008 - 3:32 am. (29 messages)

Next thread: Re: How how latent should non-preemptive scheduling be? by Sitsofe Wheeler on Friday, September 19, 2008 - 4:54 am. (16 messages)
From: Ryo Tsuruta
Date: Friday, September 19, 2008 - 4:01 am

Hi everyone,

Here are new releases of bio-cgroup.
Changes from the previous version are as follows:

- Accurate dirty-page tracking
  Support migrating pages between bio-cgroups with minimum overhead,
  but I think such a situation is quite rare.

- Fix a bug of swapcache page handling
  Sometimes, "bad page state" is occurred since the memory controller
  has temporarily changed the swapcache page handling.

The following is the list of patches:

  [PATCH 0/5] bio-cgroup: Introduction
  [PATCH 1/5] bio-cgroup: Split the cgroup memory subsystem into two parts
  [PATCH 2/5] bio-cgroup: Remove a lot of "#ifdef"s
  [PATCH 3/5] bio-cgroup: Implement the bio-cgroup
  [PATCH 4/5] bio-cgroup: Add a cgroup support to dm-ioband
  [PATCH 5/5] bio-cgroup: Dirty page tracking

You have to apply the patch dm-ioband v1.5.0 before applying this
series of patches. The dm-ioband patch can be found at:
http://people.valinux.co.jp/~ryov/dm-ioband/

And you have to select the following config options when compiling kernel:
  CONFIG_CGROUPS=y
  CONFIG_CGROUP_BIO=y
And I recommend you should also select the options for cgroup memory
subsystem, because it makes it possible to give some I/O bandwidth
and some memory to a certain cgroup to control delayed write requests
and the processes in the cgroup will be able to make pages dirty only
inside the cgroup even when the given bandwidth is narrow.
  CONFIG_RESOURCE_COUNTERS=y
  CONFIG_CGROUP_MEM_RES_CTLR=y

Please see the following site for more information:
http://people.valinux.co.jp/~ryov/bio-cgroup/

 --------------------------------------------------------

The following shows how to use dm-ioband with cgroups.
Please assume that you want make two cgroups, which we call "bio cgroup"
here, to track down block I/Os and assign them to ioband device "ioband1".

First, mount the bio cgroup filesystem.

 # mount -t cgroup -o bio none /cgroup/bio

Then, make new bio cgroups and put some processes in them.

 # mkdir ...
From: Ryo Tsuruta
Date: Friday, September 19, 2008 - 4:01 am

This patch splits the cgroup memory subsystem into two parts.
One is for tracking pages to find out the owners. The other is
for controlling how much amount of memory should be assigned to
each cgroup.

With this patch, you can use the page tracking mechanism even if
the memory subsystem is off.

Based on 2.6.27-rc1-mm1
Signed-off-by: Ryo Tsuruta <ryov@valinux.co.jp>
Signed-off-by: Hirokazu Takahashi <taka@valinux.co.jp>

diff -Ndupr linux-2.6.27-rc1-mm1.ioband/include/linux/memcontrol.h linux-2.6.27-rc1-mm1.cg0/include/linux/memcontrol.h
--- linux-2.6.27-rc1-mm1.ioband/include/linux/memcontrol.h	2008-09-19 10:54:43.000000000 +0900
+++ linux-2.6.27-rc1-mm1.cg0/include/linux/memcontrol.h	2008-09-19 18:50:25.000000000 +0900
@@ -20,12 +20,62 @@
 #ifndef _LINUX_MEMCONTROL_H
 #define _LINUX_MEMCONTROL_H
 
+#include <linux/rcupdate.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/bit_spinlock.h>
+
 struct mem_cgroup;
 struct page_cgroup;
 struct page;
 struct mm_struct;
 
+#ifdef CONFIG_CGROUP_PAGE
+/*
+ * We use the lower bit of the page->page_cgroup pointer as a bit spin
+ * lock.  We need to ensure that page->page_cgroup is at least two
+ * byte aligned (based on comments from Nick Piggin).  But since
+ * bit_spin_lock doesn't actually set that lock bit in a non-debug
+ * uniprocessor kernel, we should avoid setting it here too.
+ */
+#define PAGE_CGROUP_LOCK_BIT    0x0
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+#define PAGE_CGROUP_LOCK        (1 << PAGE_CGROUP_LOCK_BIT)
+#else
+#define PAGE_CGROUP_LOCK        0x0
+#endif
+
+/*
+ * A page_cgroup page is associated with every page descriptor. The
+ * page_cgroup helps us identify information about the cgroup
+ */
+struct page_cgroup {
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
+	struct list_head lru;		/* per cgroup LRU list */
+	struct mem_cgroup *mem_cgroup;
+#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
+	struct page *page;
+	int flags;
+};
+#define PAGE_CGROUP_FLAG_CACHE	(0x1)	/* ...
From: Ryo Tsuruta
Date: Friday, September 19, 2008 - 4:02 am

This patch is for cleaning up the code of the cgroup memory subsystem
to remove a lot of "#ifdef"s.

Based on 2.6.27-rc1-mm1
Signed-off-by: Ryo Tsuruta <ryov@valinux.co.jp>
Signed-off-by: Hirokazu Takahashi <taka@valinux.co.jp>

diff -Ndupr linux-2.6.27-rc1-mm1.cg0/mm/memcontrol.c linux-2.6.27-rc1-mm1.cg1/mm/memcontrol.c
--- linux-2.6.27-rc1-mm1.cg0/mm/memcontrol.c	2008-09-19 18:50:26.000000000 +0900
+++ linux-2.6.27-rc1-mm1.cg1/mm/memcontrol.c	2008-09-19 18:50:59.000000000 +0900
@@ -228,6 +228,47 @@ struct mem_cgroup *mem_cgroup_from_task(
 				struct mem_cgroup, css);
 }
 
+static inline void get_mem_cgroup(struct mem_cgroup *mem)
+{
+	css_get(&mem->css);
+}
+
+static inline void put_mem_cgroup(struct mem_cgroup *mem)
+{
+	css_put(&mem->css);
+}
+
+static inline void set_mem_cgroup(struct page_cgroup *pc,
+					struct mem_cgroup *mem)
+{
+	pc->mem_cgroup = mem;
+}
+
+static inline void clear_mem_cgroup(struct page_cgroup *pc)
+{
+	struct mem_cgroup *mem = pc->mem_cgroup;
+	res_counter_uncharge(&mem->res, PAGE_SIZE);
+	pc->mem_cgroup = NULL;
+	put_mem_cgroup(mem);
+}
+
+static inline struct mem_cgroup *get_mem_page_cgroup(struct page_cgroup *pc)
+{
+	struct mem_cgroup *mem = pc->mem_cgroup;
+	css_get(&mem->css);
+	return mem;
+}
+
+/* This sould be called in an RCU-protected section. */
+static inline struct mem_cgroup *mm_get_mem_cgroup(struct mm_struct *mm)
+{
+	struct mem_cgroup *mem;
+
+	mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+	get_mem_cgroup(mem);
+	return mem;
+}
+
 static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
 			struct page_cgroup *pc)
 {
@@ -297,6 +338,26 @@ static void __mem_cgroup_move_lists(stru
 	list_move(&pc->lru, &mz->lists[lru]);
 }
 
+static inline void mem_cgroup_add_page(struct page_cgroup *pc)
+{
+	struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
+	unsigned long flags;
+
+	spin_lock_irqsave(&mz->lru_lock, flags);
+	__mem_cgroup_add_list(mz, ...
From: Ryo Tsuruta
Date: Friday, September 19, 2008 - 4:03 am

This patch implements the bio cgroup on the memory cgroup.

Based on 2.6.27-rc1-mm1
Signed-off-by: Ryo Tsuruta <ryov@valinux.co.jp>
Signed-off-by: Hirokazu Takahashi <taka@valinux.co.jp>

diff -Ndupr linux-2.6.27-rc1-mm1.cg1/block/blk-ioc.c linux-2.6.27-rc1-mm1.cg2/block/blk-ioc.c
--- linux-2.6.27-rc1-mm1.cg1/block/blk-ioc.c	2008-07-29 11:40:31.000000000 +0900
+++ linux-2.6.27-rc1-mm1.cg2/block/blk-ioc.c	2008-09-19 18:50:59.000000000 +0900
@@ -84,24 +84,28 @@ void exit_io_context(void)
 	}
 }
 
+void init_io_context(struct io_context *ioc)
+{
+	atomic_set(&ioc->refcount, 1);
+	atomic_set(&ioc->nr_tasks, 1);
+	spin_lock_init(&ioc->lock);
+	ioc->ioprio_changed = 0;
+	ioc->ioprio = 0;
+	ioc->last_waited = jiffies; /* doesn't matter... */
+	ioc->nr_batch_requests = 0; /* because this is 0 */
+	ioc->aic = NULL;
+	INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH);
+	INIT_HLIST_HEAD(&ioc->cic_list);
+	ioc->ioc_data = NULL;
+}
+
 struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
 {
 	struct io_context *ret;
 
 	ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
-	if (ret) {
-		atomic_set(&ret->refcount, 1);
-		atomic_set(&ret->nr_tasks, 1);
-		spin_lock_init(&ret->lock);
-		ret->ioprio_changed = 0;
-		ret->ioprio = 0;
-		ret->last_waited = jiffies; /* doesn't matter... */
-		ret->nr_batch_requests = 0; /* because this is 0 */
-		ret->aic = NULL;
-		INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH);
-		INIT_HLIST_HEAD(&ret->cic_list);
-		ret->ioc_data = NULL;
-	}
+	if (ret)
+		init_io_context(ret);
 
 	return ret;
 }
diff -Ndupr linux-2.6.27-rc1-mm1.cg1/include/linux/biocontrol.h linux-2.6.27-rc1-mm1.cg2/include/linux/biocontrol.h
--- linux-2.6.27-rc1-mm1.cg1/include/linux/biocontrol.h	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.27-rc1-mm1.cg2/include/linux/biocontrol.h	2008-09-19 18:50:59.000000000 +0900
@@ -0,0 +1,159 @@
+#include <linux/cgroup.h>
+#include <linux/mm.h>
+#include ...
From: Ryo Tsuruta
Date: Friday, September 19, 2008 - 4:03 am

With this patch, dm-ioband can work with the bio cgroup.

Based on 2.6.27-rc1-mm1
Signed-off-by: Ryo Tsuruta <ryov@valinux.co.jp>
Signed-off-by: Hirokazu Takahashi <taka@valinux.co.jp>

diff -Ndupr linux-2.6.27-rc1-mm1.cg2/drivers/md/dm-ioband-type.c linux-2.6.27-rc1-mm1.cg3/drivers/md/dm-ioband-type.c
--- linux-2.6.27-rc1-mm1.cg2/drivers/md/dm-ioband-type.c	2008-09-19 10:55:46.000000000 +0900
+++ linux-2.6.27-rc1-mm1.cg3/drivers/md/dm-ioband-type.c	2008-09-19 18:51:00.000000000 +0900
@@ -6,6 +6,7 @@
  * This file is released under the GPL.
  */
 #include <linux/bio.h>
+#include <linux/biocontrol.h>
 #include "dm.h"
 #include "dm-bio-list.h"
 #include "dm-ioband.h"
@@ -53,13 +54,13 @@ static int ioband_node(struct bio *bio)
 
 static int ioband_cgroup(struct bio *bio)
 {
-  /*
-   * This function should return the ID of the cgroup which issued "bio".
-   * The ID of the cgroup which the current process belongs to won't be
-   * suitable ID for this purpose, since some BIOs will be handled by kernel
-   * threads like aio or pdflush on behalf of the process requesting the BIOs.
-   */
-	return 0;	/* not implemented yet */
+	struct io_context *ioc = get_bio_cgroup_iocontext(bio);
+	int id = 0;
+	if (ioc) {
+		id = ioc->id;
+		put_io_context(ioc);
+	}
+	return id;
 }
 
 struct group_type dm_ioband_group_type[] = {
--

From: Ryo Tsuruta
Date: Friday, September 19, 2008 - 4:04 am

This patch supports migrating pages between bio-cgroups with minimum overhead,

Based on 2.6.27-rc1-mm1
Signed-off-by: Ryo Tsuruta <ryov@valinux.co.jp>
Signed-off-by: Hirokazu Takahashi <taka@valinux.co.jp>

diff -Ndupr linux-2.6.27-rc1-mm1.cg3/fs/buffer.c linux-2.6.27-rc1-mm1.cg4/fs/buffer.c
--- linux-2.6.27-rc1-mm1.cg3/fs/buffer.c	2008-09-19 10:54:42.000000000 +0900
+++ linux-2.6.27-rc1-mm1.cg4/fs/buffer.c	2008-09-19 18:51:01.000000000 +0900
@@ -36,6 +36,7 @@
 #include <linux/buffer_head.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/bio.h>
+#include <linux/biocontrol.h>
 #include <linux/notifier.h>
 #include <linux/cpu.h>
 #include <linux/bitops.h>
@@ -723,6 +724,7 @@ static int __set_page_dirty(struct page 
 			__inc_bdi_stat(mapping->backing_dev_info,
 					BDI_RECLAIMABLE);
 			task_io_account_write(PAGE_CACHE_SIZE);
+			bio_cgroup_recharge(page, current->mm);
 		}
 		radix_tree_tag_set(&mapping->page_tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
diff -Ndupr linux-2.6.27-rc1-mm1.cg3/include/linux/biocontrol.h linux-2.6.27-rc1-mm1.cg4/include/linux/biocontrol.h
--- linux-2.6.27-rc1-mm1.cg3/include/linux/biocontrol.h	2008-09-19 18:51:00.000000000 +0900
+++ linux-2.6.27-rc1-mm1.cg4/include/linux/biocontrol.h	2008-09-19 18:51:01.000000000 +0900
@@ -100,6 +100,8 @@ static inline struct bio_cgroup *mm_get_
 
 extern struct io_context *get_bio_cgroup_iocontext(struct bio *bio);
 
+extern void bio_cgroup_recharge(struct page *page, struct mm_struct *mm);
+
 #else	/* CONFIG_CGROUP_BIO */
 
 struct bio_cgroup;
@@ -154,6 +156,10 @@ static inline struct io_context *get_bio
 	return NULL;
 }
 
+static inline void bio_cgroup_recharge(struct page *page, struct mm_struct *mm)
+{
+}
+
 #endif	/* CONFIG_CGROUP_BIO */
 
 #endif /* _LINUX_BIOCONTROL_H */
diff -Ndupr linux-2.6.27-rc1-mm1.cg3/mm/biocontrol.c linux-2.6.27-rc1-mm1.cg4/mm/biocontrol.c
--- linux-2.6.27-rc1-mm1.cg3/mm/biocontrol.c	2008-09-19 18:51:00.000000000 +0900
+++ ...
From: Hirokazu Takahashi
Date: Friday, September 19, 2008 - 5:02 am

I'm the one implementing this code.
The implementation isn't finished yet that this this code can't handle
mmapped file pages correctly. Actually, the same problem goes with
the task I/O accounting mechanism since the two of them are using
the same hook

This happens when starting to free pages under memory pressure.
Dirty bit of a pte will be moved into PG_Dirty flag of the associated
page. This can be done in another process context, while the hook try

I implemented it though, I don't think this case often happens since
it's not normal that several processes try to write the same block
of a file.


Thanks,
Hirokazu Takahashi.

--

Previous thread: [PATCH 4/5] watchdog: move omap_wdt.h to include/linux/watchdog by Felipe Balbi on Friday, September 19, 2008 - 3:32 am. (29 messages)

Next thread: Re: How how latent should non-preemptive scheduling be? by Sitsofe Wheeler on Friday, September 19, 2008 - 4:54 am. (16 messages)