Re: per BDI dirty limit (was Re: -mm merge plans for 2.6.24)

!MAILaRCHIVE_VOTE_RePLACE
Previous message: [thread] [date] [author]
Next message: [thread] [date] [author]
To: Kay Sievers <kay.sievers@...>
Cc: Nick Piggin <nickpiggin@...>, Andrew Morton <akpm@...>, <linux-kernel@...>, Jens Axboe <jens.axboe@...>, Fengguang Wu <fengguang.wu@...>, <greg@...>, Trond Myklebust <trond.myklebust@...>, Miklos Szeredi <miklos@...>
Date: Friday, October 26, 2007 - 10:48 am

On Wed, 2007-10-03 at 15:35 +0200, Kay Sievers wrote:

---
Subject: bdi: debugfs interface

Expose the BDI stats (and readahead window) in /debug/bdi/

I'm still thinking it should go into /sys somewhere, however I just noticed
not all block devices that have a queue have a /queue directory. Noticeably
those that use make_request_fn() as opposed to request_fn(). And then of
course there are the non-block/non-queue BDIs.

A BDI is basically the object that represents the 'thing' you dirty pages
against. For block devices that is related to the block device (and is
typically embedded in the queue object), for NFS mounts its the remote server
object of the client. For FUSE, yet again something else.

I appreciate the sysfs people their opinion that /sys/bdi/ might not be the
best from their POV, however I'm not seeing where to hook the BDI object from
so that it all makes sense, a few of the things are currently not exposed in
sysfs at all, like the NFS and FUSE things.

So, for now, I've exposed the thing in debugfs. Please suggest a better
alternative.

Miklos, Trond: could you suggest a better fmt for the bdi_init_fmt() for your
respective filesystems?

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Miklos Szeredi <miklos@szeredi.hu>
CC: Trond Myklebust <trond.myklebust@fys.uio.no>
---
 block/genhd.c               |    2 
 block/ll_rw_blk.c           |    1 
 drivers/block/loop.c        |    7 ++
 drivers/md/dm.c             |    2 
 drivers/md/md.c             |    2 
 fs/fuse/inode.c             |    2 
 fs/nfs/client.c             |    2 
 include/linux/backing-dev.h |   15 ++++
 include/linux/debugfs.h     |   11 +++
 include/linux/writeback.h   |    3 
 mm/backing-dev.c            |  153 ++++++++++++++++++++++++++++++++++++++++++++
 mm/page-writeback.c         |    2 
 12 files changed, 199 insertions(+), 3 deletions(-)

Index: linux-2.6-2/fs/fuse/inode.c
===================================================================
--- linux-2.6-2.orig/fs/fuse/inode.c
+++ linux-2.6-2/fs/fuse/inode.c
@@ -467,7 +467,7 @@ static struct fuse_conn *new_conn(void)
 		atomic_set(&fc->num_waiting, 0);
 		fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 		fc->bdi.unplug_io_fn = default_unplug_io_fn;
-		err = bdi_init(&fc->bdi);
+		err = bdi_init_fmt(&fc->bdi, "fuse-%p", fc);
 		if (err) {
 			kfree(fc);
 			fc = NULL;
Index: linux-2.6-2/fs/nfs/client.c
===================================================================
--- linux-2.6-2.orig/fs/nfs/client.c
+++ linux-2.6-2/fs/nfs/client.c
@@ -678,7 +678,7 @@ static int nfs_probe_fsinfo(struct nfs_s
 		goto out_error;
 
 	nfs_server_set_fsinfo(server, &fsinfo);
-	error = bdi_init(&server->backing_dev_info);
+	error = bdi_init_fmt(&server->backing_dev_info, "nfs-%s-%p", clp->cl_hostname, server);
 	if (error)
 		goto out_error;
 
Index: linux-2.6-2/include/linux/backing-dev.h
===================================================================
--- linux-2.6-2.orig/include/linux/backing-dev.h
+++ linux-2.6-2/include/linux/backing-dev.h
@@ -11,6 +11,7 @@
 #include <linux/percpu_counter.h>
 #include <linux/log2.h>
 #include <linux/proportions.h>
+#include <linux/kernel.h>
 #include <asm/atomic.h>
 
 struct page;
@@ -48,11 +49,25 @@ struct backing_dev_info {
 
 	struct prop_local_percpu completions;
 	int dirty_exceeded;
+
+#ifdef CONFIG_DEBUG_FS
+	char *name;
+
+	struct dentry *debugfs_dir;
+	struct dentry *debugfs_ra;
+	struct dentry *debugfs_stat[NR_BDI_STAT_ITEMS];
+	struct dentry *debugfs_dirty;
+	struct dentry *debugfs_bdi_dirty;
+#endif
 };
 
 int bdi_init(struct backing_dev_info *bdi);
+int bdi_init_fmt(struct backing_dev_info *bdi, const char *fmt, ...);
 void bdi_destroy(struct backing_dev_info *bdi);
 
+int bdi_register(struct backing_dev_info *bdi, char *name);
+void bdi_unregister(struct backing_dev_info *bdi);
+
 static inline void __add_bdi_stat(struct backing_dev_info *bdi,
 		enum bdi_stat_item item, s64 amount)
 {
Index: linux-2.6-2/include/linux/debugfs.h
===================================================================
--- linux-2.6-2.orig/include/linux/debugfs.h
+++ linux-2.6-2/include/linux/debugfs.h
@@ -165,4 +165,15 @@ static inline struct dentry *debugfs_cre
 
 #endif
 
+static inline struct dentry *debugfs_create_long(const char *name, mode_t mode,
+						 struct dentry *parent,
+						 unsigned long *value)
+{
+#if BITS_PER_LONG == 32
+	return debugfs_create_u32(name,mode, parent, (u32*)value);
+#else
+	return debugfs_create_u64(name,mode, parent, (u64*)value);
+#endif
+}
+
 #endif
Index: linux-2.6-2/include/linux/writeback.h
===================================================================
--- linux-2.6-2.orig/include/linux/writeback.h
+++ linux-2.6-2/include/linux/writeback.h
@@ -113,6 +113,9 @@ struct file;
 int dirty_writeback_centisecs_handler(struct ctl_table *, int, struct file *,
 				      void __user *, size_t *, loff_t *);
 
+void get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
+		 struct backing_dev_info *bdi);
+
 void page_writeback_init(void);
 void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
 					unsigned long nr_pages_dirtied);
Index: linux-2.6-2/mm/backing-dev.c
===================================================================
--- linux-2.6-2.orig/mm/backing-dev.c
+++ linux-2.6-2/mm/backing-dev.c
@@ -4,12 +4,158 @@
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/writeback.h>
+
+#ifdef CONFIG_DEBUG_FS
+
+static struct dentry *debugfs_dir;
+
+static __init int bdifs_init(void)
+{
+	debugfs_dir = debugfs_create_dir("bdi", NULL);
+	return 0;
+}
+
+__initcall(bdifs_init);
+
+static const char *stat_name[NR_BDI_STAT_ITEMS] = {
+	"reclaimable_pages",
+	"writeback_pages",
+};
+
+static u64 stat_get(void *data)
+{
+	return percpu_counter_read_positive((struct percpu_counter *)data);
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(stat_ops, stat_get, NULL, "%llu\n");
+
+static u64 dirty_get(void *data)
+{
+	struct backing_dev_info *bdi = data;
+	long background, dirty, bdi_dirty;
+
+	get_dirty_limits(&background, &dirty, &bdi_dirty, bdi);
+
+	return dirty;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(dirty_ops, dirty_get, NULL, "%llu\n");
+
+static u64 bdi_dirty_get(void *data)
+{
+	struct backing_dev_info *bdi = data;
+	long background, dirty, bdi_dirty;
+
+	get_dirty_limits(&background, &dirty, &bdi_dirty, bdi);
+
+	return bdi_dirty;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(bdi_dirty_ops, bdi_dirty_get, NULL, "%llu\n");
+
+int bdi_register(struct backing_dev_info *bdi, char *name)
+{
+	int i;
+
+	if (bdi->debugfs_dir)
+		return -EEXIST;
+
+	bdi->name = kstrdup(name, GFP_KERNEL);
+	if (!name)
+		return -ENOMEM;
+
+	bdi->debugfs_dir = debugfs_create_dir(bdi->name, debugfs_dir);
+	if (bdi->debugfs_dir) {
+		bdi->debugfs_ra = debugfs_create_long("readahead_pages", 0644,
+				bdi->debugfs_dir, &bdi->ra_pages);
+
+		for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
+			bdi->debugfs_stat[i] =
+				debugfs_create_file(stat_name[i],
+						0444, bdi->debugfs_dir,
+						&bdi->bdi_stat[i], &stat_ops);
+		}
+
+		bdi->debugfs_dirty =
+			debugfs_create_file("dirty_pages",
+					0444, bdi->debugfs_dir,
+					bdi, &dirty_ops);
+
+		bdi->debugfs_bdi_dirty =
+			debugfs_create_file("bdi_dirty_pages",
+					0444, bdi->debugfs_dir,
+					bdi, &bdi_dirty_ops);
+	} else
+		return -ENOMEM;
+
+	return 0;
+}
+
+void bdi_unregister(struct backing_dev_info *bdi)
+{
+	int i;
+
+	debugfs_remove(bdi->debugfs_bdi_dirty);
+	debugfs_remove(bdi->debugfs_dirty);
+	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
+		debugfs_remove(bdi->debugfs_stat[i]);
+	debugfs_remove(bdi->debugfs_ra);
+	debugfs_remove(bdi->debugfs_dir);
+
+	kfree(bdi->name);
+
+	bdi->debugfs_dir = NULL;
+}
+
+int bdi_init_fmt(struct backing_dev_info *bdi, const char *fmt, ...)
+{
+	int ret;
+	va_list args;
+	char buf[64];
+
+	va_start(args, fmt);
+	vsnprintf(buf, sizeof(buf), fmt, args);
+	va_end(args);
+
+	ret = bdi_init(bdi);
+	if (!ret) {
+		ret = bdi_register(bdi, buf);
+		if (ret)
+			bdi_destroy(bdi);
+	}
+
+	return ret;
+}
+
+#else
+
+int bdi_register(struct backing_dev_info *bdi, char *name)
+{
+	return 0;
+}
+
+inline void bdi_unregister(struct backing_dev_info *bdi)
+{
+}
+
+int bdi_init_fmt(struct backing_dev_info *bdi, const char *fmt, ...)
+{
+	return bdi_init(bdi);
+}
+
+#endif
+
+EXPORT_SYMBOL(bdi_init_fmt);
 
 int bdi_init(struct backing_dev_info *bdi)
 {
 	int i, j;
 	int err;
 
+	memset(bdi, 0, sizeof(*bdi));
+
 	for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
 		err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0);
 		if (err)
@@ -33,6 +181,8 @@ void bdi_destroy(struct backing_dev_info
 {
 	int i;
 
+	bdi_unregister(bdi);
+
 	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
 		percpu_counter_destroy(&bdi->bdi_stat[i]);
 
Index: linux-2.6-2/mm/page-writeback.c
===================================================================
--- linux-2.6-2.orig/mm/page-writeback.c
+++ linux-2.6-2/mm/page-writeback.c
@@ -291,7 +291,7 @@ static unsigned long determine_dirtyable
 	return x + 1;	/* Ensure that we never return 0 */
 }
 
-static void
+void
 get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
 		 struct backing_dev_info *bdi)
 {
Index: linux-2.6-2/block/genhd.c
===================================================================
--- linux-2.6-2.orig/block/genhd.c
+++ linux-2.6-2/block/genhd.c
@@ -182,6 +182,7 @@ void add_disk(struct gendisk *disk)
 			    disk->minors, NULL, exact_match, exact_lock, disk);
 	register_disk(disk);
 	blk_register_queue(disk);
+	bdi_register(&disk->queue->backing_dev_info, disk->disk_name);
 }
 
 EXPORT_SYMBOL(add_disk);
@@ -190,6 +191,7 @@ EXPORT_SYMBOL(del_gendisk);	/* in partit
 void unlink_gendisk(struct gendisk *disk)
 {
 	blk_unregister_queue(disk);
+	bdi_unregister(&disk->queue->backing_dev_info);
 	blk_unregister_region(MKDEV(disk->major, disk->first_minor),
 			      disk->minors);
 }


-
Previous message: [thread] [date] [author]
Next message: [thread] [date] [author]

Messages in current thread:
-mm merge plans for 2.6.24, Andrew Morton, (Mon Oct 1, 5:22 pm)
new aops merge [was Re: -mm merge plans for 2.6.24], Hugh Dickins, (Tue Oct 2, 12:21 pm)
x86 patches was Re: -mm merge plans for 2.6.24, Andi Kleen, (Tue Oct 2, 2:18 am)
Re: x86 patches was Re: -mm merge plans for 2.6.24, Andrew Morton, (Tue Oct 2, 2:32 am)
Re: x86 patches was Re: -mm merge plans for 2.6.24, Ingo Molnar, (Tue Oct 2, 3:37 am)
Re: x86 patches was Re: -mm merge plans for 2.6.24, Andi Kleen, (Tue Oct 2, 3:46 am)
Re: x86 patches was Re: -mm merge plans for 2.6.24, Thomas Gleixner, (Tue Oct 2, 3:58 am)
Re: x86 patches was Re: -mm merge plans for 2.6.24, Andi Kleen, (Tue Oct 2, 3:01 am)
Re: x86 patches was Re: -mm merge plans for 2.6.24, Andy Whitcroft, (Tue Oct 2, 5:26 am)
Re: x86 patches was Re: -mm merge plans for 2.6.24, Andrew Morton, (Tue Oct 2, 3:18 am)
Re: x86 patches was Re: -mm merge plans for 2.6.24, KAMEZAWA Hiroyuki, (Tue Oct 2, 3:36 am)
Re: x86 patches was Re: -mm merge plans for 2.6.24, Christoph Lameter, (Tue Oct 2, 2:16 pm)
Re: x86 patches was Re: -mm merge plans for 2.6.24, Nish Aravamudan, (Tue Oct 2, 12:40 pm)
Re: x86 patches was Re: -mm merge plans for 2.6.24, Andrew Morton, (Tue Oct 2, 3:43 am)
Re: x86 patches was Re: -mm merge plans for 2.6.24, KAMEZAWA Hiroyuki, (Tue Oct 2, 4:16 am)
Re: x86 patches was Re: -mm merge plans for 2.6.24, Christoph Lameter, (Tue Oct 2, 2:18 pm)
Re: x86 patches was Re: -mm merge plans for 2.6.24, Yasunori Goto, (Tue Oct 2, 6:48 am)
Re: x86 patches was Re: -mm merge plans for 2.6.24, Lee Schermerhorn, (Tue Oct 2, 1:25 pm)
Re: x86 patches was Re: -mm merge plans for 2.6.24, Lee Schermerhorn, (Tue Oct 2, 1:17 pm)
Re: x86 patches was Re: -mm merge plans for 2.6.24, Matt Mackall, (Tue Oct 2, 3:55 am)
Re: x86 patches was Re: -mm merge plans for 2.6.24, Andi Kleen, (Tue Oct 2, 3:59 am)
Re: -mm merge plans for 2.6.24, Pekka Enberg, (Tue Oct 2, 12:12 pm)
v4l-stk11xx* [Was: -mm merge plans for 2.6.24], Jiri Slaby, (Tue Oct 2, 3:59 am)
writeback fixes, Fengguang Wu, (Tue Oct 2, 4:39 am)
Re: -mm merge plans for 2.6.24, Borislav Petkov, (Sat Oct 13, 4:44 am)
Re: -mm merge plans for 2.6.24, Andrew Morton, (Sat Oct 13, 4:52 am)
Re: -mm merge plans for 2.6.24, Borislav Petkov, (Sat Oct 13, 7:45 am)
r/o bind mounts, was Re: -mm merge plans for 2.6.24, Christoph Hellwig, (Tue Oct 9, 5:19 am)
Re: remove zero_page (was Re: -mm merge plans for 2.6.24), Linus Torvalds, (Wed Oct 3, 11:21 am)
Re: remove zero_page (was Re: -mm merge plans for 2.6.24), Linus Torvalds, (Tue Oct 9, 10:52 am)
Re: remove zero_page (was Re: -mm merge plans for 2.6.24), Linus Torvalds, (Tue Oct 9, 10:22 pm)
Re: remove zero_page (was Re: -mm merge plans for 2.6.24), Hugh Dickins, (Wed Oct 10, 12:06 am)
Re: remove zero_page (was Re: -mm merge plans for 2.6.24), Linus Torvalds, (Wed Oct 10, 1:20 am)
Re: remove zero_page (was Re: -mm merge plans for 2.6.24), Linus Torvalds, (Wed Oct 10, 11:04 am)
Re: remove zero_page (was Re: -mm merge plans for 2.6.24), Linus Torvalds, (Tue Oct 9, 11:06 pm)
wibbling over the cpuset shed domain connnection, Paul Jackson, (Mon Oct 1, 5:34 pm)
Re: wibbling over the cpuset shed domain connnection, Nick Piggin, (Tue Oct 2, 8:36 am)
Re: wibbling over the cpuset shed domain connnection, Paul Jackson, (Wed Oct 3, 1:21 am)
Re: wibbling over the cpuset shed domain connnection, Nick Piggin, (Tue Oct 2, 9:12 am)
Re: wibbling over the cpuset shed domain connnection, Paul Jackson, (Wed Oct 3, 3:00 am)
Re: wibbling over the cpuset shed domain connnection, Andrew Morton, (Wed Oct 3, 6:57 am)
per BDI dirty limit (was Re: -mm merge plans for 2.6.24), Peter Zijlstra, (Tue Oct 2, 4:17 am)
Re: per BDI dirty limit (was Re: -mm merge plans for 2.6.24), Martin Knoblauch, (Wed Oct 3, 7:00 am)
Re: per BDI dirty limit (was Re: -mm merge plans for 2.6.24), Peter Zijlstra, (Fri Oct 26, 10:48 am)
Re: per BDI dirty limit (was Re: -mm merge plans for 2.6.24), Trond Myklebust, (Fri Oct 26, 12:37 pm)
Re: per BDI dirty limit (was Re: -mm merge plans for 2.6.24), Peter Zijlstra, (Fri Dec 14, 10:50 am)
Re: per BDI dirty limit (was Re: -mm merge plans for 2.6.24), Miklos Szeredi, (Fri Dec 14, 11:14 am)
Re: per BDI dirty limit (was Re: -mm merge plans for 2.6.24), Peter Zijlstra, (Fri Dec 14, 11:54 am)
Re: per BDI dirty limit (was Re: -mm merge plans for 2.6.24), Miklos Szeredi, (Fri Oct 26, 11:06 am)
Re: per BDI dirty limit (was Re: -mm merge plans for 2.6.24), Peter Zijlstra, (Fri Oct 26, 11:22 am)
Re: per BDI dirty limit (was Re: -mm merge plans for 2.6.24), Peter Zijlstra, (Fri Oct 26, 11:33 am)
[PATCH] mm: sysfs: expose the BDI object in sysfs, Peter Zijlstra, (Fri Nov 2, 10:59 am)
Re: [PATCH] mm: sysfs: expose the BDI object in sysfs, Kay Sievers, (Fri Nov 2, 11:13 am)
Re: per BDI dirty limit (was Re: -mm merge plans for 2.6.24), Peter Zijlstra, (Sat Oct 27, 12:07 pm)