This patchset implements CUSE - Character device in Userspace. Except for initialization sequence and creation of character device instead of a mount, CUSE isn't very different from FUSE. This patchset is consisted of the following five patches. 0001-FUSE-add-fuse_-prefix-to-several-functions.patch 0002-FUSE-export-symbols-to-be-used-by-CUSE.patch 0003-FUSE-separate-out-fuse_conn_init-from-new_conn.patch 0004-FUSE-add-fuse_conn-release.patch 0005-CUSE-implement-CUSE-Character-device-in-Userspace.patch 0001-0004 prepares FUSE for CUSE addition and 0005 implements CUSE. Corresponding libfuse changes will be posted separately. This patchset is on top of... 2.6.27-rc4 (b8e6c91c74e9f0279b7c51048779b3d62da60b88) + [1] 9p-use-single-poller patchset + [2] wait-kill-is_sync_wait + [3] poll-allow-f_op_poll-to-sleep + [4] uevent updates (2 patches) + [5] char_dev-add-release + [6] extend-FUSE patchset The above three patches allow f_op->poll() to sleep and 0007 depends on it. This patchset is available in the following git tree. http://git.kernel.org/?p=linux/kernel/git/tj/misc.git;a=shortlog;h=cuse git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc.git cuse and contains the following changes. fs/Kconfig | 10 fs/fuse/Makefile | 1 fs/fuse/cuse.c | 634 +++++++++++++++++++++++++++++++++++++++++++++++++++ fs/fuse/dev.c | 32 +- fs/fuse/dir.c | 34 +- fs/fuse/file.c | 60 ++-- fs/fuse/fuse_i.h | 46 +++ fs/fuse/inode.c | 143 ++++++----- include/linux/cuse.h | 40 +++ include/linux/fuse.h | 2 10 files changed, 882 insertions(+), 120 deletions(-) Thanks. -- tejun [1] http://thread.gmane.org/gmane.linux.kernel/726098 [2] http://article.gmane.org/gmane.linux.kernel/726176 [3] http://article.gmane.org/gmane.linux.kernel/726178 [4] http://thread.gmane.org/gmane.linux.kernel/727127 [5] http://article.gmane.org/gmane.linux.kernel/727133 [6] ...
Add fuse_ prefix to request_send*() and get_root_inode() as some of
those functions will be exported for CUSE. With or without CUSE
export, having the function names scoped is a good idea for
debuggability.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
fs/fuse/dev.c | 23 ++++++++++++-----------
fs/fuse/dir.c | 34 +++++++++++++++++-----------------
fs/fuse/file.c | 30 +++++++++++++++---------------
fs/fuse/fuse_i.h | 9 +++++----
fs/fuse/inode.c | 12 ++++++------
5 files changed, 55 insertions(+), 53 deletions(-)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 1c422f9..b448dfd 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -380,7 +380,7 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
}
}
-void request_send(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
{
req->isreply = 1;
spin_lock(&fc->lock);
@@ -399,8 +399,8 @@ void request_send(struct fuse_conn *fc, struct fuse_req *req)
spin_unlock(&fc->lock);
}
-static void request_send_nowait_locked(struct fuse_conn *fc,
- struct fuse_req *req)
+static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
+ struct fuse_req *req)
{
req->background = 1;
fc->num_background++;
@@ -414,11 +414,11 @@ static void request_send_nowait_locked(struct fuse_conn *fc,
flush_bg_queue(fc);
}
-static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
+static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
{
spin_lock(&fc->lock);
if (fc->connected) {
- request_send_nowait_locked(fc, req);
+ fuse_request_send_nowait_locked(fc, req);
spin_unlock(&fc->lock);
} else {
req->out.h.error = -ENOTCONN;
@@ -426,16 +426,16 @@ static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
}
}
-void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
+void ...Export the following symbols for CUSE.
fuse_conn_put()
fuse_conn_get()
fuse_get_root_inode()
fuse_super_operations
fuse_send_init()
fuse_flush()
fuse_fsync()
fuse_direct_io()
fuse_file_lock()
fuse_file_flock()
fuse_file_llseek()
fuse_file_ioctl()
fuse_file_compat_ioctl()
fuse_file_poll()
Signed-off-by: Tejun Heo <tj@kernel.org>
---
fs/fuse/dev.c | 9 ++++++++-
fs/fuse/file.c | 30 ++++++++++++++++++++----------
fs/fuse/fuse_i.h | 29 +++++++++++++++++++++++++++++
fs/fuse/inode.c | 11 ++++++++---
4 files changed, 65 insertions(+), 14 deletions(-)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index b448dfd..75e2775 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -46,6 +46,7 @@ struct fuse_req *fuse_request_alloc(void)
fuse_request_init(req);
return req;
}
+EXPORT_SYMBOL_GPL(fuse_request_alloc);
struct fuse_req *fuse_request_alloc_nofs(void)
{
@@ -124,6 +125,7 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc)
atomic_dec(&fc->num_waiting);
return ERR_PTR(err);
}
+EXPORT_SYMBOL_GPL(fuse_get_req);
/*
* Return request in fuse_file->reserved_req. However that may
@@ -208,6 +210,7 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
fuse_request_free(req);
}
}
+EXPORT_SYMBOL_GPL(fuse_put_request);
static unsigned len_args(unsigned numargs, struct fuse_arg *args)
{
@@ -398,6 +401,7 @@ void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
}
spin_unlock(&fc->lock);
}
+EXPORT_SYMBOL_GPL(fuse_request_send);
static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
struct fuse_req *req)
@@ -1092,8 +1096,9 @@ void fuse_abort_conn(struct fuse_conn *fc)
}
spin_unlock(&fc->lock);
}
+EXPORT_SYMBOL_GPL(fuse_abort_conn);
-static int fuse_dev_release(struct inode *inode, struct file *file)
+int fuse_dev_release(struct inode *inode, struct file *file)
{
struct fuse_conn *fc = fuse_get_conn(file);
if (fc) {
@@ -1108,6 +1113,7 @@ static int ...Separate out fuse_conn_init() from new_conn() and while at it
initialize fuse_conn->entry during conn initialization.
This will be used by CUSE.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
fs/fuse/fuse_i.h | 5 ++
fs/fuse/inode.c | 116 +++++++++++++++++++++++++++++------------------------
2 files changed, 68 insertions(+), 53 deletions(-)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index bc55f6d..4795264 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -679,6 +679,11 @@ void fuse_invalidate_entry_cache(struct dentry *entry);
struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
/**
+ * Initialize fuse_conn
+ */
+int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb);
+
+/**
* Release reference to fuse_conn
*/
void fuse_conn_put(struct fuse_conn *fc);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index fae8732..8d092ea 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -462,66 +462,76 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
return 0;
}
-static struct fuse_conn *new_conn(struct super_block *sb)
+int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb)
{
- struct fuse_conn *fc;
int err;
- fc = kzalloc(sizeof(*fc), GFP_KERNEL);
- if (fc) {
- spin_lock_init(&fc->lock);
- mutex_init(&fc->inst_mutex);
- atomic_set(&fc->count, 1);
- init_waitqueue_head(&fc->waitq);
- init_waitqueue_head(&fc->blocked_waitq);
- init_waitqueue_head(&fc->reserved_req_waitq);
- INIT_LIST_HEAD(&fc->pending);
- INIT_LIST_HEAD(&fc->processing);
- INIT_LIST_HEAD(&fc->io);
- INIT_LIST_HEAD(&fc->interrupts);
- INIT_LIST_HEAD(&fc->bg_queue);
- atomic_set(&fc->num_waiting, 0);
- fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
- fc->bdi.unplug_io_fn = default_unplug_io_fn;
- /* fuse does it's own writeback accounting */
- fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
- fc->polled_files = RB_ROOT;
- fc->dev = sb->s_dev;
- err = bdi_init(&fc->bdi);
- if ...Add fuse_conn->release() so that fuse_conn can be embedded in other
structures. If unspecified, the original action - kfree() - is done.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
fs/fuse/fuse_i.h | 3 +++
fs/fuse/inode.c | 6 +++++-
2 files changed, 8 insertions(+), 1 deletions(-)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 4795264..67f33e8 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -465,6 +465,9 @@ struct fuse_conn {
/** Version counter for attribute changes */
u64 attr_version;
+
+ /** Called on final put. If implemented, should free the connection */
+ void (*release)(struct fuse_conn *);
};
static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 8d092ea..b99bb95 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -543,7 +543,11 @@ void fuse_conn_put(struct fuse_conn *fc)
fuse_request_free(fc->destroy_req);
mutex_destroy(&fc->inst_mutex);
bdi_destroy(&fc->bdi);
- kfree(fc);
+
+ if (fc->release)
+ fc->release(fc);
+ else
+ kfree(fc);
}
}
EXPORT_SYMBOL_GPL(fuse_conn_put);
--
1.5.4.5
--
CUSE enables implementing character devices in userspace. With recent additions of nonblock, lseek, ioctl and poll support, FUSE already has most of what's necessary to implement character devices. All CUSE has to do is bonding all those components - FUSE, chardev and the driver model - nicely. Due to the number of different objects involved and many ways an instance can fail, object lifetime rules are a tad bit complex. Please take a look at the comment on top of fs/fuse/cuse.c for details. Other than that, it's mostly straight forward. Client opens /dev/cuse, kernel starts conversation with CUSE_INIT. The client tells CUSE which device it wants to create. CUSE creates the device for the client and the rest works the same way as in a direct IO FUSE session. Each CUSE device has a corresponding directory /sys/class/cuse/DEVNAME (which is symlink to /sys/devices/virtual/class/DEVNAME if SYSFS_DEPRECATED is turned off) which hosts "waiting" and "abort" among other things. Those two files have the same meaning as the FUSE control files. The only notable lacking feature compared to in-kernel implementation is mmap support. Signed-off-by: Tejun Heo <tj@kernel.org> --- fs/Kconfig | 10 + fs/fuse/Makefile | 1 + fs/fuse/cuse.c | 634 ++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/cuse.h | 40 ++++ include/linux/fuse.h | 2 + 5 files changed, 687 insertions(+), 0 deletions(-) create mode 100644 fs/fuse/cuse.c create mode 100644 include/linux/cuse.h diff --git a/fs/Kconfig b/fs/Kconfig index d387358..3da7551 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -648,6 +648,16 @@ config FUSE_FS If you want to develop a userspace FS, or if you want to use a filesystem based on FUSE, answer Y or M. +config CUSE + tristate "Character device in Userpace support" + depends on FUSE_FS + help + This FUSE extension allows character devices to be + implemented in userspace. + + If you want to develop or use ...
On Fri, 29 Aug 2008 03:19:04 +0900 I didn't know you could do that with bools. These two fields will share a word, and modifications of one are racy wrt modifications of the other. So some form of locking is needed, and I believe all the above could be implemented in C. Making that change would fix the bug in cuse_conn_get(), which OK, I have NFI whatsoever what this thing is doing and I am disinclined to reverse-engineer it. If this is parsing something whcih operators/users provided then it should have been documented somewhere? Whether it is or isn't doing that, this function really really really needs a comment telling readers (ie: me) what it does. Or what it This functions seems to be woefully misnamed. The name implies that it initialises a worker. But it's a kernel thread? Could you please document your design somehow? What does this kernel hotplug? What's all this? Seems to have something to do with an So... basically this undocumented kernel thread will for undocumented reasons create the device node? An obvious question which the reader of the code will ask is "why current->pid is non-unique in a containerised setup. What are the implications of this? It needs a comment, because the containerisation guys will end up coming here and scratching their heads over the same This reader is wondering what a "channel" is. Understanding high-level <looks in fuse_i.h> "The number of requests waiting for completion". Some description of the kernel<->userspace versioning design would be Nice-looking code, but I do not feel able to properly review it with its current level of description. --
"traditionally" container_of() is used in #define, not a function call as it is just pointer math that can be done at compile time. thanks, greg k-h --
On Thu, 28 Aug 2008 15:15:25 -0700
Well yeah. But it isn't a very good tradition.
static inline struct cuse_conn *cdev_to_cc(struct cdev *cdev)
{
return container_of(cdev, struct cuse_conn, cdev);
}
should generate the same code and is prettier.
Unfortunately it has no additional type-safety. You can still pass it
the address of a tty_driver.cdev instead of a cuse_conn.cdev and the
compiler will happily swallow it. Not a big problem in practice though.
--
Hello, Andrew. Yeah, as much as other character devices are. As long as uevent works, The thing is when error handling paths are lumped up using "if (xxx) destroy(xxx);", it's a good idea to always initialize variables which will carry allocated resource. For this iteration, it doesn't make any difference but later if the order of initialization changes and/or a new sequence is added before cc initialization, it's all to easy to forget whether cc was initialized to NULL or not. gcc will catch it most of the time but there's no guarantee, so I think it's better to keep it Yeah, reading an undocumented parsing function is really painful. Sorry about lack of comments there. With too many components to update at hands, I kinda ran out of steam in the last stages where I do the final review pass through patches and add comments. The input is packed strings - "key0=val0\0key1=val1\0" - and parse one CUSE initialization worker, it is, meaning that it's a kernel thread These will be feeded to uevent verbatim. The only reason why it's called hotplug_info instead of uevent_envp is because FUSE works on It's briefly mentioned in comment below. It's because the initialization needs to talk with userland and the talk should happen over a file which the following cuse_channel_open() creates. So, cuse_channel_open() can't really talk with the client without completing Not much. It's just the name of the worker. The name differentiation is mainly to help debugging a bit when something went wrong and doesn't Yes, and from the comment at the top of the file. * channel : file handle connected to the userland CUSE client ... * Note that 'channel' is what 'dev' is in FUSE. As CUSE deals with * devices, it's called 'channel' to reduce confusion. * * channel determines when the character device dies. When channel is * closed, everything should begin to destruct. As cuse_conn and mnt "waiting" and "abort" are what fuse exports as controls in fuse ...
oop, I failed to note the struct assignment there. The usual rule of thumb applies: I can be safely ignored. --
Boolean bit-fields are indeed valid. Usual semantics for objects of type
_Bool apply.
C99 6.7.2.1 Structure and union specifiers
4 A bit-field shall have a type that is a qualified or unqualified
version of _Bool, signed int, unsigned int, or some other
implementation-defined type.
--
Nick Bowler, Elliptic Semiconductor (http://www.ellipticsemi.com/)
--
It would be nice to have BUSE, Block device in Userspace, too. Mike --
Thought about that but it's really no different from nbd or loop depending on your application and block devices don't really implement the file operations so it won't have too much in common with FUSE. Also, there's the complication of going out to disk for more memory cases. -- tejun --
I think BUSE would be useful. For one, it allows you to avoid problems with the extra caching you get with a loopback device. And NBD is too limiting for some applications. For my half-ignorant analysis of the caching issues, see: http://code.google.com/p/s3backer/wiki/PerformanceConsiderations#Caching Not sure what you mean exactly (my fault), but it seems BUSE would have fewer places for memory problems (including deadlocks) than loopback over FUSE, which is the only way to do this kind of stuff now. -Archie -- Archie L. Cobbs --
My gutt feeling is that it would have less overhead when done via FUSE than through nbd, but that could be wrong. Mike --
What I would hope is that BUSE would add some zero copy transport in
there. A way to just redirect the BIOs from the BUSE device to other
block devices without copying the data around.
But maybe BUSE is a bad name for that. That would be more a DUSE -
Device mapper in USEr space.
MfG
Goswin.
--
That already exists. http://wiki.xensource.com/xenwiki/DmUserspace Mike --
Well, NBD is the bird in hand, but that doesn't mean it's the best way to do things generically for all block device emulation applications. I'd even argue that NBD should be removed from the kernel and replaced by BUSE plus a user-land daemon. A BUSE interface could be a lot more general, and simpler. Not to mention that converting all block reads and writes to TCP operations that talk to another process on the same machine via the loopback interface seems awfully inefficient. -Archie -- Archie L. Cobbs --
While it is not based upon FUSE, you may be interested in ULOOP driver which is based upon the loopback block device. Here is a README file from http://aufs.sourceforge.net/uloop.txt. If you want to checkout the source files, please refer to http://aufs.sourceforge.net/. Junjiro R. Okajima ---------------------------------------------------------------------- ULOOP -- Loopback block device in userspace (and a sample for HTTP and generic block device) Junjiro Okajima # $Id: 00readme.txt,v 1.6 2008/08/17 23:04:29 sfjro Exp $ 0. Introduction As you know, there is a Loopback block device in Linux, /dev/loop, which enables you to mount a fs-image local file. Also it can adopt a userspace program, such as cryptloop. This sample ULOOP driver makes it generic, and enables to adopt any userspace program. You can give an empty or non-existing file to /dev/loop backend. When a process reads from /dev/loop, this dirver wakes a user process up and passes the I/O transaction to it. A user process makes the required block ready and tells the driver. Then the driver completes the I/O transaction. Also there is sample scripts or usage for diskless nodes working with aufs. This driver may work with it well. The name is unrelated to YouTube. :-) 1. sample for HTTP Simple 'make' will build ./drivers/block/uloop.ko and ./ulohttp. Ulohttp application behaves like losetup(8). Additionally, ulohttp is an actual daemon which handles I/O request. Here is a syntax. ulohttp [-b bitmap] [-c cache] device URL The device is /dev/loopN and the URL is a URL for fs-image file via HTTP. The http server must support byte range (Range: header). The bitmap is a new filename or previously specified as the bitmap for the same URL. Its filesize will be 'the size of the specified fs-image / pagesize (usually 4k) / bits in a byte (8)', and round-up to pagesize. The cache is a new filename or previously specified as the cache for the same URL. Its filesize will be 'the size of the ...
Would it be hard to extend this to block devices as well?
MfG
Goswin
--
