This patch series implements union mounts for every system call except: - xattr-related calls - fchmod()/fchown()/futimensat() on an fd The copyup portion is a proof of concept based around union_path_nd(), which looks up both the target <mnt,dentry> pair and its parent's. If the parent is on a union mount, and the target is on a lower layer, we copy up the target. This patch set is very raw and could use even the most superficial review. Just take a look at Erez's review from December to see how low-hanging the fruit is on this one. -VAL Felix Fietkau (2): whiteout: jffs2 whiteout support fallthru: jffs2 fallthru support Jan Blunck (13): VFS: Make lookup_hash() return a struct path XXX autofs4: Save autofs trigger's vfsmount in super block info whiteout/NFSD: Don't return information about whiteouts to userspace whiteout: Add vfs_whiteout() and whiteout inode operation whiteout: Set S_OPAQUE inode flag when creating directories whiteout: Allow removal of a directory with whiteouts whiteout: tmpfs whiteout support whiteout: Split of ext2_append_link() from ext2_add_link() whiteout: ext2 whiteout support union-mount: Introduce MNT_UNION and MS_UNION flags union-mount: Introduce union_mount structure and basic operations union-mount: Drive the union cache via dcache union-mount: Call do_whiteout() on unlink and rmdir in unions Valerie Aurora (20): VFS: Add read-only users count to superblock fallthru: Basic fallthru definitions fallthru: ext2 fallthru support fallthru: tmpfs fallthru support union-mount: Writable overlays/union mounts documentation union-mount: Implement union lookup union-mount: Support for mounting union mount file systems union-mount: Copy up directory entries on first readdir() VFS: Split inode_permission() and create path_permission() VFS: Create user_path_nd() to lookup both parent and target union-mount: In-kernel copyup routines union-mount: Implement union-aware ...
While we can check if a file system is currently read-only, we can't
guarantee that it will stay read-only. The file system can be
remounted read-write at any time; it's also conceivable that a file
system can be mounted a second time and converted to read-write if the
underlying fs allows it. This is a problem for union mounts, which
require the underlying file system be read-only. Add a read-only
users count and don't allow remounts to change the file system to
read-write or read-write mounts if there are any read-only users.
Signed-off-by: Valerie Aurora <vaurora@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
---
fs/namespace.c | 11 +++++++++++
fs/super.c | 23 +++++++++++++++++++++++
include/linux/fs.h | 8 ++++++++
3 files changed, 42 insertions(+), 0 deletions(-)
diff --git a/fs/namespace.c b/fs/namespace.c
index 1cd59a0..9a40282 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -200,6 +200,17 @@ int __mnt_is_readonly(struct vfsmount *mnt)
}
EXPORT_SYMBOL_GPL(__mnt_is_readonly);
+static void inc_hard_readonly_users(struct vfsmount *mnt)
+{
+ mnt->mnt_sb->s_hard_readonly_users++;
+}
+
+static void dec_hard_readonly_users(struct vfsmount *mnt)
+{
+ BUG_ON(mnt->mnt_sb->s_hard_readonly_users == 0);
+ mnt->mnt_sb->s_hard_readonly_users--;
+}
+
static inline void inc_mnt_writers(struct vfsmount *mnt)
{
#ifdef CONFIG_SMP
diff --git a/fs/super.c b/fs/super.c
index f35ac60..fa9b40b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -117,6 +117,7 @@ out:
*/
static inline void destroy_super(struct super_block *s)
{
+ BUG_ON(s->s_hard_readonly_users);
security_sb_free(s);
kfree(s->s_subtype);
kfree(s->s_options);
@@ -556,6 +557,21 @@ out:
return err;
}
+/*
+ * Some uses of file systems require that they never be mounted
+ * read-write anywhere (e.g., the lower layers of union mounts must
+ * always be read-only). If there are any of these "hard" read-only
+ * mounts, don't permit a transition to ...From: Jan Blunck <jblunck@suse.de>
In case of an union directory we don't want that the directories on lower
layers of the union "show through". So to prevent that the contents of
underlying directories magically shows up after a mkdir() we set the S_OPAQUE
flag if directories are created where a whiteout existed before.
Signed-off-by: Jan Blunck <jblunck@suse.de>
Signed-off-by: Valerie Aurora <vaurora@redhat.com>
---
fs/namei.c | 11 ++++++++++-
include/linux/fs.h | 3 +++
2 files changed, 13 insertions(+), 1 deletions(-)
diff --git a/fs/namei.c b/fs/namei.c
index 010927b..956083a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2104,6 +2104,7 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, int, mode, unsigned, dev)
int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
{
int error = may_create(dir, dentry);
+ int opaque = 0;
if (error)
return error;
@@ -2116,9 +2117,17 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
if (error)
return error;
+ if (d_is_whiteout(dentry))
+ opaque = 1;
+
error = dir->i_op->mkdir(dir, dentry, mode);
- if (!error)
+ if (!error) {
fsnotify_mkdir(dir, dentry);
+ if (opaque) {
+ dentry->d_inode->i_flags |= S_OPAQUE;
+ mark_inode_dirty(dentry->d_inode);
+ }
+ }
return error;
}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 21102f9..a9f747c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -236,6 +236,7 @@ struct inodes_stat_t {
#define S_NOCMTIME 128 /* Do not update file c/mtime */
#define S_SWAPFILE 256 /* Do not truncate: swapon got its bmaps */
#define S_PRIVATE 512 /* Inode is fs-internal */
+#define S_OPAQUE 1024 /* Directory is opaque */
/*
* Note that nosuid etc flags are inode-specific: setting some file-system
@@ -271,6 +272,8 @@ struct inodes_stat_t {
#define IS_SWAPFILE(inode) ((inode)->i_flags & S_SWAPFILE)
#define IS_PRIVATE(inode) ((inode)->i_flags & S_PRIVATE)
+#define ...From: Jan Blunck <jblunck@suse.de>
do_whiteout() allows removal of a directory when it has whiteouts but
is logically empty.
XXX - This patch abuses readdir() to check if the union directory is
logically empty - that is, all the entries are whiteouts (or "." or
".."). Currently, we have no clean VFS interface to ask the lower
file system if a directory is empty.
Fixes:
- Add ->is_directory_empty() op
- Add is_directory_empty flag to dentry (ugly dcache populate)
- Ask underlying fs to remove it and look for an error return
- (your idea here)
Signed-off-by: Jan Blunck <jblunck@suse.de>
Signed-off-by: Valerie Aurora <vaurora@redhat.com>
---
fs/namei.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 88 insertions(+), 0 deletions(-)
diff --git a/fs/namei.c b/fs/namei.c
index 956083a..991767b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2307,6 +2307,94 @@ int path_whiteout(struct path *dir_path, struct dentry *dentry, int isdir)
EXPORT_SYMBOL(path_whiteout);
/*
+ * XXX - We are abusing readdir to check if a union directory is
+ * logically empty.
+ */
+static int filldir_is_empty(void *__buf, const char *name, int namlen,
+ loff_t offset, u64 ino, unsigned int d_type)
+{
+ int *is_empty = (int *)__buf;
+
+ switch (namlen) {
+ case 2:
+ if (name[1] != '.')
+ break;
+ case 1:
+ if (name[0] != '.')
+ break;
+ return 0;
+ }
+
+ if (d_type == DT_WHT)
+ return 0;
+
+ (*is_empty) = 0;
+ return 0;
+}
+
+static int directory_is_empty(struct dentry *dentry, struct vfsmount *mnt)
+{
+ struct file *file;
+ int err;
+ int is_empty = 1;
+
+ BUG_ON(!S_ISDIR(dentry->d_inode->i_mode));
+
+ /* references for the file pointer */
+ dget(dentry);
+ mntget(mnt);
+
+ file = dentry_open(dentry, mnt, O_RDONLY, current_cred());
+ if (IS_ERR(file))
+ return 0;
+
+ err = vfs_readdir(file, filldir_is_empty, &is_empty);
+
+ fput(file);
+ return is_empty;
+}
+
+static int do_whiteout(struct nameidata ...From: Jan Blunck <jblunck@suse.de> Add support for whiteout dentries to tmpfs. This includes adding support for whiteouts to d_genocide(), which is called to tear down pinned tmpfs dentries. Whiteouts have to be persistent, so they have a pinning extra ref count that needs to be dropped by d_genocide(). Signed-off-by: Jan Blunck <jblunck@suse.de> Signed-off-by: David Woodhouse <dwmw2@infradead.org> Signed-off-by: Valerie Aurora <vaurora@redhat.com> Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: linux-mm@kvack.org --- fs/dcache.c | 13 +++++- mm/shmem.c | 149 +++++++++++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 147 insertions(+), 15 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 265015d..3b0e525 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2229,7 +2229,18 @@ resume: struct list_head *tmp = next; struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); next = tmp->next; - if (d_unhashed(dentry)||!dentry->d_inode) + /* + * Skip unhashed and negative dentries, but process + * positive dentries and whiteouts. A whiteout looks + * kind of like a negative dentry for purposes of + * lookup, but it has an extra pinning ref count + * because it can't be evicted like a negative dentry + * can. What we care about here is ref counts - and + * we need to drop the ref count on a whiteout before + * we can evict it. + */ + if (d_unhashed(dentry)||(!dentry->d_inode && + !d_is_whiteout(dentry))) continue; if (!list_empty(&dentry->d_subdirs)) { this_parent = dentry; diff --git a/mm/shmem.c b/mm/shmem.c index eef4ebe..c58ecf4 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1805,6 +1805,76 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } +static int shmem_rmdir(struct inode *dir, struct dentry *dentry); +static int shmem_unlink(struct inode *dir, struct dentry *dentry); + +/* + * This is the whiteout support for tmpfs. It uses one ...
From: Jan Blunck <jblunck@suse.de> The ext2_append_link() is later used to find or append a directory entry to whiteout. Signed-off-by: Jan Blunck <jblunck@suse.de> Signed-off-by: Valerie Aurora <vaurora@redhat.com> Cc: Theodore Tso <tytso@mit.edu> Cc: linux-ext4@vger.kernel.org --- fs/ext2/dir.c | 70 ++++++++++++++++++++++++++++++++++++++++---------------- 1 files changed, 50 insertions(+), 20 deletions(-) diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 7516957..57207a9 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -472,9 +472,10 @@ void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, } /* - * Parent is locked. + * Find or append a given dentry to the parent directory */ -int ext2_add_link (struct dentry *dentry, struct inode *inode) +static ext2_dirent * ext2_append_entry(struct dentry * dentry, + struct page ** page) { struct inode *dir = dentry->d_parent->d_inode; const char *name = dentry->d_name.name; @@ -482,13 +483,10 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode) unsigned chunk_size = ext2_chunk_size(dir); unsigned reclen = EXT2_DIR_REC_LEN(namelen); unsigned short rec_len, name_len; - struct page *page = NULL; - ext2_dirent * de; + ext2_dirent * de = NULL; unsigned long npages = dir_pages(dir); unsigned long n; char *kaddr; - loff_t pos; - int err; /* * We take care of directory expansion in the same loop. @@ -498,20 +496,19 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode) for (n = 0; n <= npages; n++) { char *dir_end; - page = ext2_get_page(dir, n, 0); - err = PTR_ERR(page); - if (IS_ERR(page)) + *page = ext2_get_page(dir, n, 0); + de = ERR_PTR(PTR_ERR(*page)); + if (IS_ERR(*page)) goto out; - lock_page(page); - kaddr = page_address(page); + lock_page(*page); + kaddr = page_address(*page); dir_end = kaddr + ext2_last_byte(dir, n); de = (ext2_dirent *)kaddr; kaddr += PAGE_CACHE_SIZE - reclen; while ((char ...
Add support for fallthru directory entries to tmpfs
XXX - Makes up inode number for dirent
Signed-off-by: Valerie Aurora <vaurora@redhat.com>
---
fs/dcache.c | 3 +-
fs/libfs.c | 21 +++++++++++++++++--
mm/shmem.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++------
3 files changed, 73 insertions(+), 11 deletions(-)
diff --git a/fs/dcache.c b/fs/dcache.c
index b76f9e4..1575af4 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2240,7 +2240,8 @@ resume:
* we can evict it.
*/
if (d_unhashed(dentry)||(!dentry->d_inode &&
- !d_is_whiteout(dentry)))
+ !d_is_whiteout(dentry) &&
+ !d_is_fallthru(dentry)))
continue;
if (!list_empty(&dentry->d_subdirs)) {
this_parent = dentry;
diff --git a/fs/libfs.c b/fs/libfs.c
index 9e50bcf..cb24772 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -133,6 +133,7 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
struct dentry *cursor = filp->private_data;
struct list_head *p, *q = &cursor->d_u.d_child;
ino_t ino;
+ int d_type;
int i = filp->f_pos;
switch (i) {
@@ -158,14 +159,28 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
struct dentry *next;
next = list_entry(p, struct dentry, d_u.d_child);
- if (d_unhashed(next) || !next->d_inode)
+ if (d_unhashed(next) || (!next->d_inode && !d_is_fallthru(next)))
continue;
+ if (d_is_fallthru(next)) {
+ /* XXX We don't know the inode
+ * number of the directory
+ * entry in the underlying
+ * file system. Should look
+ * it up, either on fallthru
+ * creation at first readdir
+ * or now at filldir time. */
+ ino = 123; /* Made up ino */
+ d_type = DT_UNKNOWN;
+ } else {
+ ino = next->d_inode->i_ino;
+ d_type = dt_type(next->d_inode);
+ }
+
spin_unlock(&dcache_lock);
if (filldir(dirent, next->d_name.name,
...Document design and implementation of writable overlays (a.k.a. union mounts). XXX - out of date Signed-off-by: Valerie Aurora <vaurora@redhat.com> --- Documentation/filesystems/union-mounts.txt | 708 ++++++++++++++++++++++++++++ 1 files changed, 708 insertions(+), 0 deletions(-) create mode 100644 Documentation/filesystems/union-mounts.txt diff --git a/Documentation/filesystems/union-mounts.txt b/Documentation/filesystems/union-mounts.txt new file mode 100644 index 0000000..5f47296 --- /dev/null +++ b/Documentation/filesystems/union-mounts.txt @@ -0,0 +1,708 @@ +State of writable overlays (formerly union mounts) +================================================== + +This version of union mounts is renamed "writable overlays." The goal +of this patch set is to support a single read-write file system +overlaid on a single read-only file system. "Union mounts" suggests +that we support unions of arbitrary numbers and types of file systems, +which is not the goal of this patch set. + +The most recent version of writable overlays can boot to multi-user +mode with a writable overlay root file system. open(), truncate(), +creat(), unlink(), mkdir(), rmdir(), and rename() work. link(), +chmod(), chown(), and chattr() don't work yet. + +This document describes the architecture and current status of +writable overlays, including an item-by-item todo list. + +Writable overlays (formerly union mounts) +========================================= + +In this document: + - Overview of writable overlays + - Terminology + - VFS implementation + - Locking strategy + - VFS/file system interface + - Userland interface + - NFS interaction + - Status + - Contributing to writable overlays + +Overview +======== + +Writable overlays (formerly known as union mounts) are used to layer a +single writable file system over a single read-only file system, with +all writes going to the writable file system. The namespace of both +file systems appears as a combined whole to ...
From: Jan Blunck <jblunck@suse.de>
If a dentry is removed from dentry cache because its usage count drops to
zero, the references to the underlying layer of the unions the dentry is in
are dropped too. Therefore the union cache is driven by the dentry cache.
Signed-off-by: Jan Blunck <jblunck@suse.de>
Signed-off-by: Valerie Aurora <vaurora@redhat.com>
---
fs/dcache.c | 13 +++++++++++
fs/union.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/dcache.h | 8 ++++++
include/linux/union.h | 4 +++
4 files changed, 81 insertions(+), 0 deletions(-)
diff --git a/fs/dcache.c b/fs/dcache.c
index 05c3a1e..983a1ea 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -18,6 +18,7 @@
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/fs.h>
+#include <linux/union.h>
#include <linux/fsnotify.h>
#include <linux/slab.h>
#include <linux/init.h>
@@ -175,6 +176,8 @@ static struct dentry *d_kill(struct dentry *dentry)
dentry_stat.nr_dentry--; /* For d_free, below */
/*drops the locks, at that point nobody can reach this dentry */
dentry_iput(dentry);
+ /* If the dentry was in an union delete them */
+ shrink_d_unions(dentry);
if (IS_ROOT(dentry))
parent = NULL;
else
@@ -696,6 +699,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
iput(inode);
}
+ shrink_d_unions(dentry);
d_free(dentry);
/* finished when we fall off the top of the tree,
@@ -1535,7 +1539,9 @@ void d_delete(struct dentry * dentry)
spin_lock(&dentry->d_lock);
isdir = S_ISDIR(dentry->d_inode->i_mode);
if (atomic_read(&dentry->d_count) == 1) {
+ __d_drop_unions(dentry);
dentry_iput(dentry);
+ shrink_d_unions(dentry);
fsnotify_nameremove(dentry, isdir);
return;
}
@@ -1546,6 +1552,13 @@ void d_delete(struct dentry * dentry)
spin_unlock(&dentry->d_lock);
spin_unlock(&dcache_lock);
+ /*
+ * Remove any associated unions. While someone still has this
+ * ...Create and tear down union mount structures on mount. Check
requirements for union mounts.
Thanks to Felix Fietkau <nbd@openwrt.org> for a bug fix.
Signed-off-by: Jan Blunck <jblunck@suse.de>
Signed-off-by: Valerie Aurora <vaurora@redhat.com>
---
fs/namespace.c | 130 ++++++++++++++++++++++++++++++++++++++++++++++++-
fs/union.c | 63 ++++++++++++++++++++++++
include/linux/union.h | 4 ++
3 files changed, 196 insertions(+), 1 deletions(-)
diff --git a/fs/namespace.c b/fs/namespace.c
index 5e4b27b..e19a432 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -29,6 +29,7 @@
#include <linux/log2.h>
#include <linux/idr.h>
#include <linux/fs_struct.h>
+#include <linux/union.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
#include "pnode.h"
@@ -157,6 +158,9 @@ struct vfsmount *alloc_vfsmnt(const char *name)
#else
mnt->mnt_writers = 0;
#endif
+#ifdef CONFIG_UNION_MOUNT
+ INIT_LIST_HEAD(&mnt->mnt_unions);
+#endif
}
return mnt;
@@ -492,6 +496,7 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
{
+ detach_mnt_union(mnt);
old_path->dentry = mnt->mnt_mountpoint;
old_path->mnt = mnt->mnt_parent;
mnt->mnt_parent = mnt;
@@ -515,6 +520,7 @@ static void attach_mnt(struct vfsmount *mnt, struct path *path)
list_add_tail(&mnt->mnt_hash, mount_hashtable +
hash(path->mnt, path->dentry));
list_add_tail(&mnt->mnt_child, &path->mnt->mnt_mounts);
+ attach_mnt_union(mnt, path->mnt);
}
/*
@@ -537,6 +543,7 @@ static void commit_tree(struct vfsmount *mnt)
list_add_tail(&mnt->mnt_hash, mount_hashtable +
hash(parent, mnt->mnt_mountpoint));
list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
+ attach_mnt_union(mnt, parent);
touch_mnt_namespace(n);
}
@@ -1025,6 +1032,7 @@ void release_mounts(struct list_head *head)
struct dentry *dentry;
struct vfsmount *m;
...From: Jan Blunck <jblunck@suse.de>
Call do_whiteout() when removing files and directories from a union
mounted file system.
Signed-off-by: Valerie Aurora <vaurora@redhat.com>
---
fs/namei.c | 8 ++++++++
1 files changed, 8 insertions(+), 0 deletions(-)
diff --git a/fs/namei.c b/fs/namei.c
index b179062..900df0f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2670,6 +2670,10 @@ static long do_rmdir(int dfd, const char __user *pathname)
error = mnt_want_write(nd.path.mnt);
if (error)
goto exit3;
+ if (IS_UNIONED_DIR(&nd.path)) {
+ error = do_whiteout(&nd, &path, 1);
+ goto exit4;
+ }
error = security_path_rmdir(&nd.path, path.dentry);
if (error)
goto exit4;
@@ -2759,6 +2763,10 @@ static long do_unlinkat(int dfd, const char __user *pathname)
error = mnt_want_write(nd.path.mnt);
if (error)
goto exit2;
+ if (IS_UNIONED_DIR(&nd.path)) {
+ error = do_whiteout(&nd, &path, 0);
+ goto exit3;
+ }
error = security_path_unlink(&nd.path, path.dentry);
if (error)
goto exit3;
--
1.6.3.3
--
For union mounts, a file located on the lower layer will incorrectly
return EROFS on an access check. To fix this, use the new
path_permission() call, which ignores a read-only lower layer file
system if the target will be copied up to the topmost file system.
---
fs/open.c | 20 ++++++++++++++++----
1 files changed, 16 insertions(+), 4 deletions(-)
diff --git a/fs/open.c b/fs/open.c
index e17f544..686fcd2 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -454,7 +454,10 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
const struct cred *old_cred;
struct cred *override_cred;
struct path path;
+ struct nameidata nd;
+ struct vfsmount *mnt;
struct inode *inode;
+ char *tmp;
int res;
if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */
@@ -478,10 +481,17 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
old_cred = override_creds(override_cred);
- res = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path);
+ res = user_path_nd(dfd, filename, LOOKUP_FOLLOW,
+ &nd, &path, &tmp);
if (res)
goto out;
+ /* For union mounts, use the topmost mnt's permissions */
+ if (IS_UNIONED_DIR(&nd.path))
+ mnt = nd.path.mnt;
+ else
+ mnt = path.mnt;
+
inode = path.dentry->d_inode;
if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
@@ -490,11 +500,11 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
* with the "noexec" flag.
*/
res = -EACCES;
- if (path.mnt->mnt_flags & MNT_NOEXEC)
+ if (mnt->mnt_flags & MNT_NOEXEC)
goto out_path_release;
}
- res = inode_permission(inode, mode | MAY_ACCESS);
+ res = path_permission(&path, &nd.path, mode | MAY_ACCESS);
/* SuS v2 requires we report a read only fs too */
if (res || !(mode & S_IWOTH) || special_file(inode->i_mode))
goto out_path_release;
@@ -508,11 +518,13 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
* inherently racy and know ...Copy up a file when opened with write permissions. Does not copy up
the file data when O_TRUNC is specified.
---
fs/namei.c | 28 ++++++++++++++++++++++++++++
1 files changed, 28 insertions(+), 0 deletions(-)
diff --git a/fs/namei.c b/fs/namei.c
index a6f7d5d..85a5451 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1929,6 +1929,24 @@ exit:
return ERR_PTR(error);
}
+static int open_union_copyup(struct nameidata *nd, struct path *path,
+ int open_flag)
+{
+ struct vfsmount *oldmnt = path->mnt;
+ int error;
+
+ if (open_flag & O_TRUNC)
+ error = union_copyup_len(nd, path, 0);
+ else
+ error = union_copyup(nd, path);
+ if (error)
+ return error;
+ if (oldmnt != path->mnt)
+ mntput(nd->path.mnt);
+
+ return error;
+}
+
static struct file *do_last(struct nameidata *nd, struct path *path,
int open_flag, int acc_mode,
int mode, const char *pathname,
@@ -1979,6 +1997,11 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
error = -ENOTDIR;
if (*want_dir && !path->dentry->d_inode->i_op->lookup)
goto exit_dput;
+ if (acc_mode & MAY_WRITE) {
+ error = open_union_copyup(nd, path, open_flag);
+ if (error)
+ goto exit_dput;
+ }
path_to_nameidata(path, nd);
audit_inode(pathname, nd->path.dentry);
goto ok;
@@ -2050,6 +2073,11 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
if (path->dentry->d_inode->i_op->follow_link)
return NULL;
+ if (acc_mode & MAY_WRITE) {
+ error = open_union_copyup(nd, path, open_flag);
+ if (error)
+ goto exit_dput;
+ }
path_to_nameidata(path, nd);
error = -EISDIR;
if (S_ISDIR(path->dentry->d_inode->i_mode))
--
1.6.3.3
--
XXX - doesn't implement NOFOLLOW correctly
---
fs/utimes.c | 13 +++++++++++--
1 files changed, 11 insertions(+), 2 deletions(-)
diff --git a/fs/utimes.c b/fs/utimes.c
index e4c75db..82feca2 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -8,6 +8,7 @@
#include <linux/stat.h>
#include <linux/utime.h>
#include <linux/syscalls.h>
+#include <linux/union.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -152,18 +153,26 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
error = utimes_common(&file->f_path, times);
fput(file);
} else {
+ struct nameidata nd;
+ char *tmp;
struct path path;
int lookup_flags = 0;
if (!(flags & AT_SYMLINK_NOFOLLOW))
lookup_flags |= LOOKUP_FOLLOW;
- error = user_path_at(dfd, filename, lookup_flags, &path);
+ error = user_path_nd(dfd, filename, lookup_flags, &nd, &path,
+ &tmp);
if (error)
goto out;
- error = utimes_common(&path, times);
+ error = union_copyup(&nd, &path);
+
+ if (!error)
+ error = utimes_common(&path, times);
path_put(&path);
+ path_put(&nd.path);
+ putname(tmp);
}
out:
--
1.6.3.3
--
---
fs/open.c | 23 ++++++++++++++++++++---
1 files changed, 20 insertions(+), 3 deletions(-)
diff --git a/fs/open.c b/fs/open.c
index 6ec99e9..dc65b27 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -812,18 +812,35 @@ out:
SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group)
{
struct path path;
+ struct nameidata nd;
+ struct vfsmount *mnt;
+ char *tmp;
int error;
- error = user_lpath(filename, &path);
+ error = user_path_nd(AT_FDCWD, filename, 0, &nd, &path, &tmp);
if (error)
goto out;
- error = mnt_want_write(path.mnt);
+
+ if (IS_UNIONED_DIR(&nd.path))
+ mnt = nd.path.mnt;
+ else
+ mnt = path.mnt;
+
+ error = mnt_want_write(mnt);
if (error)
goto out_release;
+
+ error = union_copyup(&nd, &path);
+ if (error)
+ goto out_drop_write;
+
error = chown_common(&path, user, group);
- mnt_drop_write(path.mnt);
+out_drop_write:
+ mnt_drop_write(mnt);
out_release:
path_put(&path);
+ path_put(&nd.path);
+ putname(tmp);
out:
return error;
}
--
1.6.3.3
--
---
fs/open.c | 25 +++++++++++++++++++++----
1 files changed, 21 insertions(+), 4 deletions(-)
diff --git a/fs/open.c b/fs/open.c
index dda1b6f..6ec99e9 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -669,18 +669,32 @@ out:
SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode)
{
struct path path;
+ struct nameidata nd;
+ struct vfsmount *mnt;
struct inode *inode;
+ char *tmp;
int error;
struct iattr newattrs;
- error = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path);
+ error = user_path_nd(dfd, filename, LOOKUP_FOLLOW, &nd,
+ &path, &tmp);
if (error)
goto out;
- inode = path.dentry->d_inode;
- error = mnt_want_write(path.mnt);
+ if (IS_UNIONED_DIR(&nd.path))
+ mnt = nd.path.mnt;
+ else
+ mnt = path.mnt;
+
+ error = mnt_want_write(mnt);
if (error)
goto dput_and_out;
+
+ error = union_copyup(&nd, &path);
+ if (error)
+ goto mnt_drop_write_and_out;
+
+ inode = path.dentry->d_inode;
mutex_lock(&inode->i_mutex);
error = security_path_chmod(path.dentry, path.mnt, mode);
if (error)
@@ -692,9 +706,12 @@ SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode)
error = notify_change(path.dentry, &newattrs);
out_unlock:
mutex_unlock(&inode->i_mutex);
- mnt_drop_write(path.mnt);
+mnt_drop_write_and_out:
+ mnt_drop_write(mnt);
dput_and_out:
path_put(&path);
+ path_put(&nd.path);
+ putname(tmp);
out:
return error;
}
--
1.6.3.3
--
---
fs/open.c | 24 ++++++++++++++++++++----
1 files changed, 20 insertions(+), 4 deletions(-)
diff --git a/fs/open.c b/fs/open.c
index 325852d..dda1b6f 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -230,14 +230,17 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
static long do_sys_truncate(const char __user *pathname, loff_t length)
{
struct path path;
+ struct nameidata nd;
+ struct vfsmount *mnt;
struct inode *inode;
+ char *tmp;
int error;
error = -EINVAL;
if (length < 0) /* sorry, but loff_t says... */
goto out;
- error = user_path(pathname, &path);
+ error = user_path_nd(AT_FDCWD, pathname, 0, &nd, &path, &tmp);
if (error)
goto out;
inode = path.dentry->d_inode;
@@ -251,11 +254,16 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
if (!S_ISREG(inode->i_mode))
goto dput_and_out;
- error = mnt_want_write(path.mnt);
+ if (IS_UNIONED_DIR(&nd.path))
+ mnt = nd.path.mnt;
+ else
+ mnt = path.mnt;
+
+ error = mnt_want_write(mnt);
if (error)
goto dput_and_out;
- error = inode_permission(inode, MAY_WRITE);
+ error = path_permission(&path, &nd.path, MAY_WRITE);
if (error)
goto mnt_drop_write_and_out;
@@ -263,6 +271,12 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
if (IS_APPEND(inode))
goto mnt_drop_write_and_out;
+ error = union_copyup_len(&nd, &path, length);
+ if (error)
+ goto mnt_drop_write_and_out;
+
+ /* path may have changed after copyup */
+ inode = path.dentry->d_inode;
error = get_write_access(inode);
if (error)
goto mnt_drop_write_and_out;
@@ -284,9 +298,11 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
put_write_and_out:
put_write_access(inode);
mnt_drop_write_and_out:
- mnt_drop_write(path.mnt);
+ mnt_drop_write(mnt);
dput_and_out:
path_put(&path);
+ path_put(&nd.path);
+ putname(tmp);
out:
return error;
}
--
1.6.3.3
--
Proof-of-concept implementation of chown() for union mounts.
---
fs/open.c | 24 +++++++++++++++++++++---
1 files changed, 21 insertions(+), 3 deletions(-)
diff --git a/fs/open.c b/fs/open.c
index 686fcd2..325852d 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -30,6 +30,7 @@
#include <linux/falloc.h>
#include <linux/fs_struct.h>
#include <linux/ima.h>
+#include <linux/union.h>
#include "internal.h"
@@ -717,18 +718,35 @@ static int chown_common(struct path *path, uid_t user, gid_t group)
SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group)
{
struct path path;
+ struct nameidata nd;
+ struct vfsmount *mnt;
+ char *tmp;
int error;
- error = user_path(filename, &path);
+ error = user_path_nd(AT_FDCWD, filename, LOOKUP_FOLLOW,
+ &nd, &path, &tmp);
if (error)
goto out;
- error = mnt_want_write(path.mnt);
+
+ if (IS_UNIONED_DIR(&nd.path))
+ mnt = nd.path.mnt;
+ else
+ mnt = path.mnt;
+
+ error = mnt_want_write(mnt);
if (error)
goto out_release;
+
+ error = union_copyup(&nd, &path);
+ if (error)
+ goto out_drop_write;
error = chown_common(&path, user, group);
- mnt_drop_write(path.mnt);
+out_drop_write:
+ mnt_drop_write(mnt);
out_release:
path_put(&path);
+ path_put(&nd.path);
+ putname(tmp);
out:
return error;
}
--
1.6.3.3
--
On rename() of a file on union mount, copyup and whiteout the source
file. Both are done under the rename mutex. I believe this is
actually atomic.
XXX - May not need to do file copyup under the lock.
---
fs/namei.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
1 files changed, 70 insertions(+), 5 deletions(-)
diff --git a/fs/namei.c b/fs/namei.c
index 5f6dcd4..a6f7d5d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3233,6 +3233,7 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
{
struct dentry *old_dir, *new_dir;
struct path old, new;
+ struct path to_whiteout = {NULL, NULL};
struct dentry *trap;
struct nameidata oldnd, newnd;
char *from;
@@ -3248,12 +3249,9 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
goto exit1;
error = -EXDEV;
+ /* Union mounts will pass below test - dirs always on topmost */
if (oldnd.path.mnt != newnd.path.mnt)
goto exit2;
- /* Rename on union mounts not implemented yet */
- /* XXX much harsher check than necessary - can do some renames */
- if (IS_UNIONED_DIR(&oldnd.path) || IS_UNIONED_DIR(&newnd.path))
- goto exit2;
old_dir = oldnd.path.dentry;
error = -EBUSY;
if (oldnd.last_type != LAST_NORM)
@@ -3276,7 +3274,7 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
error = -ENOENT;
if (!old.dentry->d_inode)
goto exit4;
- /* unless the source is a directory trailing slashes give -ENOTDIR */
+ /* unless the source is a directory, trailing slashes give -ENOTDIR */
if (!S_ISDIR(old.dentry->d_inode->i_mode)) {
error = -ENOTDIR;
if (oldnd.last.name[oldnd.last.len])
@@ -3288,6 +3286,11 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
error = -EINVAL;
if (old.dentry == trap)
goto exit4;
+ error = -EXDEV;
+ /* Can't rename a directory from a lower layer */
+ if (IS_UNIONED_DIR(&oldnd.path) &&
+ IS_UNIONED_DIR(&old))
+ goto exit4;
error = lookup_hash(&newnd, &newnd.last, ...---
fs/namei.c | 24 ++++++++++++++++++++----
1 files changed, 20 insertions(+), 4 deletions(-)
diff --git a/fs/namei.c b/fs/namei.c
index 68aa8ab..5f6dcd4 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3019,16 +3019,18 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
{
struct dentry *new_dentry;
struct nameidata nd;
+ struct nameidata old_nd;
struct path old_path;
int error;
char *to;
+ char *from;
if ((flags & ~AT_SYMLINK_FOLLOW) != 0)
return -EINVAL;
- error = user_path_at(olddfd, oldname,
+ error = user_path_nd(olddfd, oldname,
flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
- &old_path);
+ &old_nd, &old_path, &from);
if (error)
return error;
@@ -3036,8 +3038,20 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
if (error)
goto out;
error = -EXDEV;
- if (old_path.mnt != nd.path.mnt)
- goto out_release;
+ if (old_path.mnt != nd.path.mnt) {
+ if (IS_UNIONED_DIR(&old_nd.path) &&
+ (old_nd.path.mnt == nd.path.mnt)) {
+ error = mnt_want_write(old_nd.path.mnt);
+ if (error)
+ goto out_release;
+ error = union_copyup(&old_nd, &old_path);
+ mnt_drop_write(old_nd.path.mnt);
+ if (error)
+ goto out_release;
+ } else {
+ goto out_release;
+ }
+ }
new_dentry = lookup_create(&nd, 0);
error = PTR_ERR(new_dentry);
if (IS_ERR(new_dentry))
@@ -3060,6 +3074,8 @@ out_release:
putname(to);
out:
path_put(&old_path);
+ path_put(&old_nd.path);
+ putname(from);
return error;
}
--
1.6.3.3
--
When a file on the read-only layer of a union mount is altered, it
must be copied up to the topmost read-write layer. This patch creates
union_copyup() and its supporting routines.
---
fs/union.c | 246 +++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/union.h | 7 +-
2 files changed, 252 insertions(+), 1 deletions(-)
diff --git a/fs/union.c b/fs/union.c
index e2384ad..944c720 100644
--- a/fs/union.c
+++ b/fs/union.c
@@ -26,6 +26,7 @@
#include <linux/namei.h>
#include <linux/file.h>
#include <linux/security.h>
+#include <linux/splice.h>
/*
* This is borrowed from fs/inode.c. The hashtable for lookups. Somebody
@@ -633,3 +634,248 @@ out_fput:
mnt_drop_write(topmost_path->mnt);
return res;
}
+
+/**
+ * union_create_file
+ *
+ * @nd: namediata for source file
+ * @old: path of the source file
+ * @new: path of the new file, negative dentry
+ *
+ * Must already have mnt_want_write() on the mnt and the parent's
+ * i_mutex.
+ */
+
+static int union_create_file(struct nameidata *nd, struct path *old,
+ struct dentry *new)
+{
+ struct path *parent = &nd->path;
+ BUG_ON(!mutex_is_locked(&parent->dentry->d_inode->i_mutex));
+
+ return vfs_create(parent->dentry->d_inode, new,
+ old->dentry->d_inode->i_mode, nd);
+}
+
+/**
+ * union_create_symlink
+ *
+ * @nd: namediata for source symlink
+ * @old: path of the source symlink
+ * @new: path of the new symlink, negative dentry
+ *
+ * Must already have mnt_want_write() on the mnt and the parent's
+ * i_mutex.
+ */
+
+static int union_create_symlink(struct nameidata *nd, struct path *old,
+ struct dentry *new)
+{
+ void *cookie;
+ int error;
+
+ BUG_ON(!mutex_is_locked(&nd->path.dentry->d_inode->i_mutex));
+
+ printk(KERN_INFO "%s: copying up symlink\n", new->d_name.name);
+ /*
+ * We want the contents of this symlink, not to follow it, so
+ * this is modeled on generic_readlink() rather than
+ * do_follow_link().
+ */
+ nd->depth = ...Proof-of-concept implementation of user_path_nd(). Lookup both the
parent and the target of a user-supplied filename, to supply later to
union copyup routines.
---
fs/namei.c | 31 +++++++++++++++++++++++++++++++
include/linux/namei.h | 2 ++
2 files changed, 33 insertions(+), 0 deletions(-)
diff --git a/fs/namei.c b/fs/namei.c
index 24e0cb2..68aa8ab 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1563,6 +1563,37 @@ static int user_path_parent(int dfd, const char __user *path,
return error;
}
+int user_path_nd(int dfd, const char __user *filename,
+ unsigned flags, struct nameidata *parent_nd,
+ struct path *child, char **tmp)
+{
+ struct nameidata child_nd;
+ char *s = getname(filename);
+ int error;
+
+ if (IS_ERR(s))
+ return PTR_ERR(s);
+
+ /* Lookup parent */
+ error = do_path_lookup(dfd, s, LOOKUP_PARENT, parent_nd);
+ if (error)
+ goto out_putname;
+
+ /* Lookup child - XXX optimize, racy */
+ error = do_path_lookup(dfd, s, flags, &child_nd);
+ if (error)
+ goto out_path_put;
+ *child = child_nd.path;
+ *tmp = s;
+ return 0;
+
+out_path_put:
+ path_put(&parent_nd->path);
+out_putname:
+ putname(s);
+ return error;
+}
+
/*
* It's inline, so penalty for filesystems that don't use sticky bit is
* minimal.
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 05b441d..83dc8b5 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -58,6 +58,8 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
#define LOOKUP_RENAME_TARGET 0x0800
extern int user_path_at(int, const char __user *, unsigned, struct path *);
+extern int user_path_nd(int, const char __user *, unsigned,
+ struct nameidata *, struct path *, char **);
#define user_path(name, path) user_path_at(AT_FDCWD, name, LOOKUP_FOLLOW, path)
#define user_lpath(name, path) user_path_at(AT_FDCWD, name, 0, path)
--
1.6.3.3
--
Split inode_permission() into inode and file-system-dependent parts.
Create path_permission() to check permission based on the path to the
inode. This is for union mounts, in which an inode can be located on
a read-only lower layer file system but is still writable, since we
will copy it up to the writable top layer file system. So in that
case, we want to ignore MS_RDONLY on the lower layer. To make this
decision, we must know the path (vfsmount, dentry) of both the target
and its parent.
---
fs/namei.c | 92 ++++++++++++++++++++++++++++++++++++++++++++--------
include/linux/fs.h | 1 +
2 files changed, 79 insertions(+), 14 deletions(-)
diff --git a/fs/namei.c b/fs/namei.c
index 900df0f..24e0cb2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -241,29 +241,20 @@ int generic_permission(struct inode *inode, int mask,
}
/**
- * inode_permission - check for access rights to a given inode
+ * __inode_permission - check for access rights to a given inode
* @inode: inode to check permission on
* @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
*
* Used to check for read/write/execute permissions on an inode.
- * We use "fsuid" for this, letting us set arbitrary permissions
- * for filesystem access without changing the "normal" uids which
- * are used for other things.
+ *
+ * This does not check for a read-only file system. You probably want
+ * inode_permission().
*/
-int inode_permission(struct inode *inode, int mask)
+static int __inode_permission(struct inode *inode, int mask)
{
int retval;
if (mask & MAY_WRITE) {
- umode_t mode = inode->i_mode;
-
- /*
- * Nobody gets write access to a read-only fs.
- */
- if (IS_RDONLY(inode) &&
- (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
- return -EROFS;
-
/*
* Nobody gets write access to an immutable file.
*/
@@ -288,6 +279,79 @@ int inode_permission(struct inode *inode, int mask)
}
/**
+ * sb_permission - check superblock-level ...readdir() in union mounts is implemented by copying up all visible
directory entries from the lower level directories to the topmost
directory. Directory entries that refer to lower level file system
objects are marked as "fallthru" in the topmost directory.
Thanks to Felix Fietkau <nbd@openwrt.org> for a bug fix.
Signed-off-by: Valerie Aurora <vaurora@redhat.com>
Signed-off-by: Felix Fietkau <nbd@openwrt.org>
---
fs/readdir.c | 9 +++
fs/union.c | 160 +++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/union.h | 2 +
3 files changed, 171 insertions(+), 0 deletions(-)
diff --git a/fs/readdir.c b/fs/readdir.c
index 3a48491..da71515 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -16,6 +16,8 @@
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/unistd.h>
+#include <linux/union.h>
+#include <linux/mount.h>
#include <asm/uaccess.h>
@@ -36,9 +38,16 @@ int vfs_readdir(struct file *file, filldir_t filler, void *buf)
res = -ENOENT;
if (!IS_DEADDIR(inode)) {
+ if (IS_UNIONED_DIR(&file->f_path) && !IS_OPAQUE(inode)) {
+ res = union_copyup_dir(&file->f_path);
+ if (res)
+ goto out_unlock;
+ }
+
res = file->f_op->readdir(file, buf, filler);
file_accessed(file);
}
+out_unlock:
mutex_unlock(&inode->i_mutex);
out:
return res;
diff --git a/fs/union.c b/fs/union.c
index 8ad9de7..e2384ad 100644
--- a/fs/union.c
+++ b/fs/union.c
@@ -5,6 +5,7 @@
* Copyright (C) 2007-2009 Novell Inc.
*
* Author(s): Jan Blunck (j.blunck@tu-harburg.de)
+ * Valerie Aurora <vaurora@redhat.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
@@ -23,6 +24,8 @@
#include <linux/slab.h>
#include <linux/union.h>
#include <linux/namei.h>
+#include <linux/file.h>
+#include <linux/security.h>
/*
* This is borrowed from fs/inode.c. The hashtable for lookups. ...Implement unioned directories, whiteouts, and fallthrus in pathname lookup routines. do_lookup() and lookup_hash() call lookup_union() after looking up the dentry from the top-level file system. lookup_union() is centered around __lookup_hash(), which does cached and/or real lookups and revalidates each dentry in the union stack. The added cost to a non-union mount pathname lookup in a CONFIG_UNION_MOUNT kernel is either one or two mount flag tests per pathname component, in needs_union_lookup(). XXX - implement negative union cache entries --- fs/namei.c | 191 ++++++++++++++++++++++++++++++++++++++++++++++++- fs/union.c | 67 +++++++++++++++++ include/linux/union.h | 9 +++ 3 files changed, 266 insertions(+), 1 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 991767b..b179062 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -32,6 +32,7 @@ #include <linux/fcntl.h> #include <linux/device_cgroup.h> #include <linux/fs_struct.h> +#include <linux/union.h> #include <asm/uaccess.h> #include "internal.h" @@ -722,6 +723,181 @@ static __always_inline void follow_dotdot(struct nameidata *nd) follow_mount(&nd->path); } +static struct dentry *__lookup_hash(struct qstr *name, struct dentry *base, + struct nameidata *nd); + +/* + * __lookup_union - Given a path from the topmost layer, lookup and + * revalidate each dentry in its union stack, building it if necessary + * + * @nd - nameidata for the parent of @topmost + * @name - pathname from this element on + * @topmost - path of the topmost matching dentry + * + * Given the nameidata and the path of the topmost dentry for this + * pathname, lookup, revalidate, and build the associated union stack. + * @topmost must be either a negative dentry or a directory. + * + * This function is called both to build a new union stack and to + * revalidate a pre-existing union stack. So we must cope with + * already existing union cache entries. + * + * This function may stomp ...
From: Jan Blunck <jblunck@suse.de> This patch adds the basic structures and operations of VFS-based union mounts (but not the ability to mount or lookup unioned file systems). Each directory in a unioned file system has an associated union stack created when the directory is first looked up. The union stack is a structure kept in a hash table indexed by mount and dentry of the directory; thus, specific paths are unioned, not dentries alone. The union stack keeps a pointer to the upper path and the lower path and can be looked up by either path. This particular version of union mounts is based on ideas by Jan Blunck, Bharata Rao, and many others. Signed-off-by: Jan Blunck <jblunck@suse.de> Signed-off-by: Valerie Aurora <vaurora@redhat.com> --- fs/Kconfig | 13 ++ fs/Makefile | 1 + fs/dcache.c | 4 + fs/union.c | 289 ++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/dcache.h | 20 ++++ include/linux/mount.h | 3 + include/linux/union.h | 53 +++++++++ 7 files changed, 383 insertions(+), 0 deletions(-) create mode 100644 fs/union.c create mode 100644 include/linux/union.h diff --git a/fs/Kconfig b/fs/Kconfig index 7405f07..c16b9db 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -59,6 +59,19 @@ source "fs/notify/Kconfig" source "fs/quota/Kconfig" +config UNION_MOUNT + bool "Writable overlays (union mounts) (EXPERIMENTAL)" + depends on EXPERIMENTAL + help + Writable overlays allow you to mount a transparent writable + layer over a read-only file system, for example, an ext3 + partition on a hard drive over a CD-ROM root file system + image. + + See <file:Documentation/filesystems/union-mounts.txt> for details. + + If unsure, say N. + source "fs/autofs/Kconfig" source "fs/autofs4/Kconfig" source "fs/fuse/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index c3633aa..9693730 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -52,6 +52,7 @@ ...
From: Jan Blunck <jblunck@suse.de>
Add per mountpoint flag for Union Mount support. You need additional patches
to util-linux for that to work - see:
git://git.kernel.org/pub/scm/utils/util-linux-ng/val/util-linux-ng.git
Signed-off-by: Jan Blunck <jblunck@suse.de>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Valerie Aurora <vaurora@redhat.com>
---
fs/namespace.c | 5 ++++-
include/linux/fs.h | 1 +
include/linux/mount.h | 4 ++--
3 files changed, 7 insertions(+), 3 deletions(-)
diff --git a/fs/namespace.c b/fs/namespace.c
index 9a40282..5e4b27b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -808,6 +808,7 @@ static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
{ MNT_NODIRATIME, ",nodiratime" },
{ MNT_RELATIME, ",relatime" },
{ MNT_STRICTATIME, ",strictatime" },
+ { MNT_UNION, ",union" },
{ 0, NULL }
};
const struct proc_fs_info *fs_infop;
@@ -2018,10 +2019,12 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
if (flags & MS_RDONLY)
mnt_flags |= MNT_READONLY;
+ if (flags & MS_UNION)
+ mnt_flags |= MNT_UNION;
flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE |
MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
- MS_STRICTATIME);
+ MS_STRICTATIME | MS_UNION);
if (flags & MS_REMOUNT)
retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a5ba718..4dae882 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -192,6 +192,7 @@ struct inodes_stat_t {
#define MS_REMOUNT 32 /* Alter flags of a mounted FS */
#define MS_MANDLOCK 64 /* Allow mandatory locks on an FS */
#define MS_DIRSYNC 128 /* Directory modifications are synchronous */
+#define MS_UNION 256 /* Merge namespace with FS mounted below */
#define MS_NOATIME 1024 /* Do not update access times. */
#define MS_NODIRATIME 2048 /* Do not update directory access ...I'm sorry I have responded sooner, I've been trying to write a detailed useful message and that turns out to be hard. I'll just include a few of the highlights; mainly I want to say that I'd rather do it the way you describe but when I tried it ended up even uglier than the VFS implementation. I went down this road initially (do most of the unioning in a file system) and spent a couple of months on it. But I always ended up having to do some level of copy-around and redirection similar to that in unionfs. One of the major difficulties that arises even when doing unioning at the VFS level is keeping around the parent's path in order to do the copyup later on. Take a look at the code pattern in the "union-mount: Implement union-aware syscall()" series of patches. That's the prettiest and most efficient version I could come up with, after two other implementations, and it's in the VFS, at the vfs_foo_syscall() level. I don't even know how I would start if I had to wait until the file system op is called. If you have some insights on how to do this, I'd love to hear them. I don't enjoy writing VFS code for the fun of it. :) Thanks, -VAL --
I agree that is prettiest, and copup at open for write makes it easier. But some applications issue mmap(MAP_PRIVATE) after open(O_RDWR), for example modprobe(8). In this case, every kernel module will be copied-up and it must be a waste of time and space. And I guess this is one reason why other implementation took the approach of copyup at write. At the same time, I guess this issue may be less important since other parts are pretty enough. J. R. Okajima --
Sure. The short version is that unionfs has to allocate another copy of each file system structure - inode, etc. - and then keep an array of the matching structures from each of the file system layers. Each unionfs file system op copies data up and down between the unionfs structures and the underlying structures, and then calls the lower file system op as necessary. Often it has to duplicate code from the VFS before calling the lower file system ops. Where union mounts has the advantage is that we make zero copies of file system data structures and therefore don't need copyup or interposition on as many ops. But if you wait until the file system op is called, you have to attach your union-related data to the associated data structure, and the underlying file system is already using the private data pointer. And you have to keep a copy of the underlying file system ops. And each data structure can be part of multiple unions. So you end up with an effective second copy of the Unfortunately, dentries aren't unioned - paths (dentry/mnt pairs) are. So you can get the parent dentry in the file system op, but the dentry is potentially part of many different mounts. There's no mapping from a lower-level read-only dentry to the covering read-write parent dentry because the read-only dentry could potentially be mounted in 5 different places. Which union mount is this dentry part of? You have to record the parent's path during lookup and carry it around until you do the copyup - for every syscall that alters a file, not just open() and write(), but chmod(), etc. So if you implement it in the VFS, you don't have to carry that info across the file system op boundary. I think the chmod() case really shows the issues well. user_path_nd() records the parent's path during lookup (in an inefficient, possibly racy manner), then union_copyup() does the copy (too early, before a lot of permission checks). The underlying file system doesn't get involved until the ->setattr() call ...
Let's not over-generalize the problem. Current implementation has the following properties: a) one read-only layer and one read-write layer b) for each non-directory only one of the layers is relevant c) for directories both layers may be relevant Yep, and that can be fixed by adding better helpers to the VFS which do all the locking magic, etc, and are supplied with a "struct path" My proposal a few mails back would eliminate that as well. Imagine it like this: you have a filesystem full of symlinks. Special symlinks, in fact, which are always followed and so are invisible from userspace. What this means is that it's not necessary to interposition on any operation that: - doesn't modify the file and is not a lookup, or For the simplified case all we need in each union fs inode is a reference to a dentry in either the lower or upper layers, and in the rare case of unioned directory a reference to a denty in both. And notice, that is exactly the same as what you have in union mounts. It shouldn't make too much difference if those refs are stored in a That's simply not true, we are unioning *filesystems* not paths. And that's true of union mounts as well. Unioning of complete namespaces is a completely different, and orders And that's irrelevant. The union filesystem can make a pair of private, kernel-only, mounts for the underlying filesystems and be If we keep to the simplified rules, then there's no need to map from a The same can be done with the union fs, except copy up is done by the union filesystem instead of the VFS. And that is _after_ the Not quite, allowing unprivileged users to trigger arbitrary copy-up is clearly a DoS. Thanks, Miklos --
I haven't looked at unionfs in a long time. Can you say something On a high level I don't see a problem, the parent of every dentry can be found through ->d_parent. One issue is having to duplicate some locking and other stuff around vfs_whatever() calls. But that could be fixed by exporting suitable helpers from the VFS. Other than that I don't see any fundamental issues with union filesystems (except that they seem to grow too many features to be maintainable). Thanks, Miklos --
From: Felix Fietkau <nbd@openwrt.org> Add support for fallthru dentries to jffs2. Cc: David Woodhouse <dwmw2@infradead.org> Cc: linux-mtd@lists.infradead.org Signed-off-by: Felix Fietkau <nbd@openwrt.org> Signed-off-by: Valerie Aurora <vaurora@redhat.com> --- fs/jffs2/dir.c | 36 +++++++++++++++++++++++++++++++++--- include/linux/jffs2.h | 6 ++++++ 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index c259193..98397b3 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -35,6 +35,7 @@ static int jffs2_rename (struct inode *, struct dentry *, struct inode *, struct dentry *); static int jffs2_whiteout (struct inode *, struct dentry *, struct dentry *); +static int jffs2_fallthru (struct inode *, struct dentry *); const struct file_operations jffs2_dir_operations = { @@ -59,6 +60,7 @@ const struct inode_operations jffs2_dir_inode_operations = .rename = jffs2_rename, .check_acl = jffs2_check_acl, .whiteout = jffs2_whiteout, + .fallthru = jffs2_fallthru, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, @@ -103,10 +105,14 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target, } if (fd) { spin_lock(&target->d_lock); - if (fd->type == DT_WHT) + switch (fd->type) { + case DT_WHT: target->d_flags |= DCACHE_WHITEOUT; - else + case JFFS2_DT_FALLTHRU: + target->d_flags |= DCACHE_FALLTHRU; + default: ino = fd->ino; + } spin_unlock(&target->d_lock); } mutex_unlock(&dir_f->sem); @@ -164,7 +170,10 @@ static int jffs2_readdir(struct file *filp, void *dirent, filldir_t filldir) fd->name, fd->ino, fd->type, curofs, offset)); continue; } - if (!fd->ino) { + if (fd->type == JFFS2_DT_FALLTHRU) + /* XXX Should really do a lookup for the real inode number here */ + fd->ino = 100; + else if (!fd->ino && (fd->type != DT_WHT)) { D2(printk(KERN_DEBUG "Skipping deletion ...
From: Felix Fietkau <nbd@openwrt.org> Add support for whiteout dentries to jffs2. Signed-off-by: Felix Fietkau <nbd@openwrt.org> Signed-off-by: Valerie Aurora <vaurora@redhat.com> Cc: David Woodhouse <dwmw2@infradead.org> Cc: linux-mtd@lists.infradead.org --- fs/jffs2/dir.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++- fs/jffs2/fs.c | 4 +++ fs/jffs2/super.c | 2 +- include/linux/jffs2.h | 2 + 4 files changed, 77 insertions(+), 3 deletions(-) diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 7aa4417..c259193 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -34,6 +34,8 @@ static int jffs2_mknod (struct inode *,struct dentry *,int,dev_t); static int jffs2_rename (struct inode *, struct dentry *, struct inode *, struct dentry *); +static int jffs2_whiteout (struct inode *, struct dentry *, struct dentry *); + const struct file_operations jffs2_dir_operations = { .read = generic_read_dir, @@ -56,6 +58,7 @@ const struct inode_operations jffs2_dir_inode_operations = .mknod = jffs2_mknod, .rename = jffs2_rename, .check_acl = jffs2_check_acl, + .whiteout = jffs2_whiteout, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, @@ -98,8 +101,14 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target, fd = fd_list; } } - if (fd) - ino = fd->ino; + if (fd) { + spin_lock(&target->d_lock); + if (fd->type == DT_WHT) + target->d_flags |= DCACHE_WHITEOUT; + else + ino = fd->ino; + spin_unlock(&target->d_lock); + } mutex_unlock(&dir_f->sem); if (ino) { inode = jffs2_iget(dir_i->i_sb, ino); @@ -498,6 +507,11 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode) return PTR_ERR(inode); } + if (dentry->d_flags & DCACHE_WHITEOUT) { + inode->i_flags |= S_OPAQUE; + ri->flags = cpu_to_je16(JFFS2_INO_FLAG_OPAQUE); + } + inode->i_op = &jffs2_dir_inode_operations; inode->i_fop = ...
Add support for fallthru directory entries to ext2. XXX - Makes up inode number for fallthru entry XXX - Might be better implemented as special symlinks Cc: Theodore Tso <tytso@mit.edu> Cc: linux-ext4@vger.kernel.org Signed-off-by: Valerie Aurora <vaurora@redhat.com> Signed-off-by: Jan Blunck <jblunck@suse.de> --- fs/ext2/dir.c | 92 ++++++++++++++++++++++++++++++++++++++++++++-- fs/ext2/ext2.h | 1 + fs/ext2/namei.c | 22 +++++++++++ include/linux/ext2_fs.h | 1 + 4 files changed, 112 insertions(+), 4 deletions(-) diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 030bd46..f3b4aff 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -219,7 +219,8 @@ static inline int ext2_match (int len, const char * const name, { if (len != de->name_len) return 0; - if (!de->inode && (de->file_type != EXT2_FT_WHT)) + if (!de->inode && ((de->file_type != EXT2_FT_WHT) && + (de->file_type != EXT2_FT_FALLTHRU))) return 0; return !memcmp(name, de->name, len); } @@ -256,6 +257,7 @@ static unsigned char ext2_filetype_table[EXT2_FT_MAX] = { [EXT2_FT_SOCK] = DT_SOCK, [EXT2_FT_SYMLINK] = DT_LNK, [EXT2_FT_WHT] = DT_WHT, + [EXT2_FT_FALLTHRU] = DT_UNKNOWN, }; #define S_SHIFT 12 @@ -342,6 +344,24 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir) ext2_put_page(page); return 0; } + } else if (de->file_type == EXT2_FT_FALLTHRU) { + int over; + unsigned char d_type = DT_UNKNOWN; + + offset = (char *)de - kaddr; + /* XXX We don't know the inode number + * of the directory entry in the + * underlying file system. Should + * look it up, either on fallthru + * creation at first readdir or now at + * filldir time. */ + over = filldir(dirent, de->name, de->name_len, + (n<<PAGE_CACHE_SHIFT) | offset, + 123 /* Made up ino */, d_type); + if (over) { + ext2_put_page(page); + return 0; + } } filp->f_pos += ...
I certainly asked whether you really need a real 'struct inode' for whiteouts, and suggested that they should be represented _purely_ as a dentry with type DT_WHT. I don't much like the manifestation of that in this patch though, especially with the made-up inode number. (ISTR I had other jffs2-specific objections too, which I'll dig out and forward). -- David Woodhouse Open Source Technology Centre David.Woodhouse@intel.com Intel Corporation --
Yes, this patches still have issues that Val and me are aware off. I can't remember anything jffs2-specific though. We return that inode number because we don't want to lookup the name on the other filesystem during readdir. Therefore returning DT_UNKNOWN to let the userspace decide if it needs to stat the file was the easiest workaround. I know that POSIX requires d_ino and d_name but on the other hand it does not require anything more on how long d_ino is valid. If somebody has an idea how to make this cleaner please speak up. Regards, Jan -- Jan Blunck <jblunck@suse.de> --
Hmm, why not. Or even the ino of the directory we are reading from ... Regards, Jan -- Jan Blunck <jblunck@suse.de> --
I don't recall there being any technical reason not to look up the real inode number. I just wrote it that we because I was lazy. So I like returning the directory's d_ino better than a single magic number, but I'd at least like to try returning the real inode number too. -VAL --
No, for stat() you do a lookup and that is returning the correct dentry/inode for the filesystem the name is on. We just return the the fallthru directory entries to give userspace an offset that they can seekdir() to. Regards, Jan -- Jan Blunck <jblunck@suse.de> --
Hmm. I smell potential confusion for some otherwise POSIX-friendly userspaces. When I open /path/to/foo, call fstat (st_dev=2, st_ino=5678), and then keep the file open, then later do a readdir which includes foo (dir.st_dev=1, d_ino=1234), I'm going to immediately assume a rename or unlink happened, close the file, abort streaming from it, refresh the GUI windows, refresh application caches for that name entry, etc. Because in the POSIX world I think open files have stable inode numbers (as long as they are open), and I don't think that an open file can have it's name's d_ino not match the inode number unless it's a mount point, which my program would know about. This plays into inotify, where you have to know if you are monitoring every directory that contains a link to a file, to know if you need to monitor the file itself directly instead. Now I think it's fair enough that a union mount doesn't play all the traditional rules :-) C'est la vie. This mismatch of (dir.st_dev,d_ino) and st_ino strongly resembles a file-bind-mount. Like bind mounts, it's quite annoying for programs that like to assume they've seen all of a file's links when they've seen i_nlink of them. Bind mounts can be detected by looking in /proc/mounts. st_dev changing doesn't work because it can be a binding of the same filesystem. How would I go about detecting when a union mount's directory entry has similar behaviour, without calling stat() on each entry? Is it just a matter of recognising a particular filesystem name in /proc/mounts, or something more? Thanks, -- Jamie --
Sorry, no: That does not work for bind mounts. Both layers can have the same st_dev. Nor does O_NOFOLLOW stop traversal in the middle of a path, there is no handy O_NOCROSSMOUNTS, and no st_mode flag or d_type to say it's a bind mount. Bind mounts are really a big pain for i_nlink+inotify name counting. Besides, calling stat() on every entry in a large directory to check st_ino can be orders of magnitude slower than readdir() on a large directory - especially with a cold cache. It is quicker, but much more complicated, to parse /proc/mounts and apply arcane rules to find the exceptions. I agree, and union moint is a very useful feature that's worth breaking a few apps for :-) I'm curious if there's a clear way to go about it in this case, or if it'll involve a certain amount of pattern recognition in /proc/mounts. Basically I'm wondering if it's been thought about already. -- Jamie --
I'm confused. You are monitoring a specific file and would like to know if something is happening to any of it's links, right? Why do you need to know about bind mounts for that? Count the number of times you encounter that d_ino and if that matches i_nlink then every directory is monitored. Simple as that, no? Thanks, Miklos --
Not quite. I'm monitoring a million files (say), so I must use
directory watches for most of them. I need directory watches anyway,
when the semantic is "calling open on /path/to/file and reading would
return the same data", because renames and unlinks are also a way to
invalidate monitored file contents.
At a high level, what we're talking about is the ability to cache and
verify the the validity information derived from reading files in the
filesystem, in a manner which efficiently triggers invalidation only
on changes. Being able to answer, as quickly as possible, "if I read
this, that and other, will I get the same results as the last time I
did those operations, without having to actually do them to check".
When I see a file has i_nlink > 1, I must watch the file directly
using a file-watch (with inotify; polling with stat() with dnotify),
_unless_ I have seen all the links to that file.
When I've seen all the links to a file, I know that my directory
watches on the directories containing those links are sufficient to
detect changes to the file contents. That's because every
file change will get notified on at least one of those paths.
I learn that I've seen all the links by seeing d_ino during readdir as
you suggested, or by st_ino in the cases where I've not had reason to
readdir and I have needed to open the file or call stat.
Let's look at some bind mounts. One where st_ino doesn't work:
/dirA/file1 [hard link to inode 100, i_nlink = 2]
/dirA/bound [bind mount, has /dirA/file1 mounted on it]
/dirB/file2 [hard link to inode 100, i_nlink = 2]
If the program is asked to open /dirA/file1 and /dirA/bound at various
times, and never asked to readdir /dirA, it will have used fstat not
readdir, seen the same (st_dev,st_ino,i_nlink = 2), and _wrongly_
concluded that it is monitoring all directories containing paths to
the file.
To avoid that problem, it parses /proc/mounts and detects that
/dirA/bound does not contributed to the link count. ...I couldn't have put it better myself. To expand slightly, if the broken apps are not few and easily fixed, then we'll go back and make the kernel more complicated. I'd like to try the simplest version we think will work, first. Thanks! -VAL --
Don't worry, I'm not trying to deviate you from that good plan. Just throwing questions out to find what's a good and simple answer to these little open questions to minimise trouble. -- Jamie --
Addition to the inode number of fallthru/readdir, hardlink in union mount may be a problem. If you open a hardlinked file for writing or try chmod it, the internal copyup will happen and the hardlink will be destroyed. For instance, when fileA and fileB are hardlinked on the lower layer, and the contents of fileA is modifed (copyup happens). You will not see the latest contents via fileB. And the IN_CREATE event may be fired to the parent dir if you monitor it, I am afraid. (I have pointed out this issue before, but the posted document didn't seem to contain about it) J. R. Okajima --
Define the fallthru dcache flag and file system op. Mask out the
DCACHE_FALLTHRU flag on dentry creation. Actual users and changes to
lookup come in later patches.
Signed-off-by: Valerie Aurora <vaurora@redhat.com>
---
Documentation/filesystems/vfs.txt | 6 ++++++
fs/dcache.c | 2 +-
include/linux/dcache.h | 6 ++++++
include/linux/fs.h | 1 +
4 files changed, 14 insertions(+), 1 deletions(-)
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 8846b4f..29f3476 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -320,6 +320,7 @@ struct inode_operations {
int (*rmdir) (struct inode *,struct dentry *);
int (*mknod) (struct inode *,struct dentry *,int,dev_t);
int (*whiteout) (struct inode *, struct dentry *, struct dentry *);
+ int (*fallthru) (struct inode *, struct dentry *);
int (*rename) (struct inode *, struct dentry *,
struct inode *, struct dentry *);
int (*readlink) (struct dentry *, char __user *,int);
@@ -390,6 +391,11 @@ otherwise noted.
second is the dentry for the whiteout itself. This method
must unlink() or rmdir() the original entry if it exists.
+ fallthru: called by the readdir(2) system call on a layered file
+ system. Only required if you want to support fallthrus.
+ Fallthrus are place-holders for directory entries visible from
+ a lower level file system.
+
rename: called by the rename(2) system call to rename the object to
have the parent and name given by the second inode and dentry.
diff --git a/fs/dcache.c b/fs/dcache.c
index 3b0e525..b76f9e4 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -993,7 +993,7 @@ EXPORT_SYMBOL(d_alloc_name);
static void __d_instantiate(struct dentry *dentry, struct inode *inode)
{
if (inode) {
- dentry->d_flags &= ~DCACHE_WHITEOUT;
+ dentry->d_flags &= ~(DCACHE_WHITEOUT|DCACHE_FALLTHRU);
...This doesn't seem to have incorporated my feedback from the attached... -- David Woodhouse Open Source Technology Centre David.Woodhouse@intel.com Intel Corporation
Hm, I'm not sure whether I lost the patch in a rebase or didn't have time to test it or what. I was hoping someone who actually knows JFFS2 like Felix or you would get to it first - in general, I'd like the underlying file system maintainers to implement whiteouts and fallthrus since they know them best. Felix, if you implemented it and I lost the patch, my apologies to you. Thanks David, --
From: Jan Blunck <jblunck@suse.de> This patch adds whiteout support to EXT2. A whiteout is an empty directory entry (inode == 0) with the file type set to EXT2_FT_WHT. Therefore it allocates space in directories. Due to being implemented as a filetype it is necessary to have the EXT2_FEATURE_INCOMPAT_FILETYPE flag set. XXX - Whiteouts could be implemented as special symbolic links Signed-off-by: Jan Blunck <jblunck@suse.de> Signed-off-by: Valerie Aurora <vaurora@redhat.com> Cc: Theodore Tso <tytso@mit.edu> Cc: linux-ext4@vger.kernel.org --- fs/ext2/dir.c | 96 +++++++++++++++++++++++++++++++++++++++++++++-- fs/ext2/ext2.h | 3 + fs/ext2/inode.c | 11 ++++- fs/ext2/namei.c | 67 +++++++++++++++++++++++++++++++- fs/ext2/super.c | 6 +++ include/linux/ext2_fs.h | 4 ++ 6 files changed, 177 insertions(+), 10 deletions(-) diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 57207a9..030bd46 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -219,7 +219,7 @@ static inline int ext2_match (int len, const char * const name, { if (len != de->name_len) return 0; - if (!de->inode) + if (!de->inode && (de->file_type != EXT2_FT_WHT)) return 0; return !memcmp(name, de->name, len); } @@ -255,6 +255,7 @@ static unsigned char ext2_filetype_table[EXT2_FT_MAX] = { [EXT2_FT_FIFO] = DT_FIFO, [EXT2_FT_SOCK] = DT_SOCK, [EXT2_FT_SYMLINK] = DT_LNK, + [EXT2_FT_WHT] = DT_WHT, }; #define S_SHIFT 12 @@ -448,6 +449,26 @@ ino_t ext2_inode_by_name(struct inode *dir, struct qstr *child) return res; } +/* Special version for filetype based whiteout support */ +ino_t ext2_inode_by_dentry(struct inode *dir, struct dentry *dentry) +{ + ino_t res = 0; + struct ext2_dir_entry_2 *de; + struct page *page; + + de = ext2_find_entry (dir, &dentry->d_name, &page); + if (de) { + res = le32_to_cpu(de->inode); + if (!res && de->file_type == EXT2_FT_WHT) { + spin_lock(&dentry->d_lock); + dentry->d_flags |= ...
From: Jan Blunck <jblunck@suse.de>
Whiteout a given directory entry. File systems that support whiteouts
must implement the new ->whiteout() directory inode operation.
Signed-off-by: Jan Blunck <jblunck@suse.de>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Valerie Aurora <vaurora@redhat.com>
---
Documentation/filesystems/vfs.txt | 10 +++-
fs/dcache.c | 4 +-
fs/namei.c | 133 +++++++++++++++++++++++++++++++++++++
include/linux/dcache.h | 6 ++
include/linux/fs.h | 2 +
5 files changed, 153 insertions(+), 2 deletions(-)
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 3de2f32..8846b4f 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -308,7 +308,7 @@ struct inode_operations
-----------------------
This describes how the VFS can manipulate an inode in your
-filesystem. As of kernel 2.6.22, the following members are defined:
+filesystem. As of kernel 2.6.33, the following members are defined:
struct inode_operations {
int (*create) (struct inode *,struct dentry *,int, struct nameidata *);
@@ -319,6 +319,7 @@ struct inode_operations {
int (*mkdir) (struct inode *,struct dentry *,int);
int (*rmdir) (struct inode *,struct dentry *);
int (*mknod) (struct inode *,struct dentry *,int,dev_t);
+ int (*whiteout) (struct inode *, struct dentry *, struct dentry *);
int (*rename) (struct inode *, struct dentry *,
struct inode *, struct dentry *);
int (*readlink) (struct dentry *, char __user *,int);
@@ -382,6 +383,13 @@ otherwise noted.
will probably need to call d_instantiate() just as you would
in the create() method
+ whiteout: called by the rmdir(2) and unlink(2) system calls on a
+ layered file system. Only required if you want to support
+ whiteouts. The first dentry passed in is that for the old
+ dentry if it exists, and a negative ...From: Jan Blunck <jblunck@suse.de> Userspace isn't ready for handling another file type, so silently drop whiteout directory entries before they leave the kernel. Signed-off-by: Jan Blunck <jblunck@suse.de> Signed-off-by: David Woodhouse <dwmw2@infradead.org> Signed-off-by: Valerie Aurora <vaurora@redhat.com> Cc: linux-nfs@vger.kernel.org Cc: "J. Bruce Fields" <bfields@fieldses.org> Cc: Neil Brown <neilb@suse.de> --- fs/compat.c | 9 +++++++++ fs/nfsd/nfs3xdr.c | 5 +++++ fs/nfsd/nfs4xdr.c | 5 +++++ fs/nfsd/nfsxdr.c | 4 ++++ fs/readdir.c | 9 +++++++++ 5 files changed, 32 insertions(+), 0 deletions(-) diff --git a/fs/compat.c b/fs/compat.c index 00d90c2..624e1a5 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -838,6 +838,9 @@ static int compat_fillonedir(void *__buf, const char *name, int namlen, struct compat_old_linux_dirent __user *dirent; compat_ulong_t d_ino; + if (d_type == DT_WHT) + return 0; + if (buf->result) return -EINVAL; d_ino = ino; @@ -909,6 +912,9 @@ static int compat_filldir(void *__buf, const char *name, int namlen, compat_ulong_t d_ino; int reclen = ALIGN(NAME_OFFSET(dirent) + namlen + 2, sizeof(compat_long_t)); + if (d_type == DT_WHT) + return 0; + buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) return -EINVAL; @@ -998,6 +1004,9 @@ static int compat_filldir64(void * __buf, const char * name, int namlen, loff_t int reclen = ALIGN(jj + namlen + 1, sizeof(u64)); u64 off; + if (d_type == DT_WHT) + return 0; + buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) return -EINVAL; diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 2a533a0..9b96f5a 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -885,6 +885,11 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen, int elen; /* estimated entry length in words */ int num_entry_words = 0; /* actual number of words */ + if (d_type ...
Seems OK. (Though is there any way we could avoid having to add the check to every filldir callback? Isn't the default going to be disinterest in whiteouts? How are we avoiding all the same checks in the case of lookup?) --b. --
Bruce, the alternative would be to include the check in the fs readdir() implementation, and therefore prevent the call of the filler. I think this patch would be even bigger. --
From: Jan Blunck <jblunck@suse.de> (Resend due to kernel.org complaining about XXX in original subject) XXX - This is broken and included just to make union mounts work. See discussion at: http://kerneltrap.org/mailarchive/linux-fsdevel/2010/1/15/6708053/thread Original commit message: This is a bugfix/replacement for commit 051d381259eb57d6074d02a6ba6e90e744f1a29f: During a path walk if an autofs trigger is mounted on a dentry, when the follow_link method is called, the nameidata struct contains the vfsmount and mountpoint dentry of the parent mount while the dentry that is passed in is the root of the autofs trigger mount. I believe it is impossible to get the vfsmount of the trigger mount, within the follow_link method, when only the parent vfsmount and the root dentry of the trigger mount are known. The solution in this commit was to replace the path embedded in the parent's nameidata with the path of the link itself in __do_follow_link(). This is a relatively harmless misuse of the field, but union mounts ran into a bug during follow_link() caused by the nameidata containing the wrong path (we count on it being what it is all other places - the path of the parent). A cleaner and easier to understand solution is to save the necessary vfsmount in the autofs superblock info when it is mounted. Then we can easily update the vfsmount in autofs4_follow_link(). Signed-off-by: Jan Blunck <jblunck@suse.de> Signed-off-by: Valerie Aurora <vaurora@redhat.com> Acked-by: Ian Kent <raven@themaw.net> Cc: autofs@linux.kernel.org Cc: Alexander Viro <viro@zeniv.linux.org.uk> --- fs/autofs4/autofs_i.h | 1 + fs/autofs4/init.c | 11 ++++++++++- fs/autofs4/root.c | 6 ++++++ fs/namei.c | 7 ++----- 4 files changed, 19 insertions(+), 6 deletions(-) diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 3d283ab..de3af64 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -133,6 ...
From: Jan Blunck <jblunck@suse.de>
This patch changes lookup_hash() into returning a struct path.
Signed-off-by: Jan Blunck <jblunck@suse.de>
Signed-off-by: Valerie Aurora <vaurora@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
---
fs/namei.c | 113 ++++++++++++++++++++++++++++++-----------------------------
1 files changed, 57 insertions(+), 56 deletions(-)
diff --git a/fs/namei.c b/fs/namei.c
index 981f72c..9013d17 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1155,7 +1155,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
}
static struct dentry *__lookup_hash(struct qstr *name,
- struct dentry *base, struct nameidata *nd)
+ struct dentry *base, struct nameidata *nd)
{
struct dentry *dentry;
struct inode *inode;
@@ -1212,14 +1212,22 @@ out:
* needs parent already locked. Doesn't follow mounts.
* SMP-safe.
*/
-static struct dentry *lookup_hash(struct nameidata *nd)
+static int lookup_hash(struct nameidata *nd, struct qstr *name,
+ struct path *path)
{
int err;
err = exec_permission(nd->path.dentry->d_inode);
if (err)
- return ERR_PTR(err);
- return __lookup_hash(&nd->last, nd->path.dentry, nd);
+ return err;
+ path->mnt = nd->path.mnt;
+ path->dentry = __lookup_hash(name, nd->path.dentry, nd);
+ if (IS_ERR(path->dentry)) {
+ err = PTR_ERR(path->dentry);
+ path->dentry = NULL;
+ path->mnt = NULL;
+ }
+ return err;
}
static int __lookup_one_len(const char *name, struct qstr *this,
@@ -1700,12 +1708,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
/* OK, it's O_CREAT */
mutex_lock(&dir->d_inode->i_mutex);
+ error = lookup_hash(nd, &nd->last, path);
- path->dentry = lookup_hash(nd);
- path->mnt = nd->path.mnt;
-
- error = PTR_ERR(path->dentry);
- if (IS_ERR(path->dentry)) {
+ if (error) {
mutex_unlock(&dir->d_inode->i_mutex);
goto exit;
}
@@ -1954,7 +1959,8 @@ EXPORT_SYMBOL(filp_open);
*/
struct dentry ...Hi VAL, In the future, please make patches 1-N reply/refer to patch 0 instead of to the preceding numbered patch. thanks, --- ~Randy --
