Re: [PATCH 16/35] union-mount: Writable overlays/union mounts documentation

Previous thread: mmotm 2010-04-15-14-42 uploaded by akpm on Thursday, April 15, 2010 - 2:42 pm. (25 messages)

Next thread: lockdep warning on block tree for-2.6.35 branch by Gui Jianfeng on Thursday, April 15, 2010 - 6:40 pm. (3 messages)
From: Valerie Aurora
Date: Thursday, April 15, 2010 - 4:04 pm

This patch series implements union mounts for every system call
except:

- xattr-related calls
- fchmod()/fchown()/futimensat() on an fd

The copyup portion is a proof of concept based around union_path_nd(),
which looks up both the target <mnt,dentry> pair and its parent's.  If
the parent is on a union mount, and the target is on a lower layer, we
copy up the target.

This patch set is very raw and could use even the most superficial
review.  Just take a look at Erez's review from December to see how
low-hanging the fruit is on this one.

-VAL

Felix Fietkau (2):
  whiteout: jffs2 whiteout support
  fallthru: jffs2 fallthru support

Jan Blunck (13):
  VFS: Make lookup_hash() return a struct path
  XXX autofs4: Save autofs trigger's vfsmount in super block info
  whiteout/NFSD: Don't return information about whiteouts to userspace
  whiteout: Add vfs_whiteout() and whiteout inode operation
  whiteout: Set S_OPAQUE inode flag when creating directories
  whiteout: Allow removal of a directory with whiteouts
  whiteout: tmpfs whiteout support
  whiteout: Split of ext2_append_link() from ext2_add_link()
  whiteout: ext2 whiteout support
  union-mount: Introduce MNT_UNION and MS_UNION flags
  union-mount: Introduce union_mount structure and basic operations
  union-mount: Drive the union cache via dcache
  union-mount: Call do_whiteout() on unlink and rmdir in unions

Valerie Aurora (20):
  VFS: Add read-only users count to superblock
  fallthru: Basic fallthru definitions
  fallthru: ext2 fallthru support
  fallthru: tmpfs fallthru support
  union-mount: Writable overlays/union mounts documentation
  union-mount: Implement union lookup
  union-mount: Support for mounting union mount file systems
  union-mount: Copy up directory entries on first readdir()
  VFS: Split inode_permission() and create path_permission()
  VFS: Create user_path_nd() to lookup both parent and target
  union-mount: In-kernel copyup routines
  union-mount: Implement union-aware ...
From: Valerie Aurora
Date: Thursday, April 15, 2010 - 4:04 pm

While we can check if a file system is currently read-only, we can't
guarantee that it will stay read-only.  The file system can be
remounted read-write at any time; it's also conceivable that a file
system can be mounted a second time and converted to read-write if the
underlying fs allows it.  This is a problem for union mounts, which
require the underlying file system be read-only.  Add a read-only
users count and don't allow remounts to change the file system to
read-write or read-write mounts if there are any read-only users.

Signed-off-by: Valerie Aurora <vaurora@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c     |   11 +++++++++++
 fs/super.c         |   23 +++++++++++++++++++++++
 include/linux/fs.h |    8 ++++++++
 3 files changed, 42 insertions(+), 0 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 1cd59a0..9a40282 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -200,6 +200,17 @@ int __mnt_is_readonly(struct vfsmount *mnt)
 }
 EXPORT_SYMBOL_GPL(__mnt_is_readonly);
 
+static void inc_hard_readonly_users(struct vfsmount *mnt)
+{
+	mnt->mnt_sb->s_hard_readonly_users++;
+}
+
+static void dec_hard_readonly_users(struct vfsmount *mnt)
+{
+	BUG_ON(mnt->mnt_sb->s_hard_readonly_users == 0);
+	mnt->mnt_sb->s_hard_readonly_users--;
+}
+
 static inline void inc_mnt_writers(struct vfsmount *mnt)
 {
 #ifdef CONFIG_SMP
diff --git a/fs/super.c b/fs/super.c
index f35ac60..fa9b40b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -117,6 +117,7 @@ out:
  */
 static inline void destroy_super(struct super_block *s)
 {
+	BUG_ON(s->s_hard_readonly_users);
 	security_sb_free(s);
 	kfree(s->s_subtype);
 	kfree(s->s_options);
@@ -556,6 +557,21 @@ out:
 	return err;
 }
 
+/*
+ * Some uses of file systems require that they never be mounted
+ * read-write anywhere (e.g., the lower layers of union mounts must
+ * always be read-only).  If there are any of these "hard" read-only
+ * mounts, don't permit a transition to ...
From: Valerie Aurora
Date: Thursday, April 15, 2010 - 4:04 pm

From: Jan Blunck <jblunck@suse.de>

In case of an union directory we don't want that the directories on lower
layers of the union "show through". So to prevent that the contents of
underlying directories magically shows up after a mkdir() we set the S_OPAQUE
flag if directories are created where a whiteout existed before.

Signed-off-by: Jan Blunck <jblunck@suse.de>
Signed-off-by: Valerie Aurora <vaurora@redhat.com>
---
 fs/namei.c         |   11 ++++++++++-
 include/linux/fs.h |    3 +++
 2 files changed, 13 insertions(+), 1 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 010927b..956083a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2104,6 +2104,7 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, int, mode, unsigned, dev)
 int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
 	int error = may_create(dir, dentry);
+	int opaque = 0;
 
 	if (error)
 		return error;
@@ -2116,9 +2117,17 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	if (error)
 		return error;
 
+	if (d_is_whiteout(dentry))
+		opaque = 1;
+
 	error = dir->i_op->mkdir(dir, dentry, mode);
-	if (!error)
+	if (!error) {
 		fsnotify_mkdir(dir, dentry);
+		if (opaque) {
+			dentry->d_inode->i_flags |= S_OPAQUE;
+			mark_inode_dirty(dentry->d_inode);
+		}
+	}
 	return error;
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 21102f9..a9f747c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -236,6 +236,7 @@ struct inodes_stat_t {
 #define S_NOCMTIME	128	/* Do not update file c/mtime */
 #define S_SWAPFILE	256	/* Do not truncate: swapon got its bmaps */
 #define S_PRIVATE	512	/* Inode is fs-internal */
+#define S_OPAQUE	1024	/* Directory is opaque */
 
 /*
  * Note that nosuid etc flags are inode-specific: setting some file-system
@@ -271,6 +272,8 @@ struct inodes_stat_t {
 #define IS_SWAPFILE(inode)	((inode)->i_flags & S_SWAPFILE)
 #define IS_PRIVATE(inode)	((inode)->i_flags & S_PRIVATE)
 
+#define ...
From: Valerie Aurora
Date: Thursday, April 15, 2010 - 4:04 pm

From: Jan Blunck <jblunck@suse.de>

do_whiteout() allows removal of a directory when it has whiteouts but
is logically empty.

XXX - This patch abuses readdir() to check if the union directory is
logically empty - that is, all the entries are whiteouts (or "." or
"..").  Currently, we have no clean VFS interface to ask the lower
file system if a directory is empty.

Fixes:
 - Add ->is_directory_empty() op
 - Add is_directory_empty flag to dentry (ugly dcache populate)
 - Ask underlying fs to remove it and look for an error return
 - (your idea here)

Signed-off-by: Jan Blunck <jblunck@suse.de>
Signed-off-by: Valerie Aurora <vaurora@redhat.com>
---
 fs/namei.c |   88 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 88 insertions(+), 0 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 956083a..991767b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2307,6 +2307,94 @@ int path_whiteout(struct path *dir_path, struct dentry *dentry, int isdir)
 EXPORT_SYMBOL(path_whiteout);
 
 /*
+ * XXX - We are abusing readdir to check if a union directory is
+ * logically empty.
+ */
+static int filldir_is_empty(void *__buf, const char *name, int namlen,
+			    loff_t offset, u64 ino, unsigned int d_type)
+{
+	int *is_empty = (int *)__buf;
+
+	switch (namlen) {
+	case 2:
+		if (name[1] != '.')
+			break;
+	case 1:
+		if (name[0] != '.')
+			break;
+		return 0;
+	}
+
+	if (d_type == DT_WHT)
+		return 0;
+
+	(*is_empty) = 0;
+	return 0;
+}
+
+static int directory_is_empty(struct dentry *dentry, struct vfsmount *mnt)
+{
+	struct file *file;
+	int err;
+	int is_empty = 1;
+
+	BUG_ON(!S_ISDIR(dentry->d_inode->i_mode));
+
+	/* references for the file pointer */
+	dget(dentry);
+	mntget(mnt);
+
+	file = dentry_open(dentry, mnt, O_RDONLY, current_cred());
+	if (IS_ERR(file))
+		return 0;
+
+	err = vfs_readdir(file, filldir_is_empty, &is_empty);
+
+	fput(file);
+	return is_empty;
+}
+
+static int do_whiteout(struct nameidata ...
From: Valerie Aurora
Date: Thursday, April 15, 2010 - 4:04 pm

From: Jan Blunck <jblunck@suse.de>

Add support for whiteout dentries to tmpfs.  This includes adding
support for whiteouts to d_genocide(), which is called to tear down
pinned tmpfs dentries.  Whiteouts have to be persistent, so they have
a pinning extra ref count that needs to be dropped by d_genocide().

Signed-off-by: Jan Blunck <jblunck@suse.de>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Valerie Aurora <vaurora@redhat.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: linux-mm@kvack.org
---
 fs/dcache.c |   13 +++++-
 mm/shmem.c  |  149 +++++++++++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 147 insertions(+), 15 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 265015d..3b0e525 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2229,7 +2229,18 @@ resume:
 		struct list_head *tmp = next;
 		struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
 		next = tmp->next;
-		if (d_unhashed(dentry)||!dentry->d_inode)
+		/*
+		 * Skip unhashed and negative dentries, but process
+		 * positive dentries and whiteouts.  A whiteout looks
+		 * kind of like a negative dentry for purposes of
+		 * lookup, but it has an extra pinning ref count
+		 * because it can't be evicted like a negative dentry
+		 * can.  What we care about here is ref counts - and
+		 * we need to drop the ref count on a whiteout before
+		 * we can evict it.
+		 */
+		if (d_unhashed(dentry)||(!dentry->d_inode &&
+					 !d_is_whiteout(dentry)))
 			continue;
 		if (!list_empty(&dentry->d_subdirs)) {
 			this_parent = dentry;
diff --git a/mm/shmem.c b/mm/shmem.c
index eef4ebe..c58ecf4 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1805,6 +1805,76 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
+static int shmem_rmdir(struct inode *dir, struct dentry *dentry);
+static int shmem_unlink(struct inode *dir, struct dentry *dentry);
+
+/*
+ * This is the whiteout support for tmpfs. It uses one ...
From: Valerie Aurora
Date: Thursday, April 15, 2010 - 4:04 pm

From: Jan Blunck <jblunck@suse.de>

The ext2_append_link() is later used to find or append a directory
entry to whiteout.

Signed-off-by: Jan Blunck <jblunck@suse.de>
Signed-off-by: Valerie Aurora <vaurora@redhat.com>
Cc: Theodore Tso <tytso@mit.edu>
Cc: linux-ext4@vger.kernel.org
---
 fs/ext2/dir.c |   70 ++++++++++++++++++++++++++++++++++++++++----------------
 1 files changed, 50 insertions(+), 20 deletions(-)

diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 7516957..57207a9 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -472,9 +472,10 @@ void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
 }
 
 /*
- *	Parent is locked.
+ * Find or append a given dentry to the parent directory
  */
-int ext2_add_link (struct dentry *dentry, struct inode *inode)
+static ext2_dirent * ext2_append_entry(struct dentry * dentry,
+				       struct page ** page)
 {
 	struct inode *dir = dentry->d_parent->d_inode;
 	const char *name = dentry->d_name.name;
@@ -482,13 +483,10 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode)
 	unsigned chunk_size = ext2_chunk_size(dir);
 	unsigned reclen = EXT2_DIR_REC_LEN(namelen);
 	unsigned short rec_len, name_len;
-	struct page *page = NULL;
-	ext2_dirent * de;
+	ext2_dirent * de = NULL;
 	unsigned long npages = dir_pages(dir);
 	unsigned long n;
 	char *kaddr;
-	loff_t pos;
-	int err;
 
 	/*
 	 * We take care of directory expansion in the same loop.
@@ -498,20 +496,19 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode)
 	for (n = 0; n <= npages; n++) {
 		char *dir_end;
 
-		page = ext2_get_page(dir, n, 0);
-		err = PTR_ERR(page);
-		if (IS_ERR(page))
+		*page = ext2_get_page(dir, n, 0);
+		de = ERR_PTR(PTR_ERR(*page));
+		if (IS_ERR(*page))
 			goto out;
-		lock_page(page);
-		kaddr = page_address(page);
+		lock_page(*page);
+		kaddr = page_address(*page);
 		dir_end = kaddr + ext2_last_byte(dir, n);
 		de = (ext2_dirent *)kaddr;
 		kaddr += PAGE_CACHE_SIZE - reclen;
 		while ((char ...
From: Valerie Aurora
Date: Thursday, April 15, 2010 - 4:04 pm

Add support for fallthru directory entries to tmpfs

XXX - Makes up inode number for dirent

Signed-off-by: Valerie Aurora <vaurora@redhat.com>
---
 fs/dcache.c |    3 +-
 fs/libfs.c  |   21 +++++++++++++++++--
 mm/shmem.c  |   60 ++++++++++++++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 73 insertions(+), 11 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index b76f9e4..1575af4 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2240,7 +2240,8 @@ resume:
 		 * we can evict it.
 		 */
 		if (d_unhashed(dentry)||(!dentry->d_inode &&
-					 !d_is_whiteout(dentry)))
+					 !d_is_whiteout(dentry) &&
+					 !d_is_fallthru(dentry)))
 			continue;
 		if (!list_empty(&dentry->d_subdirs)) {
 			this_parent = dentry;
diff --git a/fs/libfs.c b/fs/libfs.c
index 9e50bcf..cb24772 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -133,6 +133,7 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
 	struct dentry *cursor = filp->private_data;
 	struct list_head *p, *q = &cursor->d_u.d_child;
 	ino_t ino;
+	int d_type;
 	int i = filp->f_pos;
 
 	switch (i) {
@@ -158,14 +159,28 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
 			for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
 				struct dentry *next;
 				next = list_entry(p, struct dentry, d_u.d_child);
-				if (d_unhashed(next) || !next->d_inode)
+				if (d_unhashed(next) || (!next->d_inode && !d_is_fallthru(next)))
 					continue;
 
+				if (d_is_fallthru(next)) {
+					/* XXX We don't know the inode
+					 * number of the directory
+					 * entry in the underlying
+					 * file system.  Should look
+					 * it up, either on fallthru
+					 * creation at first readdir
+					 * or now at filldir time. */
+					ino = 123; /* Made up ino */
+					d_type = DT_UNKNOWN;
+				} else {
+					ino = next->d_inode->i_ino;
+					d_type = dt_type(next->d_inode);
+				}
+
 				spin_unlock(&dcache_lock);
 				if (filldir(dirent, next->d_name.name, 
 					    ...
From: Valerie Aurora
Date: Thursday, April 15, 2010 - 4:04 pm

Document design and implementation of writable overlays (a.k.a. union
mounts).

XXX - out of date

Signed-off-by: Valerie Aurora <vaurora@redhat.com>
---
 Documentation/filesystems/union-mounts.txt |  708 ++++++++++++++++++++++++++++
 1 files changed, 708 insertions(+), 0 deletions(-)
 create mode 100644 Documentation/filesystems/union-mounts.txt

diff --git a/Documentation/filesystems/union-mounts.txt b/Documentation/filesystems/union-mounts.txt
new file mode 100644
index 0000000..5f47296
--- /dev/null
+++ b/Documentation/filesystems/union-mounts.txt
@@ -0,0 +1,708 @@
+State of writable overlays (formerly union mounts)
+==================================================
+
+This version of union mounts is renamed "writable overlays."  The goal
+of this patch set is to support a single read-write file system
+overlaid on a single read-only file system.  "Union mounts" suggests
+that we support unions of arbitrary numbers and types of file systems,
+which is not the goal of this patch set.
+
+The most recent version of writable overlays can boot to multi-user
+mode with a writable overlay root file system.  open(), truncate(),
+creat(), unlink(), mkdir(), rmdir(), and rename() work.  link(),
+chmod(), chown(), and chattr() don't work yet.
+
+This document describes the architecture and current status of
+writable overlays, including an item-by-item todo list.
+
+Writable overlays (formerly union mounts)
+=========================================
+
+In this document:
+ - Overview of writable overlays
+ - Terminology
+ - VFS implementation
+ - Locking strategy
+ - VFS/file system interface
+ - Userland interface
+ - NFS interaction
+ - Status
+ - Contributing to writable overlays
+
+Overview
+========
+
+Writable overlays (formerly known as union mounts) are used to layer a
+single writable file system over a single read-only file system, with
+all writes going to the writable file system.  The namespace of both
+file systems appears as a combined whole to ...
From: Valerie Aurora
Date: Thursday, April 15, 2010 - 4:04 pm

From: Jan Blunck <jblunck@suse.de>

If a dentry is removed from dentry cache because its usage count drops to
zero, the references to the underlying layer of the unions the dentry is in
are dropped too. Therefore the union cache is driven by the dentry cache.

Signed-off-by: Jan Blunck <jblunck@suse.de>
Signed-off-by: Valerie Aurora <vaurora@redhat.com>
---
 fs/dcache.c            |   13 +++++++++++
 fs/union.c             |   56 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/dcache.h |    8 ++++++
 include/linux/union.h  |    4 +++
 4 files changed, 81 insertions(+), 0 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 05c3a1e..983a1ea 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -18,6 +18,7 @@
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
+#include <linux/union.h>
 #include <linux/fsnotify.h>
 #include <linux/slab.h>
 #include <linux/init.h>
@@ -175,6 +176,8 @@ static struct dentry *d_kill(struct dentry *dentry)
 	dentry_stat.nr_dentry--;	/* For d_free, below */
 	/*drops the locks, at that point nobody can reach this dentry */
 	dentry_iput(dentry);
+	/* If the dentry was in an union delete them */
+	shrink_d_unions(dentry);
 	if (IS_ROOT(dentry))
 		parent = NULL;
 	else
@@ -696,6 +699,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 					iput(inode);
 			}
 
+			shrink_d_unions(dentry);
 			d_free(dentry);
 
 			/* finished when we fall off the top of the tree,
@@ -1535,7 +1539,9 @@ void d_delete(struct dentry * dentry)
 	spin_lock(&dentry->d_lock);
 	isdir = S_ISDIR(dentry->d_inode->i_mode);
 	if (atomic_read(&dentry->d_count) == 1) {
+		__d_drop_unions(dentry);
 		dentry_iput(dentry);
+		shrink_d_unions(dentry);
 		fsnotify_nameremove(dentry, isdir);
 		return;
 	}
@@ -1546,6 +1552,13 @@ void d_delete(struct dentry * dentry)
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dcache_lock);
 
+	/*
+	 * Remove any associated unions.  While someone still has this
+	 * ...
From: Valerie Aurora
Date: Thursday, April 15, 2010 - 4:04 pm

Create and tear down union mount structures on mount.  Check
requirements for union mounts.

Thanks to Felix Fietkau <nbd@openwrt.org> for a bug fix.

Signed-off-by: Jan Blunck <jblunck@suse.de>
Signed-off-by: Valerie Aurora <vaurora@redhat.com>
---
 fs/namespace.c        |  130 ++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/union.c            |   63 ++++++++++++++++++++++++
 include/linux/union.h |    4 ++
 3 files changed, 196 insertions(+), 1 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 5e4b27b..e19a432 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -29,6 +29,7 @@
 #include <linux/log2.h>
 #include <linux/idr.h>
 #include <linux/fs_struct.h>
+#include <linux/union.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include "pnode.h"
@@ -157,6 +158,9 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 #else
 		mnt->mnt_writers = 0;
 #endif
+#ifdef CONFIG_UNION_MOUNT
+		INIT_LIST_HEAD(&mnt->mnt_unions);
+#endif
 	}
 	return mnt;
 
@@ -492,6 +496,7 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
 
 static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
 {
+	detach_mnt_union(mnt);
 	old_path->dentry = mnt->mnt_mountpoint;
 	old_path->mnt = mnt->mnt_parent;
 	mnt->mnt_parent = mnt;
@@ -515,6 +520,7 @@ static void attach_mnt(struct vfsmount *mnt, struct path *path)
 	list_add_tail(&mnt->mnt_hash, mount_hashtable +
 			hash(path->mnt, path->dentry));
 	list_add_tail(&mnt->mnt_child, &path->mnt->mnt_mounts);
+	attach_mnt_union(mnt, path->mnt);
 }
 
 /*
@@ -537,6 +543,7 @@ static void commit_tree(struct vfsmount *mnt)
 	list_add_tail(&mnt->mnt_hash, mount_hashtable +
 				hash(parent, mnt->mnt_mountpoint));
 	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
+	attach_mnt_union(mnt, parent);
 	touch_mnt_namespace(n);
 }
 
@@ -1025,6 +1032,7 @@ void release_mounts(struct list_head *head)
 			struct dentry *dentry;
 			struct vfsmount *m;
 ...
From: Valerie Aurora
Date: Thursday, April 15, 2010 - 4:04 pm

From: Jan Blunck <jblunck@suse.de>

Call do_whiteout() when removing files and directories from a union
mounted file system.

Signed-off-by: Valerie Aurora <vaurora@redhat.com>
---
 fs/namei.c |    8 ++++++++
 1 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index b179062..900df0f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2670,6 +2670,10 @@ static long do_rmdir(int dfd, const char __user *pathname)
 	error = mnt_want_write(nd.path.mnt);
 	if (error)
 		goto exit3;
+	if (IS_UNIONED_DIR(&nd.path)) {
+		error = do_whiteout(&nd, &path, 1);
+		goto exit4;
+	}
 	error = security_path_rmdir(&nd.path, path.dentry);
 	if (error)
 		goto exit4;
@@ -2759,6 +2763,10 @@ static long do_unlinkat(int dfd, const char __user *pathname)
 		error = mnt_want_write(nd.path.mnt);
 		if (error)
 			goto exit2;
+		if (IS_UNIONED_DIR(&nd.path)) {
+			error = do_whiteout(&nd, &path, 0);
+			goto exit3;
+		}
 		error = security_path_unlink(&nd.path, path.dentry);
 		if (error)
 			goto exit3;
-- 
1.6.3.3

--

From: Valerie Aurora
Date: Thursday, April 15, 2010 - 4:04 pm

For union mounts, a file located on the lower layer will incorrectly
return EROFS on an access check.  To fix this, use the new
path_permission() call, which ignores a read-only lower layer file
system if the target will be copied up to the topmost file system.
---
 fs/open.c |   20 ++++++++++++++++----
 1 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/fs/open.c b/fs/open.c
index e17f544..686fcd2 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -454,7 +454,10 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
 	const struct cred *old_cred;
 	struct cred *override_cred;
 	struct path path;
+	struct nameidata nd;
+	struct vfsmount *mnt;
 	struct inode *inode;
+	char *tmp;
 	int res;
 
 	if (mode & ~S_IRWXO)	/* where's F_OK, X_OK, W_OK, R_OK? */
@@ -478,10 +481,17 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
 
 	old_cred = override_creds(override_cred);
 
-	res = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path);
+	res = user_path_nd(dfd, filename, LOOKUP_FOLLOW,
+				   &nd, &path, &tmp);
 	if (res)
 		goto out;
 
+	/* For union mounts, use the topmost mnt's permissions */
+	if (IS_UNIONED_DIR(&nd.path))
+		mnt = nd.path.mnt;
+	else
+		mnt = path.mnt;
+
 	inode = path.dentry->d_inode;
 
 	if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
@@ -490,11 +500,11 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
 		 * with the "noexec" flag.
 		 */
 		res = -EACCES;
-		if (path.mnt->mnt_flags & MNT_NOEXEC)
+		if (mnt->mnt_flags & MNT_NOEXEC)
 			goto out_path_release;
 	}
 
-	res = inode_permission(inode, mode | MAY_ACCESS);
+	res = path_permission(&path, &nd.path, mode | MAY_ACCESS);
 	/* SuS v2 requires we report a read only fs too */
 	if (res || !(mode & S_IWOTH) || special_file(inode->i_mode))
 		goto out_path_release;
@@ -508,11 +518,13 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
 	 * inherently racy and know ...
From: Valerie Aurora
Date: Thursday, April 15, 2010 - 4:04 pm

Copy up a file when opened with write permissions.  Does not copy up
the file data when O_TRUNC is specified.
---
 fs/namei.c |   28 ++++++++++++++++++++++++++++
 1 files changed, 28 insertions(+), 0 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index a6f7d5d..85a5451 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1929,6 +1929,24 @@ exit:
 	return ERR_PTR(error);
 }
 
+static int open_union_copyup(struct nameidata *nd, struct path *path,
+			     int open_flag)
+{
+	struct vfsmount *oldmnt = path->mnt;
+	int error;
+
+	if (open_flag & O_TRUNC)
+		error = union_copyup_len(nd, path, 0);
+	else
+		error = union_copyup(nd, path);
+	if (error)
+		return error;
+	if (oldmnt != path->mnt)
+		mntput(nd->path.mnt);
+
+	return error;
+}
+
 static struct file *do_last(struct nameidata *nd, struct path *path,
 			    int open_flag, int acc_mode,
 			    int mode, const char *pathname,
@@ -1979,6 +1997,11 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		error = -ENOTDIR;
 		if (*want_dir && !path->dentry->d_inode->i_op->lookup)
 			goto exit_dput;
+		if (acc_mode & MAY_WRITE) {
+			error = open_union_copyup(nd, path, open_flag);
+			if (error)
+				goto exit_dput;
+		}
 		path_to_nameidata(path, nd);
 		audit_inode(pathname, nd->path.dentry);
 		goto ok;
@@ -2050,6 +2073,11 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	if (path->dentry->d_inode->i_op->follow_link)
 		return NULL;
 
+	if (acc_mode & MAY_WRITE) {
+		error = open_union_copyup(nd, path, open_flag);
+		if (error)
+			goto exit_dput;
+	}
 	path_to_nameidata(path, nd);
 	error = -EISDIR;
 	if (S_ISDIR(path->dentry->d_inode->i_mode))
-- 
1.6.3.3

--

From: Valerie Aurora
Date: Thursday, April 15, 2010 - 4:04 pm

XXX - doesn't implement NOFOLLOW correctly
---
 fs/utimes.c |   13 +++++++++++--
 1 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/fs/utimes.c b/fs/utimes.c
index e4c75db..82feca2 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -8,6 +8,7 @@
 #include <linux/stat.h>
 #include <linux/utime.h>
 #include <linux/syscalls.h>
+#include <linux/union.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 
@@ -152,18 +153,26 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
 		error = utimes_common(&file->f_path, times);
 		fput(file);
 	} else {
+		struct nameidata nd;
+		char *tmp;
 		struct path path;
 		int lookup_flags = 0;
 
 		if (!(flags & AT_SYMLINK_NOFOLLOW))
 			lookup_flags |= LOOKUP_FOLLOW;
 
-		error = user_path_at(dfd, filename, lookup_flags, &path);
+		error = user_path_nd(dfd, filename, lookup_flags, &nd, &path,
+				     &tmp);
 		if (error)
 			goto out;
 
-		error = utimes_common(&path, times);
+		error = union_copyup(&nd, &path);
+
+		if (!error)
+			error = utimes_common(&path, times);
 		path_put(&path);
+		path_put(&nd.path);
+		putname(tmp);
 	}
 
 out:
-- 
1.6.3.3

--

From: Valerie Aurora
Date: Thursday, April 15, 2010 - 4:04 pm

---
 fs/open.c |   23 ++++++++++++++++++++---
 1 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/fs/open.c b/fs/open.c
index 6ec99e9..dc65b27 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -812,18 +812,35 @@ out:
 SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group)
 {
 	struct path path;
+	struct nameidata nd;
+	struct vfsmount *mnt;
+	char *tmp;
 	int error;
 
-	error = user_lpath(filename, &path);
+	error = user_path_nd(AT_FDCWD, filename, 0, &nd, &path, &tmp);
 	if (error)
 		goto out;
-	error = mnt_want_write(path.mnt);
+
+	if (IS_UNIONED_DIR(&nd.path))
+		mnt = nd.path.mnt;
+	else
+		mnt = path.mnt;
+
+	error = mnt_want_write(mnt);
 	if (error)
 		goto out_release;
+
+	error = union_copyup(&nd, &path);
+	if (error)
+		goto out_drop_write;
+
 	error = chown_common(&path, user, group);
-	mnt_drop_write(path.mnt);
+out_drop_write:
+	mnt_drop_write(mnt);
 out_release:
 	path_put(&path);
+	path_put(&nd.path);
+	putname(tmp);
 out:
 	return error;
 }
-- 
1.6.3.3

--

From: Valerie Aurora
Date: Thursday, April 15, 2010 - 4:04 pm

---
 fs/open.c |   25 +++++++++++++++++++++----
 1 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/fs/open.c b/fs/open.c
index dda1b6f..6ec99e9 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -669,18 +669,32 @@ out:
 SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode)
 {
 	struct path path;
+	struct nameidata nd;
+	struct vfsmount *mnt;
 	struct inode *inode;
+	char *tmp;
 	int error;
 	struct iattr newattrs;
 
-	error = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path);
+	error = user_path_nd(dfd, filename, LOOKUP_FOLLOW, &nd,
+				     &path, &tmp);
 	if (error)
 		goto out;
-	inode = path.dentry->d_inode;
 
-	error = mnt_want_write(path.mnt);
+	if (IS_UNIONED_DIR(&nd.path))
+		mnt = nd.path.mnt;
+	else
+		mnt = path.mnt;
+
+	error = mnt_want_write(mnt);
 	if (error)
 		goto dput_and_out;
+
+	error = union_copyup(&nd, &path);
+	if (error)
+		goto mnt_drop_write_and_out;
+
+	inode = path.dentry->d_inode;
 	mutex_lock(&inode->i_mutex);
 	error = security_path_chmod(path.dentry, path.mnt, mode);
 	if (error)
@@ -692,9 +706,12 @@ SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode)
 	error = notify_change(path.dentry, &newattrs);
 out_unlock:
 	mutex_unlock(&inode->i_mutex);
-	mnt_drop_write(path.mnt);
+mnt_drop_write_and_out:
+	mnt_drop_write(mnt);
 dput_and_out:
 	path_put(&path);
+	path_put(&nd.path);
+	putname(tmp);
 out:
 	return error;
 }
-- 
1.6.3.3

--

From: Valerie Aurora
Date: Thursday, April 15, 2010 - 4:04 pm

---
 fs/open.c |   24 ++++++++++++++++++++----
 1 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/fs/open.c b/fs/open.c
index 325852d..dda1b6f 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -230,14 +230,17 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
 static long do_sys_truncate(const char __user *pathname, loff_t length)
 {
 	struct path path;
+	struct nameidata nd;
+	struct vfsmount *mnt;
 	struct inode *inode;
+	char *tmp;
 	int error;
 
 	error = -EINVAL;
 	if (length < 0)	/* sorry, but loff_t says... */
 		goto out;
 
-	error = user_path(pathname, &path);
+	error = user_path_nd(AT_FDCWD, pathname, 0, &nd, &path, &tmp);
 	if (error)
 		goto out;
 	inode = path.dentry->d_inode;
@@ -251,11 +254,16 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
 	if (!S_ISREG(inode->i_mode))
 		goto dput_and_out;
 
-	error = mnt_want_write(path.mnt);
+	if (IS_UNIONED_DIR(&nd.path))
+		mnt = nd.path.mnt;
+	else
+		mnt = path.mnt;
+
+	error = mnt_want_write(mnt);
 	if (error)
 		goto dput_and_out;
 
-	error = inode_permission(inode, MAY_WRITE);
+	error = path_permission(&path, &nd.path, MAY_WRITE);
 	if (error)
 		goto mnt_drop_write_and_out;
 
@@ -263,6 +271,12 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
 	if (IS_APPEND(inode))
 		goto mnt_drop_write_and_out;
 
+	error = union_copyup_len(&nd, &path, length);
+	if (error)
+		goto mnt_drop_write_and_out;
+
+	/* path may have changed after copyup */
+	inode = path.dentry->d_inode;
 	error = get_write_access(inode);
 	if (error)
 		goto mnt_drop_write_and_out;
@@ -284,9 +298,11 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
 put_write_and_out:
 	put_write_access(inode);
 mnt_drop_write_and_out:
-	mnt_drop_write(path.mnt);
+	mnt_drop_write(mnt);
 dput_and_out:
 	path_put(&path);
+	path_put(&nd.path);
+	putname(tmp);
 out:
 	return error;
 }
-- 
1.6.3.3

--

From: Valerie Aurora
Date: Thursday, April 15, 2010 - 4:04 pm

Proof-of-concept implementation of chown() for union mounts.
---
 fs/open.c |   24 +++++++++++++++++++++---
 1 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/fs/open.c b/fs/open.c
index 686fcd2..325852d 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -30,6 +30,7 @@
 #include <linux/falloc.h>
 #include <linux/fs_struct.h>
 #include <linux/ima.h>
+#include <linux/union.h>
 
 #include "internal.h"
 
@@ -717,18 +718,35 @@ static int chown_common(struct path *path, uid_t user, gid_t group)
 SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group)
 {
 	struct path path;
+	struct nameidata nd;
+	struct vfsmount *mnt;
+	char *tmp;
 	int error;
 
-	error = user_path(filename, &path);
+	error = user_path_nd(AT_FDCWD, filename, LOOKUP_FOLLOW,
+				     &nd, &path, &tmp);
 	if (error)
 		goto out;
-	error = mnt_want_write(path.mnt);
+
+	if (IS_UNIONED_DIR(&nd.path))
+		mnt = nd.path.mnt;
+	else
+		mnt = path.mnt;
+
+	error = mnt_want_write(mnt);
 	if (error)
 		goto out_release;
+
+	error = union_copyup(&nd, &path);
+	if (error)
+		goto out_drop_write;
 	error = chown_common(&path, user, group);
-	mnt_drop_write(path.mnt);
+out_drop_write:
+	mnt_drop_write(mnt);
 out_release:
 	path_put(&path);
+	path_put(&nd.path);
+	putname(tmp);
 out:
 	return error;
 }
-- 
1.6.3.3

--

From: Valerie Aurora
Date: Thursday, April 15, 2010 - 4:04 pm

On rename() of a file on union mount, copyup and whiteout the source
file.  Both are done under the rename mutex.  I believe this is
actually atomic.

XXX - May not need to do file copyup under the lock.
---
 fs/namei.c |   75 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 files changed, 70 insertions(+), 5 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 5f6dcd4..a6f7d5d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3233,6 +3233,7 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
 {
 	struct dentry *old_dir, *new_dir;
 	struct path old, new;
+	struct path to_whiteout = {NULL, NULL};
 	struct dentry *trap;
 	struct nameidata oldnd, newnd;
 	char *from;
@@ -3248,12 +3249,9 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
 		goto exit1;
 
 	error = -EXDEV;
+	/* Union mounts will pass below test - dirs always on topmost */
 	if (oldnd.path.mnt != newnd.path.mnt)
 		goto exit2;
-	/* Rename on union mounts not implemented yet */
-	/* XXX much harsher check than necessary - can do some renames */
-	if (IS_UNIONED_DIR(&oldnd.path) || IS_UNIONED_DIR(&newnd.path))
-		goto exit2;
 	old_dir = oldnd.path.dentry;
 	error = -EBUSY;
 	if (oldnd.last_type != LAST_NORM)
@@ -3276,7 +3274,7 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
 	error = -ENOENT;
 	if (!old.dentry->d_inode)
 		goto exit4;
-	/* unless the source is a directory trailing slashes give -ENOTDIR */
+	/* unless the source is a directory, trailing slashes give -ENOTDIR */
 	if (!S_ISDIR(old.dentry->d_inode->i_mode)) {
 		error = -ENOTDIR;
 		if (oldnd.last.name[oldnd.last.len])
@@ -3288,6 +3286,11 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
 	error = -EINVAL;
 	if (old.dentry == trap)
 		goto exit4;
+	error = -EXDEV;
+	/* Can't rename a directory from a lower layer */
+	if (IS_UNIONED_DIR(&oldnd.path) &&
+	    IS_UNIONED_DIR(&old))
+		goto exit4;
 	error = lookup_hash(&newnd, &newnd.last, ...
From: Valerie Aurora
Date: Thursday, April 15, 2010 - 4:04 pm

---
 fs/namei.c |   24 ++++++++++++++++++++----
 1 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 68aa8ab..5f6dcd4 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3019,16 +3019,18 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
 {
 	struct dentry *new_dentry;
 	struct nameidata nd;
+	struct nameidata old_nd;
 	struct path old_path;
 	int error;
 	char *to;
+	char *from;
 
 	if ((flags & ~AT_SYMLINK_FOLLOW) != 0)
 		return -EINVAL;
 
-	error = user_path_at(olddfd, oldname,
+	error = user_path_nd(olddfd, oldname,
 			     flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
-			     &old_path);
+			     &old_nd, &old_path, &from);
 	if (error)
 		return error;
 
@@ -3036,8 +3038,20 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
 	if (error)
 		goto out;
 	error = -EXDEV;
-	if (old_path.mnt != nd.path.mnt)
-		goto out_release;
+	if (old_path.mnt != nd.path.mnt) {
+		if (IS_UNIONED_DIR(&old_nd.path) &&
+		    (old_nd.path.mnt == nd.path.mnt)) {
+			error = mnt_want_write(old_nd.path.mnt);
+			if (error)
+				goto out_release;
+			error = union_copyup(&old_nd, &old_path);
+			mnt_drop_write(old_nd.path.mnt);
+			if (error)
+				goto out_release;
+		} else {
+			goto out_release;
+		}
+	}
 	new_dentry = lookup_create(&nd, 0);
 	error = PTR_ERR(new_dentry);
 	if (IS_ERR(new_dentry))
@@ -3060,6 +3074,8 @@ out_release:
 	putname(to);
 out:
 	path_put(&old_path);
+	path_put(&old_nd.path);
+	putname(from);
 
 	return error;
 }
-- 
1.6.3.3

--

From: Valerie Aurora
Date: Thursday, April 15, 2010 - 4:04 pm

When a file on the read-only layer of a union mount is altered, it
must be copied up to the topmost read-write layer.  This patch creates
union_copyup() and its supporting routines.
---
 fs/union.c            |  246 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/union.h |    7 +-
 2 files changed, 252 insertions(+), 1 deletions(-)

diff --git a/fs/union.c b/fs/union.c
index e2384ad..944c720 100644
--- a/fs/union.c
+++ b/fs/union.c
@@ -26,6 +26,7 @@
 #include <linux/namei.h>
 #include <linux/file.h>
 #include <linux/security.h>
+#include <linux/splice.h>
 
 /*
  * This is borrowed from fs/inode.c. The hashtable for lookups. Somebody
@@ -633,3 +634,248 @@ out_fput:
 	mnt_drop_write(topmost_path->mnt);
 	return res;
 }
+
+/**
+ * union_create_file
+ *
+ * @nd: namediata for source file
+ * @old: path of the source file
+ * @new: path of the new file, negative dentry
+ *
+ * Must already have mnt_want_write() on the mnt and the parent's
+ * i_mutex.
+ */
+
+static int union_create_file(struct nameidata *nd, struct path *old,
+			     struct dentry *new)
+{
+	struct path *parent = &nd->path;
+	BUG_ON(!mutex_is_locked(&parent->dentry->d_inode->i_mutex));
+
+	return vfs_create(parent->dentry->d_inode, new,
+			  old->dentry->d_inode->i_mode, nd);
+}
+
+/**
+ * union_create_symlink
+ *
+ * @nd: namediata for source symlink
+ * @old: path of the source symlink
+ * @new: path of the new symlink, negative dentry
+ *
+ * Must already have mnt_want_write() on the mnt and the parent's
+ * i_mutex.
+ */
+
+static int union_create_symlink(struct nameidata *nd, struct path *old,
+				struct dentry *new)
+{
+	void *cookie;
+	int error;
+
+	BUG_ON(!mutex_is_locked(&nd->path.dentry->d_inode->i_mutex));
+
+	printk(KERN_INFO "%s: copying up symlink\n", new->d_name.name);
+	/*
+	 * We want the contents of this symlink, not to follow it, so
+	 * this is modeled on generic_readlink() rather than
+	 * do_follow_link().
+	 */
+	nd->depth = ...
From: Valerie Aurora
Date: Thursday, April 15, 2010 - 4:04 pm

Proof-of-concept implementation of user_path_nd().  Lookup both the
parent and the target of a user-supplied filename, to supply later to
union copyup routines.
---
 fs/namei.c            |   31 +++++++++++++++++++++++++++++++
 include/linux/namei.h |    2 ++
 2 files changed, 33 insertions(+), 0 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 24e0cb2..68aa8ab 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1563,6 +1563,37 @@ static int user_path_parent(int dfd, const char __user *path,
 	return error;
 }
 
+int user_path_nd(int dfd, const char __user *filename,
+			 unsigned flags, struct nameidata *parent_nd,
+			 struct path *child, char **tmp)
+{
+	struct nameidata child_nd;
+	char *s = getname(filename);
+	int error;
+
+	if (IS_ERR(s))
+		return PTR_ERR(s);
+
+	/* Lookup parent */
+	error = do_path_lookup(dfd, s, LOOKUP_PARENT, parent_nd);
+	if (error)
+		goto out_putname;
+
+	/* Lookup child - XXX optimize, racy */
+	error = do_path_lookup(dfd, s, flags, &child_nd);
+	if (error)
+		goto out_path_put;
+	*child = child_nd.path;
+	*tmp = s;
+	return 0;
+
+out_path_put:
+	path_put(&parent_nd->path);
+out_putname:
+	putname(s);
+	return error;
+}
+
 /*
  * It's inline, so penalty for filesystems that don't use sticky bit is
  * minimal.
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 05b441d..83dc8b5 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -58,6 +58,8 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
 #define LOOKUP_RENAME_TARGET	0x0800
 
 extern int user_path_at(int, const char __user *, unsigned, struct path *);
+extern int user_path_nd(int, const char __user *, unsigned,
+			struct nameidata *, struct path *, char **);
 
 #define user_path(name, path) user_path_at(AT_FDCWD, name, LOOKUP_FOLLOW, path)
 #define user_lpath(name, path) user_path_at(AT_FDCWD, name, 0, path)
-- 
1.6.3.3

--

From: Valerie Aurora
Date: Thursday, April 15, 2010 - 4:04 pm

Split inode_permission() into inode and file-system-dependent parts.
Create path_permission() to check permission based on the path to the
inode.  This is for union mounts, in which an inode can be located on
a read-only lower layer file system but is still writable, since we
will copy it up to the writable top layer file system.  So in that
case, we want to ignore MS_RDONLY on the lower layer.  To make this
decision, we must know the path (vfsmount, dentry) of both the target
and its parent.
---
 fs/namei.c         |   92 ++++++++++++++++++++++++++++++++++++++++++++--------
 include/linux/fs.h |    1 +
 2 files changed, 79 insertions(+), 14 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 900df0f..24e0cb2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -241,29 +241,20 @@ int generic_permission(struct inode *inode, int mask,
 }
 
 /**
- * inode_permission  -  check for access rights to a given inode
+ * __inode_permission  -  check for access rights to a given inode
  * @inode:	inode to check permission on
  * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
  *
  * Used to check for read/write/execute permissions on an inode.
- * We use "fsuid" for this, letting us set arbitrary permissions
- * for filesystem access without changing the "normal" uids which
- * are used for other things.
+ *
+ * This does not check for a read-only file system.  You probably want
+ * inode_permission().
  */
-int inode_permission(struct inode *inode, int mask)
+static int __inode_permission(struct inode *inode, int mask)
 {
 	int retval;
 
 	if (mask & MAY_WRITE) {
-		umode_t mode = inode->i_mode;
-
-		/*
-		 * Nobody gets write access to a read-only fs.
-		 */
-		if (IS_RDONLY(inode) &&
-		    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
-			return -EROFS;
-
 		/*
 		 * Nobody gets write access to an immutable file.
 		 */
@@ -288,6 +279,79 @@ int inode_permission(struct inode *inode, int mask)
 }
 
 /**
+ * sb_permission  -  check superblock-level ...
From: Valerie Aurora
Date: Thursday, April 15, 2010 - 4:04 pm

readdir() in union mounts is implemented by copying up all visible
directory entries from the lower level directories to the topmost
directory.  Directory entries that refer to lower level file system
objects are marked as "fallthru" in the topmost directory.

Thanks to Felix Fietkau <nbd@openwrt.org> for a bug fix.

Signed-off-by: Valerie Aurora <vaurora@redhat.com>
Signed-off-by: Felix Fietkau <nbd@openwrt.org>
---
 fs/readdir.c          |    9 +++
 fs/union.c            |  160 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/union.h |    2 +
 3 files changed, 171 insertions(+), 0 deletions(-)

diff --git a/fs/readdir.c b/fs/readdir.c
index 3a48491..da71515 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -16,6 +16,8 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/unistd.h>
+#include <linux/union.h>
+#include <linux/mount.h>
 
 #include <asm/uaccess.h>
 
@@ -36,9 +38,16 @@ int vfs_readdir(struct file *file, filldir_t filler, void *buf)
 
 	res = -ENOENT;
 	if (!IS_DEADDIR(inode)) {
+		if (IS_UNIONED_DIR(&file->f_path) && !IS_OPAQUE(inode)) {
+			res = union_copyup_dir(&file->f_path);
+			if (res)
+				goto out_unlock;
+		}
+
 		res = file->f_op->readdir(file, buf, filler);
 		file_accessed(file);
 	}
+out_unlock:
 	mutex_unlock(&inode->i_mutex);
 out:
 	return res;
diff --git a/fs/union.c b/fs/union.c
index 8ad9de7..e2384ad 100644
--- a/fs/union.c
+++ b/fs/union.c
@@ -5,6 +5,7 @@
  * Copyright (C) 2007-2009 Novell Inc.
  *
  *   Author(s): Jan Blunck (j.blunck@tu-harburg.de)
+ *              Valerie Aurora <vaurora@redhat.com>
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
@@ -23,6 +24,8 @@
 #include <linux/slab.h>
 #include <linux/union.h>
 #include <linux/namei.h>
+#include <linux/file.h>
+#include <linux/security.h>
 
 /*
  * This is borrowed from fs/inode.c. The hashtable for lookups. ...
From: Valerie Aurora
Date: Thursday, April 15, 2010 - 4:04 pm

Implement unioned directories, whiteouts, and fallthrus in pathname
lookup routines.  do_lookup() and lookup_hash() call lookup_union()
after looking up the dentry from the top-level file system.
lookup_union() is centered around __lookup_hash(), which does cached
and/or real lookups and revalidates each dentry in the union stack.

The added cost to a non-union mount pathname lookup in a
CONFIG_UNION_MOUNT kernel is either one or two mount flag tests per
pathname component, in needs_union_lookup().

XXX - implement negative union cache entries
---
 fs/namei.c            |  191 ++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/union.c            |   67 +++++++++++++++++
 include/linux/union.h |    9 +++
 3 files changed, 266 insertions(+), 1 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 991767b..b179062 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -32,6 +32,7 @@
 #include <linux/fcntl.h>
 #include <linux/device_cgroup.h>
 #include <linux/fs_struct.h>
+#include <linux/union.h>
 #include <asm/uaccess.h>
 
 #include "internal.h"
@@ -722,6 +723,181 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
 	follow_mount(&nd->path);
 }
 
+static struct dentry *__lookup_hash(struct qstr *name, struct dentry *base,
+				    struct nameidata *nd);
+
+/*
+ * __lookup_union - Given a path from the topmost layer, lookup and
+ * revalidate each dentry in its union stack, building it if necessary
+ *
+ * @nd - nameidata for the parent of @topmost
+ * @name - pathname from this element on
+ * @topmost - path of the topmost matching dentry
+ *
+ * Given the nameidata and the path of the topmost dentry for this
+ * pathname, lookup, revalidate, and build the associated union stack.
+ * @topmost must be either a negative dentry or a directory.
+ *
+ * This function is called both to build a new union stack and to
+ * revalidate a pre-existing union stack.  So we must cope with
+ * already existing union cache entries.
+ *
+ * This function may stomp ...
From: Valerie Aurora
Date: Thursday, April 15, 2010 - 4:04 pm

From: Jan Blunck <jblunck@suse.de>

This patch adds the basic structures and operations of VFS-based union
mounts (but not the ability to mount or lookup unioned file systems).
Each directory in a unioned file system has an associated union stack
created when the directory is first looked up.  The union stack is a
structure kept in a hash table indexed by mount and dentry of the
directory; thus, specific paths are unioned, not dentries alone.  The
union stack keeps a pointer to the upper path and the lower path and
can be looked up by either path.

This particular version of union mounts is based on ideas by Jan
Blunck, Bharata Rao, and many others.

Signed-off-by: Jan Blunck <jblunck@suse.de>
Signed-off-by: Valerie Aurora <vaurora@redhat.com>
---
 fs/Kconfig             |   13 ++
 fs/Makefile            |    1 +
 fs/dcache.c            |    4 +
 fs/union.c             |  289 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/dcache.h |   20 ++++
 include/linux/mount.h  |    3 +
 include/linux/union.h  |   53 +++++++++
 7 files changed, 383 insertions(+), 0 deletions(-)
 create mode 100644 fs/union.c
 create mode 100644 include/linux/union.h

diff --git a/fs/Kconfig b/fs/Kconfig
index 7405f07..c16b9db 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -59,6 +59,19 @@ source "fs/notify/Kconfig"
 
 source "fs/quota/Kconfig"
 
+config UNION_MOUNT
+       bool "Writable overlays (union mounts) (EXPERIMENTAL)"
+       depends on EXPERIMENTAL
+       help
+         Writable overlays allow you to mount a transparent writable
+	 layer over a read-only file system, for example, an ext3
+	 partition on a hard drive over a CD-ROM root file system
+	 image.
+
+	 See <file:Documentation/filesystems/union-mounts.txt> for details.
+
+	 If unsure, say N.
+
 source "fs/autofs/Kconfig"
 source "fs/autofs4/Kconfig"
 source "fs/fuse/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index c3633aa..9693730 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -52,6 +52,7 @@ ...
From: Valerie Aurora
Date: Thursday, April 15, 2010 - 4:04 pm

From: Jan Blunck <jblunck@suse.de>

Add per mountpoint flag for Union Mount support. You need additional patches
to util-linux for that to work - see:

git://git.kernel.org/pub/scm/utils/util-linux-ng/val/util-linux-ng.git

Signed-off-by: Jan Blunck <jblunck@suse.de>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Valerie Aurora <vaurora@redhat.com>
---
 fs/namespace.c        |    5 ++++-
 include/linux/fs.h    |    1 +
 include/linux/mount.h |    4 ++--
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 9a40282..5e4b27b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -808,6 +808,7 @@ static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
 		{ MNT_NODIRATIME, ",nodiratime" },
 		{ MNT_RELATIME, ",relatime" },
 		{ MNT_STRICTATIME, ",strictatime" },
+		{ MNT_UNION, ",union" },
 		{ 0, NULL }
 	};
 	const struct proc_fs_info *fs_infop;
@@ -2018,10 +2019,12 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
 		mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
 	if (flags & MS_RDONLY)
 		mnt_flags |= MNT_READONLY;
+	if (flags & MS_UNION)
+		mnt_flags |= MNT_UNION;
 
 	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE |
 		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
-		   MS_STRICTATIME);
+		   MS_STRICTATIME | MS_UNION);
 
 	if (flags & MS_REMOUNT)
 		retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a5ba718..4dae882 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -192,6 +192,7 @@ struct inodes_stat_t {
 #define MS_REMOUNT	32	/* Alter flags of a mounted FS */
 #define MS_MANDLOCK	64	/* Allow mandatory locks on an FS */
 #define MS_DIRSYNC	128	/* Directory modifications are synchronous */
+#define MS_UNION	256	/* Merge namespace with FS mounted below */
 #define MS_NOATIME	1024	/* Do not update access times. */
 #define MS_NODIRATIME	2048	/* Do not update directory access ...
From: Valerie Aurora
Date: Wednesday, April 28, 2010 - 1:19 pm

I'm sorry I have responded sooner, I've been trying to write a
detailed useful message and that turns out to be hard.  I'll just
include a few of the highlights; mainly I want to say that I'd
rather do it the way you describe but when I tried it ended up even
uglier than the VFS implementation.

I went down this road initially (do most of the unioning in a file
system) and spent a couple of months on it.  But I always ended up
having to do some level of copy-around and redirection similar to that
in unionfs.

One of the major difficulties that arises even when doing unioning at
the VFS level is keeping around the parent's path in order to do the
copyup later on.  Take a look at the code pattern in the "union-mount:
Implement union-aware syscall()" series of patches.  That's the
prettiest and most efficient version I could come up with, after two
other implementations, and it's in the VFS, at the vfs_foo_syscall()
level.  I don't even know how I would start if I had to wait until the
file system op is called.

If you have some insights on how to do this, I'd love to hear them.  I
don't enjoy writing VFS code for the fun of it. :)

Thanks,

-VAL


--

From: J. R. Okajima
Date: Thursday, April 29, 2010 - 9:10 am

I agree that is prettiest, and copup at open for write makes it easier.
But some applications issue mmap(MAP_PRIVATE) after open(O_RDWR), for
example modprobe(8). In this case, every kernel module will be copied-up
and it must be a waste of time and space. And I guess this is one reason
why other implementation took the approach of copyup at write.
At the same time, I guess this issue may be less important since other
parts are pretty enough.


J. R. Okajima
--

From: Valerie Aurora
Date: Thursday, April 29, 2010 - 1:20 pm

Sure.  The short version is that unionfs has to allocate another copy
of each file system structure - inode, etc. - and then keep an array
of the matching structures from each of the file system layers.  Each
unionfs file system op copies data up and down between the unionfs
structures and the underlying structures, and then calls the lower
file system op as necessary.  Often it has to duplicate code from the
VFS before calling the lower file system ops.

Where union mounts has the advantage is that we make zero copies of
file system data structures and therefore don't need copyup or
interposition on as many ops.  But if you wait until the file system
op is called, you have to attach your union-related data to the
associated data structure, and the underlying file system is already
using the private data pointer.  And you have to keep a copy of the
underlying file system ops.  And each data structure can be part of
multiple unions.  So you end up with an effective second copy of the

Unfortunately, dentries aren't unioned - paths (dentry/mnt pairs) are.
So you can get the parent dentry in the file system op, but the dentry
is potentially part of many different mounts.  There's no mapping from
a lower-level read-only dentry to the covering read-write parent
dentry because the read-only dentry could potentially be mounted in 5
different places.  Which union mount is this dentry part of?  You have
to record the parent's path during lookup and carry it around until
you do the copyup - for every syscall that alters a file, not just
open() and write(), but chmod(), etc.  So if you implement it in the
VFS, you don't have to carry that info across the file system op
boundary.

I think the chmod() case really shows the issues well.  user_path_nd()
records the parent's path during lookup (in an inefficient, possibly
racy manner), then union_copyup() does the copy (too early, before a
lot of permission checks).  The underlying file system doesn't get
involved until the ->setattr() call ...
From: Miklos Szeredi
Date: Monday, May 10, 2010 - 5:57 am

Let's not over-generalize the problem.  Current implementation has the
following properties:

 a) one read-only layer and one read-write layer
 b) for each non-directory only one of the layers is relevant
 c) for directories both layers may be relevant

Yep, and that can be fixed by adding better helpers to the VFS which
do all the locking magic, etc, and are supplied with a "struct path"

My proposal a few mails back would eliminate that as well.

Imagine it like this: you have a filesystem full of symlinks.  Special
symlinks, in fact, which are always followed and so are invisible from
userspace.

What this means is that it's not necessary to interposition on any
operation that:

 - doesn't modify the file and is not a lookup, or

For the simplified case all we need in each union fs inode is a
reference to a dentry in either the lower or upper layers, and in the
rare case of unioned directory a reference to a denty in both.

And notice, that is exactly the same as what you have in union mounts.
It shouldn't make too much difference if those refs are stored in a

That's simply not true, we are unioning *filesystems* not paths.  And
that's true of union mounts as well.

Unioning of complete namespaces is a completely different, and orders

And that's irrelevant.  The union filesystem can make a pair of
private, kernel-only, mounts for the underlying filesystems and be

If we keep to the simplified rules, then there's no need to map from a

The same can be done with the union fs, except copy up is done by the
union filesystem instead of the VFS.  And that is _after_ the

Not quite, allowing unprivileged users to trigger arbitrary copy-up is
clearly a DoS.

Thanks,
Miklos
--

From: Miklos Szeredi
Date: Thursday, April 29, 2010 - 2:33 am

I haven't looked at unionfs in a long time.  Can you say something

On a high level I don't see a problem, the parent of every dentry can
be found through ->d_parent.

One issue is having to duplicate some locking and other stuff around
vfs_whatever() calls.  But that could be fixed by exporting suitable
helpers from the VFS.

Other than that I don't see any fundamental issues with union
filesystems (except that they seem to grow too many features to be
maintainable).

Thanks,
Miklos
--

From: Valerie Aurora
Date: Thursday, April 15, 2010 - 4:04 pm

From: Felix Fietkau <nbd@openwrt.org>

Add support for fallthru dentries to jffs2.

Cc: David Woodhouse <dwmw2@infradead.org>
Cc: linux-mtd@lists.infradead.org
Signed-off-by: Felix Fietkau <nbd@openwrt.org>
Signed-off-by: Valerie Aurora <vaurora@redhat.com>
---
 fs/jffs2/dir.c        |   36 +++++++++++++++++++++++++++++++++---
 include/linux/jffs2.h |    6 ++++++
 2 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index c259193..98397b3 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -35,6 +35,7 @@ static int jffs2_rename (struct inode *, struct dentry *,
 			 struct inode *, struct dentry *);
 
 static int jffs2_whiteout (struct inode *, struct dentry *, struct dentry *);
+static int jffs2_fallthru (struct inode *, struct dentry *);
 
 const struct file_operations jffs2_dir_operations =
 {
@@ -59,6 +60,7 @@ const struct inode_operations jffs2_dir_inode_operations =
 	.rename =	jffs2_rename,
 	.check_acl =	jffs2_check_acl,
 	.whiteout =     jffs2_whiteout,
+	.fallthru =     jffs2_fallthru,
 	.setattr =	jffs2_setattr,
 	.setxattr =	jffs2_setxattr,
 	.getxattr =	jffs2_getxattr,
@@ -103,10 +105,14 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target,
 	}
 	if (fd) {
 		spin_lock(&target->d_lock);
-		if (fd->type == DT_WHT)
+		switch (fd->type) {
+		case DT_WHT:
 			target->d_flags |= DCACHE_WHITEOUT;
-		else
+		case JFFS2_DT_FALLTHRU:
+			target->d_flags |= DCACHE_FALLTHRU;
+		default:
 			ino = fd->ino;
+		}
 		spin_unlock(&target->d_lock);
 	}
 	mutex_unlock(&dir_f->sem);
@@ -164,7 +170,10 @@ static int jffs2_readdir(struct file *filp, void *dirent, filldir_t filldir)
 				  fd->name, fd->ino, fd->type, curofs, offset));
 			continue;
 		}
-		if (!fd->ino) {
+		if (fd->type == JFFS2_DT_FALLTHRU)
+			/* XXX Should really do a lookup for the real inode number here */
+			fd->ino = 100;
+		else if (!fd->ino && (fd->type != DT_WHT)) {
 			D2(printk(KERN_DEBUG "Skipping deletion ...
From: Valerie Aurora
Date: Thursday, April 15, 2010 - 4:04 pm

From: Felix Fietkau <nbd@openwrt.org>

Add support for whiteout dentries to jffs2.

Signed-off-by: Felix Fietkau <nbd@openwrt.org>
Signed-off-by: Valerie Aurora <vaurora@redhat.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: linux-mtd@lists.infradead.org
---
 fs/jffs2/dir.c        |   72 +++++++++++++++++++++++++++++++++++++++++++++++-
 fs/jffs2/fs.c         |    4 +++
 fs/jffs2/super.c      |    2 +-
 include/linux/jffs2.h |    2 +
 4 files changed, 77 insertions(+), 3 deletions(-)

diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 7aa4417..c259193 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -34,6 +34,8 @@ static int jffs2_mknod (struct inode *,struct dentry *,int,dev_t);
 static int jffs2_rename (struct inode *, struct dentry *,
 			 struct inode *, struct dentry *);
 
+static int jffs2_whiteout (struct inode *, struct dentry *, struct dentry *);
+
 const struct file_operations jffs2_dir_operations =
 {
 	.read =		generic_read_dir,
@@ -56,6 +58,7 @@ const struct inode_operations jffs2_dir_inode_operations =
 	.mknod =	jffs2_mknod,
 	.rename =	jffs2_rename,
 	.check_acl =	jffs2_check_acl,
+	.whiteout =     jffs2_whiteout,
 	.setattr =	jffs2_setattr,
 	.setxattr =	jffs2_setxattr,
 	.getxattr =	jffs2_getxattr,
@@ -98,8 +101,14 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target,
 			fd = fd_list;
 		}
 	}
-	if (fd)
-		ino = fd->ino;
+	if (fd) {
+		spin_lock(&target->d_lock);
+		if (fd->type == DT_WHT)
+			target->d_flags |= DCACHE_WHITEOUT;
+		else
+			ino = fd->ino;
+		spin_unlock(&target->d_lock);
+	}
 	mutex_unlock(&dir_f->sem);
 	if (ino) {
 		inode = jffs2_iget(dir_i->i_sb, ino);
@@ -498,6 +507,11 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
 		return PTR_ERR(inode);
 	}
 
+	if (dentry->d_flags & DCACHE_WHITEOUT) {
+		inode->i_flags |= S_OPAQUE;
+		ri->flags = cpu_to_je16(JFFS2_INO_FLAG_OPAQUE);
+	}
+
 	inode->i_op = &jffs2_dir_inode_operations;
 	inode->i_fop = ...
From: Valerie Aurora
Date: Thursday, April 15, 2010 - 4:04 pm

Add support for fallthru directory entries to ext2.

XXX - Makes up inode number for fallthru entry
XXX - Might be better implemented as special symlinks

Cc: Theodore Tso <tytso@mit.edu>
Cc: linux-ext4@vger.kernel.org
Signed-off-by: Valerie Aurora <vaurora@redhat.com>
Signed-off-by: Jan Blunck <jblunck@suse.de>
---
 fs/ext2/dir.c           |   92 ++++++++++++++++++++++++++++++++++++++++++++--
 fs/ext2/ext2.h          |    1 +
 fs/ext2/namei.c         |   22 +++++++++++
 include/linux/ext2_fs.h |    1 +
 4 files changed, 112 insertions(+), 4 deletions(-)

diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 030bd46..f3b4aff 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -219,7 +219,8 @@ static inline int ext2_match (int len, const char * const name,
 {
 	if (len != de->name_len)
 		return 0;
-	if (!de->inode && (de->file_type != EXT2_FT_WHT))
+	if (!de->inode && ((de->file_type != EXT2_FT_WHT) &&
+			   (de->file_type != EXT2_FT_FALLTHRU)))
 		return 0;
 	return !memcmp(name, de->name, len);
 }
@@ -256,6 +257,7 @@ static unsigned char ext2_filetype_table[EXT2_FT_MAX] = {
 	[EXT2_FT_SOCK]		= DT_SOCK,
 	[EXT2_FT_SYMLINK]	= DT_LNK,
 	[EXT2_FT_WHT]		= DT_WHT,
+	[EXT2_FT_FALLTHRU]	= DT_UNKNOWN,
 };
 
 #define S_SHIFT 12
@@ -342,6 +344,24 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
 					ext2_put_page(page);
 					return 0;
 				}
+			} else if (de->file_type == EXT2_FT_FALLTHRU) {
+				int over;
+				unsigned char d_type = DT_UNKNOWN;
+
+				offset = (char *)de - kaddr;
+				/* XXX We don't know the inode number
+				 * of the directory entry in the
+				 * underlying file system.  Should
+				 * look it up, either on fallthru
+				 * creation at first readdir or now at
+				 * filldir time. */
+				over = filldir(dirent, de->name, de->name_len,
+					       (n<<PAGE_CACHE_SHIFT) | offset,
+					       123 /* Made up ino */, d_type);
+				if (over) {
+					ext2_put_page(page);
+					return 0;
+				}
 			}
 			filp->f_pos += ...
From: David Woodhouse
Date: Monday, April 19, 2010 - 6:02 am

I certainly asked whether you really need a real 'struct inode' for
whiteouts, and suggested that they should be represented _purely_ as a
dentry with type DT_WHT.

I don't much like the manifestation of that in this patch though,
especially with the made-up inode number. (ISTR I had other
jffs2-specific objections too, which I'll dig out and forward).

-- 
David Woodhouse                            Open Source Technology Centre
David.Woodhouse@intel.com                              Intel Corporation

--

From: Jan Blunck
Date: Monday, April 19, 2010 - 6:23 am

Yes, this patches still have issues that Val and me are aware off. I can't
remember anything jffs2-specific though.

We return that inode number because we don't want to lookup the name on the
other filesystem during readdir. Therefore returning DT_UNKNOWN to let the
userspace decide if it needs to stat the file was the easiest workaround. I
know that POSIX requires d_ino and d_name but on the other hand it does not
require anything more on how long d_ino is valid.

If somebody has an idea how to make this cleaner please speak up.

Regards,
	Jan

-- 
Jan Blunck <jblunck@suse.de>
--

From: Jan Blunck
Date: Monday, April 19, 2010 - 7:12 am

Hmm, why not. Or even the ino of the directory we are reading from ...

Regards,
	Jan

-- 
Jan Blunck <jblunck@suse.de>
--

From: Valerie Aurora
Date: Monday, April 19, 2010 - 7:23 am

I don't recall there being any technical reason not to look up the
real inode number.  I just wrote it that we because I was lazy.  So I
like returning the directory's d_ino better than a single magic
number, but I'd at least like to try returning the real inode number
too.

-VAL
--

From: Jan Blunck
Date: Wednesday, April 21, 2010 - 1:42 am

No, for stat() you do a lookup and that is returning the correct dentry/inode
for the filesystem the name is on.

We just return the the fallthru directory entries to give userspace an offset
that they can seekdir() to.

Regards,
	Jan

-- 
Jan Blunck <jblunck@suse.de>
--

From: Jamie Lokier
Date: Wednesday, April 21, 2010 - 2:22 am

Hmm.  I smell potential confusion for some otherwise POSIX-friendly
userspaces.

When I open /path/to/foo, call fstat (st_dev=2, st_ino=5678), and then
keep the file open, then later do a readdir which includes foo
(dir.st_dev=1, d_ino=1234), I'm going to immediately assume a rename
or unlink happened, close the file, abort streaming from it, refresh
the GUI windows, refresh application caches for that name entry, etc.

Because in the POSIX world I think open files have stable inode
numbers (as long as they are open), and I don't think that an open
file can have it's name's d_ino not match the inode number unless it's
a mount point, which my program would know about.

This plays into inotify, where you have to know if you are monitoring
every directory that contains a link to a file, to know if you need to
monitor the file itself directly instead.

Now I think it's fair enough that a union mount doesn't play all the
traditional rules :-)  C'est la vie.

This mismatch of (dir.st_dev,d_ino) and st_ino strongly resembles a
file-bind-mount.  Like bind mounts, it's quite annoying for programs
that like to assume they've seen all of a file's links when they've
seen i_nlink of them.

Bind mounts can be detected by looking in /proc/mounts.  st_dev
changing doesn't work because it can be a binding of the same
filesystem.

How would I go about detecting when a union mount's directory entry
has similar behaviour, without calling stat() on each entry?  Is it
just a matter of recognising a particular filesystem name in
/proc/mounts, or something more?

Thanks,
-- Jamie
--

From: Jamie Lokier
Date: Wednesday, April 21, 2010 - 2:52 am

Sorry, no: That does not work for bind mounts.  Both layers can have
the same st_dev.  Nor does O_NOFOLLOW stop traversal in the middle of
a path, there is no handy O_NOCROSSMOUNTS, and no st_mode flag or
d_type to say it's a bind mount.  Bind mounts are really a big pain
for i_nlink+inotify name counting.

Besides, calling stat() on every entry in a large directory to check
st_ino can be orders of magnitude slower than readdir() on a large
directory - especially with a cold cache.  It is quicker, but much
more complicated, to parse /proc/mounts and apply arcane rules to find
the exceptions.


I agree, and union moint is a very useful feature that's worth
breaking a few apps for :-)

I'm curious if there's a clear way to go about it in this case, or
if it'll involve a certain amount of pattern recognition in /proc/mounts.

Basically I'm wondering if it's been thought about already.

-- Jamie
--

From: Miklos Szeredi
Date: Wednesday, April 21, 2010 - 3:17 am

I'm confused.  You are monitoring a specific file and would like to
know if something is happening to any of it's links, right?

Why do you need to know about bind mounts for that?

Count the number of times you encounter that d_ino and if that matches
i_nlink then every directory is monitored.  Simple as that, no?

Thanks,
Miklos
--

From: Jamie Lokier
Date: Wednesday, April 21, 2010 - 10:36 am

Not quite. I'm monitoring a million files (say), so I must use
directory watches for most of them.  I need directory watches anyway,
when the semantic is "calling open on /path/to/file and reading would
return the same data", because renames and unlinks are also a way to
invalidate monitored file contents.

At a high level, what we're talking about is the ability to cache and
verify the the validity information derived from reading files in the
filesystem, in a manner which efficiently triggers invalidation only
on changes.  Being able to answer, as quickly as possible, "if I read
this, that and other, will I get the same results as the last time I
did those operations, without having to actually do them to check".

When I see a file has i_nlink > 1, I must watch the file directly
using a file-watch (with inotify; polling with stat() with dnotify),
_unless_ I have seen all the links to that file.

When I've seen all the links to a file, I know that my directory
watches on the directories containing those links are sufficient to
detect changes to the file contents.  That's because every
file change will get notified on at least one of those paths.

I learn that I've seen all the links by seeing d_ino during readdir as
you suggested, or by st_ino in the cases where I've not had reason to
readdir and I have needed to open the file or call stat.

Let's look at some bind mounts.  One where st_ino doesn't work:

    /dirA/file1  [hard link to inode 100, i_nlink = 2]
    /dirA/bound  [bind mount, has /dirA/file1 mounted on it]
    /dirB/file2  [hard link to inode 100, i_nlink = 2]

If the program is asked to open /dirA/file1 and /dirA/bound at various
times, and never asked to readdir /dirA, it will have used fstat not
readdir, seen the same (st_dev,st_ino,i_nlink = 2), and _wrongly_
concluded that it is monitoring all directories containing paths to
the file.

To avoid that problem, it parses /proc/mounts and detects that
/dirA/bound does not contributed to the link count.  ...
From: Valerie Aurora
Date: Wednesday, April 21, 2010 - 2:38 pm

I couldn't have put it better myself.

To expand slightly, if the broken apps are not few and easily fixed,
then we'll go back and make the kernel more complicated.  I'd like to
try the simplest version we think will work, first.

Thanks!

-VAL
--

From: Jamie Lokier
Date: Wednesday, April 21, 2010 - 3:10 pm

Don't worry, I'm not trying to deviate you from that good plan.

Just throwing questions out to find what's a good and simple answer to
these little open questions to minimise trouble.

-- Jamie
--

From: J. R. Okajima
Date: Thursday, April 22, 2010 - 3:30 am

Addition to the inode number of fallthru/readdir, hardlink in union
mount may be a problem. If you open a hardlinked file for writing or
try chmod it, the internal copyup will happen and the hardlink will be 
destroyed. For instance, when fileA and fileB are hardlinked on the
lower layer, and the contents of fileA is modifed (copyup happens). You
will not see the latest contents via fileB.
And the IN_CREATE event may be fired to the parent dir if you monitor
it, I am afraid.

(I have pointed out this issue before, but the posted document didn't
seem to contain about it)


J. R. Okajima
--

From: Valerie Aurora
Date: Thursday, April 15, 2010 - 4:04 pm

Define the fallthru dcache flag and file system op.  Mask out the
DCACHE_FALLTHRU flag on dentry creation.  Actual users and changes to
lookup come in later patches.

Signed-off-by: Valerie Aurora <vaurora@redhat.com>
---
 Documentation/filesystems/vfs.txt |    6 ++++++
 fs/dcache.c                       |    2 +-
 include/linux/dcache.h            |    6 ++++++
 include/linux/fs.h                |    1 +
 4 files changed, 14 insertions(+), 1 deletions(-)

diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 8846b4f..29f3476 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -320,6 +320,7 @@ struct inode_operations {
 	int (*rmdir) (struct inode *,struct dentry *);
 	int (*mknod) (struct inode *,struct dentry *,int,dev_t);
 	int (*whiteout) (struct inode *, struct dentry *, struct dentry *);
+	int (*fallthru) (struct inode *, struct dentry *);
 	int (*rename) (struct inode *, struct dentry *,
 			struct inode *, struct dentry *);
 	int (*readlink) (struct dentry *, char __user *,int);
@@ -390,6 +391,11 @@ otherwise noted.
         second is the dentry for the whiteout itself.  This method
         must unlink() or rmdir() the original entry if it exists.
 
+  fallthru: called by the readdir(2) system call on a layered file
+        system.  Only required if you want to support fallthrus.
+        Fallthrus are place-holders for directory entries visible from
+        a lower level file system.
+
   rename: called by the rename(2) system call to rename the object to
 	have the parent and name given by the second inode and dentry.
 
diff --git a/fs/dcache.c b/fs/dcache.c
index 3b0e525..b76f9e4 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -993,7 +993,7 @@ EXPORT_SYMBOL(d_alloc_name);
 static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 {
 	if (inode) {
-		dentry->d_flags &= ~DCACHE_WHITEOUT;
+		dentry->d_flags &= ~(DCACHE_WHITEOUT|DCACHE_FALLTHRU);
 ...
From: David Woodhouse
Date: Monday, April 19, 2010 - 6:03 am

This doesn't seem to have incorporated my feedback from the attached...

-- 
David Woodhouse                            Open Source Technology Centre
David.Woodhouse@intel.com                              Intel Corporation
From: Valerie Aurora
Date: Monday, April 19, 2010 - 7:26 am

Hm, I'm not sure whether I lost the patch in a rebase or didn't have
time to test it or what.  I was hoping someone who actually knows
JFFS2 like Felix or you would get to it first - in general, I'd like
the underlying file system maintainers to implement whiteouts and
fallthrus since they know them best.  Felix, if you implemented it and
I lost the patch, my apologies to you.

Thanks David,



--

From: Valerie Aurora
Date: Thursday, April 15, 2010 - 4:04 pm

From: Jan Blunck <jblunck@suse.de>

This patch adds whiteout support to EXT2. A whiteout is an empty directory
entry (inode == 0) with the file type set to EXT2_FT_WHT. Therefore it
allocates space in directories. Due to being implemented as a filetype it is
necessary to have the EXT2_FEATURE_INCOMPAT_FILETYPE flag set.

XXX - Whiteouts could be implemented as special symbolic links

Signed-off-by: Jan Blunck <jblunck@suse.de>
Signed-off-by: Valerie Aurora <vaurora@redhat.com>
Cc: Theodore Tso <tytso@mit.edu>
Cc: linux-ext4@vger.kernel.org
---
 fs/ext2/dir.c           |   96 +++++++++++++++++++++++++++++++++++++++++++++--
 fs/ext2/ext2.h          |    3 +
 fs/ext2/inode.c         |   11 ++++-
 fs/ext2/namei.c         |   67 +++++++++++++++++++++++++++++++-
 fs/ext2/super.c         |    6 +++
 include/linux/ext2_fs.h |    4 ++
 6 files changed, 177 insertions(+), 10 deletions(-)

diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 57207a9..030bd46 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -219,7 +219,7 @@ static inline int ext2_match (int len, const char * const name,
 {
 	if (len != de->name_len)
 		return 0;
-	if (!de->inode)
+	if (!de->inode && (de->file_type != EXT2_FT_WHT))
 		return 0;
 	return !memcmp(name, de->name, len);
 }
@@ -255,6 +255,7 @@ static unsigned char ext2_filetype_table[EXT2_FT_MAX] = {
 	[EXT2_FT_FIFO]		= DT_FIFO,
 	[EXT2_FT_SOCK]		= DT_SOCK,
 	[EXT2_FT_SYMLINK]	= DT_LNK,
+	[EXT2_FT_WHT]		= DT_WHT,
 };
 
 #define S_SHIFT 12
@@ -448,6 +449,26 @@ ino_t ext2_inode_by_name(struct inode *dir, struct qstr *child)
 	return res;
 }
 
+/* Special version for filetype based whiteout support */
+ino_t ext2_inode_by_dentry(struct inode *dir, struct dentry *dentry)
+{
+	ino_t res = 0;
+	struct ext2_dir_entry_2 *de;
+	struct page *page;
+
+	de = ext2_find_entry (dir, &dentry->d_name, &page);
+	if (de) {
+		res = le32_to_cpu(de->inode);
+		if (!res && de->file_type == EXT2_FT_WHT) {
+			spin_lock(&dentry->d_lock);
+			dentry->d_flags |= ...
From: Valerie Aurora
Date: Thursday, April 15, 2010 - 4:04 pm

From: Jan Blunck <jblunck@suse.de>

Whiteout a given directory entry.  File systems that support whiteouts
must implement the new ->whiteout() directory inode operation.

Signed-off-by: Jan Blunck <jblunck@suse.de>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Valerie Aurora <vaurora@redhat.com>
---
 Documentation/filesystems/vfs.txt |   10 +++-
 fs/dcache.c                       |    4 +-
 fs/namei.c                        |  133 +++++++++++++++++++++++++++++++++++++
 include/linux/dcache.h            |    6 ++
 include/linux/fs.h                |    2 +
 5 files changed, 153 insertions(+), 2 deletions(-)

diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 3de2f32..8846b4f 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -308,7 +308,7 @@ struct inode_operations
 -----------------------
 
 This describes how the VFS can manipulate an inode in your
-filesystem. As of kernel 2.6.22, the following members are defined:
+filesystem. As of kernel 2.6.33, the following members are defined:
 
 struct inode_operations {
 	int (*create) (struct inode *,struct dentry *,int, struct nameidata *);
@@ -319,6 +319,7 @@ struct inode_operations {
 	int (*mkdir) (struct inode *,struct dentry *,int);
 	int (*rmdir) (struct inode *,struct dentry *);
 	int (*mknod) (struct inode *,struct dentry *,int,dev_t);
+	int (*whiteout) (struct inode *, struct dentry *, struct dentry *);
 	int (*rename) (struct inode *, struct dentry *,
 			struct inode *, struct dentry *);
 	int (*readlink) (struct dentry *, char __user *,int);
@@ -382,6 +383,13 @@ otherwise noted.
 	will probably need to call d_instantiate() just as you would
 	in the create() method
 
+  whiteout: called by the rmdir(2) and unlink(2) system calls on a
+        layered file system.  Only required if you want to support
+        whiteouts.  The first dentry passed in is that for the old
+        dentry if it exists, and a negative ...
From: Valerie Aurora
Date: Thursday, April 15, 2010 - 4:04 pm

From: Jan Blunck <jblunck@suse.de>

Userspace isn't ready for handling another file type, so silently drop
whiteout directory entries before they leave the kernel.

Signed-off-by: Jan Blunck <jblunck@suse.de>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Valerie Aurora <vaurora@redhat.com>
Cc: linux-nfs@vger.kernel.org
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Neil Brown <neilb@suse.de>
---
 fs/compat.c       |    9 +++++++++
 fs/nfsd/nfs3xdr.c |    5 +++++
 fs/nfsd/nfs4xdr.c |    5 +++++
 fs/nfsd/nfsxdr.c  |    4 ++++
 fs/readdir.c      |    9 +++++++++
 5 files changed, 32 insertions(+), 0 deletions(-)

diff --git a/fs/compat.c b/fs/compat.c
index 00d90c2..624e1a5 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -838,6 +838,9 @@ static int compat_fillonedir(void *__buf, const char *name, int namlen,
 	struct compat_old_linux_dirent __user *dirent;
 	compat_ulong_t d_ino;
 
+	if (d_type == DT_WHT)
+		return 0;
+
 	if (buf->result)
 		return -EINVAL;
 	d_ino = ino;
@@ -909,6 +912,9 @@ static int compat_filldir(void *__buf, const char *name, int namlen,
 	compat_ulong_t d_ino;
 	int reclen = ALIGN(NAME_OFFSET(dirent) + namlen + 2, sizeof(compat_long_t));
 
+	if (d_type == DT_WHT)
+		return 0;
+
 	buf->error = -EINVAL;	/* only used if we fail.. */
 	if (reclen > buf->count)
 		return -EINVAL;
@@ -998,6 +1004,9 @@ static int compat_filldir64(void * __buf, const char * name, int namlen, loff_t
 	int reclen = ALIGN(jj + namlen + 1, sizeof(u64));
 	u64 off;
 
+	if (d_type == DT_WHT)
+		return 0;
+
 	buf->error = -EINVAL;	/* only used if we fail.. */
 	if (reclen > buf->count)
 		return -EINVAL;
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 2a533a0..9b96f5a 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -885,6 +885,11 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
 	int		elen;		/* estimated entry length in words */
 	int		num_entry_words = 0;	/* actual number of words */
 
+	if (d_type ...
From: J. Bruce Fields
Date: Friday, April 16, 2010 - 8:59 am

Seems OK.  (Though is there any way we could avoid having to add the
check to every filldir callback?  Isn't the default going to be
disinterest in whiteouts?  How are we avoiding all the same checks in
the case of lookup?)

--b.

--

From: Jan Blunck
Date: Monday, April 19, 2010 - 5:37 am

Bruce,

the alternative would be to include the check in the fs readdir()
implementation, and therefore prevent the call of the filler. I think this
patch would be even bigger.

--

From: Valerie Aurora
Date: Thursday, April 15, 2010 - 4:45 pm

From: Jan Blunck <jblunck@suse.de>

(Resend due to kernel.org complaining about XXX in original subject)

XXX - This is broken and included just to make union mounts work.  See
discussion at:

http://kerneltrap.org/mailarchive/linux-fsdevel/2010/1/15/6708053/thread

Original commit message:

This is a bugfix/replacement for commit
051d381259eb57d6074d02a6ba6e90e744f1a29f:

    During a path walk if an autofs trigger is mounted on a dentry,
    when the follow_link method is called, the nameidata struct
    contains the vfsmount and mountpoint dentry of the parent mount
    while the dentry that is passed in is the root of the autofs
    trigger mount.  I believe it is impossible to get the vfsmount of
    the trigger mount, within the follow_link method, when only the
    parent vfsmount and the root dentry of the trigger mount are
    known.

The solution in this commit was to replace the path embedded in the
parent's nameidata with the path of the link itself in
__do_follow_link().  This is a relatively harmless misuse of the
field, but union mounts ran into a bug during follow_link() caused by
the nameidata containing the wrong path (we count on it being what it
is all other places - the path of the parent).

A cleaner and easier to understand solution is to save the necessary
vfsmount in the autofs superblock info when it is mounted.  Then we
can easily update the vfsmount in autofs4_follow_link().

Signed-off-by: Jan Blunck <jblunck@suse.de>
Signed-off-by: Valerie Aurora <vaurora@redhat.com>
Acked-by: Ian Kent <raven@themaw.net>
Cc: autofs@linux.kernel.org
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/autofs_i.h |    1 +
 fs/autofs4/init.c     |   11 ++++++++++-
 fs/autofs4/root.c     |    6 ++++++
 fs/namei.c            |    7 ++-----
 4 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 3d283ab..de3af64 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -133,6 ...
From: Valerie Aurora
Date: Thursday, April 15, 2010 - 4:04 pm

From: Jan Blunck <jblunck@suse.de>

This patch changes lookup_hash() into returning a struct path.

Signed-off-by: Jan Blunck <jblunck@suse.de>
Signed-off-by: Valerie Aurora <vaurora@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c |  113 ++++++++++++++++++++++++++++++-----------------------------
 1 files changed, 57 insertions(+), 56 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 981f72c..9013d17 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1155,7 +1155,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 }
 
 static struct dentry *__lookup_hash(struct qstr *name,
-		struct dentry *base, struct nameidata *nd)
+				    struct dentry *base, struct nameidata *nd)
 {
 	struct dentry *dentry;
 	struct inode *inode;
@@ -1212,14 +1212,22 @@ out:
  * needs parent already locked. Doesn't follow mounts.
  * SMP-safe.
  */
-static struct dentry *lookup_hash(struct nameidata *nd)
+static int lookup_hash(struct nameidata *nd, struct qstr *name,
+		       struct path *path)
 {
 	int err;
 
 	err = exec_permission(nd->path.dentry->d_inode);
 	if (err)
-		return ERR_PTR(err);
-	return __lookup_hash(&nd->last, nd->path.dentry, nd);
+		return err;
+	path->mnt = nd->path.mnt;
+	path->dentry =  __lookup_hash(name, nd->path.dentry, nd);
+	if (IS_ERR(path->dentry)) {
+		err = PTR_ERR(path->dentry);
+		path->dentry = NULL;
+		path->mnt = NULL;
+	}
+	return err;
 }
 
 static int __lookup_one_len(const char *name, struct qstr *this,
@@ -1700,12 +1708,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 
 	/* OK, it's O_CREAT */
 	mutex_lock(&dir->d_inode->i_mutex);
+	error = lookup_hash(nd, &nd->last, path);
 
-	path->dentry = lookup_hash(nd);
-	path->mnt = nd->path.mnt;
-
-	error = PTR_ERR(path->dentry);
-	if (IS_ERR(path->dentry)) {
+	if (error) {
 		mutex_unlock(&dir->d_inode->i_mutex);
 		goto exit;
 	}
@@ -1954,7 +1959,8 @@ EXPORT_SYMBOL(filp_open);
  */
 struct dentry ...
From: Randy Dunlap
Date: Wednesday, April 21, 2010 - 3:06 pm

Hi VAL,

In the future, please make patches 1-N reply/refer to patch 0 instead of to
the preceding numbered patch.


thanks,
---
~Randy
--

From: Valerie Aurora
Date: Wednesday, April 21, 2010 - 4:35 pm

Okay.

-VAL
--

Previous thread: mmotm 2010-04-15-14-42 uploaded by akpm on Thursday, April 15, 2010 - 2:42 pm. (25 messages)

Next thread: lockdep warning on block tree for-2.6.35 branch by Gui Jianfeng on Thursday, April 15, 2010 - 6:40 pm. (3 messages)