Re: [patch 2/5] fs: introduce new aops and infrastructure

Previous thread: Re: [patch 28/34] Xen-pv_ops: Xen SMP guest support by Benjamin LaHaise on Wednesday, March 14, 2007 - 6:28 am. (3 messages)

Next thread: kswapd & 2.4.21-47.0.0.1 by Konstantin Kalin on Wednesday, March 14, 2007 - 6:35 am. (4 messages)
From: Nick Piggin
Date: Wednesday, March 14, 2007 - 6:38 am

Add an iterator data structure to operate over an iovec. Add usercopy
operators needed by generic_file_buffered_write, and convert that function
over.

 include/linux/fs.h |   33 ++++++++++++
 mm/filemap.c       |  144 +++++++++++++++++++++++++++++++++++++++++++----------
 mm/filemap.h       |  103 -------------------------------------
 3 files changed, 150 insertions(+), 130 deletions(-)

Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h
+++ linux-2.6/include/linux/fs.h
@@ -396,6 +396,39 @@ struct page;
 struct address_space;
 struct writeback_control;
 
+struct iov_iter {
+	const struct iovec *iov;
+	unsigned long nr_segs;
+	size_t iov_offset;
+	size_t count;
+};
+
+size_t iov_iter_copy_from_user_atomic(struct page *page,
+		struct iov_iter *i, unsigned long offset, size_t bytes);
+size_t iov_iter_copy_from_user(struct page *page,
+		struct iov_iter *i, unsigned long offset, size_t bytes);
+void iov_iter_advance(struct iov_iter *i, size_t bytes);
+int iov_iter_fault_in_readable(struct iov_iter *i);
+size_t iov_iter_single_seg_count(struct iov_iter *i);
+
+static inline void iov_iter_init(struct iov_iter *i,
+			const struct iovec *iov, unsigned long nr_segs,
+			size_t count, size_t written)
+{
+	i->iov = iov;
+	i->nr_segs = nr_segs;
+	i->iov_offset = 0;
+	i->count = count + written;
+
+	iov_iter_advance(i, written);
+}
+
+static inline size_t iov_iter_count(struct iov_iter *i)
+{
+	return i->count;
+}
+
+
 struct address_space_operations {
 	int (*writepage)(struct page *page, struct writeback_control *wbc);
 	int (*readpage)(struct file *, struct page *);
Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c
+++ linux-2.6/mm/filemap.c
@@ -30,7 +30,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/cpuset.h>
-#include "filemap.h"
+#include ...
From: Nick Piggin
Date: Wednesday, March 14, 2007 - 6:38 am

Introduce write_begin, write_end, and perform_write aops.

These are intended to replace prepare_write and commit_write with more
flexible alternatives that are also able to avoid the buffered write
deadlock problems efficiently (which prepare_write is unable to do).

 Documentation/filesystems/Locking |    9 +
 Documentation/filesystems/vfs.txt |   38 ++++++
 drivers/block/loop.c              |   69 ++++-------
 fs/buffer.c                       |  144 ++++++++++++++++++++----
 fs/libfs.c                        |   40 ++++++
 fs/namei.c                        |   47 ++-----
 fs/splice.c                       |  106 +----------------
 include/linux/buffer_head.h       |    7 +
 include/linux/fs.h                |   29 ++++
 include/linux/pagemap.h           |    2 
 mm/filemap.c                      |  228 ++++++++++++++++++++++++++++++++++----
 11 files changed, 494 insertions(+), 225 deletions(-)

Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h
+++ linux-2.6/include/linux/fs.h
@@ -449,6 +449,17 @@ struct address_space_operations {
 	 */
 	int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
 	int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
+
+	int (*write_begin)(struct file *, struct address_space *mapping,
+				loff_t pos, unsigned len, int intr,
+				struct page **pagep, void **fsdata);
+	int (*write_end)(struct file *, struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned copied,
+				struct page *page, void *fsdata);
+
+	int (*perform_write)(struct file *, struct address_space *,
+				struct iov_iter *, loff_t);
+
 	/* Unfortunately this kludge is needed for FIBMAP. Don't use it */
 	sector_t (*bmap)(struct address_space *, sector_t);
 	void (*invalidatepage) (struct page *, unsigned long);
@@ -463,6 +474,18 @@ struct address_space_operations {
 	int (*launder_page) (struct page *);
 };
 
+/*
+ * ...
From: Mark Fasheh
Date: Wednesday, March 14, 2007 - 9:13 pm

Hi Nick,


Are we going to get rid of the file and intr arguments btw? I'm not sure
intr is useful, and mapping is probably enough to get whatever we inside
->write_begin / ->write_end.

Also, I noticed that you didn't export block_write_begin(),
simple_write_begin(), block_write_end() and simple_write_end() - I think we
want those for client modules.


Attached is a quick patch to hook up the existing ocfs2 write code. This has
been compile tested only for now - one of my test machines isn't
cooperating, so a runtime test will have to wait until tommorrow.

One interesting side effect is that we no longer pass AOP_TRUNCATE_PAGE up a
level. This gives callers less to deal with. And it means that ocfs2 doesn't
have to use the ocfs2_*_lock_with_page() cluster lock variants in
ocfs2_block_write_begin() because it can order cluster locks outside of the
page lock there.

My ocfs2 write rework will be a more serious user of these stuff, including
the fsdata backpointer. That code will also eliminate the entire
ocfs2_*_lock_with_page() cluster locking workarounds for write (they'll have
to remain for ->readpage()). I'm beginning work on cleaning those ocfs2
patches up and getting them plugged into this stuff. I hope to post them in
the next day or two.
	--Mark

--
Mark Fasheh
Senior Software Developer, Oracle
mark.fasheh@oracle.com

ocfs2: Convert to new aops

Turn ocfs2_prepare_write() and ocfs2_commit_write() into ocfs2_write_begin()
and ocfs2_write_end().

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 93628b0..e7bcbbd 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -293,29 +293,30 @@ int ocfs2_prepare_write_nolock(struct in
 }
 
 /*
- * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
- * from loopback.  It must be able to perform its own locking around
- * ocfs2_get_block().
+ * ocfs2_write_begin() can be an outer-most ocfs2 call when it is
+ * called from elsewhere in ...
From: Nick Piggin
Date: Wednesday, March 14, 2007 - 9:36 pm

Yeah, I was going to, but I had this version ready to go so decided
to leave them in at the last minute. We can definitely take them out
if people agree.

However a side note about intr -- I wonder if it might be wise to
include a flags argument, in case we might want to add something like
that later? (definitely if we do keep intr, then it should be done as


OK that's very cool. I was hoping that would be the case. If GFS2 can
avoid that too, then we might be able to get rid of AOP_TRUNCATE_PAGE
handling from the legacy prepare/commit_write paths, which will make

OK, well I'll add this to my queue for now, and post the full patchset
after incorporating feedback I've had so far, and doing more testing,
so people can actually apply them and boot kernels.

Thanks,
Nick
-

From: Mark Fasheh
Date: Wednesday, March 14, 2007 - 11:11 pm

I don't see a problem with having a flags argument. It could give us some
flexibility in the future which would otherwise require a much bigger

Yeah - so long as we're not taking a page fault between write_begin /
write_end, there's no reason for the cluster locks to be taken and dropped
within the individual callbacks, which means we can just take them in
write_begin (where page lock ordering is possible) and hold them until

Great, thanks - it just occured to me that I should be holding the clusters
locks across the entire copy (as I point out above), so I'll have a slightly
updated version of this patch for you soon :)
	--Mark

--
Mark Fasheh
Senior Software Developer, Oracle
mark.fasheh@oracle.com
-

From: Joel Becker
Date: Wednesday, March 14, 2007 - 11:23 pm

You're really going to need the file argument around.  Some
folks care about file->private_data, etc.  A good example is
nfs_updatepage() from nfs_commit_write().  There's a context on the
filp.  Mapping can get back to the inode via ->host, but not to the
struct file.

Joel

-- 

Life's Little Instruction Book #157 

	"Take time to smell the roses."

Joel Becker
Principal Software Developer
Oracle
E-mail: joel.becker@oracle.com
Phone: (650) 506-8127
-

From: Nick Piggin
Date: Thursday, March 15, 2007 - 1:04 am

OK, I'll keep the file around unless we see a better alternative.
Thanks for pointing that out.


-

From: Steven Whitehouse
Date: Thursday, March 15, 2007 - 9:24 am

Hi,

Yes, I agree that with the new operations GFS2 should also no longer
need AOP_TRUNCATE_PAGE for writes,

Steve.


-

From: Trond Myklebust
Date: Thursday, March 15, 2007 - 1:06 pm

Hell no! Struct file carries information that is essential for those of
us that use strong authentication. It stays.

Trond

-

From: Mark Fasheh
Date: Thursday, March 15, 2007 - 1:44 pm

Joel pointed this out yesterday for the nfs case. Not to worry, we'll keep
struct file around :)
	--Mark

--
Mark Fasheh
Senior Software Developer, Oracle
mark.fasheh@oracle.com
-

From: Dmitriy Monakhov
Date: Wednesday, March 14, 2007 - 2:28 pm

What about AOP_TRUNCATED_PAGE?  Off corse we can't just "goto retry" here :) ,
if ->commit_write return non negative value we return with sill locked page  look above at [1] 
<<<<OOPS why you new block wasn't fully zeroed? as it was done before.
At least this result in information leak in case of (stat == from)
just imagine fs with blocksize == 1k conains file with i_size == 4096 and 
fist two blocks not mapped (hole), now invoke write op from 1023 to 2048.
For example we succeed in allocating first block, but faile while allocating second
, then we call page_zero_new_buffers(...from == 1023, to == 2048)
  and then zerro only one last byte for first block, and set is uptodate
<<<<<< BTW: do_lo_send_aops() code itself is realy ugly, for example patrial 
Ohhh and now i'm totaly shure what you just forgot unlocking stuff

-

From: Nick Piggin
Date: Wednesday, March 14, 2007 - 8:55 pm

Yeah AOP_TRUNCATED_PAGE... I'm _hoping_ that OCFS2 and GFS2 will be able
to avoid that using write_begin/write_end, so the caller will not have to
know anything about it.

I don't know that commit_write can even return AOP_TRUNCATED_PAGE... we

Yeah, thanks. I think I converted all my filesystems to use write_begin /
write_end, so I probably didn't test this path :P. I do plan to go through
and try to individually test error cases and stress test it over the next

When we first invoke the write op, it should see were doing a partial
write into the first buffer and bring it uptodate first. I don't see the
problem, but again I do need to go through and exercise various cases

Well, the new code just works. The old code tried to support partial writes,
but it ends up just zeroing out the rest of the unwritten part of the page,
so it can actually overwrite real data.

This is one of the problems with prepare_write/commit_write: error handling
is fairly complex, and leaving it up to the callers is just insane because
we can't even get it right in mm/ :P


Anyway, thanks for having a look over it. You definitely spotted a bug
with the page locking.

Nick
-

From: Dmitriy Monakhov
Date: Thursday, March 15, 2007 - 2:44 am

One more note. It's looks like you just disabled all fancy zero copy logic.
Off corse this is just rfc patchset.
But i think where is fundamental problem with it:
 Previous logic was following:
  1)splice code responsible for: stealing(if possible) and loking the page
  2)prepare_write() code responsible for: do fs speciffic stuff

 But with new write_begin() logic  all steps (grubbing, locking, preparing)
 happened internaly inside write_begin() witch doesn't even know about what
 kind of data will be copied between write_begin/write_end.
 So fancy zero copy logic is impossible :(
I think this can be solved somehow, but i dont know yet, how can this be done

-

From: Nick Piggin
Date: Thursday, March 15, 2007 - 3:04 am

Check linux-mm: zero-copy splice is broken anyway, and AFAIKS it cannot

Actually we could do it with begin_write. All we need to do is set a
flag to say that *pagep contains a page that we can use, with a copy of
the write data on it.

The filesystem would then be able to insert that page into the pagecache
*if* it could handle such an operation.

OTOH, is splice stealing support really important? I guess it could be a
nice win for a small niche of workloads, and we probably don't want to
exclude it by design...

-- 
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com 
-

From: Nick Piggin
Date: Wednesday, March 14, 2007 - 6:38 am

Implement new aops for some of the simpler filesystems.

 fs/configfs/inode.c |    4 ++--
 fs/sysfs/inode.c    |    4 ++--
 mm/shmem.c          |   16 ++++++++++------
 3 files changed, 14 insertions(+), 10 deletions(-)

Index: linux-2.6/mm/shmem.c
===================================================================
--- linux-2.6.orig/mm/shmem.c
+++ linux-2.6/mm/shmem.c
@@ -1419,10 +1419,14 @@ static const struct inode_operations shm
  * lets a tmpfs file be used read-write below the loop driver.
  */
 static int
-shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to)
-{
-	struct inode *inode = page->mapping->host;
-	return shmem_getpage(inode, page->index, &page, SGP_WRITE, NULL);
+shmem_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, int intr,
+			struct page **pagep, void **fsdata)
+{
+	struct inode *inode = mapping->host;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	*pagep = NULL;
+	return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
 }
 
 static ssize_t
@@ -2319,8 +2323,8 @@ static const struct address_space_operat
 	.writepage	= shmem_writepage,
 	.set_page_dirty	= __set_page_dirty_no_writeback,
 #ifdef CONFIG_TMPFS
-	.prepare_write	= shmem_prepare_write,
-	.commit_write	= simple_commit_write,
+	.write_begin	= shmem_write_begin,
+	.write_end	= simple_write_end,
 #endif
 	.migratepage	= migrate_page,
 };
Index: linux-2.6/fs/configfs/inode.c
===================================================================
--- linux-2.6.orig/fs/configfs/inode.c
+++ linux-2.6/fs/configfs/inode.c
@@ -40,8 +40,8 @@ extern struct super_block * configfs_sb;
 
 static const struct address_space_operations configfs_aops = {
 	.readpage	= simple_readpage,
-	.prepare_write	= simple_prepare_write,
-	.commit_write	= simple_commit_write
+	.write_begin	= simple_write_begin,
+	.write_end	= simple_write_end,
 };
 
 static struct backing_dev_info configfs_backing_dev_info = {
Index: ...
From: Nick Piggin
Date: Wednesday, March 14, 2007 - 6:38 am

Implement new aops for ext2.

 fs/ext2/inode.c |   12 ++++++++++++
 1 file changed, 12 insertions(+)

Index: linux-2.6/fs/ext2/inode.c
===================================================================
--- linux-2.6.orig/fs/ext2/inode.c
+++ linux-2.6/fs/ext2/inode.c
@@ -643,6 +643,16 @@ ext2_readpages(struct file *file, struct
 }
 
 static int
+ext2_write_begin(struct file *file, struct address_space *mapping,
+		loff_t pos, unsigned len, int intr,
+		struct page **pagep, void **fsdata)
+{
+	*pagep = NULL;
+	return block_write_begin(file, mapping, pos, len, intr, pagep, fsdata,
+							ext2_get_block);
+}
+
+static int
 ext2_prepare_write(struct file *file, struct page *page,
 			unsigned from, unsigned to)
 {
@@ -689,6 +699,8 @@ const struct address_space_operations ex
 	.readpages		= ext2_readpages,
 	.writepage		= ext2_writepage,
 	.sync_page		= block_sync_page,
+	.write_begin		= ext2_write_begin,
+	.write_end		= block_write_end,
 	.prepare_write		= ext2_prepare_write,
 	.commit_write		= generic_commit_write,
 	.bmap			= ext2_bmap,
-

From: Nick Piggin
Date: Wednesday, March 14, 2007 - 6:38 am

Implement new aops for ext3. Probably has some bugs in interaction with
journalling, and corner cases aren't tested/thought out fully, but it
boots and runs. I don't see a fundamental reason why it can't work...

 fs/ext3/inode.c |  137 +++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 88 insertions(+), 49 deletions(-)

Index: linux-2.6/fs/ext3/inode.c
===================================================================
--- linux-2.6.orig/fs/ext3/inode.c
+++ linux-2.6/fs/ext3/inode.c
@@ -1155,7 +1155,7 @@ static int do_journal_get_write_access(h
  * This content is expected to be set to zeroes by block_prepare_write().
  * 2006/10/14  SAW
  */
-static int ext3_prepare_failure(struct file *file, struct page *page,
+static int ext3_write_failure(struct file *file, struct page *page,
 				unsigned from, unsigned to)
 {
 	struct address_space *mapping;
@@ -1208,29 +1208,40 @@ skip:
 	return mapping->a_ops->commit_write(file, page, from, block_start);
 }
 
-static int ext3_prepare_write(struct file *file, struct page *page,
-			      unsigned from, unsigned to)
+static int ext3_write_begin(struct file *file, struct address_space *mapping,
+				loff_t pos, unsigned len, int intr,
+				struct page **pagep, void **fsdata)
 {
-	struct inode *inode = page->mapping->host;
-	int ret, ret2;
+	struct inode *inode = mapping->host;
 	int needed_blocks = ext3_writepage_trans_blocks(inode);
+	int ret, ret2;
 	handle_t *handle;
 	int retries = 0;
+	struct page *page;
+	pgoff_t index;
+	unsigned start, end;
+
+	index = pos >> PAGE_CACHE_SHIFT;
+	start = pos * (PAGE_CACHE_SIZE - 1);
+	end = start + len;
+
+	page = __grab_cache_page(mapping, index);
+	if (!page)
+		return -ENOMEM;
+	*pagep = page;
 
 retry:
 	handle = ext3_journal_start(inode, needed_blocks);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
-	if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode))
-		ret = nobh_prepare_write(page, from, to, ext3_get_block);
-	else
-		ret ...
From: Nick Piggin
Date: Wednesday, March 14, 2007 - 6:51 am

Just a note to anyone looking at these -- they don't apply to any tree,
and I'm posting them at this stage mainly to get feedback on the a_ops
APIs. Comments from anyone welcome.

-

Previous thread: Re: [patch 28/34] Xen-pv_ops: Xen SMP guest support by Benjamin LaHaise on Wednesday, March 14, 2007 - 6:28 am. (3 messages)

Next thread: kswapd & 2.4.21-47.0.0.1 by Konstantin Kalin on Wednesday, March 14, 2007 - 6:35 am. (4 messages)