Add an iterator data structure to operate over an iovec. Add usercopy
operators needed by generic_file_buffered_write, and convert that function
over.
include/linux/fs.h | 33 ++++++++++++
mm/filemap.c | 144 +++++++++++++++++++++++++++++++++++++++++++----------
mm/filemap.h | 103 -------------------------------------
3 files changed, 150 insertions(+), 130 deletions(-)
Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h
+++ linux-2.6/include/linux/fs.h
@@ -396,6 +396,39 @@ struct page;
struct address_space;
struct writeback_control;
+struct iov_iter {
+ const struct iovec *iov;
+ unsigned long nr_segs;
+ size_t iov_offset;
+ size_t count;
+};
+
+size_t iov_iter_copy_from_user_atomic(struct page *page,
+ struct iov_iter *i, unsigned long offset, size_t bytes);
+size_t iov_iter_copy_from_user(struct page *page,
+ struct iov_iter *i, unsigned long offset, size_t bytes);
+void iov_iter_advance(struct iov_iter *i, size_t bytes);
+int iov_iter_fault_in_readable(struct iov_iter *i);
+size_t iov_iter_single_seg_count(struct iov_iter *i);
+
+static inline void iov_iter_init(struct iov_iter *i,
+ const struct iovec *iov, unsigned long nr_segs,
+ size_t count, size_t written)
+{
+ i->iov = iov;
+ i->nr_segs = nr_segs;
+ i->iov_offset = 0;
+ i->count = count + written;
+
+ iov_iter_advance(i, written);
+}
+
+static inline size_t iov_iter_count(struct iov_iter *i)
+{
+ return i->count;
+}
+
+
struct address_space_operations {
int (*writepage)(struct page *page, struct writeback_control *wbc);
int (*readpage)(struct file *, struct page *);
Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c
+++ linux-2.6/mm/filemap.c
@@ -30,7 +30,7 @@
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/cpuset.h>
-#include "filemap.h"
+#include ...Introduce write_begin, write_end, and perform_write aops.
These are intended to replace prepare_write and commit_write with more
flexible alternatives that are also able to avoid the buffered write
deadlock problems efficiently (which prepare_write is unable to do).
Documentation/filesystems/Locking | 9 +
Documentation/filesystems/vfs.txt | 38 ++++++
drivers/block/loop.c | 69 ++++-------
fs/buffer.c | 144 ++++++++++++++++++++----
fs/libfs.c | 40 ++++++
fs/namei.c | 47 ++-----
fs/splice.c | 106 +----------------
include/linux/buffer_head.h | 7 +
include/linux/fs.h | 29 ++++
include/linux/pagemap.h | 2
mm/filemap.c | 228 ++++++++++++++++++++++++++++++++++----
11 files changed, 494 insertions(+), 225 deletions(-)
Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h
+++ linux-2.6/include/linux/fs.h
@@ -449,6 +449,17 @@ struct address_space_operations {
*/
int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
+
+ int (*write_begin)(struct file *, struct address_space *mapping,
+ loff_t pos, unsigned len, int intr,
+ struct page **pagep, void **fsdata);
+ int (*write_end)(struct file *, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata);
+
+ int (*perform_write)(struct file *, struct address_space *,
+ struct iov_iter *, loff_t);
+
/* Unfortunately this kludge is needed for FIBMAP. Don't use it */
sector_t (*bmap)(struct address_space *, sector_t);
void (*invalidatepage) (struct page *, unsigned long);
@@ -463,6 +474,18 @@ struct address_space_operations {
int (*launder_page) (struct page *);
};
+/*
+ * ...Hi Nick, Are we going to get rid of the file and intr arguments btw? I'm not sure intr is useful, and mapping is probably enough to get whatever we inside ->write_begin / ->write_end. Also, I noticed that you didn't export block_write_begin(), simple_write_begin(), block_write_end() and simple_write_end() - I think we want those for client modules. Attached is a quick patch to hook up the existing ocfs2 write code. This has been compile tested only for now - one of my test machines isn't cooperating, so a runtime test will have to wait until tommorrow. One interesting side effect is that we no longer pass AOP_TRUNCATE_PAGE up a level. This gives callers less to deal with. And it means that ocfs2 doesn't have to use the ocfs2_*_lock_with_page() cluster lock variants in ocfs2_block_write_begin() because it can order cluster locks outside of the page lock there. My ocfs2 write rework will be a more serious user of these stuff, including the fsdata backpointer. That code will also eliminate the entire ocfs2_*_lock_with_page() cluster locking workarounds for write (they'll have to remain for ->readpage()). I'm beginning work on cleaning those ocfs2 patches up and getting them plugged into this stuff. I hope to post them in the next day or two. --Mark -- Mark Fasheh Senior Software Developer, Oracle mark.fasheh@oracle.com ocfs2: Convert to new aops Turn ocfs2_prepare_write() and ocfs2_commit_write() into ocfs2_write_begin() and ocfs2_write_end(). Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com> diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 93628b0..e7bcbbd 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -293,29 +293,30 @@ int ocfs2_prepare_write_nolock(struct in } /* - * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called - * from loopback. It must be able to perform its own locking around - * ocfs2_get_block(). + * ocfs2_write_begin() can be an outer-most ocfs2 call when it is + * called from elsewhere in ...
Yeah, I was going to, but I had this version ready to go so decided to leave them in at the last minute. We can definitely take them out if people agree. However a side note about intr -- I wonder if it might be wise to include a flags argument, in case we might want to add something like that later? (definitely if we do keep intr, then it should be done as OK that's very cool. I was hoping that would be the case. If GFS2 can avoid that too, then we might be able to get rid of AOP_TRUNCATE_PAGE handling from the legacy prepare/commit_write paths, which will make OK, well I'll add this to my queue for now, and post the full patchset after incorporating feedback I've had so far, and doing more testing, so people can actually apply them and boot kernels. Thanks, Nick -
I don't see a problem with having a flags argument. It could give us some flexibility in the future which would otherwise require a much bigger Yeah - so long as we're not taking a page fault between write_begin / write_end, there's no reason for the cluster locks to be taken and dropped within the individual callbacks, which means we can just take them in write_begin (where page lock ordering is possible) and hold them until Great, thanks - it just occured to me that I should be holding the clusters locks across the entire copy (as I point out above), so I'll have a slightly updated version of this patch for you soon :) --Mark -- Mark Fasheh Senior Software Developer, Oracle mark.fasheh@oracle.com -
You're really going to need the file argument around. Some folks care about file->private_data, etc. A good example is nfs_updatepage() from nfs_commit_write(). There's a context on the filp. Mapping can get back to the inode via ->host, but not to the struct file. Joel -- Life's Little Instruction Book #157 "Take time to smell the roses." Joel Becker Principal Software Developer Oracle E-mail: joel.becker@oracle.com Phone: (650) 506-8127 -
OK, I'll keep the file around unless we see a better alternative. Thanks for pointing that out. -
Hi, Yes, I agree that with the new operations GFS2 should also no longer need AOP_TRUNCATE_PAGE for writes, Steve. -
Hell no! Struct file carries information that is essential for those of us that use strong authentication. It stays. Trond -
Joel pointed this out yesterday for the nfs case. Not to worry, we'll keep struct file around :) --Mark -- Mark Fasheh Senior Software Developer, Oracle mark.fasheh@oracle.com -
What about AOP_TRUNCATED_PAGE? Off corse we can't just "goto retry" here :) , if ->commit_write return non negative value we return with sill locked page look above at [1] <<<<OOPS why you new block wasn't fully zeroed? as it was done before. At least this result in information leak in case of (stat == from) just imagine fs with blocksize == 1k conains file with i_size == 4096 and fist two blocks not mapped (hole), now invoke write op from 1023 to 2048. For example we succeed in allocating first block, but faile while allocating second , then we call page_zero_new_buffers(...from == 1023, to == 2048) and then zerro only one last byte for first block, and set is uptodate <<<<<< BTW: do_lo_send_aops() code itself is realy ugly, for example patrial Ohhh and now i'm totaly shure what you just forgot unlocking stuff -
Yeah AOP_TRUNCATED_PAGE... I'm _hoping_ that OCFS2 and GFS2 will be able to avoid that using write_begin/write_end, so the caller will not have to know anything about it. I don't know that commit_write can even return AOP_TRUNCATED_PAGE... we Yeah, thanks. I think I converted all my filesystems to use write_begin / write_end, so I probably didn't test this path :P. I do plan to go through and try to individually test error cases and stress test it over the next When we first invoke the write op, it should see were doing a partial write into the first buffer and bring it uptodate first. I don't see the problem, but again I do need to go through and exercise various cases Well, the new code just works. The old code tried to support partial writes, but it ends up just zeroing out the rest of the unwritten part of the page, so it can actually overwrite real data. This is one of the problems with prepare_write/commit_write: error handling is fairly complex, and leaving it up to the callers is just insane because we can't even get it right in mm/ :P Anyway, thanks for having a look over it. You definitely spotted a bug with the page locking. Nick -
One more note. It's looks like you just disabled all fancy zero copy logic. Off corse this is just rfc patchset. But i think where is fundamental problem with it: Previous logic was following: 1)splice code responsible for: stealing(if possible) and loking the page 2)prepare_write() code responsible for: do fs speciffic stuff But with new write_begin() logic all steps (grubbing, locking, preparing) happened internaly inside write_begin() witch doesn't even know about what kind of data will be copied between write_begin/write_end. So fancy zero copy logic is impossible :( I think this can be solved somehow, but i dont know yet, how can this be done -
Check linux-mm: zero-copy splice is broken anyway, and AFAIKS it cannot Actually we could do it with begin_write. All we need to do is set a flag to say that *pagep contains a page that we can use, with a copy of the write data on it. The filesystem would then be able to insert that page into the pagecache *if* it could handle such an operation. OTOH, is splice stealing support really important? I guess it could be a nice win for a small niche of workloads, and we probably don't want to exclude it by design... -- SUSE Labs, Novell Inc. Send instant messages to your online friends http://au.messenger.yahoo.com -
Implement new aops for some of the simpler filesystems.
fs/configfs/inode.c | 4 ++--
fs/sysfs/inode.c | 4 ++--
mm/shmem.c | 16 ++++++++++------
3 files changed, 14 insertions(+), 10 deletions(-)
Index: linux-2.6/mm/shmem.c
===================================================================
--- linux-2.6.orig/mm/shmem.c
+++ linux-2.6/mm/shmem.c
@@ -1419,10 +1419,14 @@ static const struct inode_operations shm
* lets a tmpfs file be used read-write below the loop driver.
*/
static int
-shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to)
-{
- struct inode *inode = page->mapping->host;
- return shmem_getpage(inode, page->index, &page, SGP_WRITE, NULL);
+shmem_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, int intr,
+ struct page **pagep, void **fsdata)
+{
+ struct inode *inode = mapping->host;
+ pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ *pagep = NULL;
+ return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
}
static ssize_t
@@ -2319,8 +2323,8 @@ static const struct address_space_operat
.writepage = shmem_writepage,
.set_page_dirty = __set_page_dirty_no_writeback,
#ifdef CONFIG_TMPFS
- .prepare_write = shmem_prepare_write,
- .commit_write = simple_commit_write,
+ .write_begin = shmem_write_begin,
+ .write_end = simple_write_end,
#endif
.migratepage = migrate_page,
};
Index: linux-2.6/fs/configfs/inode.c
===================================================================
--- linux-2.6.orig/fs/configfs/inode.c
+++ linux-2.6/fs/configfs/inode.c
@@ -40,8 +40,8 @@ extern struct super_block * configfs_sb;
static const struct address_space_operations configfs_aops = {
.readpage = simple_readpage,
- .prepare_write = simple_prepare_write,
- .commit_write = simple_commit_write
+ .write_begin = simple_write_begin,
+ .write_end = simple_write_end,
};
static struct backing_dev_info configfs_backing_dev_info = {
Index: ...Implement new aops for ext2.
fs/ext2/inode.c | 12 ++++++++++++
1 file changed, 12 insertions(+)
Index: linux-2.6/fs/ext2/inode.c
===================================================================
--- linux-2.6.orig/fs/ext2/inode.c
+++ linux-2.6/fs/ext2/inode.c
@@ -643,6 +643,16 @@ ext2_readpages(struct file *file, struct
}
static int
+ext2_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, int intr,
+ struct page **pagep, void **fsdata)
+{
+ *pagep = NULL;
+ return block_write_begin(file, mapping, pos, len, intr, pagep, fsdata,
+ ext2_get_block);
+}
+
+static int
ext2_prepare_write(struct file *file, struct page *page,
unsigned from, unsigned to)
{
@@ -689,6 +699,8 @@ const struct address_space_operations ex
.readpages = ext2_readpages,
.writepage = ext2_writepage,
.sync_page = block_sync_page,
+ .write_begin = ext2_write_begin,
+ .write_end = block_write_end,
.prepare_write = ext2_prepare_write,
.commit_write = generic_commit_write,
.bmap = ext2_bmap,
-
Implement new aops for ext3. Probably has some bugs in interaction with
journalling, and corner cases aren't tested/thought out fully, but it
boots and runs. I don't see a fundamental reason why it can't work...
fs/ext3/inode.c | 137 +++++++++++++++++++++++++++++++++++---------------------
1 file changed, 88 insertions(+), 49 deletions(-)
Index: linux-2.6/fs/ext3/inode.c
===================================================================
--- linux-2.6.orig/fs/ext3/inode.c
+++ linux-2.6/fs/ext3/inode.c
@@ -1155,7 +1155,7 @@ static int do_journal_get_write_access(h
* This content is expected to be set to zeroes by block_prepare_write().
* 2006/10/14 SAW
*/
-static int ext3_prepare_failure(struct file *file, struct page *page,
+static int ext3_write_failure(struct file *file, struct page *page,
unsigned from, unsigned to)
{
struct address_space *mapping;
@@ -1208,29 +1208,40 @@ skip:
return mapping->a_ops->commit_write(file, page, from, block_start);
}
-static int ext3_prepare_write(struct file *file, struct page *page,
- unsigned from, unsigned to)
+static int ext3_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, int intr,
+ struct page **pagep, void **fsdata)
{
- struct inode *inode = page->mapping->host;
- int ret, ret2;
+ struct inode *inode = mapping->host;
int needed_blocks = ext3_writepage_trans_blocks(inode);
+ int ret, ret2;
handle_t *handle;
int retries = 0;
+ struct page *page;
+ pgoff_t index;
+ unsigned start, end;
+
+ index = pos >> PAGE_CACHE_SHIFT;
+ start = pos * (PAGE_CACHE_SIZE - 1);
+ end = start + len;
+
+ page = __grab_cache_page(mapping, index);
+ if (!page)
+ return -ENOMEM;
+ *pagep = page;
retry:
handle = ext3_journal_start(inode, needed_blocks);
if (IS_ERR(handle))
return PTR_ERR(handle);
- if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode))
- ret = nobh_prepare_write(page, from, to, ext3_get_block);
- else
- ret ...Just a note to anyone looking at these -- they don't apply to any tree, and I'm posting them at this stage mainly to get feedback on the a_ops APIs. Comments from anyone welcome. -
