[patch 1/3] add the fsblock layer

Previous message: [thread] [date] [author]
Next message: [thread] [date] [author]
From: Nick Piggin
Date: Saturday, June 23, 2007 - 6:46 pm

Rewrite the buffer layer.

---
 fs/Makefile                   |    2 
 fs/buffer.c                   |   31 
 fs/fs-writeback.c             |   13 
 fs/fsblock.c                  | 2511 ++++++++++++++++++++++++++++++++++++++++++
 fs/inode.c                    |   37 
 fs/splice.c                   |    3 
 include/linux/buffer_head.h   |    1 
 include/linux/fsblock.h       |  347 +++++
 include/linux/fsblock_types.h |   70 +
 include/linux/page-flags.h    |   15 
 init/main.c                   |    2 
 mm/filemap.c                  |    7 
 mm/page_alloc.c               |    3 
 mm/swap.c                     |    7 
 mm/truncate.c                 |   93 -
 mm/vmscan.c                   |    6 
 16 files changed, 3077 insertions(+), 71 deletions(-)

Index: linux-2.6/include/linux/page-flags.h
===================================================================
--- linux-2.6.orig/include/linux/page-flags.h
+++ linux-2.6/include/linux/page-flags.h
@@ -90,6 +90,8 @@
 #define PG_reclaim		17	/* To be reclaimed asap */
 #define PG_buddy		19	/* Page is free, on buddy lists */
 
+#define PG_blocks		20	/* Page has block mappings */
+
 /* PG_owner_priv_1 users should have descriptive aliases */
 #define PG_checked		PG_owner_priv_1 /* Used by some filesystems */
 
@@ -134,8 +136,17 @@ static inline void SetPageUptodate(struc
 	if (!test_and_set_bit(PG_uptodate, &page->flags))
 		page_clear_dirty(page);
 }
+static inline void TestSetPageUptodate(struct page *page)
+{
+	if (!test_and_set_bit(PG_uptodate, &page->flags)) {
+		page_clear_dirty(page);
+		return 0;
+	}
+	return 1;
+}
 #else
 #define SetPageUptodate(page)	set_bit(PG_uptodate, &(page)->flags)
+#define TestSetPageUptodate(page) test_and_set_bit(PG_uptodate, &(page)->flags)
 #endif
 #define ClearPageUptodate(page)	clear_bit(PG_uptodate, &(page)->flags)
 
@@ -217,6 +228,10 @@ static inline void SetPageUptodate(struc
 #define __SetPageBuddy(page)	__set_bit(PG_buddy, &(page)->flags)
 #define __ClearPageBuddy(page)	__clear_bit(PG_buddy, &(page)->flags)
 
+#define PageBlocks(page)	test_bit(PG_blocks, &(page)->flags)
+#define SetPageBlocks(page)	set_bit(PG_blocks, &(page)->flags)
+#define ClearPageBlocks(page)	clear_bit(PG_blocks, &(page)->flags)
+
 #define PageMappedToDisk(page)	test_bit(PG_mappedtodisk, &(page)->flags)
 #define SetPageMappedToDisk(page) set_bit(PG_mappedtodisk, &(page)->flags)
 #define ClearPageMappedToDisk(page) clear_bit(PG_mappedtodisk, &(page)->flags)
Index: linux-2.6/fs/Makefile
===================================================================
--- linux-2.6.orig/fs/Makefile
+++ linux-2.6/fs/Makefile
@@ -14,7 +14,7 @@ obj-y :=	open.o read_write.o file_table.
 		stack.o
 
 ifeq ($(CONFIG_BLOCK),y)
-obj-y +=	buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
+obj-y +=	fsblock.o buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
 else
 obj-y +=	no-block.o
 endif
Index: linux-2.6/fs/fsblock.c
===================================================================
--- /dev/null
+++ linux-2.6/fs/fsblock.c
@@ -0,0 +1,2511 @@
+/*
+ * fs/fsblock.c
+ *
+ * Copyright (C) 2007 Nick Piggin, SuSE Labs, Novell Inc.
+ */
+
+#include <linux/fsblock.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/bio.h>
+#include <linux/mm.h>
+#include <linux/gfp.h>
+#include <linux/bitops.h>
+#include <linux/pagevec.h>
+#include <linux/pagemap.h>
+#include <linux/page-flags.h>
+#include <linux/rcupdate.h> /* XXX: get rid of RCU */
+#include <linux/module.h>
+#include <linux/bit_spinlock.h> /* bit_spin_lock for subpage blocks */
+#include <linux/vmalloc.h> /* vmap for superpage blocks */
+#include <linux/gfp.h>
+//#include <linux/buffer_head.h> /* too much crap in me */
+extern int try_to_free_buffers(struct page *);
+
+/* XXX: add a page / block invariant checker function? */
+
+#include <asm/atomic.h>
+
+#define SECTOR_SHIFT	MIN_SECTOR_SHIFT
+#define NR_SUB_SIZES	(1 << (PAGE_CACHE_SHIFT - MIN_SECTOR_SHIFT))
+
+static struct kmem_cache *block_cache __read_mostly;
+static struct kmem_cache *mblock_cache __read_mostly;
+
+static void block_ctor(void *data, struct kmem_cache *cachep,
+			unsigned long flags)
+{
+	struct fsblock *block = data;
+	atomic_set(&block->count, 0);
+}
+
+void __init fsblock_init(void)
+{
+	block_cache = kmem_cache_create("fsblock-data",
+			sizeof(struct fsblock), 0,
+			SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_DESTROY_BY_RCU,
+			block_ctor, NULL);
+
+	mblock_cache = kmem_cache_create("fsblock-metadata",
+			sizeof(struct fsblock_meta), 0,
+			SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_DESTROY_BY_RCU,
+			block_ctor, NULL);
+}
+
+static void init_block(struct page *page, struct fsblock *block, unsigned int bits)
+{
+	block->flags = 0;
+	block->block_nr = -1;
+	block->page = page;
+	block->private = NULL;
+	FSB_BUG_ON(atomic_read(&block->count));
+	atomic_inc(&block->count);
+	__set_bit(BL_locked, &block->flags);
+	fsblock_set_bits(block, bits);
+#ifdef FSB_DEBUG
+	atomic_set(&block->vmap_count, 0);
+#endif
+}
+
+static void init_mblock(struct page *page, struct fsblock_meta *mblock, unsigned int bits)
+{
+	init_block(page, &mblock->block, bits);
+	__set_bit(BL_metadata, &mblock->block.flags);
+	INIT_LIST_HEAD(&mblock->assoc_list);
+	mblock->assoc_mapping = NULL;
+}
+
+static struct fsblock *alloc_blocks(struct page *page, unsigned int bits, gfp_t gfp_flags)
+{
+	struct fsblock *block;
+	int nid = page_to_nid(page);
+
+	if (bits >= PAGE_CACHE_SHIFT) { /* !subpage */
+		block = kmem_cache_alloc_node(block_cache, gfp_flags, nid);
+		if (likely(block))
+			init_block(page, block, bits);
+	} else {
+		int nr = PAGE_CACHE_SIZE >> bits;
+		/* XXX: could have a range of cache sizes */
+		block = kmalloc_node(sizeof(struct fsblock)*nr, gfp_flags, nid);
+		if (likely(block)) {
+			int i;
+			for (i = 0; i < nr; i++) {
+				struct fsblock *b = block + i;
+				atomic_set(&b->count, 0);
+				init_block(page, b, bits);
+			}
+		}
+	}
+	return block;
+}
+
+static struct fsblock_meta *alloc_mblocks(struct page *page, unsigned int bits, gfp_t gfp_flags)
+{
+	struct fsblock_meta *mblock;
+	int nid = page_to_nid(page);
+
+	if (bits >= PAGE_CACHE_SHIFT) { /* !subpage */
+		mblock = kmem_cache_alloc_node(mblock_cache, gfp_flags, nid);
+		if (likely(mblock))
+			init_mblock(page, mblock, bits);
+	} else {
+		int nr = PAGE_CACHE_SIZE >> bits;
+		mblock = kmalloc_node(sizeof(struct fsblock_meta)*nr, gfp_flags, nid);
+		if (likely(mblock)) {
+			int i;
+			for (i = 0; i < nr; i++) {
+				struct fsblock_meta *mb = mblock + i;
+				atomic_set(&mb->block.count, 0);
+				init_mblock(page, mb, bits);
+			}
+		}
+	}
+	return mblock;
+}
+
+#ifdef FSB_DEBUG
+static void assert_block(struct fsblock *block)
+{
+	struct page *page = block->page;
+
+	FSB_BUG_ON(!PageLocked(page));
+	FSB_BUG_ON(!PageBlocks(page));
+
+	if (fsblock_superpage(block)) {
+		struct page *p;
+
+		FSB_BUG_ON(page->index != first_page_idx(page->index,
+							fsblock_size(block)));
+
+		for_each_page(page, fsblock_size(block), p) {
+			FSB_BUG_ON(!PageBlocks(p));
+			FSB_BUG_ON(page_blocks(p) != block);
+		} end_for_each_page;
+	} else if (fsblock_subpage(block)) {
+		struct fsblock *b;
+		block = page_blocks(block->page);
+
+		for_each_block(block, b)
+			FSB_BUG_ON(b->page != page);
+	}
+}
+
+static void free_block_check(struct fsblock *block)
+{
+	unsigned long flags = block->flags;
+	unsigned long badflags =
+			(1 << BL_locked	|
+			 1 << BL_dirty	|
+			 /* 1 << BL_error	| */
+			 1 << BL_new	|
+			 1 << BL_writeback |
+			 1 << BL_readin	|
+			 1 << BL_sync_io);
+	unsigned long goodflags = 0;
+	unsigned int size = fsblock_size(block);
+	unsigned int count = atomic_read(&block->count);
+	unsigned int vmap_count = atomic_read(&block->vmap_count);
+	void *private = block->private;
+
+	if ((flags & badflags) || ((flags & goodflags) != goodflags) || count || private || vmap_count) {
+		printk("block flags = %lx\n", flags);
+		printk("block size  = %u\n", size);
+		printk("block count = %u\n", count);
+		printk("block private = %p\n", private);
+		printk("vmap count  = %u\n", vmap_count);
+		BUG();
+	}
+}
+#else
+static inline void assert_block(struct fsblock *block) {}
+#endif
+
+static void rcu_free_block(struct rcu_head *head)
+{
+	struct fsblock *block = container_of(head, struct fsblock, rcu_head);
+	kfree(block);
+}
+
+static void free_block(struct fsblock *block)
+{
+	if (fsblock_subpage(block)) {
+#ifdef FSB_DEBUG
+		unsigned int bits = fsblock_bits(block);
+		int i, nr = PAGE_CACHE_SIZE >> bits;
+
+		for (i = 0; i < nr; i++) {
+			struct fsblock *b;
+			if (test_bit(BL_metadata, &block->flags))
+				b = &(block_mblock(block) + i)->block;
+			else
+				b = block + i;
+			free_block_check(b);
+		}
+#endif
+
+		INIT_RCU_HEAD(&block->rcu_head);
+		call_rcu(&block->rcu_head, rcu_free_block);
+	} else {
+#ifdef VMAP_CACHE
+		if (test_bit(BL_vmapped, &block->flags)) {
+			vunmap(block->vaddr);
+			block->vaddr = NULL;
+			clear_bit(BL_vmapped, &block->flags);
+		}
+#endif
+#ifdef FSB_DEBUG
+		free_block_check(block);
+#endif
+		if (test_bit(BL_metadata, &block->flags))
+			kmem_cache_free(mblock_cache, block);
+		else
+			kmem_cache_free(block_cache, block);
+	}
+}
+
+int block_get_unless_zero(struct fsblock *block)
+{
+	return atomic_inc_not_zero(&block->count);
+}
+
+void block_get(struct fsblock *block)
+{
+	FSB_BUG_ON(atomic_read(&block->count) == 0);
+	atomic_inc(&block->count);
+}
+EXPORT_SYMBOL(block_get);
+
+static int fsblock_noblock = 1 __read_mostly; /* Like nobh mode */
+
+void block_put(struct fsblock *block)
+{
+	int free_it;
+	struct page *page;
+
+	page = block->page;
+	free_it = 0;
+	if (!page->mapping || fsblock_noblock) {
+		free_it = 1;
+		page_cache_get(page);
+	}
+
+#ifdef FSB_DEBUG
+	FSB_BUG_ON(atomic_read(&block->count) == 2 &&
+			atomic_read(&block->vmap_count));
+#endif
+	FSB_BUG_ON(atomic_read(&block->count) <= 1);
+
+	/* dec_return required for the release memory barrier */
+	if (atomic_dec_return(&block->count) == 1) {
+		if (free_it && !test_bit(BL_dirty, &block->flags)) {
+			/*
+			 * At this point we'd like to try stripping the block
+			 * if it is only existing in a self-referential
+			 * relationship with the pagecache (ie. the pagecache
+			 * is truncated as well).
+			 */
+			if (!TestSetPageLocked(page)) {
+				try_to_free_blocks(page);
+				unlock_page(page);
+			}
+		}
+	}
+	if (free_it)
+		page_cache_release(page);
+}
+EXPORT_SYMBOL(block_put);
+
+static int sleep_on_block(void *unused)
+{
+	io_schedule();
+	return 0;
+}
+
+void lock_block(struct fsblock *block)
+{
+	might_sleep();
+
+	if (!trylock_block(block))
+		wait_on_bit_lock(&block->flags, BL_locked, sleep_on_block,
+							TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(lock_block);
+
+void unlock_block(struct fsblock *block)
+{
+	FSB_BUG_ON(!test_bit(BL_locked, &block->flags));
+	smp_mb__before_clear_bit();
+	clear_bit(BL_locked, &block->flags);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&block->flags, BL_locked);
+}
+EXPORT_SYMBOL(unlock_block);
+
+void wait_on_block_locked(struct fsblock *block)
+{
+	might_sleep();
+
+	if (test_bit(BL_locked, &block->flags))
+		wait_on_bit(&block->flags, BL_locked, sleep_on_block,
+							TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_on_block_locked);
+
+static void set_block_sync_io(struct fsblock *block)
+{
+	FSB_BUG_ON(!PageLocked(block->page));
+	FSB_BUG_ON(test_bit(BL_sync_io, &block->flags));
+#ifdef FSB_DEBUG
+	if (fsblock_superpage(block)) {
+		struct page *page = block->page, *p;
+		for_each_page(page, fsblock_size(block), p) {
+			FSB_BUG_ON(!PageLocked(p));
+			FSB_BUG_ON(PageWriteback(p));
+		} end_for_each_page;
+	} else {
+		FSB_BUG_ON(!PageLocked(block->page));
+		FSB_BUG_ON(PageWriteback(block->page));
+	}
+#endif
+	set_bit(BL_sync_io, &block->flags);
+}
+
+static void end_block_sync_io(struct fsblock *block)
+{
+	FSB_BUG_ON(!PageLocked(block->page));
+	FSB_BUG_ON(!test_bit(BL_sync_io, &block->flags));
+	clear_bit(BL_sync_io, &block->flags);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&block->flags, BL_sync_io);
+}
+
+static void wait_on_block_sync_io(struct fsblock *block)
+{
+	might_sleep();
+
+	FSB_BUG_ON(!PageLocked(block->page));
+	if (test_bit(BL_sync_io, &block->flags))
+		wait_on_bit(&block->flags, BL_sync_io, sleep_on_block,
+							TASK_UNINTERRUPTIBLE);
+}
+
+static void iolock_block(struct fsblock *block)
+{
+	struct page *page, *p;
+	might_sleep();
+
+	page = block->page;
+	if (!fsblock_superpage(block))
+		lock_page(page);
+	else {
+		for_each_page(page, fsblock_size(block), p) {
+			lock_page(p);
+		} end_for_each_page;
+	}
+}
+
+static void iounlock_block(struct fsblock *block)
+{
+	struct page *page, *p;
+
+	page = block->page;
+	if (!fsblock_superpage(block))
+		unlock_page(page);
+	else {
+		for_each_page(page, fsblock_size(block), p) {
+			unlock_page(p);
+		} end_for_each_page;
+	}
+}
+
+static void wait_on_block_iolock(struct fsblock *block)
+{
+	struct page *page, *p;
+	might_sleep();
+
+	page = block->page;
+	if (!fsblock_superpage(block))
+		wait_on_page_locked(page);
+	else {
+		for_each_page(page, fsblock_size(block), p) {
+			wait_on_page_locked(p);
+		} end_for_each_page;
+	}
+}
+
+static void set_block_writeback(struct fsblock *block)
+{
+	struct page *page, *p;
+	might_sleep();
+
+	page = block->page;
+	if (!fsblock_superpage(block)) {
+		set_page_writeback(page);
+		unlock_page(page);
+	} else {
+		for_each_page(page, fsblock_size(block), p) {
+			set_page_writeback(p);
+			unlock_page(p);
+		} end_for_each_page;
+	}
+}
+
+static void end_block_writeback(struct fsblock *block)
+{
+	struct page *page, *p;
+
+	page = block->page;
+	if (!fsblock_superpage(block))
+		end_page_writeback(page);
+	else {
+		for_each_page(page, fsblock_size(block), p) {
+			end_page_writeback(p);
+		} end_for_each_page;
+	}
+}
+
+static void wait_on_block_writeback(struct fsblock *block)
+{
+	struct page *page, *p;
+	might_sleep();
+
+	page = block->page;
+	if (!fsblock_superpage(block))
+		wait_on_page_writeback(page);
+	else {
+		for_each_page(page, fsblock_size(block), p) {
+			wait_on_page_writeback(p);
+		} end_for_each_page;
+	}
+}
+
+static struct block_device *mapping_data_bdev(struct address_space *mapping)
+{
+	struct inode *inode = mapping->host;
+	if (unlikely(S_ISBLK(inode->i_mode)))
+		return inode->i_bdev;
+	else
+		return inode->i_sb->s_bdev;
+}
+
+static struct fsblock *find_get_page_block(struct page *page)
+{
+	struct fsblock *block;
+
+	rcu_read_lock();
+again:
+	block = page_blocks_rcu(page);
+	if (block) {
+		/*
+		 * Might be better off implementing this as a bit spinlock
+		 * rather than count (which requires tricks with ordering
+		 * eg. release vs set page dirty).
+		 */
+		if (block_get_unless_zero(block)) {
+			if ((page_blocks_rcu(page) != block)) {
+				block_put(block);
+				block = NULL;
+			}
+		} else {
+			cpu_relax();
+			goto again;
+		}
+	}
+	rcu_read_unlock();
+
+	return block;
+}
+
+static int __set_page_dirty_noblocks(struct page *page)
+{
+	FSB_BUG_ON(!PageBlocks(page));
+	FSB_BUG_ON(!fsblock_subpage(page_blocks(page)) && !PageUptodate(page));
+
+	return __set_page_dirty_nobuffers(page);
+}
+
+int fsblock_set_page_dirty(struct page *page)
+{
+	struct fsblock *block;
+	int ret = 0;
+
+	FSB_BUG_ON(!PageUptodate(page));
+	FSB_BUG_ON(!PageBlocks(page));
+//	FSB_BUG_ON(!PageLocked(page)); /* XXX: this can go away when we pin a page's metadata */
+
+	block = page_blocks(page);
+	if (fsblock_subpage(block)) {
+		struct fsblock *b;
+
+		for_each_block(block, b) {
+			FSB_BUG_ON(!test_bit(BL_uptodate, &b->flags));
+			if (!test_bit(BL_dirty, &b->flags)) {
+				set_bit(BL_dirty, &b->flags);
+				ret = 1;
+			}
+		}
+	} else {
+		FSB_BUG_ON(!test_bit(BL_uptodate, &block->flags));
+		if (!test_bit(BL_dirty, &block->flags)) {
+			set_bit(BL_dirty, &block->flags);
+			ret = 1;
+		}
+	}
+	/*
+	 * XXX: this is slightly racy because the above blocks could be
+	 * cleaned in a writeback that's underway, while the page will
+	 * still get marked dirty below. This technically breaks some
+	 * invariants that we check for (that a dirty page must have at
+	 * least 1 dirty buffer). Eventually we could just relax those
+	 * invariants, but keep them in for now to catch bugs.
+	 */
+	return __set_page_dirty_noblocks(page);
+}
+EXPORT_SYMBOL(fsblock_set_page_dirty);
+
+/*
+ * Do we need a fast atomic version for just page sized / aligned maps?
+ */
+void *vmap_block(struct fsblock *block, off_t off, size_t len)
+{
+	struct address_space *mapping = block->page->mapping;
+	unsigned int size = fsblock_size(block);
+
+	FSB_BUG_ON(off < 0);
+	FSB_BUG_ON(off + len > size);
+
+	if (!fsblock_superpage(block)) {
+		unsigned int page_offset = 0;
+		if (fsblock_subpage(block))
+			page_offset = block_page_offset(block, size);
+#ifdef FSB_DEBUG
+		atomic_inc(&block->vmap_count);
+#endif
+		return kmap(block->page) + page_offset + off;
+	} else {
+		pgoff_t pgoff, start, end;
+		unsigned long pos;
+
+#ifdef VMAP_CACHE
+		if (test_bit(BL_vmapped, &block->flags)) {
+			while (test_bit(BL_vmap_lock, &block->flags))
+				cpu_relax();
+			smp_rmb();
+#ifdef FSB_DEBUG
+			atomic_inc(&block->vmap_count);
+#endif
+			return block->vaddr + off;
+		}
+#endif
+
+		pgoff = block->page->index;
+		FSB_BUG_ON(test_bit(BL_metadata, &block->flags) &&
+			pgoff != block->block_nr * (size >> PAGE_CACHE_SHIFT));
+		start = pgoff + (off >> PAGE_CACHE_SHIFT);
+		end = pgoff + ((off + len - 1) >> PAGE_CACHE_SHIFT);
+		pos = off & ~PAGE_CACHE_MASK;
+
+#ifndef VMAP_CACHE
+		if (start == end) {
+			struct page *page;
+
+			page = find_page(mapping, start);
+			FSB_BUG_ON(!page);
+
+#ifdef FSB_DEBUG
+			atomic_inc(&block->vmap_count);
+#endif
+			return kmap(page) + pos;
+		} else
+#endif
+		{
+			int nr;
+			struct page **pages;
+			void *addr;
+#ifndef VMAP_CACHE
+			nr = end - start + 1;
+#else
+			nr = size >> PAGE_CACHE_SHIFT;
+#endif
+			pages = kmalloc(nr * sizeof(struct page *), GFP_NOFS);
+			if (!pages)
+				return ERR_PTR(-ENOMEM);
+#ifndef VMAP_CACHE
+			find_pages(mapping, start, nr, pages);
+#else
+			find_pages(mapping, pgoff, nr, pages);
+#endif
+
+			addr = vmap(pages, nr, VM_MAP, PAGE_KERNEL);
+			kfree(pages);
+			if (!addr)
+				return ERR_PTR(-ENOMEM);
+
+#ifdef FSB_DEBUG
+			atomic_inc(&block->vmap_count);
+#endif
+#ifndef VMAP_CACHE
+			return addr + pos;
+#else
+			bit_spin_lock(BL_vmap_lock, &block->flags);
+			if (!test_bit(BL_vmapped, &block->flags)) {
+				block->vaddr = addr;
+				set_bit(BL_vmapped, &block->flags);
+			}
+			bit_spin_unlock(BL_vmap_lock, &block->flags);
+			if (block->vaddr != addr)
+				vunmap(addr);
+			return block->vaddr + off;
+#endif
+		}
+	}
+}
+EXPORT_SYMBOL(vmap_block);
+
+void vunmap_block(struct fsblock *block, off_t off, size_t len, void *vaddr)
+{
+#ifdef FSB_DEBUG
+	FSB_BUG_ON(atomic_read(&block->vmap_count) <= 0);
+	atomic_dec(&block->vmap_count);
+#endif
+	if (!fsblock_superpage(block))
+		kunmap(block->page);
+#ifndef VMAP_CACHE
+	else {
+		unsigned int size = fsblock_size(block);
+		pgoff_t pgoff, start, end;
+
+		pgoff = block->block_nr * (size >> PAGE_CACHE_SHIFT);
+		FSB_BUG_ON(pgoff != block->page->index);
+		start = pgoff + (off >> PAGE_CACHE_SHIFT);
+		end = pgoff + ((off + len - 1) >> PAGE_CACHE_SHIFT);
+
+		if (start == end) {
+			struct address_space *mapping = block->page->mapping;
+			struct page *page;
+
+			page = find_page(mapping, start);
+			FSB_BUG_ON(!page);
+
+			kunmap(page);
+		} else {
+			unsigned long pos;
+
+			pos = off & ~PAGE_CACHE_MASK;
+			vunmap(vaddr - pos);
+		}
+	}
+#endif
+}
+EXPORT_SYMBOL(vunmap_block);
+
+static struct fsblock *__find_get_block(struct address_space *mapping, sector_t blocknr)
+{
+	struct inode *inode = mapping->host;
+	struct page *page;
+	pgoff_t pgoff;
+
+	pgoff = sector_pgoff(blocknr, inode->i_blkbits);
+
+	page = find_get_page(mapping, pgoff);
+	if (page) {
+		struct fsblock *block;
+		block = find_get_page_block(page);
+		if (block) {
+			if (fsblock_subpage(block)) {
+				struct fsblock *b;
+				for_each_block(block, b) {
+					if (b->block_nr == blocknr) {
+						block_get(b);
+						block_put(block);
+						block = b;
+						goto found;
+					}
+				}
+				FSB_BUG();
+			} else
+				FSB_BUG_ON(block->block_nr != blocknr);
+found:
+			FSB_BUG_ON(!test_bit(BL_mapped, &block->flags));
+		}
+
+		page_cache_release(page);
+		return block;
+	}
+	return NULL;
+}
+
+struct fsblock_meta *find_get_mblock(struct block_device *bdev, sector_t blocknr, unsigned int size)
+{
+	struct fsblock *block;
+
+	block = __find_get_block(bdev->bd_inode->i_mapping, blocknr);
+	if (block) {
+		if (test_bit(BL_metadata, &block->flags)) {
+			/*
+			 * XXX: need a better way than 'size' to tag and
+			 * identify metadata fsblocks?
+			 */
+			if (fsblock_size(block) == size)
+				return block_mblock(block);
+		}
+
+		block_put(block);
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(find_get_mblock);
+
+static void attach_block_page(struct page *page, struct fsblock *block)
+{
+	if (PageUptodate(page))
+		set_bit(BL_uptodate, &block->flags);
+	unlock_block(block); /* XXX: need this? */
+}
+
+/* This goes away when we get rid of buffer.c */
+static int invalidate_aliasing_buffers(struct page *page, unsigned int size)
+{
+	if (!size_is_superpage(size)) {
+		if (PagePrivate(page))
+			return try_to_free_buffers(page);
+	} else {
+		struct page *p;
+
+		for_each_page(page, size, p) {
+			FSB_BUG_ON(!PageLocked(p));
+			FSB_BUG_ON(PageBlocks(p));
+
+			if (PagePrivate(p)) {
+				if (!try_to_free_buffers(p))
+					return 0;
+			}
+		} end_for_each_page;
+	}
+	return 1;
+}
+
+static int __try_to_free_blocks(struct page *page, int all_locked);
+static int invalidate_aliasing_blocks(struct page *page, unsigned int size)
+{
+	if (!size_is_superpage(size)) {
+		if (PageBlocks(page)) {
+			/* could check for compatible blocks here, but meh */
+			return __try_to_free_blocks(page, 1);
+		}
+	} else {
+		struct page *p;
+
+		for_each_page(page, size, p) {
+			FSB_BUG_ON(!PageLocked(p));
+			FSB_BUG_ON(PageBlocks(p));
+
+			if (PageBlocks(p)) {
+				if (!__try_to_free_blocks(p, 1))
+					return 0;
+			}
+		} end_for_each_page;
+	}
+	return 1;
+}
+
+#define CREATE_METADATA	0x01
+#define CREATE_DIRTY	0x02
+static int create_unmapped_blocks(struct page *page, gfp_t gfp_flags, unsigned int size, unsigned int flags)
+{
+	unsigned int bits = ffs(size) - 1;
+	struct fsblock *block;
+
+	FSB_BUG_ON(!PageLocked(page));
+	FSB_BUG_ON(PageDirty(page));
+	FSB_BUG_ON(PageWriteback(page));
+	FSB_BUG_ON(PageBlocks(page));
+	FSB_BUG_ON(flags & CREATE_DIRTY);
+
+	if (!invalidate_aliasing_buffers(page, size))
+		return -EBUSY;
+
+	/*
+	 * XXX: maybe use private alloc funcions so fses can embed block into
+	 * their fs-private block rather than using ->private? Maybe ->private
+	 * is easier though...
+	 */
+	if (!(flags & CREATE_METADATA)) {
+		block = alloc_blocks(page, bits, gfp_flags);
+		if (!block)
+			return -ENOMEM;
+	} else {
+		struct fsblock_meta *mblock;
+		mblock = alloc_mblocks(page, bits, gfp_flags);
+		if (!mblock)
+			return -ENOMEM;
+		block = mblock_block(mblock);
+	}
+
+	if (!fsblock_superpage(block)) {
+		attach_page_blocks(page, block);
+		/*
+		 * Ensure ordering between setting page->block ptr and reading
+		 * PageDirty, thus giving synchronisation between this and
+		 * fsblock_set_page_dirty()
+		 */
+		smp_mb();
+		if (fsblock_subpage(block)) {
+			struct fsblock *b;
+			for_each_block(block, b)
+				attach_block_page(page, b);
+		} else
+			attach_block_page(page, block);
+	} else {
+		struct page *p;
+		int uptodate = 1;
+		FSB_BUG_ON(page->index != first_page_idx(page->index, size));
+
+		for_each_page(page, size, p) {
+			FSB_BUG_ON(!PageLocked(p));
+			FSB_BUG_ON(PageDirty(p));
+			FSB_BUG_ON(PageWriteback(p));
+			FSB_BUG_ON(PageBlocks(p));
+			attach_page_blocks(p, block);
+		} end_for_each_page;
+		smp_mb();
+		for_each_page(page, size, p) {
+			if (!PageUptodate(p))
+				uptodate = 0;
+		} end_for_each_page;
+		if (uptodate)
+			set_bit(BL_uptodate, &block->flags);
+		unlock_block(block);
+	}
+
+	assert_block(block);
+
+	return 0;
+}
+
+static struct page *create_lock_page_range(struct address_space *mapping,
+					pgoff_t pgoff, unsigned int size)
+{
+	struct page *page;
+	gfp_t gfp;
+
+	gfp = mapping_gfp_mask(mapping) & ~__GFP_FS;
+	page = find_or_create_page(mapping, pgoff, gfp);
+	if (!page)
+		return NULL;
+
+	FSB_BUG_ON(!page->mapping);
+	page_cache_release(page);
+
+	if (size_is_superpage(size)) {
+		int i, nr = size >> PAGE_CACHE_SHIFT;
+
+		FSB_BUG_ON(pgoff != first_page_idx(pgoff, size));
+
+		for (i = 1; i < nr; i++) {
+			struct page *p;
+
+			p = find_or_create_page(mapping, pgoff + i, gfp);
+			if (!p) {
+				nr = i;
+				for (i = 0; i < nr; i++) {
+					p = find_page(mapping, pgoff + i);
+					FSB_BUG_ON(!p);
+					unlock_page(p);
+				}
+				return NULL;
+			}
+			FSB_BUG_ON(!p->mapping);
+			page_cache_release(p);
+			/*
+			 * don't want a ref hanging around (see end io handlers
+			 * for pagecache). Page lock pins the pcache ref.
+			 * XXX: this is a little unclean.
+			 */
+		}
+	}
+	FSB_BUG_ON(page->index != pgoff);
+	return page;
+}
+
+static void unlock_page_range(struct page *page, unsigned int size)
+{
+	if (!size_is_superpage(size))
+		unlock_page(page);
+	else {
+		struct page *p;
+
+		FSB_BUG_ON(page->index != first_page_idx(page->index, size));
+		for_each_page(page, size, p) {
+			FSB_BUG_ON(!p);
+			unlock_page(p);
+		} end_for_each_page;
+	}
+}
+
+struct fsblock_meta *find_or_create_mblock(struct block_device *bdev, sector_t blocknr, unsigned int size)
+{
+	struct inode *bd_inode = bdev->bd_inode;
+	struct address_space *bd_mapping = bd_inode->i_mapping;
+	struct page *page;
+	struct fsblock_meta *mblock;
+	pgoff_t pgoff;
+	int ret;
+
+	pgoff = sector_pgoff(blocknr, bd_inode->i_blkbits);
+
+	mblock = find_get_mblock(bdev, blocknr, size);
+	if (mblock)
+		return mblock;
+
+	page = create_lock_page_range(bd_mapping, pgoff, size);
+	if (!page)
+		return ERR_PTR(-ENOMEM);
+
+	if (!invalidate_aliasing_blocks(page, size)) {
+		mblock = ERR_PTR(-EBUSY);
+		goto failed;
+	}
+	ret = create_unmapped_blocks(page, GFP_NOFS, size, CREATE_METADATA);
+	if (ret) {
+		mblock = ERR_PTR(ret);
+		goto failed;
+	}
+
+	mblock = page_mblocks(page);
+	/*
+	 * XXX: technically this is just the block_dev.c direct
+	 * mapping. So maybe logically in that file? (OTOH it *is*
+	 * "metadata")
+	 */
+	if (fsblock_subpage(&mblock->block)) {
+		struct fsblock_meta *ret = NULL, *mb;
+		sector_t base_block;
+		base_block = pgoff << (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
+		__for_each_mblock(mblock, size, mb) {
+			mb->block.block_nr = base_block;
+			set_bit(BL_mapped, &mb->block.flags);
+			if (mb->block.block_nr == blocknr) {
+				FSB_BUG_ON(ret);
+				ret = mb;
+			}
+			base_block++;
+		}
+		FSB_BUG_ON(!ret);
+		mblock = ret;
+	} else {
+		mblock->block.block_nr = blocknr;
+		set_bit(BL_mapped, &mblock->block.flags);
+	}
+	mblock_get(mblock);
+failed:
+	unlock_page_range(page, size);
+	return mblock;
+}
+EXPORT_SYMBOL(find_or_create_mblock);
+
+static void block_end_read(struct fsblock *block, int uptodate)
+{
+	int sync_io;
+	int finished_readin = 1;
+	struct page *page = block->page;
+
+	FSB_BUG_ON(test_bit(BL_uptodate, &block->flags));
+	FSB_BUG_ON(test_bit(BL_error, &block->flags));
+
+	sync_io = test_bit(BL_sync_io, &block->flags);
+
+	if (unlikely(!uptodate)) {
+		set_bit(BL_error, &block->flags);
+		if (!fsblock_superpage(block))
+			SetPageError(page);
+		else {
+			struct page *p;
+			for_each_page(page, fsblock_size(block), p) {
+				SetPageError(p);
+			} end_for_each_page;
+		}
+	} else
+		set_bit(BL_uptodate, &block->flags);
+
+	if (fsblock_subpage(block)) {
+		unsigned long flags;
+		struct fsblock *b, *first = page_blocks(block->page);
+
+		local_irq_save(flags);
+		bit_spin_lock(BL_rd_lock, &first->flags);
+		clear_bit(BL_readin, &block->flags);
+		for_each_block(page_blocks(page), b) {
+			if (test_bit(BL_readin, &b->flags)) {
+				finished_readin = 0;
+				break;
+			}
+			if (!test_bit(BL_uptodate, &b->flags))
+				uptodate = 0;
+		}
+		bit_spin_unlock(BL_rd_lock, &first->flags);
+		local_irq_restore(flags);
+	} else
+		clear_bit(BL_readin, &block->flags);
+
+	if (sync_io)
+		finished_readin = 0; /* don't unlock */
+	if (!fsblock_superpage(block)) {
+		FSB_BUG_ON(PageWriteback(page));
+		if (uptodate)
+			SetPageUptodate(page);
+		if (finished_readin)
+			unlock_page(page);
+		/*
+		 * XXX: don't know whether or not to keep the page
+		 * refcount elevated or simply rely on the page lock...
+		 */
+	} else {
+		struct page *p;
+
+		for_each_page(page, fsblock_size(block), p) {
+			FSB_BUG_ON(PageDirty(p));
+			FSB_BUG_ON(PageWriteback(p));
+			if (uptodate)
+				SetPageUptodate(p);
+			if (finished_readin)
+				unlock_page(p);
+		} end_for_each_page;
+	}
+
+	if (sync_io)
+		end_block_sync_io(block);
+
+	block_put(block);
+}
+
+static void block_end_write(struct fsblock *block, int uptodate)
+{
+	int sync_io;
+	int finished_writeback = 1;
+	struct page *page = block->page;
+
+	FSB_BUG_ON(!test_bit(BL_uptodate, &block->flags));
+	FSB_BUG_ON(test_bit(BL_error, &block->flags));
+
+	sync_io = test_bit(BL_sync_io, &block->flags);
+
+	if (unlikely(!uptodate)) {
+		set_bit(BL_error, &block->flags);
+		if (!fsblock_superpage(block))
+			SetPageError(page);
+		else {
+			struct page *p;
+			for_each_page(page, fsblock_size(block), p) {
+				SetPageError(p);
+			} end_for_each_page;
+		}
+		set_bit(AS_EIO, &page->mapping->flags);
+	}
+
+	if (fsblock_subpage(block)) {
+		unsigned long flags;
+		struct fsblock *b, *first = page_blocks(block->page);
+
+		local_irq_save(flags);
+		bit_spin_lock(BL_wb_lock, &first->flags);
+		clear_bit(BL_writeback, &block->flags);
+		for_each_block(first, b) {
+			if (test_bit(BL_writeback, &b->flags)) {
+				finished_writeback = 0;
+				break;
+			}
+		}
+		bit_spin_unlock(BL_wb_lock, &first->flags);
+		local_irq_restore(flags);
+	} else
+		clear_bit(BL_writeback, &block->flags);
+
+	if (!sync_io) {
+		if (finished_writeback) {
+			if (!fsblock_superpage(block)) {
+				end_page_writeback(page);
+			} else {
+				struct page *p;
+				for_each_page(page, fsblock_size(block), p) {
+					FSB_BUG_ON(!p->mapping);
+					end_page_writeback(p);
+				} end_for_each_page;
+			}
+		}
+	} else
+		end_block_sync_io(block);
+
+	block_put(block);
+}
+
+int fsblock_strip = 1;
+
+static int block_end_bio_io(struct bio *bio, unsigned int bytes_done, int err)
+{
+	struct fsblock *block = bio->bi_private;
+	int uptodate;
+
+	if (bio->bi_size)
+		return 1;
+
+	uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+
+	if (err == -EOPNOTSUPP) {
+		printk(KERN_WARNING "block_end_bio_io: op not supported!\n");
+		WARN_ON(uptodate);
+	}
+
+	FSB_BUG_ON(!(test_bit(BL_readin, &block->flags) ^
+		test_bit(BL_writeback, &block->flags)));
+
+	if (test_bit(BL_readin, &block->flags))
+		block_end_read(block, uptodate);
+	else
+		block_end_write(block, uptodate);
+
+	bio_put(bio);
+
+	return 0;
+}
+
+static int submit_block(struct fsblock *block, int rw)
+{
+	struct page *page = block->page;
+	struct address_space *mapping = page->mapping;
+	struct bio *bio;
+	int ret = 0;
+	unsigned int bits = fsblock_bits(block);
+	unsigned int size = 1 << bits;
+	int nr = (size + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
+
+#if 0
+	printk("submit_block for %s [blocknr=%lu, sector=%lu, size=%u]\n",
+		(test_bit(BL_readin, &block->flags) ? "read" : "write"),
+		(unsigned long)block->block_nr,
+		(unsigned long)block->block_nr * (size >> SECTOR_SHIFT), size);
+#endif
+
+	FSB_BUG_ON(!PageLocked(page) && !PageWriteback(page));
+	FSB_BUG_ON(!mapping);
+	FSB_BUG_ON(!test_bit(BL_mapped, &block->flags));
+
+	clear_bit(BL_error, &block->flags);
+
+	bio = bio_alloc(GFP_NOIO, nr);
+	bio->bi_sector = block->block_nr << (bits - SECTOR_SHIFT);
+	bio->bi_bdev = mapping_data_bdev(mapping);
+	bio->bi_end_io = block_end_bio_io;
+	bio->bi_private = block;
+
+	if (!fsblock_superpage(block)) {
+		unsigned int offset = 0;
+
+		if (fsblock_subpage(block))
+			offset = block_page_offset(block, size);
+		if (bio_add_page(bio, page, size, offset) != size)
+			FSB_BUG();
+	} else {
+		struct page *p;
+		int i;
+
+		i = 0;
+		for_each_page(page, size, p) {
+			if (bio_add_page(bio, p, PAGE_CACHE_SIZE, 0) != PAGE_CACHE_SIZE)
+				FSB_BUG();
+			i++;
+		} end_for_each_page;
+		FSB_BUG_ON(i != nr);
+	}
+
+	block_get(block);
+	bio_get(bio);
+	submit_bio(rw, bio);
+
+	if (bio_flagged(bio, BIO_EOPNOTSUPP))
+		ret = -EOPNOTSUPP;
+
+	bio_put(bio);
+	return ret;
+}
+
+static int read_block(struct fsblock *block)
+{
+	FSB_BUG_ON(PageWriteback(block->page));
+	FSB_BUG_ON(test_bit(BL_readin, &block->flags));
+	FSB_BUG_ON(test_bit(BL_writeback, &block->flags));
+	FSB_BUG_ON(test_bit(BL_dirty, &block->flags));
+	set_bit(BL_readin, &block->flags);
+	return submit_block(block, READ);
+}
+
+static int write_block(struct fsblock *block)
+{
+	FSB_BUG_ON(!PageWriteback(block->page));
+	FSB_BUG_ON(test_bit(BL_readin, &block->flags));
+	FSB_BUG_ON(test_bit(BL_writeback, &block->flags));
+	FSB_BUG_ON(!test_bit(BL_uptodate, &block->flags));
+	set_bit(BL_writeback, &block->flags);
+	return submit_block(block, WRITE);
+}
+
+int sync_block(struct fsblock *block)
+{
+	int ret = 0;
+
+	if (test_bit(BL_dirty, &block->flags)) {
+		struct page *page = block->page;
+
+		iolock_block(block);
+		wait_on_block_writeback(block);
+		FSB_BUG_ON(PageWriteback(page)); /* because block is locked */
+		if (test_bit(BL_dirty, &block->flags)) {
+			FSB_BUG_ON(!test_bit(BL_uptodate, &block->flags));
+			clear_bit(BL_dirty, &block->flags);
+
+			if (fsblock_subpage(block)) {
+				struct fsblock *b;
+				for_each_block(page_blocks(page), b) {
+					if (test_bit(BL_dirty, &b->flags))
+						goto page_dirty;
+				}
+			}
+			if (!fsblock_superpage(block)) {
+				ret = clear_page_dirty_for_io(page);
+				FSB_BUG_ON(!ret);
+			} else {
+				struct page *p;
+				for_each_page(page, fsblock_size(block), p) {
+					clear_page_dirty_for_io(p);
+				} end_for_each_page;
+			}
+page_dirty:
+			set_block_writeback(block);
+
+			ret = write_block(block);
+			if (!ret) {
+				wait_on_block_writeback(block);
+				if (test_bit(BL_error, &block->flags))
+					ret = -EIO;
+			}
+		} else
+			iounlock_block(block);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(sync_block);
+
+void mark_mblock_uptodate(struct fsblock_meta *mblock)
+{
+	struct fsblock *block = mblock_block(mblock);
+	struct page *page = block->page;
+
+	if (fsblock_superpage(block)) {
+		struct page *p;
+		for_each_page(page, fsblock_size(block), p) {
+			SetPageUptodate(p);
+		} end_for_each_page;
+	} else if (fsblock_midpage(block)) {
+		SetPageUptodate(page);
+	} /* XXX: could check for all subblocks uptodate */
+	set_bit(BL_uptodate, &block->flags);
+}
+
+int mark_mblock_dirty(struct fsblock_meta *mblock)
+{
+	struct page *page;
+	FSB_BUG_ON(!fsblock_superpage(&mblock->block) &&
+		!test_bit(BL_uptodate, &mblock->block.flags));
+
+	if (test_and_set_bit(BL_dirty, &mblock->block.flags))
+		return 0;
+
+	page = mblock_block(mblock)->page;
+	if (!fsblock_superpage(mblock_block(mblock))) {
+		__set_page_dirty_noblocks(page);
+	} else {
+		struct page *p;
+		for_each_page(page, fsblock_size(mblock_block(mblock)), p) {
+			__set_page_dirty_noblocks(p);
+		} end_for_each_page;
+	}
+	return 1;
+}
+EXPORT_SYMBOL(mark_mblock_dirty);
+
+/*
+ * XXX: this is good, but is complex and inhibits block reclaim for now.
+ * Reworking so that it gets removed if the block is cleaned might be a
+ * good option? (would require a block flag)
+ */
+struct mb_assoc {
+	struct list_head mlist;
+	struct address_space *mapping;
+
+	struct list_head blist;
+	struct fsblock_meta *mblock;
+};
+
+int mark_mblock_dirty_inode(struct fsblock_meta *mblock, struct inode *inode)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct fsblock *block = mblock_block(mblock);
+	struct mb_assoc *mba;
+	int ret;
+
+	ret = mark_mblock_dirty(mblock);
+
+	bit_spin_lock(BL_assoc_lock, &block->flags);
+	if (block->private) {
+		mba = (struct mb_assoc *)block->private;
+		do {
+			FSB_BUG_ON(mba->mblock != mblock);
+			if (mba->mapping == inode->i_mapping)
+				goto out;
+			mba = list_entry(mba->blist.next,struct mb_assoc,blist);
+		} while (mba != block->private);
+	}
+	mba = kmalloc(sizeof(struct mb_assoc), GFP_ATOMIC);
+	if (unlikely(!mba)) {
+		bit_spin_unlock(BL_assoc_lock, &block->flags);
+		sync_block(block);
+		return ret;
+	}
+	INIT_LIST_HEAD(&mba->mlist);
+	mba->mapping = mapping;
+	INIT_LIST_HEAD(&mba->blist);
+	mba->mblock = mblock;
+	if (block->private)
+		list_add(&mba->blist, ((struct mb_assoc *)block->private)->blist.prev);
+	block->private = mba;
+	spin_lock(&mapping->private_lock);
+	list_add_tail(&mba->mlist, &mapping->private_list);
+	spin_unlock(&mapping->private_lock);
+
+out:
+	bit_spin_unlock(BL_assoc_lock, &block->flags);
+	return ret;
+}
+EXPORT_SYMBOL(mark_mblock_dirty_inode);
+
+int fsblock_sync(struct address_space *mapping)
+{
+	int err, ret;
+	LIST_HEAD(list);
+	struct mb_assoc *mba, *tmp;
+
+	spin_lock(&mapping->private_lock);
+	list_splice_init(&mapping->private_list, &list);
+	spin_unlock(&mapping->private_lock);
+
+	err = 0;
+	list_for_each_entry_safe(mba, tmp, &list, mlist) {
+		struct fsblock *block = mblock_block(mba->mblock);
+
+		FSB_BUG_ON(mba->mapping != mapping);
+
+		bit_spin_lock(BL_assoc_lock, &block->flags);
+		if (list_empty(&mba->blist))
+			block->private = NULL;
+		else {
+			if (block->private == mba)
+				block->private = list_entry(mba->blist.next,struct mb_assoc,blist);
+			list_del(&mba->blist);
+		}
+		bit_spin_unlock(BL_assoc_lock, &block->flags);
+
+		iolock_block(block);
+		wait_on_block_writeback(block);
+		if (test_bit(BL_dirty, &block->flags)) {
+			FSB_BUG_ON(!test_bit(BL_uptodate, &block->flags));
+			clear_bit(BL_dirty, &block->flags);
+			ret = write_block(block);
+			if (ret && !err)
+				err = ret;
+		} else
+			iounlock_block(block);
+	}
+
+	while (!list_empty(&list)) {
+		struct fsblock *block;
+
+		/* Go in reverse order to reduce context switching */
+		mba = list_entry(list.prev, struct mb_assoc, mlist);
+		list_del(&mba->mlist);
+
+		block = mblock_block(mba->mblock);
+		wait_on_block_writeback(block);
+		if (test_bit(BL_error, &block->flags)) {
+			if (!err)
+				err = -EIO;
+			set_bit(AS_EIO, &mba->mapping->flags);
+		}
+		kfree(mba);
+	}
+	return err;
+}
+EXPORT_SYMBOL(fsblock_sync);
+
+int fsblock_release(struct address_space *mapping, int force)
+{
+	struct mb_assoc *mba;
+	LIST_HEAD(list);
+
+	if (!mapping_has_private(mapping))
+		return 1;
+
+	spin_lock(&mapping->private_lock);
+	if (!force) {
+		list_for_each_entry(mba, &mapping->private_list, mlist) {
+			struct fsblock *block = mblock_block(mba->mblock);
+			if (test_bit(BL_dirty, &block->flags)) {
+				spin_unlock(&mapping->private_lock);
+				return 0;
+			}
+		}
+	}
+	list_splice_init(&mapping->private_list, &list);
+	spin_unlock(&mapping->private_lock);
+
+	while (!list_empty(&list)) {
+		struct fsblock *block;
+
+		mba = list_entry(list.prev, struct mb_assoc, mlist);
+		list_del(&mba->mlist);
+
+		block = mblock_block(mba->mblock);
+		bit_spin_lock(BL_assoc_lock, &block->flags);
+		if (list_empty(&mba->blist))
+			block->private = NULL;
+		else {
+			if (block->private == mba)
+				block->private = list_entry(mba->blist.next,struct mb_assoc,blist);
+			list_del(&mba->blist);
+		}
+		bit_spin_unlock(BL_assoc_lock, &block->flags);
+
+		if (test_bit(BL_error, &block->flags))
+			set_bit(AS_EIO, &mba->mapping->flags);
+		kfree(mba);
+	}
+	return 1;
+}
+EXPORT_SYMBOL(fsblock_release);
+
+static void sync_underlying_metadata(struct fsblock *block)
+{
+	struct address_space *mapping = block->page->mapping;
+	struct block_device *bdev = mapping_data_bdev(mapping);
+	struct fsblock *meta_block;
+	sector_t blocknr = block->block_nr;
+
+	/* XXX: should this just invalidate rather than write back? */
+
+	FSB_BUG_ON(test_bit(BL_metadata, &block->flags));
+
+	meta_block = __find_get_block(bdev->bd_inode->i_mapping, blocknr);
+	if (meta_block) {
+		int err;
+
+		FSB_BUG_ON(!test_bit(BL_metadata, &meta_block->flags));
+		/*
+		 * Could actually do a memory copy here to bring
+		 * the block uptodate. Probably not worthwhile.
+		 */
+		FSB_BUG_ON(block == meta_block);
+		err = sync_block(meta_block);
+		if (!err)
+			FSB_BUG_ON(test_bit(BL_dirty, &meta_block->flags));
+		else {
+			clear_bit(BL_dirty, &meta_block->flags);
+			wait_on_block_iolock(meta_block);
+		}
+	}
+}
+
+struct fsblock_meta *mbread(struct block_device *bdev, sector_t blocknr, unsigned int size)
+{
+	struct fsblock_meta *mblock;
+
+	mblock = find_or_create_mblock(bdev, blocknr, size);
+	if (!IS_ERR(mblock)) {
+		struct fsblock *block = &mblock->block;
+
+		if (!test_bit(BL_uptodate, &block->flags)) {
+			iolock_block(block);
+			if (!test_bit(BL_uptodate, &block->flags)) {
+				int ret;
+				FSB_BUG_ON(PageWriteback(block->page));
+				FSB_BUG_ON(test_bit(BL_dirty, &block->flags));
+				set_block_sync_io(block);
+				ret = read_block(block);
+				if (ret) {
+					/* XXX: handle errors properly */
+					block_put(block);
+					mblock = ERR_PTR(ret);
+				} else {
+					wait_on_block_sync_io(block);
+					if (!test_bit(BL_uptodate, &block->flags))
+						mblock = ERR_PTR(-EIO);
+					FSB_BUG_ON(size >= PAGE_CACHE_SIZE && !PageUptodate(block->page));
+				}
+			}
+			iounlock_block(block);
+		}
+	}
+
+	return mblock;
+}
+EXPORT_SYMBOL(mbread);
+
+/*
+ * XXX: maybe either don't have a generic version, or change the
+ * insert_mapping scheme so that it fills fsblocks rather than inserts them
+ * live into pages?
+ */
+sector_t fsblock_bmap(struct address_space *mapping, sector_t blocknr, insert_mapping_fn *insert_mapping)
+{
+	struct fsblock *block;
+	struct inode *inode = mapping->host;
+	sector_t ret;
+
+	block = __find_get_block(mapping, blocknr);
+	if (!block) {
+		pgoff_t pgoff = sector_pgoff(blocknr, inode->i_blkbits);
+		unsigned int size = 1 << inode->i_blkbits;
+		struct page *page;
+
+		page = create_lock_page_range(mapping, pgoff, size);
+		if (!page)
+			return 0;
+
+		if (create_unmapped_blocks(page, GFP_NOFS, size, CREATE_METADATA))
+			return 0;
+
+		ret = insert_mapping(mapping, pgoff, PAGE_CACHE_SIZE, 0);
+
+		block = __find_get_block(mapping, blocknr);
+		FSB_BUG_ON(!block);
+
+		unlock_page_range(page, size);
+	}
+
+	FSB_BUG_ON(test_bit(BL_new, &block->flags));
+	ret = 0;
+	if (test_bit(BL_mapped, &block->flags))
+		ret = block->block_nr;
+
+	return ret;
+}
+EXPORT_SYMBOL(fsblock_bmap);
+
+static int relock_superpage_block(struct page **pagep, unsigned int size)
+{
+	struct page *page = *pagep;
+	pgoff_t index = page->index;
+	pgoff_t first = first_page_idx(page->index, size);
+	struct address_space *mapping = page->mapping;
+
+	/*
+	 * XXX: this is a bit of a hack because the ->readpage and other
+	 * aops APIs are not so nice. Should convert over to a ->read_range
+	 * API that does the offset, length thing and allows caller locking?
+	 * (also getting rid of ->readpages).
+	 */
+	unlock_page(page);
+	*pagep = create_lock_page_range(mapping, first, size);
+	if (!*pagep) {
+		lock_page(page);
+		return -ENOMEM;
+	}
+	if (page != find_page(mapping, index)) {
+		unlock_page_range(*pagep, size);
+		return AOP_TRUNCATED_PAGE;
+	}
+	return 0;
+}
+
+static int block_read_helper(struct page *page, struct fsblock *block)
+{
+	FSB_BUG_ON(test_bit(BL_new, &block->flags));
+
+	if (test_bit(BL_uptodate, &block->flags))
+		return 0;
+
+	FSB_BUG_ON(PageUptodate(page));
+
+	if (!test_bit(BL_mapped, &block->flags)) {
+		unsigned int size = fsblock_size(block);
+		unsigned int offset = block_page_offset(block, size);
+		zero_user_page(page, offset, size, KM_USER0);
+		set_bit(BL_uptodate, &block->flags);
+		return 0;
+	}
+
+	if (!test_bit(BL_uptodate, &block->flags)) {
+		FSB_BUG_ON(test_bit(BL_readin, &block->flags));
+		FSB_BUG_ON(test_bit(BL_writeback, &block->flags));
+		set_bit(BL_readin, &block->flags);
+		return 1;
+	}
+	return 0;
+}
+
+int fsblock_read_page(struct page *page, insert_mapping_fn *insert_mapping)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+	unsigned int size = 1 << inode->i_blkbits;
+	struct fsblock *block;
+	int ret;
+
+	FSB_BUG_ON(!PageLocked(page));
+	FSB_BUG_ON(PageUptodate(page));
+	FSB_BUG_ON(PageWriteback(page));
+
+	if (size_is_superpage(size)) {
+		struct page *orig_page = page;
+
+		ret = relock_superpage_block(&page, size);
+		if (ret)
+			return ret;
+		if (PageUptodate(orig_page))
+			goto out_unlock;
+	}
+
+	if (!PageBlocks(page)) {
+		ret = create_unmapped_blocks(page, GFP_NOFS, size, 0);
+		if (ret)
+			goto out_unlock;
+	}
+
+	/* XXX: optimise away if page is mapped to disk */
+	ret = insert_mapping(mapping, page->index << PAGE_CACHE_SHIFT,
+						PAGE_CACHE_SIZE, 0);
+	/* XXX: SetPageError on failure? */
+	if (ret)
+		goto out_unlock;
+
+	block = page_blocks(page);
+
+	if (!fsblock_superpage(block)) {
+
+		if (fsblock_subpage(block)) {
+			int nr = 0;
+			struct fsblock *b;
+			for_each_block(block, b)
+				nr += block_read_helper(page, b);
+			if (nr == 0) {
+				/* Hole? */
+				SetPageUptodate(page);
+				goto out_unlock;
+			}
+			for_each_block(block, b) {
+				if (!test_bit(BL_readin, &b->flags))
+					continue;
+
+				ret = submit_block(b, READ);
+				if (ret)
+					goto out_unlock;
+				/*
+				 * XXX: must handle errors properly (eg. wait
+				 * for outstanding reads before unlocking the
+				 * page?
+				 */
+			}
+		} else {
+			if (block_read_helper(page, block)) {
+				ret = submit_block(block, READ);
+				if (ret)
+					goto out_unlock;
+			} else {
+				SetPageUptodate(page);
+				goto out_unlock;
+			}
+		}
+	} else {
+		struct page *p;
+
+		ret = 0;
+
+		FSB_BUG_ON(test_bit(BL_new, &block->flags));
+		FSB_BUG_ON(test_bit(BL_uptodate, &block->flags));
+		FSB_BUG_ON(test_bit(BL_dirty, &block->flags));
+
+		if (!test_bit(BL_mapped, &block->flags)) {
+			for_each_page(page, size, p) {
+				FSB_BUG_ON(PageUptodate(p));
+				zero_user_page(p, 0, PAGE_CACHE_SIZE, KM_USER0);
+				SetPageUptodate(p);
+				unlock_page(p);
+			} end_for_each_page;
+			set_bit(BL_uptodate, &block->flags);
+		} else {
+			ret = read_block(block);
+			if (ret)
+				goto out_unlock;
+		}
+	}
+	FSB_BUG_ON(ret);
+	return 0;
+
+out_unlock:
+	unlock_page_range(page, size);
+	return ret;
+}
+EXPORT_SYMBOL(fsblock_read_page);
+
+static int block_write_helper(struct page *page, struct fsblock *block)
+{
+	FSB_BUG_ON(!test_bit(BL_mapped, &block->flags));
+
+	if (test_bit(BL_new, &block->flags)) {
+		sync_underlying_metadata(block);
+		clear_bit(BL_new, &block->flags);
+		set_bit(BL_dirty, &block->flags);
+	}
+
+	if (test_bit(BL_dirty, &block->flags)) {
+		FSB_BUG_ON(!test_bit(BL_uptodate, &block->flags));
+		clear_bit(BL_dirty, &block->flags);
+		FSB_BUG_ON(test_bit(BL_readin, &block->flags));
+		FSB_BUG_ON(test_bit(BL_writeback, &block->flags));
+		set_bit(BL_writeback, &block->flags);
+		return 1;
+		/*
+		 * XXX: Careful of ordering between clear buffer / page dirty
+		 * and set buffer / page dirty
+		 */
+	}
+	return 0;
+}
+
+/* XXX: must obey non-blocking writeout! */
+int fsblock_write_page(struct page *page, insert_mapping_fn *insert_mapping,
+				struct writeback_control *wbc)
+{
+	struct address_space *mapping = page->mapping;
+	unsigned int size = 1 << mapping->host->i_blkbits;
+	struct fsblock *block;
+	int ret;
+
+	FSB_BUG_ON(!PageLocked(page));
+	FSB_BUG_ON(PageWriteback(page));
+
+	if (size_is_superpage(size)) {
+		struct page *orig_page = page;
+
+		redirty_page_for_writepage(wbc, orig_page);
+		ret = relock_superpage_block(&page, size);
+		if (ret)
+			return ret;
+		if (!clear_page_dirty_for_io(orig_page))
+			goto out_unlock;
+	}
+
+	if (!PageBlocks(page)) {
+		FSB_BUG(); /* XXX: should always have blocks here */
+		FSB_BUG_ON(!PageUptodate(page));
+		/* XXX: should rework (eg use page_mkwrite) so as to always
+		 * have blocks by this stage!!! */
+		ret = create_unmapped_blocks(page, GFP_NOFS, size, CREATE_DIRTY);
+		if (ret)
+			goto out_unlock;
+	}
+
+	ret = insert_mapping(mapping, page->index << PAGE_CACHE_SHIFT,
+						PAGE_CACHE_SIZE, 1);
+	if (ret)
+		goto out_unlock;
+
+	/*
+	 * XXX: todo - i_size handling ... should it be here?!?
+	 * No - I would prefer partial page zeroing to go in filemap_nopage
+	 * and tolerate writing of crap past EOF in filesystems -- no
+	 * other sane way to do it other than invalidating a partial page
+	 * before zeroing before writing it out in order that we can
+	 * guarantee it isn't touched after zeroing.
+	 */
+
+	block = page_blocks(page);
+
+	if (!fsblock_superpage(block)) {
+
+		if (fsblock_subpage(block)) {
+			int nr = 0;
+			struct fsblock *b;
+			for_each_block(block, b)
+				nr += block_write_helper(page, b);
+			/* XXX: technically could happen (see set_page_dirty_blocks) */
+			FSB_BUG_ON(nr == 0);
+			if (nr == 0)
+				goto out_unlock;
+
+			FSB_BUG_ON(PageWriteback(page));
+			set_page_writeback(page);
+			unlock_page(page);
+			for_each_block(block, b) {
+				if (!test_bit(BL_writeback, &b->flags))
+					continue;
+				ret = submit_block(b, WRITE);
+				if (ret)
+					goto out_unlock;
+				/* XXX: error handling */
+			}
+		} else {
+			if (block_write_helper(page, block)) {
+				FSB_BUG_ON(PageWriteback(page));
+				set_page_writeback(page);
+				unlock_page(page);
+				ret = submit_block(block, WRITE);
+				if (ret)
+					goto out_unlock;
+			} else {
+				FSB_BUG(); /* XXX: see above */
+				goto out_unlock;
+			}
+		}
+	} else {
+		struct page *p;
+
+		FSB_BUG_ON(!test_bit(BL_mapped, &block->flags));
+		FSB_BUG_ON(!test_bit(BL_uptodate, &block->flags));
+		FSB_BUG_ON(!test_bit(BL_dirty, &block->flags));
+		FSB_BUG_ON(test_bit(BL_new, &block->flags));
+
+		for_each_page(page, size, p) {
+			FSB_BUG_ON(page_blocks(p) != block);
+			FSB_BUG_ON(!PageUptodate(p));
+		} end_for_each_page;
+
+		for_each_page(page, size, p) {
+			clear_page_dirty_for_io(p);
+			FSB_BUG_ON(PageWriteback(p));
+			FSB_BUG_ON(!PageUptodate(p));
+			set_page_writeback(p);
+			unlock_page(p);
+		} end_for_each_page;
+
+		/* XXX: recheck ordering here! don't want to lose dirty bits */
+
+		clear_bit(BL_dirty, &block->flags);
+		ret = write_block(block);
+		if (ret)
+			goto out_unlock;
+	}
+	FSB_BUG_ON(ret);
+	return 0;
+
+out_unlock:
+	unlock_page_range(page, size);
+	return ret;
+}
+EXPORT_SYMBOL(fsblock_write_page);
+
+static int block_dirty_helper(struct page *page, struct fsblock *block)
+{
+	FSB_BUG_ON(!test_bit(BL_mapped, &block->flags));
+
+	if (test_bit(BL_uptodate, &block->flags))
+		return 0;
+
+	FSB_BUG_ON(PageUptodate(page));
+
+	if (test_bit(BL_new, &block->flags)) {
+		unsigned int size = fsblock_size(block);
+		unsigned int offset = block_page_offset(block, size);
+		zero_user_page(page, offset, size, KM_USER0);
+		set_bit(BL_uptodate, &block->flags);
+		sync_underlying_metadata(block);
+		clear_bit(BL_new, &block->flags);
+		set_bit(BL_dirty, &block->flags);
+		__set_page_dirty_noblocks(page);
+		return 0;
+	}
+	return 1;
+}
+
+static int fsblock_prepare_write_super(struct page *orig_page,
+				unsigned int size, unsigned from, unsigned to,
+				insert_mapping_fn *insert_mapping)
+{
+	struct address_space *mapping = orig_page->mapping;
+	struct fsblock *block;
+	struct page *page = orig_page, *p;
+	int ret;
+
+	ret = relock_superpage_block(&page, size);
+	if (ret)
+		return ret;
+
+	FSB_BUG_ON(PageBlocks(page) != PageBlocks(orig_page));
+	if (!PageBlocks(page)) {
+		FSB_BUG_ON(PageDirty(orig_page));
+		FSB_BUG_ON(PageDirty(page));
+		ret = create_unmapped_blocks(page, GFP_NOFS, size, 0);
+		if (ret)
+			goto out_unlock;
+	}
+	FSB_BUG_ON(!PageBlocks(page));
+
+	ret = insert_mapping(mapping, page->index << PAGE_CACHE_SHIFT,
+						PAGE_CACHE_SIZE, 1);
+	if (ret)
+		goto out_unlock;
+
+	block = page_blocks(page);
+
+	if (test_bit(BL_new, &block->flags)) {
+		for_each_page(page, size, p) {
+			if (!PageUptodate(p)) {
+				FSB_BUG_ON(PageDirty(p));
+				zero_user_page(p, 0, PAGE_CACHE_SIZE, KM_USER0);
+				SetPageUptodate(p);
+			}
+			__set_page_dirty_noblocks(p);
+		} end_for_each_page;
+
+		set_bit(BL_uptodate, &block->flags);
+		sync_underlying_metadata(block);
+		clear_bit(BL_new, &block->flags);
+		set_bit(BL_dirty, &block->flags);
+	} else if (!test_bit(BL_uptodate, &block->flags)) {
+		FSB_BUG_ON(test_bit(BL_dirty, &block->flags));
+
+		set_block_sync_io(block);
+		ret = read_block(block);
+		if (ret)
+			goto out_unlock;
+		wait_on_block_sync_io(block);
+		if (!test_bit(BL_uptodate, &block->flags)) {
+			ret = -EIO;
+			goto out_unlock;
+		}
+	}
+
+	return 0;
+
+out_unlock:
+	unlock_page_range(page, size);
+	lock_page(orig_page);
+	FSB_BUG_ON(!ret);
+	return ret;
+}
+
+int fsblock_prepare_write(struct page *page, unsigned from, unsigned to,
+					insert_mapping_fn *insert_mapping)
+{
+	struct address_space *mapping = page->mapping;
+	unsigned int size = 1 << mapping->host->i_blkbits;
+	struct fsblock *block;
+	int ret, nr;
+
+	FSB_BUG_ON(!PageLocked(page));
+	FSB_BUG_ON(from > PAGE_CACHE_SIZE);
+	FSB_BUG_ON(to > PAGE_CACHE_SIZE);
+	FSB_BUG_ON(from > to);
+
+	if (size_is_superpage(size))
+		return fsblock_prepare_write_super(page, size, from, to, insert_mapping);
+
+	if (!PageBlocks(page)) {
+		ret = create_unmapped_blocks(page, GFP_NOFS, size, 0);
+		if (ret)
+			return ret;
+	}
+
+	ret = insert_mapping(mapping, page->index << PAGE_CACHE_SHIFT,
+						PAGE_CACHE_SIZE, 1);
+	if (ret)
+		return ret;
+
+	block = page_blocks(page);
+
+	nr = 0;
+	if (fsblock_subpage(block)) {
+		struct fsblock *b;
+		for_each_block(block, b)
+			nr += block_dirty_helper(page, b);
+	} else
+		nr += block_dirty_helper(page, block);
+	if (nr == 0)
+		SetPageUptodate(page);
+
+	if (PageUptodate(page))
+		return 0;
+
+	if (to - from == PAGE_CACHE_SIZE)
+		return 0;
+
+	/*
+	 * XXX: this is stupid, could do better with write_begin aops, or
+	 * just zero out unwritten partial blocks.
+	 */
+	if (fsblock_subpage(block)) {
+		struct fsblock *b;
+		for_each_block(block, b) {
+			if (test_bit(BL_uptodate, &b->flags))
+				continue;
+			set_block_sync_io(b);
+			ret = read_block(b);
+			if (ret)
+				break;
+		}
+
+		for_each_block(block, b) {
+			wait_on_block_sync_io(b);
+			if (!ret && !test_bit(BL_uptodate, &b->flags))
+				ret = -EIO;
+		}
+		if (ret)
+			return ret;
+	} else {
+
+		FSB_BUG_ON(test_bit(BL_uptodate, &block->flags));
+		set_block_sync_io(block);
+		ret = read_block(block);
+		if (ret)
+			return ret;
+		wait_on_block_sync_io(block);
+		if (test_bit(BL_error, &block->flags))
+			SetPageError(page);
+		if (!test_bit(BL_uptodate, &block->flags))
+			return -EIO;
+	}
+	SetPageUptodate(page);
+
+	return 0;
+}
+EXPORT_SYMBOL(fsblock_prepare_write);
+
+static int __fsblock_commit_write_super(struct page *orig_page,
+			struct fsblock *block, unsigned from, unsigned to)
+{
+	unsigned int size = fsblock_size(block);
+	struct page *page, *p;
+
+	FSB_BUG_ON(!test_bit(BL_uptodate, &block->flags));
+	set_bit(BL_dirty, &block->flags);
+	page = block->page;
+	for_each_page(page, size, p) {
+		FSB_BUG_ON(!PageUptodate(p));
+		__set_page_dirty_noblocks(p);
+	} end_for_each_page;
+	unlock_page_range(page, size);
+	lock_page(orig_page);
+
+	return 0;
+}
+
+static int __fsblock_commit_write_sub(struct page *page,
+			struct fsblock *block, unsigned from, unsigned to)
+{
+	struct fsblock *b;
+
+	for_each_block(block, b) {
+		if (to - from < PAGE_CACHE_SIZE)
+			FSB_BUG_ON(!test_bit(BL_uptodate, &b->flags));
+		else
+			set_bit(BL_uptodate, &block->flags);
+		if (!test_bit(BL_dirty, &b->flags))
+			set_bit(BL_dirty, &b->flags);
+	}
+	if (to - from < PAGE_CACHE_SIZE)
+		FSB_BUG_ON(!PageUptodate(page));
+	else
+		SetPageUptodate(page);
+	__set_page_dirty_noblocks(page);
+
+	return 0;
+}
+
+int __fsblock_commit_write(struct page *page, unsigned from, unsigned to)
+{
+	struct fsblock *block;
+
+	FSB_BUG_ON(!PageLocked(page));
+	FSB_BUG_ON(from > PAGE_CACHE_SIZE);
+	FSB_BUG_ON(to > PAGE_CACHE_SIZE);
+	FSB_BUG_ON(from > to);
+	FSB_BUG_ON(!PageBlocks(page));
+
+	block = page_blocks(page);
+	FSB_BUG_ON(!test_bit(BL_mapped, &block->flags));
+
+	if (fsblock_superpage(block))
+		return __fsblock_commit_write_super(page, block, from, to);
+	if (fsblock_subpage(block))
+		return __fsblock_commit_write_sub(page, block, from, to);
+
+	if (to - from < PAGE_CACHE_SIZE) {
+		FSB_BUG_ON(!PageUptodate(page));
+		FSB_BUG_ON(!test_bit(BL_uptodate, &block->flags));
+	} else {
+		set_bit(BL_uptodate, &block->flags);
+		SetPageUptodate(page);
+	}
+
+	if (!test_bit(BL_dirty, &block->flags))
+		set_bit(BL_dirty, &block->flags);
+	__set_page_dirty_noblocks(page);
+
+	return 0;
+}
+EXPORT_SYMBOL(__fsblock_commit_write);
+
+int fsblock_commit_write(struct file *file, struct page *page, unsigned from, unsigned to)
+{
+	struct inode *inode;
+	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+	int ret;
+
+	inode = page->mapping->host;
+	ret = __fsblock_commit_write(page, from, to);
+
+	/*
+	 * No need to use i_size_read() here, the i_size
+	 * cannot change under us because we hold i_mutex.
+	 */
+	if (!ret && pos > inode->i_size) {
+		i_size_write(inode, pos);
+		mark_inode_dirty(inode);
+	}
+        return ret;
+
+}
+EXPORT_SYMBOL(fsblock_commit_write);
+
+/* XXX: this is racy I think (must verify versus page_mkclean). Must have
+ * some operation to pin a page's metadata while dirtying it. (this will
+ * fix get_user_pages for dirty as well once callers are converted).
+ */
+int fsblock_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+	struct address_space *mapping;
+	const struct address_space_operations *a_ops;
+	int ret = 0;
+
+	lock_page(page);
+	mapping = page->mapping;
+	if (!mapping) {
+		/* Caller will take care of it */
+		goto out;
+	}
+	a_ops = mapping->a_ops;
+
+	/* XXX: don't instantiate blocks past isize! (same for truncate?) */
+	ret = a_ops->prepare_write(NULL, page, 0, PAGE_CACHE_SIZE);
+	if (ret == 0)
+		ret = __fsblock_commit_write(page, 0, PAGE_CACHE_SIZE);
+out:
+	unlock_page(page);
+
+	return ret;
+}
+EXPORT_SYMBOL(fsblock_page_mkwrite);
+
+static int fsblock_truncate_page_super(struct address_space *mapping, loff_t from)
+{
+	pgoff_t index;
+	unsigned offset;
+	const struct address_space_operations *a_ops = mapping->a_ops;
+	unsigned int size = 1 << mapping->host->i_blkbits;
+	unsigned int nr_pages;
+	unsigned int length;
+	int i, err;
+
+	length = from & (size - 1);
+	if (length == 0)
+		return 0;
+
+	nr_pages = ((size - length + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT);
+	index = from >> PAGE_CACHE_SHIFT;
+	offset = from & (PAGE_CACHE_SIZE-1);
+
+	err = 0;
+	for (i = 0; i < nr_pages; i++) {
+		struct page *page;
+
+		page = grab_cache_page(mapping, index + i);
+		if (!page) {
+			err = -ENOMEM;
+			break;
+		}
+
+		err = a_ops->prepare_write(NULL, page, offset, PAGE_CACHE_SIZE);
+		if (!err) {
+			FSB_BUG_ON(!PageBlocks(page));
+			zero_user_page(page, offset, PAGE_CACHE_SIZE-offset, KM_USER0);
+			err = __fsblock_commit_write(page, offset, PAGE_CACHE_SIZE);
+		}
+
+		unlock_page(page);
+		page_cache_release(page);
+		if (err)
+			break;
+		offset = 0;
+	}
+	return err;
+}
+
+int fsblock_truncate_page(struct address_space *mapping, loff_t from)
+{
+	pgoff_t index;
+	unsigned offset;
+	struct page *page;
+	const struct address_space_operations *a_ops = mapping->a_ops;
+	unsigned int size = 1 << mapping->host->i_blkbits;
+	unsigned int length;
+	int ret;
+
+	if (size_is_superpage(size))
+		return fsblock_truncate_page_super(mapping, from);
+
+	length = from & (size - 1);
+	if (length == 0)
+		return 0;
+
+	index = from >> PAGE_CACHE_SHIFT;
+	offset = from & (PAGE_CACHE_SIZE-1);
+
+	page = grab_cache_page(mapping, index);
+	if (!page)
+		return -ENOMEM;
+
+	ret = a_ops->prepare_write(NULL, page, offset, PAGE_CACHE_SIZE);
+	if (ret == 0) {
+		zero_user_page(page, offset, PAGE_CACHE_SIZE-offset, KM_USER0);
+		/*
+		 * a_ops->commit_write would extend i_size :( Have to assume
+		 * caller uses fsblock_prepare_write.
+		 */
+		ret = __fsblock_commit_write(page, offset, PAGE_CACHE_SIZE);
+	}
+	unlock_page(page);
+	page_cache_release(page);
+	return ret;
+}
+EXPORT_SYMBOL(fsblock_truncate_page);
+
+static int can_free_block(struct fsblock *block)
+{
+	return atomic_read(&block->count) == 1 &&
+		!test_bit(BL_dirty, &block->flags) &&
+		!block->private;
+}
+
+static int __try_to_free_block(struct fsblock *block)
+{
+	int ret = 0;
+	if (can_free_block(block)) {
+		if (atomic_dec_and_test(&block->count)) {
+			if (!test_bit(BL_dirty, &block->flags)) {
+				ret = 1;
+				goto out;
+			}
+		}
+		atomic_inc(&block->count);
+	}
+out:
+	unlock_block(block);
+
+	return ret;
+}
+
+static int try_to_free_block(struct fsblock *block)
+{
+	/*
+	 * XXX: get rid of block locking from here and invalidate_block --
+	 * use page lock instead?
+	 */
+	if (trylock_block(block))
+		return __try_to_free_block(block);
+	return 0;
+}
+
+static int try_to_free_blocks_super(struct page *orig_page, int all_locked)
+{
+	unsigned int size;
+	struct fsblock *block;
+	struct page *page, *p;
+	int i;
+	int ret = 0;
+
+	FSB_BUG_ON(!PageLocked(orig_page));
+	FSB_BUG_ON(!PageBlocks(orig_page));
+
+	if (PageDirty(orig_page) || PageWriteback(orig_page))
+		return ret;
+
+	block = page_blocks(orig_page);
+	page = block->page;
+	size = fsblock_size(block);
+
+	i = 0;
+	if (!all_locked) {
+		for_each_page(page, size, p) {
+			if (p != orig_page) {
+				if (TestSetPageLocked(p))
+					goto out;
+				i++;
+				if (PageWriteback(p))
+					goto out;
+			}
+		} end_for_each_page;
+	}
+
+	assert_block(block);
+
+	if (!can_free_block(block))
+		goto out;
+	if (!try_to_free_block(block))
+		goto out;
+
+	for_each_page(page, size, p) {
+		FSB_BUG_ON(!PageLocked(p));
+		FSB_BUG_ON(!PageBlocks(p));
+		FSB_BUG_ON(PageWriteback(p));
+		clear_page_blocks(p);
+	} end_for_each_page;
+
+	free_block(block);
+
+	ret = 1;
+
+out:
+	if (i > 0) {
+		for_each_page(page, size, p) {
+			FSB_BUG_ON(PageDirty(p)); /* XXX: racy? */
+			if (p != orig_page) {
+				unlock_page(p);
+				i--;
+				if (i == 0)
+					break;
+			}
+		} end_for_each_page;
+	}
+	return ret;
+}
+
+static int __try_to_free_blocks(struct page *page, int all_locked)
+{
+	unsigned int size;
+	struct fsblock *block;
+
+	FSB_BUG_ON(!PageLocked(page));
+	FSB_BUG_ON(!PageBlocks(page));
+
+	if (PageDirty(page) || PageWriteback(page))
+		return 0;
+
+	block = page_blocks(page);
+	if (fsblock_superpage(block))
+		return try_to_free_blocks_super(page, all_locked);
+
+	assert_block(block);
+	if (fsblock_subpage(block)) {
+		struct fsblock *b;
+
+		for_each_block(block, b) {
+			if (!can_free_block(b))
+				return 0;
+		}
+
+		for_each_block(block, b) {
+			/*
+			 * must decrement head block last, so that if the
+			 * find_get_page_block fails, then the blocks will
+			 * really be freed.
+			 */
+			if (b == block)
+				continue;
+			if (!try_to_free_block(b))
+				goto error;
+		}
+		if (!try_to_free_block(block))
+			goto error;
+
+		size = fsblock_size(block);
+		FSB_BUG_ON(block != page_blocks(page));
+		goto success;
+error:
+		fo
Previous message: [thread] [date] [author]
Next message: [thread] [date] [author]

Messages in current thread:
[RFC] fsblock, Nick Piggin, (Sat Jun 23, 6:45 pm)
[patch 1/3] add the fsblock layer, Nick Piggin, (Sat Jun 23, 6:46 pm)
[patch 2/3] block_dev: convert to fsblock, Nick Piggin, (Sat Jun 23, 6:46 pm)
[patch 3/3] minix: convert to fsblock, Nick Piggin, (Sat Jun 23, 6:47 pm)
Re: [RFC] fsblock, Nick Piggin, (Sat Jun 23, 6:53 pm)
Re: [RFC] fsblock, Jeff Garzik, (Sat Jun 23, 8:07 pm)
Re: [RFC] fsblock, Nick Piggin, (Sat Jun 23, 8:47 pm)
Re: [RFC] fsblock, William Lee Irwin III, (Sat Jun 23, 9:19 pm)
Re: [RFC] fsblock, Chris Mason, (Sun Jun 24, 6:51 am)
Re: [RFC] fsblock, Andi Kleen, (Sun Jun 24, 7:16 am)
Re: [patch 1/3] add the fsblock layer, Andi Kleen, (Sun Jun 24, 8:28 am)
Re: [patch 1/3] add the fsblock layer, Arjan van de Ven, (Sun Jun 24, 1:18 pm)
Re: [RFC] fsblock, Nick Piggin, (Sun Jun 24, 11:58 pm)
Re: [RFC] fsblock, Nick Piggin, (Mon Jun 25, 12:16 am)
Re: [patch 1/3] add the fsblock layer, Nick Piggin, (Mon Jun 25, 12:19 am)
Re: [patch 1/3] add the fsblock layer, Andi Kleen, (Mon Jun 25, 1:58 am)
Re: [RFC] fsblock, Chris Mason, (Mon Jun 25, 5:25 am)
Re: [patch 1/3] add the fsblock layer, Chris Mason, (Mon Jun 25, 6:19 am)
Re: [patch 1/3] add the fsblock layer, Nick Piggin, (Mon Jun 25, 7:42 pm)
Re: [RFC] fsblock, David Chinner, (Mon Jun 25, 8:06 pm)
Re: [RFC] fsblock, Nick Piggin, (Mon Jun 25, 8:55 pm)
Re: [RFC] fsblock, David Chinner, (Tue Jun 26, 2:23 am)
Re: [RFC] fsblock, Nick Piggin, (Tue Jun 26, 4:14 am)
Re: [RFC] fsblock, Chris Mason, (Tue Jun 26, 5:34 am)
Re: [RFC] fsblock, Nick Piggin, (Tue Jun 26, 10:32 pm)
Re: [RFC] fsblock, David Chinner, (Tue Jun 26, 11:05 pm)
Re: [RFC] fsblock, Chris Mason, (Wed Jun 27, 4:50 am)
Re: [RFC] fsblock, Kyle Moffett, (Wed Jun 27, 5:39 am)
Re: [RFC] fsblock, Anton Altaparmakov, (Wed Jun 27, 8:18 am)
Re: [RFC] fsblock, David Chinner, (Wed Jun 27, 3:35 pm)
Re: [RFC] fsblock, Nick Piggin, (Wed Jun 27, 7:44 pm)
Re: [RFC] fsblock, Chris Mason, (Thu Jun 28, 5:20 am)
Re: [RFC] fsblock, David Chinner, (Thu Jun 28, 7:08 pm)
Re: [RFC] fsblock, Nick Piggin, (Thu Jun 28, 7:33 pm)
Re: [RFC] fsblock, Christoph Hellwig, (Sat Jun 30, 3:42 am)
Re: [RFC] fsblock, Christoph Hellwig, (Sat Jun 30, 3:44 am)
Re: [RFC] fsblock, Christoph Hellwig, (Sat Jun 30, 4:05 am)
Re: [RFC] fsblock, Jeff Garzik, (Sat Jun 30, 4:10 am)
Re: [RFC] fsblock, Christoph Hellwig, (Sat Jun 30, 4:13 am)
Re: [RFC] fsblock, Christoph Lameter, (Mon Jul 9, 10:14 am)
Re: [RFC] fsblock, Nick Piggin, (Mon Jul 9, 5:54 pm)
Re: [RFC] fsblock, Christoph Lameter, (Mon Jul 9, 5:59 pm)
Re: [RFC] fsblock, Nick Piggin, (Mon Jul 9, 6:07 pm)
Re: [RFC] fsblock, Dave McCracken, (Mon Jul 9, 6:37 pm)