This is second version of generic discard optimizations first was submitted here: http://lwn.net/Articles/373994 Currently there are many file-systems which has implemented discard support, but SSD discs not widely used yet. This patch-set introduce compat helpers which simulate discard requests with zeroing semantics. __blkdev_issue_zeroout: explicitly zeroout given range via write request. blkdev_issue_clear: zeroout given range, use discard request if possible. Later filesystem admin may select which behavior is suitable for his needs discard without zeroing or explicit zeroing even if discard is not supported. Advantages: - Hope that this helps in real filesystem testing. - People who are crazy about data security would be really happy. - Virtual machine developers also would like this feature. Other optimization: - Convert all blkdev_issue_xxx function to common set of flags - Optimize generic discard submitting procedure. --
In some places caller don't want to wait a request to complete. Flags make blkdev_issue_flush() more flexible. This patch just convert existing callers to new interface without chaining existing allocation/wait behavior. Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org> --- block/blk-barrier.c | 43 +++++++++++++++++++++--------------- drivers/block/drbd/drbd_int.h | 3 +- drivers/block/drbd/drbd_receiver.c | 3 +- fs/block_dev.c | 3 +- fs/ext3/fsync.c | 3 +- fs/ext4/fsync.c | 6 +++- fs/jbd2/checkpoint.c | 3 +- fs/jbd2/commit.c | 6 +++- fs/reiserfs/file.c | 3 +- fs/xfs/linux-2.6/xfs_super.c | 3 +- include/linux/blkdev.h | 10 ++++++- 11 files changed, 55 insertions(+), 31 deletions(-) diff --git a/block/blk-barrier.c b/block/blk-barrier.c index 8618d89..7e6e810 100644 --- a/block/blk-barrier.c +++ b/block/blk-barrier.c @@ -285,26 +285,31 @@ static void bio_end_empty_barrier(struct bio *bio, int err) set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); clear_bit(BIO_UPTODATE, &bio->bi_flags); } - - complete(bio->bi_private); + if (bio->bi_private) + complete(bio->bi_private); + bio_put(bio); } /** * blkdev_issue_flush - queue a flush * @bdev: blockdev to issue flush for + * @gfp_mask: memory allocation flags (for bio_alloc) * @error_sector: error sector + * @flags: BLKDEV_IFL_* flags to control behaviour * * Description: * Issue a flush for the block device in question. Caller can supply * room for storing the error offset in case of a flush error, if they - * wish to. + * wish to. If WAIT flag is not passed then caller may check only what + * request was pushed in some internal queue for later handling. */ -int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) +int blkdev_issue_flush(struct block_device *bdev, gfp_t ...
Not all implementations require discard bio to be filled with one-sector
sized payload. Let's allocate payload only when necessary.
Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
---
block/blk-lib.c | 24 +++++++++++++-----------
drivers/scsi/sd.c | 1 +
include/linux/blkdev.h | 5 ++++-
3 files changed, 18 insertions(+), 12 deletions(-)
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 67b641e..321d150 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -184,17 +184,19 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
if (flags & BLKDEV_IFL_BARRIER)
bio->bi_private = &wait;
- /*
- * Add a zeroed one-sector payload as that's what
- * our current implementations need. If we'll ever need
- * more the interface will need revisiting.
- */
- page = alloc_page(gfp_mask | __GFP_ZERO);
- if (!page)
- goto out_free_bio;
- if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size)
- goto out_free_page;
-
+ if (blk_queue_discard_mem(q)) {
+ /*
+ * Add a zeroed one-sector payload as that's what
+ * our current implementations need. If we'll ever
+ * need more the interface will need revisiting.
+ */
+ page = alloc_page(gfp_mask | __GFP_ZERO);
+ if (!page)
+ goto out_free_bio;
+ if (bio_add_pc_page(q, bio, page, sector_size, 0) <
+ sector_size)
+ goto out_free_page;
+ }
/*
* And override the bio size - the way discard works we
* touch many more blocks on disk than the actual payload
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 1dd4d84..9994977 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1513,6 +1513,7 @@ static int read_capacity_16(struct scsi_disk *sdkp, struct scsi_device *sdp,
q->limits.discard_zeroes_data = 1;
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD_MEM, q);
}
sdkp->capacity = lba + 1;
diff --git a/include/linux/blkdev.h ...Allow several discard request to share payload page because each
discard bio require only one sector.
Current submit procedure is suboptimal. Each bio is flagged with
barrier flag, so we will end up with following trace:
for_each_bio(discar_bios) {
q->pre_flush
handle(bio);
disk->flush_cache
q->post_flush
}
But in fact user want following semantics:
q->pre_flush()
for_each_bio(discar_bios)
handle(bio)
q->pre_flush()
Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
---
block/blk-lib.c | 174 ++++++++++++++++++++++++++++++++++++-------------------
1 files changed, 115 insertions(+), 59 deletions(-)
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 321d150..be04a4c 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -9,6 +9,8 @@
#include "blk.h"
+static DEFINE_SPINLOCK(alloc_lock);
+
struct bio_batch
{
atomic_t done;
@@ -129,17 +131,55 @@ EXPORT_SYMBOL(__blkdev_issue_zeroout);
static void blkdev_discard_end_io(struct bio *bio, int err)
{
- if (err) {
- if (err == -EOPNOTSUPP)
- set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
- clear_bit(BIO_UPTODATE, &bio->bi_flags);
+ if (bio_page(bio))
+ put_page(bio_page(bio));
+}
+
+static int add_discard_payload(struct bio *bio, gfp_t gfp_mask, int len)
+{
+ static struct page *cur_page = NULL;
+ static int cur_offset = 0;
+ struct page *page;
+ int offset;
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+again:
+ spin_lock(&alloc_lock);
+ if (cur_offset + len > PAGE_SIZE)
+ /* There is no more space in the page */
+ if (cur_page) {
+ put_page(cur_page);
+ cur_page = NULL;
+ cur_offset = 0;
+ }
+ if (!cur_page) {
+ spin_unlock(&alloc_lock);
+ page = alloc_page(gfp_mask | __GFP_ZERO);
+ if (!page)
+ return -ENOMEM;
+
+ spin_lock(&alloc_lock);
+ /*
+ * Check cur_page one more time after we reacquired the lock.
+ * Because it may be changed by other task.
+ */
+ if (cur_page) ...