md: enable suspend/resume of md devices.

Previous message: [thread] [date] [author]
Next message: [thread] [date] [author]
From: Linux Kernel Mailing List
Date: Friday, April 3, 2009 - 11:00 am

Gitweb:     http://git.kernel.org/linus/409c57f3801701dfee27a28103dda4831306cb20
Commit:     409c57f3801701dfee27a28103dda4831306cb20
Parent:     e0cf8f045b2023b0b3f919ee93eb94345f648434
Author:     NeilBrown <neilb@suse.de>
AuthorDate: Tue Mar 31 14:39:39 2009 +1100
Committer:  NeilBrown <neilb@suse.de>
CommitDate: Tue Mar 31 14:39:39 2009 +1100

    md: enable suspend/resume of md devices.
    
    To be able to change the 'level' of an md/raid array, we need to
    suspend the device so that no requests are active - then move some
    pointers around etc.
    
    The code already keeps counts of active requests and the ->quiesce
    function can be used to wait until those counts hit zero.
    However the quiesce function blocks new requests once they are all
    ready 'inside' the personality module, and that is too late if we want
    to replace the personality modules.
    
    So make all md requests come in through a common md_make_request
    function that keeps track of how many requests have entered the
    modules but may not yet be on the internal reference counts.
    Allow md_make_request to be blocked when we want to suspend the
    device, and make it possible to wait for all those in-transit requests
    to be added to internal lists so that ->quiesce can wait for them.
    
    There is still a problem that when a request completes, we drop the
    ref count inside the personality code so there is a short time between
    when the refcount hits zero, and when the personality code is no
    longer being used.
    The personality code never blocks (schedule or spinlock) between
    dropping the refcount and exiting the routine, so this should be safe
    (as put_module calls synchronize_sched() before unmapping the module
    code).
    
    Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c     |   79 +++++++++++++++++++++++++++++++++++++++++---------
 drivers/md/md.h     |    2 +
 drivers/md/raid1.c  |    3 ++
 drivers/md/raid10.c |    3 ++
 4 files changed, 72 insertions(+), 15 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index f30f09c..6cb31f8 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -201,12 +201,68 @@ static DEFINE_SPINLOCK(all_mddevs_lock);
 		)
 
 
-static int md_fail_request(struct request_queue *q, struct bio *bio)
+/* Rather than calling directly into the personality make_request function,
+ * IO requests come here first so that we can check if the device is
+ * being suspended pending a reconfiguration.
+ * We hold a refcount over the call to ->make_request.  By the time that
+ * call has finished, the bio has been linked into some internal structure
+ * and so is visible to ->quiesce(), so we don't need the refcount any more.
+ */
+static int md_make_request(struct request_queue *q, struct bio *bio)
 {
-	bio_io_error(bio);
-	return 0;
+	mddev_t *mddev = q->queuedata;
+	int rv;
+	if (mddev == NULL || mddev->pers == NULL) {
+		bio_io_error(bio);
+		return 0;
+	}
+	rcu_read_lock();
+	if (mddev->suspended) {
+		DEFINE_WAIT(__wait);
+		for (;;) {
+			prepare_to_wait(&mddev->sb_wait, &__wait,
+					TASK_UNINTERRUPTIBLE);
+			if (!mddev->suspended)
+				break;
+			rcu_read_unlock();
+			schedule();
+			rcu_read_lock();
+		}
+		finish_wait(&mddev->sb_wait, &__wait);
+	}
+	atomic_inc(&mddev->active_io);
+	rcu_read_unlock();
+	rv = mddev->pers->make_request(q, bio);
+	if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
+		wake_up(&mddev->sb_wait);
+
+	return rv;
 }
 
+static void mddev_suspend(mddev_t *mddev)
+{
+	BUG_ON(mddev->suspended);
+	mddev->suspended = 1;
+	synchronize_rcu();
+	wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
+	mddev->pers->quiesce(mddev, 1);
+	md_unregister_thread(mddev->thread);
+	mddev->thread = NULL;
+	/* we now know that no code is executing in the personality module,
+	 * except possibly the tail end of a ->bi_end_io function, but that
+	 * is certain to complete before the module has a chance to get
+	 * unloaded
+	 */
+}
+
+static void mddev_resume(mddev_t *mddev)
+{
+	mddev->suspended = 0;
+	wake_up(&mddev->sb_wait);
+	mddev->pers->quiesce(mddev, 0);
+}
+
+
 static inline mddev_t *mddev_get(mddev_t *mddev)
 {
 	atomic_inc(&mddev->active);
@@ -314,6 +370,7 @@ static mddev_t * mddev_find(dev_t unit)
 	init_timer(&new->safemode_timer);
 	atomic_set(&new->active, 1);
 	atomic_set(&new->openers, 0);
+	atomic_set(&new->active_io, 0);
 	spin_lock_init(&new->write_lock);
 	init_waitqueue_head(&new->sb_wait);
 	init_waitqueue_head(&new->recovery_wait);
@@ -3632,10 +3689,12 @@ static int md_alloc(dev_t dev, char *name)
 		mddev_put(mddev);
 		return -ENOMEM;
 	}
+	mddev->queue->queuedata = mddev;
+
 	/* Can be unlocked because the queue is new: no concurrency */
 	queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue);
 
-	blk_queue_make_request(mddev->queue, md_fail_request);
+	blk_queue_make_request(mddev->queue, md_make_request);
 
 	disk = alloc_disk(1 << shift);
 	if (!disk) {
@@ -3938,16 +3997,6 @@ static int do_md_run(mddev_t * mddev)
 
 	set_capacity(disk, mddev->array_sectors);
 
-	/* If we call blk_queue_make_request here, it will
-	 * re-initialise max_sectors etc which may have been
-	 * refined inside -> run.  So just set the bits we need to set.
-	 * Most initialisation happended when we called
-	 * blk_queue_make_request(..., md_fail_request)
-	 * earlier.
-	 */
-	mddev->queue->queuedata = mddev;
-	mddev->queue->make_request_fn = mddev->pers->make_request;
-
 	/* If there is a partially-recovered drive we need to
 	 * start recovery here.  If we leave it to md_check_recovery,
 	 * it will remove the drives and not do the right thing
@@ -4077,7 +4126,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 			md_super_wait(mddev);
 			if (mddev->ro)
 				set_disk_ro(disk, 0);
-			blk_queue_make_request(mddev->queue, md_fail_request);
+
 			mddev->pers->stop(mddev);
 			mddev->queue->merge_bvec_fn = NULL;
 			mddev->queue->unplug_fn = NULL;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index c07ea91..84b22d6 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -119,6 +119,8 @@ struct mddev_s
 #define MD_CHANGE_CLEAN 1	/* transition to or from 'clean' */
 #define MD_CHANGE_PENDING 2	/* superblock update in progress */
 
+	int				suspended;
+	atomic_t			active_io;
 	int				ro;
 
 	struct gendisk			*gendisk;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 7799587..7eaca32 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2092,6 +2092,9 @@ static int stop(mddev_t *mddev)
 		/* need to kick something here to make sure I/O goes? */
 	}
 
+	raise_barrier(conf);
+	lower_barrier(conf);
+
 	md_unregister_thread(mddev->thread);
 	mddev->thread = NULL;
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index d56cb2a..c2059e2 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2211,6 +2211,9 @@ static int stop(mddev_t *mddev)
 {
 	conf_t *conf = mddev_to_conf(mddev);
 
+	raise_barrier(conf, 0);
+	lower_barrier(conf);
+
 	md_unregister_thread(mddev->thread);
 	mddev->thread = NULL;
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
--
To unsubscribe from this list: send the line "unsubscribe git-commits-head" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Previous message: [thread] [date] [author]
Next message: [thread] [date] [author]

Messages in current thread:
md: enable suspend/resume of md devices., Linux Kernel Mailing ..., (Fri Apr 3, 11:00 am)