From 5ef56c8fecedf403a346d02140e52a072d693d6b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 25 Aug 2011 14:42:51 +1000
Subject: md: report failure if a 'set faulty' request doesn't.

Sometimes a device will refuse to be set faulty.  e.g. RAID1 will
never let the last working device become faulty.

So check if "md_error()" did manage to set the faulty flag and fail
with EBUSY if it didn't.

Resolves-Debian-Bug: http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=601198
Reported-by: Mike Hommey <mh+reportbug@glandium.org>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'drivers/md/md.c')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8e221a20f5d9..1cd9bfb45e9a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2561,7 +2561,10 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 	int err = -EINVAL;
 	if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
 		md_error(rdev->mddev, rdev);
-		err = 0;
+		if (test_bit(Faulty, &rdev->flags))
+			err = 0;
+		else
+			err = -EBUSY;
 	} else if (cmd_match(buf, "remove")) {
 		if (rdev->raid_disk >= 0)
 			err = -EBUSY;
@@ -5983,6 +5986,8 @@ static int set_disk_faulty(mddev_t *mddev, dev_t dev)
 		return -ENODEV;
 
 	md_error(mddev, rdev);
+	if (!test_bit(Faulty, &rdev->flags))
+		return -EBUSY;
 	return 0;
 }
 
-- 
cgit v1.2.3-58-ga151


From aeb9b211849621f592288ed5ad694de9eeaae87a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 25 Aug 2011 14:43:08 +1000
Subject: md: ensure changes to 'write-mostly' are reflected in metadata.

The 'write-mostly' flag can be changed through sysfs.
With 0.90 metadata, those changes are reflected in the metadata.
For 1.x metadata, they aren't.

So fix super_1_sync to record 'write-mostly' status.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'drivers/md/md.c')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1cd9bfb45e9a..9a880239219d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1738,6 +1738,11 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 	sb->level = cpu_to_le32(mddev->level);
 	sb->layout = cpu_to_le32(mddev->layout);
 
+	if (test_bit(WriteMostly, &rdev->flags))
+		sb->devflags |= WriteMostly1;
+	else
+		sb->devflags &= ~WriteMostly1;
+
 	if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
 		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
 		sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
-- 
cgit v1.2.3-58-ga151


From a5bf4df0c88b88d34b6f0e3bc8a402dac7d14611 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Thu, 25 Aug 2011 14:43:34 +1000
Subject: md: use REQ_NOIDLE flag in md_super_write()

Queue idling is used for the anticipation of immediate
sequencial I/O's but md_super_write() is a kind of one-
shot operation, coupled with md_super_wait(), so the
idling in this case will be just a waste of time.

Specifying REQ_NOIDLE prevents it. Instead of adding
the flag to submit_bio() directly, use pre-defined
macro WRITE_FLUSH_FUA.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md/md.c')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9a880239219d..aca611711264 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -848,7 +848,7 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
 	bio->bi_end_io = super_written;
 
 	atomic_inc(&mddev->pending_writes);
-	submit_bio(REQ_WRITE | REQ_SYNC | REQ_FLUSH | REQ_FUA, bio);
+	submit_bio(WRITE_FLUSH_FUA, bio);
 }
 
 void md_super_wait(mddev_t *mddev)
-- 
cgit v1.2.3-58-ga151


From 7da64a0abc3b2c6cbd3521672e9bb74dd560bb89 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 30 Aug 2011 16:20:17 +1000
Subject: md: fix clearing of 'blocked' flag in the presence of bad blocks.

When the 'blocked' flag on a device is cleared while there are
unacknowledged bad blocks we must fail the device.  This is needed for
backwards compatability of the interface.

The code currently uses the wrong test for "unacknowledged bad blocks
exist".  Change it to the right test.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md/md.c')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index aca611711264..3742ce8b0acf 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2592,7 +2592,7 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 		err = 0;
 	} else if (cmd_match(buf, "-blocked")) {
 		if (!test_bit(Faulty, &rdev->flags) &&
-		    test_bit(BlockedBadBlocks, &rdev->flags)) {
+		    rdev->badblocks.unacked_exist) {
 			/* metadata handler doesn't understand badblocks,
 			 * so we need to fail the device
 			 */
-- 
cgit v1.2.3-58-ga151


From 27a7b260f71439c40546b43588448faac01adb93 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Sat, 10 Sep 2011 17:21:28 +1000
Subject: md: Fix handling for devices from 2TB to 4TB in 0.90 metadata.

0.90 metadata uses an unsigned 32bit number to count the number of
kilobytes used from each device.
This should allow up to 4TB per device.
However we multiply this by 2 (to get sectors) before casting to a
larger type, so sizes above 2TB get truncated.

Also we allow rdev->sectors to be larger than 4TB, so it is possible
for the array to be resized larger than the metadata can handle.
So make sure rdev->sectors never exceeds 4TB when 0.90 metadata is in
used.

Also the sanity check at the end of super_90_load should include level
1 as it used ->size too. (RAID0 and Linear don't use ->size at all).

Reported-by: Pim Zandbergen <P.Zandbergen@macroscoop.nl>
Cc: stable@kernel.org
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'drivers/md/md.c')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3742ce8b0acf..5404b2295820 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1138,8 +1138,11 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
 			ret = 0;
 	}
 	rdev->sectors = rdev->sb_start;
+	/* Limit to 4TB as metadata cannot record more than that */
+	if (rdev->sectors >= (2ULL << 32))
+		rdev->sectors = (2ULL << 32) - 2;
 
-	if (rdev->sectors < sb->size * 2 && sb->level > 1)
+	if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
 		/* "this cannot possibly happen" ... */
 		ret = -EINVAL;
 
@@ -1173,7 +1176,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 		mddev->clevel[0] = 0;
 		mddev->layout = sb->layout;
 		mddev->raid_disks = sb->raid_disks;
-		mddev->dev_sectors = sb->size * 2;
+		mddev->dev_sectors = ((sector_t)sb->size) * 2;
 		mddev->events = ev1;
 		mddev->bitmap_info.offset = 0;
 		mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
@@ -1415,6 +1418,11 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
 	rdev->sb_start = calc_dev_sboffset(rdev);
 	if (!num_sectors || num_sectors > rdev->sb_start)
 		num_sectors = rdev->sb_start;
+	/* Limit to 4TB as metadata cannot record more than that.
+	 * 4TB == 2^32 KB, or 2*2^32 sectors.
+	 */
+	if (num_sectors >= (2ULL << 32))
+		num_sectors = (2ULL << 32) - 2;
 	md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
 		       rdev->sb_page);
 	md_super_wait(rdev->mddev);
-- 
cgit v1.2.3-58-ga151


From 01f96c0a9922cd9919baf9d16febdf7016177a12 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 21 Sep 2011 15:30:20 +1000
Subject: md: Avoid waking up a thread after it has been freed.

Two related problems:

1/ some error paths call "md_unregister_thread(mddev->thread)"
   without subsequently clearing ->thread.  A subsequent call
   to mddev_unlock will try to wake the thread, and crash.

2/ Most calls to md_wakeup_thread are protected against the thread
   disappeared either by:
      - holding the ->mutex
      - having an active request, so something else must be keeping
        the array active.
   However mddev_unlock calls md_wakeup_thread after dropping the
   mutex and without any certainty of an active request, so the
   ->thread could theoretically disappear.
   So we need a spinlock to provide some protections.

So change md_unregister_thread to take a pointer to the thread
pointer, and ensure that it always does the required locking, and
clears the pointer properly.

Reported-by: "Moshe Melnikov" <moshe@zadarastorage.com>
Signed-off-by: NeilBrown <neilb@suse.de>
cc: stable@kernel.org
---
 drivers/md/md.c        | 22 +++++++++++++++++++---
 drivers/md/md.h        |  2 +-
 drivers/md/multipath.c |  3 +--
 drivers/md/raid1.c     |  3 +--
 drivers/md/raid10.c    |  5 ++---
 drivers/md/raid5.c     |  6 ++----
 6 files changed, 26 insertions(+), 15 deletions(-)

(limited to 'drivers/md/md.c')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5404b2295820..5c95ccb59500 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -61,6 +61,11 @@
 static void autostart_arrays(int part);
 #endif
 
+/* pers_list is a list of registered personalities protected
+ * by pers_lock.
+ * pers_lock does extra service to protect accesses to
+ * mddev->thread when the mutex cannot be held.
+ */
 static LIST_HEAD(pers_list);
 static DEFINE_SPINLOCK(pers_lock);
 
@@ -739,7 +744,12 @@ static void mddev_unlock(mddev_t * mddev)
 	} else
 		mutex_unlock(&mddev->reconfig_mutex);
 
+	/* was we've dropped the mutex we need a spinlock to
+	 * make sur the thread doesn't disappear
+	 */
+	spin_lock(&pers_lock);
 	md_wakeup_thread(mddev->thread);
+	spin_unlock(&pers_lock);
 }
 
 static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
@@ -6429,11 +6439,18 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
 	return thread;
 }
 
-void md_unregister_thread(mdk_thread_t *thread)
+void md_unregister_thread(mdk_thread_t **threadp)
 {
+	mdk_thread_t *thread = *threadp;
 	if (!thread)
 		return;
 	dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
+	/* Locking ensures that mddev_unlock does not wake_up a
+	 * non-existent thread
+	 */
+	spin_lock(&pers_lock);
+	*threadp = NULL;
+	spin_unlock(&pers_lock);
 
 	kthread_stop(thread->tsk);
 	kfree(thread);
@@ -7340,8 +7357,7 @@ static void reap_sync_thread(mddev_t *mddev)
 	mdk_rdev_t *rdev;
 
 	/* resync has finished, collect result */
-	md_unregister_thread(mddev->sync_thread);
-	mddev->sync_thread = NULL;
+	md_unregister_thread(&mddev->sync_thread);
 	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
 	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
 		/* success...*/
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 1e586bb4452e..0a309dc29b45 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -560,7 +560,7 @@ extern int register_md_personality(struct mdk_personality *p);
 extern int unregister_md_personality(struct mdk_personality *p);
 extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev),
 				mddev_t *mddev, const char *name);
-extern void md_unregister_thread(mdk_thread_t *thread);
+extern void md_unregister_thread(mdk_thread_t **threadp);
 extern void md_wakeup_thread(mdk_thread_t *thread);
 extern void md_check_recovery(mddev_t *mddev);
 extern void md_write_start(mddev_t *mddev, struct bio *bi);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 3535c23af288..d5b5fb300171 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -514,8 +514,7 @@ static int multipath_stop (mddev_t *mddev)
 {
 	multipath_conf_t *conf = mddev->private;
 
-	md_unregister_thread(mddev->thread);
-	mddev->thread = NULL;
+	md_unregister_thread(&mddev->thread);
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
 	mempool_destroy(conf->pool);
 	kfree(conf->multipaths);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index f4622dd8fc59..d9587dffe533 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2562,8 +2562,7 @@ static int stop(mddev_t *mddev)
 	raise_barrier(conf);
 	lower_barrier(conf);
 
-	md_unregister_thread(mddev->thread);
-	mddev->thread = NULL;
+	md_unregister_thread(&mddev->thread);
 	if (conf->r1bio_pool)
 		mempool_destroy(conf->r1bio_pool);
 	kfree(conf->mirrors);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index d7a8468ddeab..0cd9672cf9cb 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2955,7 +2955,7 @@ static int run(mddev_t *mddev)
 	return 0;
 
 out_free_conf:
-	md_unregister_thread(mddev->thread);
+	md_unregister_thread(&mddev->thread);
 	if (conf->r10bio_pool)
 		mempool_destroy(conf->r10bio_pool);
 	safe_put_page(conf->tmppage);
@@ -2973,8 +2973,7 @@ static int stop(mddev_t *mddev)
 	raise_barrier(conf, 0);
 	lower_barrier(conf);
 
-	md_unregister_thread(mddev->thread);
-	mddev->thread = NULL;
+	md_unregister_thread(&mddev->thread);
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
 	if (conf->r10bio_pool)
 		mempool_destroy(conf->r10bio_pool);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 43709fa6b6df..ac5e8b57e50f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4941,8 +4941,7 @@ static int run(mddev_t *mddev)
 
 	return 0;
 abort:
-	md_unregister_thread(mddev->thread);
-	mddev->thread = NULL;
+	md_unregister_thread(&mddev->thread);
 	if (conf) {
 		print_raid5_conf(conf);
 		free_conf(conf);
@@ -4956,8 +4955,7 @@ static int stop(mddev_t *mddev)
 {
 	raid5_conf_t *conf = mddev->private;
 
-	md_unregister_thread(mddev->thread);
-	mddev->thread = NULL;
+	md_unregister_thread(&mddev->thread);
 	if (mddev->queue)
 		mddev->queue->backing_dev_info.congested_fn = NULL;
 	free_conf(conf);
-- 
cgit v1.2.3-58-ga151