From b46020aa3a8a0f9c7324fe0af4aec4227f947a10 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <klamm@yandex-team.ru>
Date: Mon, 21 Dec 2015 10:50:59 +1100
Subject: md/raid5: remove redundant check in stripe_add_to_batch_list()

The stripe_add_to_batch_list() function is called only if
stripe_can_batch() returned true, so there is no need for double check.

Signed-off-by: Roman Gushchin <klamm@yandex-team.ru>
Cc: Neil Brown <neilb@suse.com>
Cc: linux-raid@vger.kernel.org
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid5.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 704ef7fcfbf8..22362505f810 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -772,8 +772,6 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh
 	int hash;
 	int dd_idx;
 
-	if (!stripe_can_batch(sh))
-		return;
 	/* Don't cross chunks, so stripe pd_idx/qd_idx is the same */
 	tmp_sec = sh->sector;
 	if (!sector_div(tmp_sec, conf->chunk_sectors))
-- 
cgit v1.2.3-58-ga151


From ac277c6a8a39bc50f891a3477625330c276bd7f5 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Mon, 21 Dec 2015 10:50:59 +1100
Subject: md-cluster: Avoid the resync ping-pong

If a RESYNCING message with (0,0) has been sent before, do not send it
again. This avoids a resync ping pong between the nodes. We read
the bitmap lockresource's LVB to figure out the previous value
of the RESYNCING message.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md-cluster.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'drivers')

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index d6a1126d85ce..e57bbfed1638 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -882,8 +882,16 @@ static int resync_start(struct mddev *mddev)
 static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
+	struct resync_info ri;
 	struct cluster_msg cmsg = {0};
 
+	/* do not send zero again, if we have sent before */
+	if (hi == 0) {
+		memcpy(&ri, cinfo->bitmap_lockres->lksb.sb_lvbptr, sizeof(struct resync_info));
+		if (le64_to_cpu(ri.hi) == 0)
+			return 0;
+	}
+
 	add_resync_info(cinfo->bitmap_lockres, lo, hi);
 	/* Re-acquire the lock to refresh LVB */
 	dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
-- 
cgit v1.2.3-58-ga151


From 659b254fa7392e32b59a30d4b61fb12c4cd440ff Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Mon, 21 Dec 2015 10:50:59 +1100
Subject: md-cluster: remove a disk asynchronously from cluster environment

For cluster raid, if one disk couldn't be reach in one node, then
other nodes would receive the REMOVE message for the disk.

In receiving node, we can't call md_kick_rdev_from_array to remove
the disk from array synchronously since the disk might still be busy
in this node. So let's set a ClusterRemove flag on the disk, then
let the thread to do the removal job eventually.

Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md-cluster.c |  7 +++++--
 drivers/md/md.c         | 12 ++++++++++++
 drivers/md/md.h         |  1 +
 3 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index e57bbfed1638..3fd7301fd7af 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -440,8 +440,11 @@ static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
 	struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev,
 						   le32_to_cpu(msg->raid_slot));
 
-	if (rdev)
-		md_kick_rdev_from_array(rdev);
+	if (rdev) {
+		set_bit(ClusterRemove, &rdev->flags);
+		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+		md_wakeup_thread(mddev->thread);
+	}
 	else
 		pr_warn("%s: %d Could not find disk(%d) to REMOVE\n",
 			__func__, __LINE__, le32_to_cpu(msg->raid_slot));
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 61aacab424cf..198e29dffb98 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8318,6 +8318,18 @@ void md_check_recovery(struct mddev *mddev)
 			goto unlock;
 		}
 
+		if (mddev_is_clustered(mddev)) {
+			struct md_rdev *rdev;
+			/* kick the device if another node issued a
+			 * remove disk.
+			 */
+			rdev_for_each(rdev, mddev) {
+				if (test_and_clear_bit(ClusterRemove, &rdev->flags) &&
+						rdev->raid_disk < 0)
+					md_kick_rdev_from_array(rdev);
+			}
+		}
+
 		if (!mddev->external) {
 			int did_change = 0;
 			spin_lock(&mddev->lock);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index ca0b643fe3c1..f7b17aef837d 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -183,6 +183,7 @@ enum flag_bits {
 				 * Usually, this device should be faster
 				 * than other devices in the array
 				 */
+	ClusterRemove,
 };
 
 #define BB_LEN_MASK	(0x00000000000001FFULL)
-- 
cgit v1.2.3-58-ga151


From 54a88392cdd84b4a739ce3a986bfabfaff67d9d2 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Mon, 21 Dec 2015 10:51:00 +1100
Subject: md-cluster: Fix the remove sequence with the new MD reload code

The remove disk message does not need metadata_update_start(), but
can be an independent message.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md-cluster.c | 2 +-
 drivers/md/md.c         | 9 +--------
 2 files changed, 2 insertions(+), 9 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 3fd7301fd7af..b58374daff32 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -997,7 +997,7 @@ static int remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 	struct md_cluster_info *cinfo = mddev->cluster_info;
 	cmsg.type = cpu_to_le32(REMOVE);
 	cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
-	return __sendmsg(cinfo, &cmsg);
+	return sendmsg(cinfo, &cmsg);
 }
 
 static int gather_bitmaps(struct md_rdev *rdev)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 198e29dffb98..ab3995de0418 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6134,15 +6134,11 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
 {
 	char b[BDEVNAME_SIZE];
 	struct md_rdev *rdev;
-	int ret = -1;
 
 	rdev = find_rdev(mddev, dev);
 	if (!rdev)
 		return -ENXIO;
 
-	if (mddev_is_clustered(mddev))
-		ret = md_cluster_ops->metadata_update_start(mddev);
-
 	if (rdev->raid_disk < 0)
 		goto kick_rdev;
 
@@ -6153,7 +6149,7 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
 		goto busy;
 
 kick_rdev:
-	if (mddev_is_clustered(mddev) && ret == 0)
+	if (mddev_is_clustered(mddev))
 		md_cluster_ops->remove_disk(mddev, rdev);
 
 	md_kick_rdev_from_array(rdev);
@@ -6162,9 +6158,6 @@ kick_rdev:
 
 	return 0;
 busy:
-	if (mddev_is_clustered(mddev) && ret == 0)
-		md_cluster_ops->metadata_update_cancel(mddev);
-
 	printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
 		bdevname(rdev->bdev,b), mdname(mddev));
 	return -EBUSY;
-- 
cgit v1.2.3-58-ga151


From 09afd2a8d6ad2c40f3c1ae0b3f83784864cf4c15 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Mon, 21 Dec 2015 10:51:00 +1100
Subject: md-cluster: Allow spare devices to be marked as faulty

If a spare device was marked faulty, it would not be reflected
in receiving nodes because it would mark it as activated and continue.
Continue the operation, so it may be set as faulty.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index ab3995de0418..f2f855c203e5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -9106,7 +9106,6 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
 				ret = remove_and_add_spares(mddev, rdev2);
 				pr_info("Activated spare: %s\n",
 						bdevname(rdev2->bdev,b));
-				continue;
 			}
 			/* device faulty
 			 * We just want to do the minimum to mark the disk
-- 
cgit v1.2.3-58-ga151


From f6a2dc64ee74477c966f5220b1f560ed6308d010 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Mon, 21 Dec 2015 10:51:00 +1100
Subject: md-cluster: append some actions when change bitmap from clustered to
 none

For clustered raid, we need to do extra actions when change
bitmap to none.

1. check if all the bitmap lock could be get or not, if yes then
   we can continue the change since cluster raid is only active
   in current node. Otherwise return fail and unlock the related
   bitmap locks
2. set nodes to 0 and then leave cluster environment.
3. release other nodes's bitmap lock.

Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md-cluster.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/md-cluster.h |  2 ++
 drivers/md/md.c         | 13 +++++++++++
 3 files changed, 72 insertions(+)

(limited to 'drivers')

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index b58374daff32..db9375f501ab 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -55,6 +55,7 @@ struct md_cluster_info {
 	int slot_number;
 	struct completion completion;
 	struct dlm_lock_resource *bitmap_lockres;
+	struct dlm_lock_resource **other_bitmap_lockres;
 	struct dlm_lock_resource *resync_lockres;
 	struct list_head suspend_list;
 	spinlock_t suspend_lock;
@@ -803,6 +804,7 @@ static void resync_bitmap(struct mddev *mddev)
 			__func__, __LINE__, err);
 }
 
+static void unlock_all_bitmaps(struct mddev *mddev);
 static int leave(struct mddev *mddev)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
@@ -823,6 +825,7 @@ static int leave(struct mddev *mddev)
 	lockres_free(cinfo->ack_lockres);
 	lockres_free(cinfo->no_new_dev_lockres);
 	lockres_free(cinfo->bitmap_lockres);
+	unlock_all_bitmaps(mddev);
 	dlm_release_lockspace(cinfo->lockspace, 2);
 	return 0;
 }
@@ -1000,6 +1003,58 @@ static int remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 	return sendmsg(cinfo, &cmsg);
 }
 
+static int lock_all_bitmaps(struct mddev *mddev)
+{
+	int slot, my_slot, ret, held = 1, i = 0;
+	char str[64];
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+
+	cinfo->other_bitmap_lockres = kzalloc((mddev->bitmap_info.nodes - 1) *
+					     sizeof(struct dlm_lock_resource *),
+					     GFP_KERNEL);
+	if (!cinfo->other_bitmap_lockres) {
+		pr_err("md: can't alloc mem for other bitmap locks\n");
+		return 0;
+	}
+
+	my_slot = slot_number(mddev);
+	for (slot = 0; slot < mddev->bitmap_info.nodes; slot++) {
+		if (slot == my_slot)
+			continue;
+
+		memset(str, '\0', 64);
+		snprintf(str, 64, "bitmap%04d", slot);
+		cinfo->other_bitmap_lockres[i] = lockres_init(mddev, str, NULL, 1);
+		if (!cinfo->other_bitmap_lockres[i])
+			return -ENOMEM;
+
+		cinfo->other_bitmap_lockres[i]->flags |= DLM_LKF_NOQUEUE;
+		ret = dlm_lock_sync(cinfo->other_bitmap_lockres[i], DLM_LOCK_PW);
+		if (ret)
+			held = -1;
+		i++;
+	}
+
+	return held;
+}
+
+static void unlock_all_bitmaps(struct mddev *mddev)
+{
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+	int i;
+
+	/* release other node's bitmap lock if they are existed */
+	if (cinfo->other_bitmap_lockres) {
+		for (i = 0; i < mddev->bitmap_info.nodes - 1; i++) {
+			if (cinfo->other_bitmap_lockres[i]) {
+				dlm_unlock_sync(cinfo->other_bitmap_lockres[i]);
+				lockres_free(cinfo->other_bitmap_lockres[i]);
+			}
+		}
+		kfree(cinfo->other_bitmap_lockres);
+	}
+}
+
 static int gather_bitmaps(struct md_rdev *rdev)
 {
 	int sn, err;
@@ -1045,6 +1100,8 @@ static struct md_cluster_operations cluster_ops = {
 	.new_disk_ack = new_disk_ack,
 	.remove_disk = remove_disk,
 	.gather_bitmaps = gather_bitmaps,
+	.lock_all_bitmaps = lock_all_bitmaps,
+	.unlock_all_bitmaps = unlock_all_bitmaps,
 };
 
 static int __init cluster_init(void)
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index e75ea2613184..45ce6c97d8bd 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -24,6 +24,8 @@ struct md_cluster_operations {
 	int (*new_disk_ack)(struct mddev *mddev, bool ack);
 	int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev);
 	int (*gather_bitmaps)(struct md_rdev *rdev);
+	int (*lock_all_bitmaps)(struct mddev *mddev);
+	void (*unlock_all_bitmaps)(struct mddev *mddev);
 };
 
 #endif /* _MD_CLUSTER_H */
diff --git a/drivers/md/md.c b/drivers/md/md.c
index f2f855c203e5..495d8aa0a0d2 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6599,6 +6599,19 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
 				rv = -EINVAL;
 				goto err;
 			}
+			if (mddev->bitmap_info.nodes) {
+				/* hold PW on all the bitmap lock */
+				if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) {
+					printk("md: can't change bitmap to none since the"
+					       " array is in use by more than one node\n");
+					rv = -EPERM;
+					md_cluster_ops->unlock_all_bitmaps(mddev);
+					goto err;
+				}
+
+				mddev->bitmap_info.nodes = 0;
+				md_cluster_ops->leave(mddev);
+			}
 			mddev->pers->quiesce(mddev, 1);
 			bitmap_destroy(mddev);
 			mddev->pers->quiesce(mddev, 0);
-- 
cgit v1.2.3-58-ga151


From 15858fa5b00c1067a8a8e53ea32f4a65f8bebbb8 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Mon, 21 Dec 2015 10:51:00 +1100
Subject: md-cluster: Defer MD reloading to mddev->thread

Reloading of superblock must be performed under reconfig_mutex. However,
this cannot be done with md_reload_sb because it would deadlock with
the message DLM lock. So, we defer it in md_check_recovery() which is
executed by mddev->thread.

This introduces a new flag, MD_RELOAD_SB, which if set, will reload the
superblock. And good_device_nr is also added to 'struct mddev' which is
used to get the num of the good device within cluster raid.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md-cluster.c | 4 +++-
 drivers/md/md.c         | 4 ++++
 drivers/md/md.h         | 4 ++++
 3 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index db9375f501ab..b659ef7b8daf 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -432,8 +432,10 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
 static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
-	md_reload_sb(mddev, le32_to_cpu(msg->raid_slot));
+	mddev->good_device_nr = le32_to_cpu(msg->raid_slot);
+	set_bit(MD_RELOAD_SB, &mddev->flags);
 	dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
+	md_wakeup_thread(mddev->thread);
 }
 
 static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 495d8aa0a0d2..504ce5d068ce 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8286,6 +8286,7 @@ void md_check_recovery(struct mddev *mddev)
 		(mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<<MD_CHANGE_PENDING)) ||
 		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
 		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
+		test_bit(MD_RELOAD_SB, &mddev->flags) ||
 		(mddev->external == 0 && mddev->safemode == 1) ||
 		(mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
 		 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
@@ -8334,6 +8335,9 @@ void md_check_recovery(struct mddev *mddev)
 						rdev->raid_disk < 0)
 					md_kick_rdev_from_array(rdev);
 			}
+
+			if (test_and_clear_bit(MD_RELOAD_SB, &mddev->flags))
+				md_reload_sb(mddev, mddev->good_device_nr);
 		}
 
 		if (!mddev->external) {
diff --git a/drivers/md/md.h b/drivers/md/md.h
index f7b17aef837d..8817e623258a 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -235,6 +235,9 @@ struct mddev {
 				 */
 #define MD_JOURNAL_CLEAN 5	/* A raid with journal is already clean */
 #define MD_HAS_JOURNAL	6	/* The raid array has journal feature set */
+#define MD_RELOAD_SB	7	/* Reload the superblock because another node
+				 * updated it.
+				 */
 
 	int				suspended;
 	atomic_t			active_io;
@@ -465,6 +468,7 @@ struct mddev {
 	struct work_struct event_work;	/* used by dm to report failure event */
 	void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
 	struct md_cluster_info		*cluster_info;
+	unsigned int			good_device_nr;	/* good device num within cluster raid */
 };
 
 static inline int __must_check mddev_lock(struct mddev *mddev)
-- 
cgit v1.2.3-58-ga151


From 8b9277c81450de9d8081ff6571ac5986e6c83f49 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Mon, 21 Dec 2015 10:51:00 +1100
Subject: md-cluster: Protect communication with mutexes

Communication can happen through multiple threads. It is possible that
one thread steps over another threads sequence. So, we use mutexes to
protect both the send and receive sequences.

Send communication is locked through state bit, MD_CLUSTER_SEND_LOCK.
Communication is locked with bit manipulation in order to allow
"lock and hold" for the add operation. In case of an add operation,
if the lock is held, MD_CLUSTER_SEND_LOCKED_ALREADY is set.
When md_update_sb() calls metadata_update_start(), it checks
(in a single statement to avoid races), if the communication
is already locked. If yes, it merely returns zero, else it
locks the token lockresource.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md-cluster.c | 73 ++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 63 insertions(+), 10 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index b659ef7b8daf..ad3ec7df1547 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -48,12 +48,26 @@ struct resync_info {
 #define		MD_CLUSTER_SUSPEND_READ_BALANCING	2
 #define		MD_CLUSTER_BEGIN_JOIN_CLUSTER		3
 
+/* Lock the send communication. This is done through
+ * bit manipulation as opposed to a mutex in order to
+ * accomodate lock and hold. See next comment.
+ */
+#define		MD_CLUSTER_SEND_LOCK			4
+/* If cluster operations must lock the communication channel,
+ * so as to perform extra operations (and no other operation
+ * is allowed on the MD, such as adding a disk. Token needs
+ * to be locked and held until the operation completes with
+ * a md_update_sb(), which would eventually release the lock.
+ */
+#define		MD_CLUSTER_SEND_LOCKED_ALREADY		5
+
 
 struct md_cluster_info {
 	/* dlm lock space and resources for clustered raid. */
 	dlm_lockspace_t *lockspace;
 	int slot_number;
 	struct completion completion;
+	struct mutex recv_mutex;
 	struct dlm_lock_resource *bitmap_lockres;
 	struct dlm_lock_resource **other_bitmap_lockres;
 	struct dlm_lock_resource *resync_lockres;
@@ -68,6 +82,7 @@ struct md_cluster_info {
 	struct dlm_lock_resource *no_new_dev_lockres;
 	struct md_thread *recv_thread;
 	struct completion newdisk_completion;
+	wait_queue_head_t wait;
 	unsigned long state;
 };
 
@@ -508,9 +523,11 @@ static void recv_daemon(struct md_thread *thread)
 	struct cluster_msg msg;
 	int ret;
 
+	mutex_lock(&cinfo->recv_mutex);
 	/*get CR on Message*/
 	if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) {
 		pr_err("md/raid1:failed to get CR on MESSAGE\n");
+		mutex_unlock(&cinfo->recv_mutex);
 		return;
 	}
 
@@ -534,33 +551,45 @@ static void recv_daemon(struct md_thread *thread)
 	ret = dlm_unlock_sync(message_lockres);
 	if (unlikely(ret != 0))
 		pr_info("unlock msg failed return %d\n", ret);
+	mutex_unlock(&cinfo->recv_mutex);
 }
 
-/* lock_comm()
+/* lock_token()
  * Takes the lock on the TOKEN lock resource so no other
  * node can communicate while the operation is underway.
- * If called again, and the TOKEN lock is alread in EX mode
- * return success. However, care must be taken that unlock_comm()
- * is called only once.
  */
-static int lock_comm(struct md_cluster_info *cinfo)
+static int lock_token(struct md_cluster_info *cinfo)
 {
 	int error;
 
-	if (cinfo->token_lockres->mode == DLM_LOCK_EX)
-		return 0;
-
 	error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
 	if (error)
 		pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
 				__func__, __LINE__, error);
+
+	/* Lock the receive sequence */
+	mutex_lock(&cinfo->recv_mutex);
 	return error;
 }
 
+/* lock_comm()
+ * Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel.
+ */
+static int lock_comm(struct md_cluster_info *cinfo)
+{
+	wait_event(cinfo->wait,
+		   !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state));
+
+	return lock_token(cinfo);
+}
+
 static void unlock_comm(struct md_cluster_info *cinfo)
 {
 	WARN_ON(cinfo->token_lockres->mode != DLM_LOCK_EX);
+	mutex_unlock(&cinfo->recv_mutex);
 	dlm_unlock_sync(cinfo->token_lockres);
+	clear_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state);
+	wake_up(&cinfo->wait);
 }
 
 /* __sendmsg()
@@ -713,6 +742,8 @@ static int join(struct mddev *mddev, int nodes)
 	spin_lock_init(&cinfo->suspend_lock);
 	init_completion(&cinfo->completion);
 	set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
+	init_waitqueue_head(&cinfo->wait);
+	mutex_init(&cinfo->recv_mutex);
 
 	mddev->cluster_info = cinfo;
 
@@ -843,9 +874,25 @@ static int slot_number(struct mddev *mddev)
 	return cinfo->slot_number - 1;
 }
 
+/*
+ * Check if the communication is already locked, else lock the communication
+ * channel.
+ * If it is already locked, token is in EX mode, and hence lock_token()
+ * should not be called.
+ */
 static int metadata_update_start(struct mddev *mddev)
 {
-	return lock_comm(mddev->cluster_info);
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+
+	wait_event(cinfo->wait,
+		   !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state) ||
+		   test_and_clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state));
+
+	/* If token is already locked, return 0 */
+	if (cinfo->token_lockres->mode == DLM_LOCK_EX)
+		return 0;
+
+	return lock_token(cinfo);
 }
 
 static int metadata_update_finish(struct mddev *mddev)
@@ -870,6 +917,7 @@ static int metadata_update_finish(struct mddev *mddev)
 		ret = __sendmsg(cinfo, &cmsg);
 	} else
 		pr_warn("md-cluster: No good device id found to send\n");
+	clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state);
 	unlock_comm(cinfo);
 	return ret;
 }
@@ -877,6 +925,7 @@ static int metadata_update_finish(struct mddev *mddev)
 static void metadata_update_cancel(struct mddev *mddev)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
+	clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state);
 	unlock_comm(cinfo);
 }
 
@@ -970,14 +1019,18 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
 		ret = -ENOENT;
 	if (ret)
 		unlock_comm(cinfo);
-	else
+	else {
 		dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
+		set_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state);
+		wake_up(&cinfo->wait);
+	}
 	return ret;
 }
 
 static void add_new_disk_cancel(struct mddev *mddev)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
+	clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state);
 	unlock_comm(cinfo);
 }
 
-- 
cgit v1.2.3-58-ga151


From e19508fa4df896b115f5321c21ce7669559b0863 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Mon, 21 Dec 2015 10:51:01 +1100
Subject: md-cluster: update comments for MD_CLUSTER_SEND_LOCKED_ALREADY

1. fix unbalanced parentheses.
2. add more description about that MD_CLUSTER_SEND_LOCKED_ALREADY
   will be cleared after set it in add_new_disk.

Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md-cluster.c | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index ad3ec7df1547..0ded8e97751d 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -53,11 +53,12 @@ struct resync_info {
  * accomodate lock and hold. See next comment.
  */
 #define		MD_CLUSTER_SEND_LOCK			4
-/* If cluster operations must lock the communication channel,
- * so as to perform extra operations (and no other operation
- * is allowed on the MD, such as adding a disk. Token needs
- * to be locked and held until the operation completes with
- * a md_update_sb(), which would eventually release the lock.
+/* If cluster operations (such as adding a disk) must lock the
+ * communication channel, so as to perform extra operations
+ * (update metadata) and no other operation is allowed on the
+ * MD. Token needs to be locked and held until the operation
+ * completes witha md_update_sb(), which would eventually release
+ * the lock.
  */
 #define		MD_CLUSTER_SEND_LOCKED_ALREADY		5
 
@@ -1021,6 +1022,18 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
 		unlock_comm(cinfo);
 	else {
 		dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
+		/* Since MD_CHANGE_DEVS will be set in add_bound_rdev which
+		 * will run soon after add_new_disk, the below path will be
+		 * invoked:
+		 *   md_wakeup_thread(mddev->thread)
+		 *	-> conf->thread (raid1d)
+		 *	-> md_check_recovery -> md_update_sb
+		 *	-> metadata_update_start/finish
+		 * MD_CLUSTER_SEND_LOCKED_ALREADY will be cleared eventually.
+		 *
+		 * For other failure cases, metadata_update_cancel and
+		 * add_new_disk_cancel also clear below bit as well.
+		 * */
 		set_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state);
 		wake_up(&cinfo->wait);
 	}
-- 
cgit v1.2.3-58-ga151


From abf3508d8faa281e01a780e022a6f43d1731fe0b Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Mon, 21 Dec 2015 10:51:01 +1100
Subject: md: update comment for md_allow_write

MD_CHANGE_CLEAN had been replaced with MD_CHANGE_PENDING after
commit 070dc6 ("md: resolve confusion of MD_CHANGE_CLEAN"),
so make the change accordingly.

Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 504ce5d068ce..f71a81b37d08 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7714,7 +7714,7 @@ EXPORT_SYMBOL(md_write_end);
  * attempting a GFP_KERNEL allocation while holding the mddev lock.
  * Must be called with mddev_lock held.
  *
- * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
+ * In the ->external case MD_CHANGE_PENDING can not be cleared until mddev->lock
  * is dropped, so return -EAGAIN after notifying userspace.
  */
 int md_allow_write(struct mddev *mddev)
-- 
cgit v1.2.3-58-ga151


From 3848c0bcb09c7b78e6f4ae9f8fc8d6d9aecbd35a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 21 Dec 2015 10:51:01 +1100
Subject: raid5-cache: simplify r5l_move_io_unit_list

It's only used for one kind of move, so make that explicit.  Also clean
up the code a bit by using list_for_each_safe.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Shaohua Li <shli@fb.com>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid5-cache.c | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index b887e04d7e5c..3699c4704ba8 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -156,21 +156,6 @@ static void r5l_free_io_unit(struct r5l_log *log, struct r5l_io_unit *io)
 	kmem_cache_free(log->io_kc, io);
 }
 
-static void r5l_move_io_unit_list(struct list_head *from, struct list_head *to,
-				  enum r5l_io_unit_state state)
-{
-	struct r5l_io_unit *io;
-
-	while (!list_empty(from)) {
-		io = list_first_entry(from, struct r5l_io_unit, log_sibling);
-		/* don't change list order */
-		if (io->state >= state)
-			list_move_tail(&io->log_sibling, to);
-		else
-			break;
-	}
-}
-
 static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
 				    enum r5l_io_unit_state state)
 {
@@ -206,6 +191,20 @@ static void r5l_log_run_stripes(struct r5l_log *log)
 	}
 }
 
+static void r5l_move_to_end_ios(struct r5l_log *log)
+{
+	struct r5l_io_unit *io, *next;
+
+	assert_spin_locked(&log->io_list_lock);
+
+	list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
+		/* don't change list order */
+		if (io->state < IO_UNIT_IO_END)
+			break;
+		list_move_tail(&io->log_sibling, &log->io_end_ios);
+	}
+}
+
 static void r5l_log_endio(struct bio *bio)
 {
 	struct r5l_io_unit *io = bio->bi_private;
@@ -220,8 +219,7 @@ static void r5l_log_endio(struct bio *bio)
 	spin_lock_irqsave(&log->io_list_lock, flags);
 	__r5l_set_io_unit_state(io, IO_UNIT_IO_END);
 	if (log->need_cache_flush)
-		r5l_move_io_unit_list(&log->running_ios, &log->io_end_ios,
-				      IO_UNIT_IO_END);
+		r5l_move_to_end_ios(log);
 	else
 		r5l_log_run_stripes(log);
 	spin_unlock_irqrestore(&log->io_list_lock, flags);
-- 
cgit v1.2.3-58-ga151


From ad66d445ee5a5f548142b880e1642c711fbcacd1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 21 Dec 2015 10:51:01 +1100
Subject: raid5-cache: free meta_page earlier

Once the I/O completed we don't need the meta page anymore.  As the iounits
can live on for a long time this reduces memory pressure a bit.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Shaohua Li <shli@fb.com>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid5-cache.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 3699c4704ba8..668e973f07e6 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -150,12 +150,6 @@ static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
 	return log->device_size > used_size + size;
 }
 
-static void r5l_free_io_unit(struct r5l_log *log, struct r5l_io_unit *io)
-{
-	__free_page(io->meta_page);
-	kmem_cache_free(log->io_kc, io);
-}
-
 static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
 				    enum r5l_io_unit_state state)
 {
@@ -215,6 +209,7 @@ static void r5l_log_endio(struct bio *bio)
 		md_error(log->rdev->mddev, log->rdev);
 
 	bio_put(bio);
+	__free_page(io->meta_page);
 
 	spin_lock_irqsave(&log->io_list_lock, flags);
 	__r5l_set_io_unit_state(io, IO_UNIT_IO_END);
@@ -552,7 +547,7 @@ static bool r5l_complete_finished_ios(struct r5l_log *log)
 		log->next_cp_seq = io->seq;
 
 		list_del(&io->log_sibling);
-		r5l_free_io_unit(log, io);
+		kmem_cache_free(log->io_kc, io);
 
 		found = true;
 	}
-- 
cgit v1.2.3-58-ga151


From 3312c951efaba55080958974047414576b9e5d63 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 21 Dec 2015 10:51:01 +1100
Subject: md: avoid warning for 32-bit sector_t

When CONFIG_LBDAF is not set, sector_t is only 32-bits wide, which
means we cannot have devices with more than 2TB, and the code that
is trying to handle compatibility support for large devices in
md version 0.90 is meaningless but also causes a compile-time warning:

drivers/md/md.c: In function 'super_90_load':
drivers/md/md.c:1029:19: warning: large integer implicitly truncated to unsigned type [-Woverflow]
drivers/md/md.c: In function 'super_90_rdev_size_change':
drivers/md/md.c:1323:17: warning: large integer implicitly truncated to unsigned type [-Woverflow]

This adds a check for CONFIG_LBDAF to avoid even getting into this
code path, and also adds an explicit cast to let the compiler know
it doesn't have to warn about the truncation.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index f71a81b37d08..3d70d0d11b95 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1026,8 +1026,9 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
 	 * (not needed for Linear and RAID0 as metadata doesn't
 	 * record this size)
 	 */
-	if (rdev->sectors >= (2ULL << 32) && sb->level >= 1)
-		rdev->sectors = (2ULL << 32) - 2;
+	if (IS_ENABLED(CONFIG_LBDAF) && (u64)rdev->sectors >= (2ULL << 32) &&
+	    sb->level >= 1)
+		rdev->sectors = (sector_t)(2ULL << 32) - 2;
 
 	if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
 		/* "this cannot possibly happen" ... */
@@ -1320,8 +1321,9 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
 	/* Limit to 4TB as metadata cannot record more than that.
 	 * 4TB == 2^32 KB, or 2*2^32 sectors.
 	 */
-	if (num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
-		num_sectors = (2ULL << 32) - 2;
+	if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) &&
+	    rdev->mddev->level >= 1)
+		num_sectors = (sector_t)(2ULL << 32) - 2;
 	md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
 		       rdev->sb_page);
 	md_super_wait(rdev->mddev);
-- 
cgit v1.2.3-58-ga151


From 9ebc6ef188a0656f3620835f9be7fe22c1644c1c Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Mon, 21 Dec 2015 10:51:01 +1100
Subject: drivers: md: use ktime_get_real_seconds()

get_seconds() API is not y2038 safe on 32 bit systems and the API
is deprecated. Replace it with calls to ktime_get_real_seconds()
API instead. Change mddev structure types to time64_t accordingly.

32 bit signed timestamps will overflow in the year 2038.

Change the user interface mdu_array_info_s structure timestamps:
ctime and utime values used in ioctls GET_ARRAY_INFO and
SET_ARRAY_INFO to unsigned int. This will extend the field to last
until the year 2106.
The long term plan is to get rid of ctime and utime values in
this structure as this information can be read from the on-disk
meta data directly.

Clamp the tim64_t timestamps to positive values with a max of U32_MAX
when returning from GET_ARRAY_INFO ioctl to accommodate above changes
in the data type of timestamps to unsigned int.

v0.90 on disk meta data uses u32 for maintaining time stamps.
So this will also last until year 2106.
Assumption is that the usage of v0.90 will be deprecated by
year 2106.

Timestamp fields in the on disk meta data for v1.0 version already
use 64 bit data types. Remove the truncation of the bits while
writing to or reading from these from the disk.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md.c                | 18 +++++++++---------
 drivers/md/md.h                |  2 +-
 include/uapi/linux/raid/md_u.h |  4 ++--
 3 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3d70d0d11b95..d0f0621bf9b0 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1200,13 +1200,13 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
 	memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
 	memcpy(&sb->set_uuid3, mddev->uuid+12,4);
 
-	sb->ctime = mddev->ctime;
+	sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
 	sb->level = mddev->level;
 	sb->size = mddev->dev_sectors / 2;
 	sb->raid_disks = mddev->raid_disks;
 	sb->md_minor = mddev->md_minor;
 	sb->not_persistent = 0;
-	sb->utime = mddev->utime;
+	sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
 	sb->state = 0;
 	sb->events_hi = (mddev->events>>32);
 	sb->events_lo = (u32)mddev->events;
@@ -1547,8 +1547,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
 		mddev->patch_version = 0;
 		mddev->external = 0;
 		mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
-		mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
-		mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
+		mddev->ctime = le64_to_cpu(sb->ctime);
+		mddev->utime = le64_to_cpu(sb->utime);
 		mddev->level = le32_to_cpu(sb->level);
 		mddev->clevel[0] = 0;
 		mddev->layout = le32_to_cpu(sb->layout);
@@ -2336,7 +2336,7 @@ repeat:
 
 	spin_lock(&mddev->lock);
 
-	mddev->utime = get_seconds();
+	mddev->utime = ktime_get_real_seconds();
 
 	if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
 		force_change = 1;
@@ -5843,7 +5843,7 @@ static int get_array_info(struct mddev *mddev, void __user *arg)
 	info.major_version = mddev->major_version;
 	info.minor_version = mddev->minor_version;
 	info.patch_version = MD_PATCHLEVEL_VERSION;
-	info.ctime         = mddev->ctime;
+	info.ctime         = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
 	info.level         = mddev->level;
 	info.size          = mddev->dev_sectors / 2;
 	if (info.size != mddev->dev_sectors / 2) /* overflow */
@@ -5853,7 +5853,7 @@ static int get_array_info(struct mddev *mddev, void __user *arg)
 	info.md_minor      = mddev->md_minor;
 	info.not_persistent= !mddev->persistent;
 
-	info.utime         = mddev->utime;
+	info.utime         = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
 	info.state         = 0;
 	if (mddev->in_sync)
 		info.state = (1<<MD_SB_CLEAN);
@@ -6353,13 +6353,13 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
 		/* ensure mddev_put doesn't delete this now that there
 		 * is some minimal configuration.
 		 */
-		mddev->ctime         = get_seconds();
+		mddev->ctime         = ktime_get_real_seconds();
 		return 0;
 	}
 	mddev->major_version = MD_MAJOR_VERSION;
 	mddev->minor_version = MD_MINOR_VERSION;
 	mddev->patch_version = MD_PATCHLEVEL_VERSION;
-	mddev->ctime         = get_seconds();
+	mddev->ctime         = ktime_get_real_seconds();
 
 	mddev->level         = info->level;
 	mddev->clevel[0]     = 0;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 8817e623258a..e16a17c37418 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -264,7 +264,7 @@ struct mddev {
 							 * managed externally */
 	char				metadata_type[17]; /* externally set*/
 	int				chunk_sectors;
-	time_t				ctime, utime;
+	time64_t			ctime, utime;
 	int				level, layout;
 	char				clevel[16];
 	int				raid_disks;
diff --git a/include/uapi/linux/raid/md_u.h b/include/uapi/linux/raid/md_u.h
index 1cb8aa6850b5..36cd8210a5d1 100644
--- a/include/uapi/linux/raid/md_u.h
+++ b/include/uapi/linux/raid/md_u.h
@@ -80,7 +80,7 @@ typedef struct mdu_array_info_s {
 	int major_version;
 	int minor_version;
 	int patch_version;
-	int ctime;
+	unsigned int ctime;
 	int level;
 	int size;
 	int nr_disks;
@@ -91,7 +91,7 @@ typedef struct mdu_array_info_s {
 	/*
 	 * Generic state information
 	 */
-	int utime;		/*  0 Superblock update time		      */
+	unsigned int utime;	/*  0 Superblock update time		      */
 	int state;		/*  1 State bits (clean, ...)		      */
 	int active_disks;	/*  2 Number of currently active disks	      */
 	int working_disks;	/*  3 Number of working disks		      */
-- 
cgit v1.2.3-58-ga151


From f6b6ec5cfac306c1eea66f074050864efcb11851 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Mon, 21 Dec 2015 10:51:02 +1100
Subject: raid5-cache: add journal hot add/remove support

Add support for journal disk hot add/remove. Mostly trival checks in md
part. The raid5 part is a little tricky. For hot-remove, we can't wait
pending write as it's called from raid5d. The wait will cause deadlock.
We simplily fail the hot-remove. A hot-remove retry can success
eventually since if journal disk is faulty all pending write will be
failed and finish. For hot-add, since an array supporting journal but
without journal disk will be marked read-only, we are safe to hot add
journal without stopping IO (should be read IO, while journal only
handles write IO).

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md.c          | 42 ++++++++++++++++++++++++++++++------------
 drivers/md/raid5-cache.c | 16 ++++++++++++----
 drivers/md/raid5.c       | 34 ++++++++++++++++++++++++++--------
 3 files changed, 68 insertions(+), 24 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index d0f0621bf9b0..c0c3e6dec248 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2055,8 +2055,9 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
 		return -EEXIST;
 
 	/* make sure rdev->sectors exceeds mddev->dev_sectors */
-	if (rdev->sectors && (mddev->dev_sectors == 0 ||
-			rdev->sectors < mddev->dev_sectors)) {
+	if (!test_bit(Journal, &rdev->flags) &&
+	    rdev->sectors &&
+	    (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) {
 		if (mddev->pers) {
 			/* Cannot change size, so fail
 			 * If mddev->level <= 0, then we don't care
@@ -2087,7 +2088,8 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
 		}
 	}
 	rcu_read_unlock();
-	if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
+	if (!test_bit(Journal, &rdev->flags) &&
+	    mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
 		printk(KERN_WARNING "md: %s: array is limited to %d devices\n",
 		       mdname(mddev), mddev->max_disks);
 		return -EBUSY;
@@ -6044,8 +6046,23 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
 		else
 			clear_bit(WriteMostly, &rdev->flags);
 
-		if (info->state & (1<<MD_DISK_JOURNAL))
+		if (info->state & (1<<MD_DISK_JOURNAL)) {
+			struct md_rdev *rdev2;
+			bool has_journal = false;
+
+			/* make sure no existing journal disk */
+			rdev_for_each(rdev2, mddev) {
+				if (test_bit(Journal, &rdev2->flags)) {
+					has_journal = true;
+					break;
+				}
+			}
+			if (has_journal) {
+				export_rdev(rdev);
+				return -EBUSY;
+			}
 			set_bit(Journal, &rdev->flags);
+		}
 		/*
 		 * check whether the device shows up in other nodes
 		 */
@@ -8181,19 +8198,20 @@ static int remove_and_add_spares(struct mddev *mddev,
 			continue;
 		if (test_bit(Faulty, &rdev->flags))
 			continue;
-		if (test_bit(Journal, &rdev->flags))
-			continue;
-		if (mddev->ro &&
-		    ! (rdev->saved_raid_disk >= 0 &&
-		       !test_bit(Bitmap_sync, &rdev->flags)))
-			continue;
+		if (!test_bit(Journal, &rdev->flags)) {
+			if (mddev->ro &&
+			    ! (rdev->saved_raid_disk >= 0 &&
+			       !test_bit(Bitmap_sync, &rdev->flags)))
+				continue;
 
-		rdev->recovery_offset = 0;
+			rdev->recovery_offset = 0;
+		}
 		if (mddev->pers->
 		    hot_add_disk(mddev, rdev) == 0) {
 			if (sysfs_link_rdev(mddev, rdev))
 				/* failure here is OK */;
-			spares++;
+			if (!test_bit(Journal, &rdev->flags))
+				spares++;
 			md_new_event(mddev);
 			set_bit(MD_CHANGE_DEVS, &mddev->flags);
 		}
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 668e973f07e6..c1c4d213a2c2 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -799,10 +799,18 @@ void r5l_quiesce(struct r5l_log *log, int state)
 
 bool r5l_log_disk_error(struct r5conf *conf)
 {
+	struct r5l_log *log;
+	bool ret;
 	/* don't allow write if journal disk is missing */
-	if (!conf->log)
-		return test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
-	return test_bit(Faulty, &conf->log->rdev->flags);
+	rcu_read_lock();
+	log = rcu_dereference(conf->log);
+
+	if (!log)
+		ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
+	else
+		ret = test_bit(Faulty, &log->rdev->flags);
+	rcu_read_unlock();
+	return ret;
 }
 
 struct r5l_recovery_ctx {
@@ -1165,7 +1173,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 	if (r5l_load_log(log))
 		goto error;
 
-	conf->log = log;
+	rcu_assign_pointer(conf->log, log);
 	return 0;
 error:
 	md_unregister_thread(&log->reclaim_thread);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 22362505f810..a086014dcd49 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7139,14 +7139,19 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 	struct disk_info *p = conf->disks + number;
 
 	print_raid5_conf(conf);
-	if (test_bit(Journal, &rdev->flags)) {
+	if (test_bit(Journal, &rdev->flags) && conf->log) {
+		struct r5l_log *log;
 		/*
-		 * journal disk is not removable, but we need give a chance to
-		 * update superblock of other disks. Otherwise journal disk
-		 * will be considered as 'fresh'
+		 * we can't wait pending write here, as this is called in
+		 * raid5d, wait will deadlock.
 		 */
-		set_bit(MD_CHANGE_DEVS, &mddev->flags);
-		return -EINVAL;
+		if (atomic_read(&mddev->writes_pending))
+			return -EBUSY;
+		log = conf->log;
+		conf->log = NULL;
+		synchronize_rcu();
+		r5l_exit_log(log);
+		return 0;
 	}
 	if (rdev == p->rdev)
 		rdevp = &p->rdev;
@@ -7210,8 +7215,21 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 	int first = 0;
 	int last = conf->raid_disks - 1;
 
-	if (test_bit(Journal, &rdev->flags))
-		return -EINVAL;
+	if (test_bit(Journal, &rdev->flags)) {
+		char b[BDEVNAME_SIZE];
+		if (conf->log)
+			return -EBUSY;
+
+		rdev->raid_disk = 0;
+		/*
+		 * The array is in readonly mode if journal is missing, so no
+		 * write requests running. We should be safe
+		 */
+		r5l_init_log(conf, rdev);
+		printk(KERN_INFO"md/raid:%s: using device %s as journal\n",
+		       mdname(mddev), bdevname(rdev->bdev, b));
+		return 0;
+	}
 	if (mddev->recovery_disabled == conf->recovery_disabled)
 		return -EBUSY;
 
-- 
cgit v1.2.3-58-ga151


From c38d29b33bb3b3c792f3cca8a973422bb1897ebf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 21 Dec 2015 10:51:02 +1100
Subject: raid5-cache: use a bio_set

This allows us to make guaranteed forward progress.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid5-cache.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index c1c4d213a2c2..2a644977d90c 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -34,6 +34,12 @@
 #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
 #define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
 
+/*
+ * We only need 2 bios per I/O unit to make progress, but ensure we
+ * have a few more available to not get too tight.
+ */
+#define R5L_POOL_SIZE	4
+
 struct r5l_log {
 	struct md_rdev *rdev;
 
@@ -70,6 +76,7 @@ struct r5l_log {
 	struct bio flush_bio;
 
 	struct kmem_cache *io_kc;
+	struct bio_set *bs;
 
 	struct md_thread *reclaim_thread;
 	unsigned long reclaim_target;	/* number of space that need to be
@@ -248,7 +255,7 @@ static void r5l_submit_current_io(struct r5l_log *log)
 
 static struct bio *r5l_bio_alloc(struct r5l_log *log)
 {
-	struct bio *bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
+	struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs);
 
 	bio->bi_rw = WRITE;
 	bio->bi_bdev = log->rdev->bdev;
@@ -1161,6 +1168,10 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 	if (!log->io_kc)
 		goto io_kc;
 
+	log->bs = bioset_create(R5L_POOL_SIZE, 0);
+	if (!log->bs)
+		goto io_bs;
+
 	log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
 						 log->rdev->mddev, "reclaim");
 	if (!log->reclaim_thread)
@@ -1178,6 +1189,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 error:
 	md_unregister_thread(&log->reclaim_thread);
 reclaim_thread:
+	bioset_free(log->bs);
+io_bs:
 	kmem_cache_destroy(log->io_kc);
 io_kc:
 	kfree(log);
@@ -1187,6 +1200,7 @@ io_kc:
 void r5l_exit_log(struct r5l_log *log)
 {
 	md_unregister_thread(&log->reclaim_thread);
+	bioset_free(log->bs);
 	kmem_cache_destroy(log->io_kc);
 	kfree(log);
 }
-- 
cgit v1.2.3-58-ga151


From e8deb6381051bf3ce9d817020e8ba972b405a070 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 21 Dec 2015 10:51:02 +1100
Subject: raid5-cache: use a mempool for the metadata block

We only have a limited number in flight, so use a page based mempool.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid5-cache.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 2a644977d90c..fa2d6321f3a4 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -77,6 +77,7 @@ struct r5l_log {
 
 	struct kmem_cache *io_kc;
 	struct bio_set *bs;
+	mempool_t *meta_pool;
 
 	struct md_thread *reclaim_thread;
 	unsigned long reclaim_target;	/* number of space that need to be
@@ -216,7 +217,7 @@ static void r5l_log_endio(struct bio *bio)
 		md_error(log->rdev->mddev, log->rdev);
 
 	bio_put(bio);
-	__free_page(io->meta_page);
+	mempool_free(io->meta_page, log->meta_pool);
 
 	spin_lock_irqsave(&log->io_list_lock, flags);
 	__r5l_set_io_unit_state(io, IO_UNIT_IO_END);
@@ -293,8 +294,9 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
 	INIT_LIST_HEAD(&io->stripe_list);
 	io->state = IO_UNIT_RUNNING;
 
-	io->meta_page = alloc_page(GFP_NOIO | __GFP_NOFAIL | __GFP_ZERO);
+	io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO);
 	block = page_address(io->meta_page);
+	clear_page(block);
 	block->magic = cpu_to_le32(R5LOG_MAGIC);
 	block->version = R5LOG_VERSION;
 	block->seq = cpu_to_le64(log->seq);
@@ -1172,6 +1174,10 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 	if (!log->bs)
 		goto io_bs;
 
+	log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0);
+	if (!log->meta_pool)
+		goto out_mempool;
+
 	log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
 						 log->rdev->mddev, "reclaim");
 	if (!log->reclaim_thread)
@@ -1186,9 +1192,12 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 
 	rcu_assign_pointer(conf->log, log);
 	return 0;
+
 error:
 	md_unregister_thread(&log->reclaim_thread);
 reclaim_thread:
+	mempool_destroy(log->meta_pool);
+out_mempool:
 	bioset_free(log->bs);
 io_bs:
 	kmem_cache_destroy(log->io_kc);
@@ -1200,6 +1209,7 @@ io_kc:
 void r5l_exit_log(struct r5l_log *log)
 {
 	md_unregister_thread(&log->reclaim_thread);
+	mempool_destroy(log->meta_pool);
 	bioset_free(log->bs);
 	kmem_cache_destroy(log->io_kc);
 	kfree(log);
-- 
cgit v1.2.3-58-ga151


From 5036c3902054358ee293b8cecfea13342d8019e8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 21 Dec 2015 10:51:02 +1100
Subject: raid5: allow r5l_io_unit allocations to fail

And propagate the error up the stack so we can add the stripe
to no_stripes_list and retry our log operation later.  This avoids
blocking raid5d due to reclaim, an it allows to get rid of the
deadlock-prone GFP_NOFAIL allocation.

shli: add missing mempool_destroy()

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid5-cache.c | 67 ++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 57 insertions(+), 10 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index fa2d6321f3a4..6d2b4789a928 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -75,7 +75,10 @@ struct r5l_log {
 	struct list_head finished_ios;	/* io_units which settle down in log disk */
 	struct bio flush_bio;
 
+	struct list_head no_mem_stripes;   /* pending stripes, -ENOMEM */
+
 	struct kmem_cache *io_kc;
+	mempool_t *io_pool;
 	struct bio_set *bs;
 	mempool_t *meta_pool;
 
@@ -287,8 +290,11 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
 	struct r5l_io_unit *io;
 	struct r5l_meta_block *block;
 
-	/* We can't handle memory allocate failure so far */
-	io = kmem_cache_zalloc(log->io_kc, GFP_NOIO | __GFP_NOFAIL);
+	io = mempool_alloc(log->io_pool, GFP_ATOMIC);
+	if (!io)
+		return NULL;
+	memset(io, 0, sizeof(*io));
+
 	io->log = log;
 	INIT_LIST_HEAD(&io->log_sibling);
 	INIT_LIST_HEAD(&io->stripe_list);
@@ -326,8 +332,12 @@ static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
 	    log->current_io->meta_offset + payload_size > PAGE_SIZE)
 		r5l_submit_current_io(log);
 
-	if (!log->current_io)
+	if (!log->current_io) {
 		log->current_io = r5l_new_meta(log);
+		if (!log->current_io)
+			return -ENOMEM;
+	}
+
 	return 0;
 }
 
@@ -372,11 +382,12 @@ static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
 	r5_reserve_log_entry(log, io);
 }
 
-static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
+static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 			   int data_pages, int parity_pages)
 {
 	int i;
 	int meta_size;
+	int ret;
 	struct r5l_io_unit *io;
 
 	meta_size =
@@ -385,7 +396,10 @@ static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 		sizeof(struct r5l_payload_data_parity) +
 		sizeof(__le32) * parity_pages;
 
-	r5l_get_meta(log, meta_size);
+	ret = r5l_get_meta(log, meta_size);
+	if (ret)
+		return ret;
+
 	io = log->current_io;
 
 	for (i = 0; i < sh->disks; i++) {
@@ -415,6 +429,8 @@ static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 	list_add_tail(&sh->log_list, &io->stripe_list);
 	atomic_inc(&io->pending_stripe);
 	sh->log_io = io;
+
+	return 0;
 }
 
 static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
@@ -429,6 +445,7 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
 	int meta_size;
 	int reserve;
 	int i;
+	int ret = 0;
 
 	if (!log)
 		return -EAGAIN;
@@ -477,17 +494,22 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
 	mutex_lock(&log->io_mutex);
 	/* meta + data */
 	reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
-	if (r5l_has_free_space(log, reserve))
-		r5l_log_stripe(log, sh, data_pages, parity_pages);
-	else {
+	if (!r5l_has_free_space(log, reserve)) {
 		spin_lock(&log->no_space_stripes_lock);
 		list_add_tail(&sh->log_list, &log->no_space_stripes);
 		spin_unlock(&log->no_space_stripes_lock);
 
 		r5l_wake_reclaim(log, reserve);
+	} else {
+		ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
+		if (ret) {
+			spin_lock_irq(&log->io_list_lock);
+			list_add_tail(&sh->log_list, &log->no_mem_stripes);
+			spin_unlock_irq(&log->io_list_lock);
+		}
 	}
-	mutex_unlock(&log->io_mutex);
 
+	mutex_unlock(&log->io_mutex);
 	return 0;
 }
 
@@ -540,6 +562,21 @@ static sector_t r5l_reclaimable_space(struct r5l_log *log)
 				 log->next_checkpoint);
 }
 
+static void r5l_run_no_mem_stripe(struct r5l_log *log)
+{
+	struct stripe_head *sh;
+
+	assert_spin_locked(&log->io_list_lock);
+
+	if (!list_empty(&log->no_mem_stripes)) {
+		sh = list_first_entry(&log->no_mem_stripes,
+				      struct stripe_head, log_list);
+		list_del_init(&sh->log_list);
+		set_bit(STRIPE_HANDLE, &sh->state);
+		raid5_release_stripe(sh);
+	}
+}
+
 static bool r5l_complete_finished_ios(struct r5l_log *log)
 {
 	struct r5l_io_unit *io, *next;
@@ -556,7 +593,8 @@ static bool r5l_complete_finished_ios(struct r5l_log *log)
 		log->next_cp_seq = io->seq;
 
 		list_del(&io->log_sibling);
-		kmem_cache_free(log->io_kc, io);
+		mempool_free(io, log->io_pool);
+		r5l_run_no_mem_stripe(log);
 
 		found = true;
 	}
@@ -1170,6 +1208,10 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 	if (!log->io_kc)
 		goto io_kc;
 
+	log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc);
+	if (!log->io_pool)
+		goto io_pool;
+
 	log->bs = bioset_create(R5L_POOL_SIZE, 0);
 	if (!log->bs)
 		goto io_bs;
@@ -1184,6 +1226,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 		goto reclaim_thread;
 	init_waitqueue_head(&log->iounit_wait);
 
+	INIT_LIST_HEAD(&log->no_mem_stripes);
+
 	INIT_LIST_HEAD(&log->no_space_stripes);
 	spin_lock_init(&log->no_space_stripes_lock);
 
@@ -1200,6 +1244,8 @@ reclaim_thread:
 out_mempool:
 	bioset_free(log->bs);
 io_bs:
+	mempool_destroy(log->io_pool);
+io_pool:
 	kmem_cache_destroy(log->io_kc);
 io_kc:
 	kfree(log);
@@ -1211,6 +1257,7 @@ void r5l_exit_log(struct r5l_log *log)
 	md_unregister_thread(&log->reclaim_thread);
 	mempool_destroy(log->meta_pool);
 	bioset_free(log->bs);
+	mempool_destroy(log->io_pool);
 	kmem_cache_destroy(log->io_kc);
 	kfree(log);
 }
-- 
cgit v1.2.3-58-ga151


From bb9ef71646606e51adfebdc94231fbbc862dbe28 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Mon, 28 Dec 2015 10:46:38 +0800
Subject: md: remove unnecesary md_new_event_inintr

md_new_event had removed sysfs_notify since 'commit 72a23c211e45
("Make sure all changes to md/sync_action are notified.")', so we
can use md_new_event and delete md_new_event_inintr.

Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index c0c3e6dec248..43a140457e0c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -205,15 +205,6 @@ void md_new_event(struct mddev *mddev)
 }
 EXPORT_SYMBOL_GPL(md_new_event);
 
-/* Alternate version that can be called from interrupts
- * when calling sysfs_notify isn't needed.
- */
-static void md_new_event_inintr(struct mddev *mddev)
-{
-	atomic_inc(&md_event_count);
-	wake_up(&md_event_waiters);
-}
-
 /*
  * Enables to iterate over all existing md arrays
  * all_mddevs_lock protects this list.
@@ -7209,7 +7200,7 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev)
 	md_wakeup_thread(mddev->thread);
 	if (mddev->event_work.func)
 		queue_work(md_misc_wq, &mddev->event_work);
-	md_new_event_inintr(mddev);
+	md_new_event(mddev);
 }
 EXPORT_SYMBOL(md_error);
 
-- 
cgit v1.2.3-58-ga151


From 274d8cbde1bc3bdfb31c5d6a58113dff5cee4f87 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Mon, 4 Jan 2016 16:16:58 +1100
Subject: md: Remove 'ready' field from mddev.

This field is always set in tandem with ->pers, and when it is tested
->pers is also tested.  So ->ready is not needed.

It was needed once, but code rearrangement and locking changes have
removed that needed.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md.c | 5 +----
 drivers/md/md.h | 2 --
 2 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 43a140457e0c..0d1d822eeda5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -250,8 +250,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
 
 	blk_queue_split(q, &bio, q->bio_split);
 
-	if (mddev == NULL || mddev->pers == NULL
-	    || !mddev->ready) {
+	if (mddev == NULL || mddev->pers == NULL) {
 		bio_io_error(bio);
 		return BLK_QC_T_NONE;
 	}
@@ -5298,7 +5297,6 @@ int md_run(struct mddev *mddev)
 	smp_wmb();
 	spin_lock(&mddev->lock);
 	mddev->pers = pers;
-	mddev->ready = 1;
 	spin_unlock(&mddev->lock);
 	rdev_for_each(rdev, mddev)
 		if (rdev->raid_disk >= 0)
@@ -5498,7 +5496,6 @@ static void __md_stop(struct mddev *mddev)
 	/* Ensure ->event_work is done */
 	flush_workqueue(md_misc_wq);
 	spin_lock(&mddev->lock);
-	mddev->ready = 0;
 	mddev->pers = NULL;
 	spin_unlock(&mddev->lock);
 	pers->free(mddev, mddev->private);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index e16a17c37418..fc6f7bbc9544 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -246,8 +246,6 @@ struct mddev {
 						       * are happening, so run/
 						       * takeover/stop are not safe
 						       */
-	int				ready; /* See when safe to pass
-						* IO requests down */
 	struct gendisk			*gendisk;
 
 	struct kobject			kobj;
-- 
cgit v1.2.3-58-ga151


From a62ab49eb502a07814f9942770893118c6281223 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 6 Jan 2016 14:37:13 -0800
Subject: md: set MD_HAS_JOURNAL in correct places

Set MD_HAS_JOURNAL when a array is loaded or journal is initialized.
This is to avoid the flags set too early in journal disk hotadd.

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md.c          | 9 +++++----
 drivers/md/raid5-cache.c | 1 +
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 0d1d822eeda5..29a4bbf62be5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1597,6 +1597,11 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
 			mddev->new_chunk_sectors = mddev->chunk_sectors;
 		}
 
+		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) {
+			set_bit(MD_HAS_JOURNAL, &mddev->flags);
+			if (mddev->recovery_cp == MaxSector)
+				set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
+		}
 	} else if (mddev->pers == NULL) {
 		/* Insist of good event counter while assembling, except for
 		 * spares (which don't need an event count) */
@@ -1643,8 +1648,6 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
 			}
 			set_bit(Journal, &rdev->flags);
 			rdev->journal_tail = le64_to_cpu(sb->journal_tail);
-			if (mddev->recovery_cp == MaxSector)
-				set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
 			rdev->raid_disk = 0;
 			break;
 		default:
@@ -1664,8 +1667,6 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
 			set_bit(WriteMostly, &rdev->flags);
 		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
 			set_bit(Replacement, &rdev->flags);
-		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
-			set_bit(MD_HAS_JOURNAL, &mddev->flags);
 	} else /* MULTIPATH are always insync */
 		set_bit(In_sync, &rdev->flags);
 
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 6d2b4789a928..7ac035a73281 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -1235,6 +1235,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 		goto error;
 
 	rcu_assign_pointer(conf->log, log);
+	set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
 	return 0;
 
 error:
-- 
cgit v1.2.3-58-ga151


From 87d4d91616e4db9b8293ba9d9e5a2f3f0d0c8aa6 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 6 Jan 2016 14:37:14 -0800
Subject: MD: add journal with array suspended

Hot add journal disk in recovery thread context brings a lot of trouble
as IO could be running. Unlike spare disk hot add, adding journal disk
with array suspended makes more sense and implmentation is much easier.

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 29a4bbf62be5..8753dee3983b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2455,15 +2455,20 @@ static int add_bound_rdev(struct md_rdev *rdev)
 {
 	struct mddev *mddev = rdev->mddev;
 	int err = 0;
+	bool add_journal = test_bit(Journal, &rdev->flags);
 
-	if (!mddev->pers->hot_remove_disk) {
+	if (!mddev->pers->hot_remove_disk || add_journal) {
 		/* If there is hot_add_disk but no hot_remove_disk
 		 * then added disks for geometry changes,
 		 * and should be added immediately.
 		 */
 		super_types[mddev->major_version].
 			validate_super(mddev, rdev);
+		if (add_journal)
+			mddev_suspend(mddev);
 		err = mddev->pers->hot_add_disk(mddev, rdev);
+		if (add_journal)
+			mddev_resume(mddev);
 		if (err) {
 			unbind_rdev_from_array(rdev);
 			export_rdev(rdev);
-- 
cgit v1.2.3-58-ga151


From 16a43f6a65002ba9a5b063764b4ad5d288a1c15e Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 6 Jan 2016 14:37:15 -0800
Subject: raid5-cache: handle journal hotadd in quiesce

Handle journal hotadd in quiesce to avoid creating duplicated threads.

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid5-cache.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'drivers')

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 7ac035a73281..9531f5f05b93 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -827,6 +827,13 @@ void r5l_quiesce(struct r5l_log *log, int state)
 		return;
 	if (state == 0) {
 		log->in_teardown = 0;
+		/*
+		 * This is a special case for hotadd. In suspend, the array has
+		 * no journal. In resume, journal is initialized as well as the
+		 * reclaim thread.
+		 */
+		if (log->reclaim_thread)
+			return;
 		log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
 					log->rdev->mddev, "reclaim");
 	} else if (state == 1) {
-- 
cgit v1.2.3-58-ga151


From 1501efadc524a0c99494b576923091589a52d2a4 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 13 Jan 2016 16:00:07 -0800
Subject: md/raid: only permit hot-add of compatible integrity profiles

It is not safe for an integrity profile to be changed while i/o is
in-flight in the queue.  Prevent adding new disks or otherwise online
spares to an array if the device has an incompatible integrity profile.

The original change to the blk_integrity_unregister implementation in
md, commmit c7bfced9a671 "md: suspend i/o during runtime
blk_integrity_unregister" introduced an immediate hang regression.

This policy of disallowing changes the integrity profile once one has
been established is shared with DM.

Here is an abbreviated log from a test run that:
1/ Creates a degraded raid1 with an integrity-enabled device (pmem0s) [   59.076127]
2/ Tries to add an integrity-disabled device (pmem1m) [   90.489209]
3/ Retries with an integrity-enabled device (pmem1s) [  205.671277]

[   59.076127] md/raid1:md0: active with 1 out of 2 mirrors
[   59.078302] md: data integrity enabled on md0
[..]
[   90.489209] md0: incompatible integrity profile for pmem1m
[..]
[  205.671277] md: super_written gets error=-5
[  205.677386] md/raid1:md0: Disk failure on pmem1m, disabling device.
[  205.677386] md/raid1:md0: Operation continuing on 1 devices.
[  205.683037] RAID1 conf printout:
[  205.684699]  --- wd:1 rd:2
[  205.685972]  disk 0, wo:0, o:1, dev:pmem0s
[  205.687562]  disk 1, wo:1, o:1, dev:pmem1s
[  205.691717] md: recovery of RAID array md0

Fixes: c7bfced9a671 ("md: suspend i/o during runtime blk_integrity_unregister")
Cc: <stable@vger.kernel.org>
Cc: Mike Snitzer <snitzer@redhat.com>
Reported-by: NeilBrown <neilb@suse.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md.c        | 28 ++++++++++++++++------------
 drivers/md/md.h        |  2 +-
 drivers/md/multipath.c |  6 +++---
 drivers/md/raid1.c     |  6 +++---
 drivers/md/raid10.c    |  6 +++---
 5 files changed, 26 insertions(+), 22 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8753dee3983b..2cf0e1c00b9a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2010,28 +2010,32 @@ int md_integrity_register(struct mddev *mddev)
 }
 EXPORT_SYMBOL(md_integrity_register);
 
-/* Disable data integrity if non-capable/non-matching disk is being added */
-void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
+/*
+ * Attempt to add an rdev, but only if it is consistent with the current
+ * integrity profile
+ */
+int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
 {
 	struct blk_integrity *bi_rdev;
 	struct blk_integrity *bi_mddev;
+	char name[BDEVNAME_SIZE];
 
 	if (!mddev->gendisk)
-		return;
+		return 0;
 
 	bi_rdev = bdev_get_integrity(rdev->bdev);
 	bi_mddev = blk_get_integrity(mddev->gendisk);
 
 	if (!bi_mddev) /* nothing to do */
-		return;
-	if (rdev->raid_disk < 0) /* skip spares */
-		return;
-	if (bi_rdev && blk_integrity_compare(mddev->gendisk,
-					     rdev->bdev->bd_disk) >= 0)
-		return;
-	WARN_ON_ONCE(!mddev->suspended);
-	printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev));
-	blk_integrity_unregister(mddev->gendisk);
+		return 0;
+
+	if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) {
+		printk(KERN_NOTICE "%s: incompatible integrity profile for %s\n",
+				mdname(mddev), bdevname(rdev->bdev, name));
+		return -ENXIO;
+	}
+
+	return 0;
 }
 EXPORT_SYMBOL(md_integrity_add_rdev);
 
diff --git a/drivers/md/md.h b/drivers/md/md.h
index fc6f7bbc9544..a491e220e738 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -660,7 +660,7 @@ extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev);
 extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors);
 extern int md_check_no_bitmap(struct mddev *mddev);
 extern int md_integrity_register(struct mddev *mddev);
-extern void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev);
+extern int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev);
 extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
 
 extern void mddev_init(struct mddev *mddev);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 7331a80d89f1..0a72ab6e6c20 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -257,6 +257,9 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 			disk_stack_limits(mddev->gendisk, rdev->bdev,
 					  rdev->data_offset << 9);
 
+			err = md_integrity_add_rdev(rdev, mddev);
+			if (err)
+				break;
 			spin_lock_irq(&conf->device_lock);
 			mddev->degraded--;
 			rdev->raid_disk = path;
@@ -264,9 +267,6 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 			spin_unlock_irq(&conf->device_lock);
 			rcu_assign_pointer(p->rdev, rdev);
 			err = 0;
-			mddev_suspend(mddev);
-			md_integrity_add_rdev(rdev, mddev);
-			mddev_resume(mddev);
 			break;
 		}
 
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e2169ff6e0f0..c4b913409226 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1589,6 +1589,9 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 	if (mddev->recovery_disabled == conf->recovery_disabled)
 		return -EBUSY;
 
+	if (md_integrity_add_rdev(rdev, mddev))
+		return -ENXIO;
+
 	if (rdev->raid_disk >= 0)
 		first = last = rdev->raid_disk;
 
@@ -1632,9 +1635,6 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 			break;
 		}
 	}
-	mddev_suspend(mddev);
-	md_integrity_add_rdev(rdev, mddev);
-	mddev_resume(mddev);
 	if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
 		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
 	print_conf(conf);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 84e597e1c489..ce959b4ae4df 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1698,6 +1698,9 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 	if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1))
 		return -EINVAL;
 
+	if (md_integrity_add_rdev(rdev, mddev))
+		return -ENXIO;
+
 	if (rdev->raid_disk >= 0)
 		first = last = rdev->raid_disk;
 
@@ -1739,9 +1742,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		rcu_assign_pointer(p->rdev, rdev);
 		break;
 	}
-	mddev_suspend(mddev);
-	md_integrity_add_rdev(rdev, mddev);
-	mddev_resume(mddev);
 	if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
 		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
 
-- 
cgit v1.2.3-58-ga151