From b46020aa3a8a0f9c7324fe0af4aec4227f947a10 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 21 Dec 2015 10:50:59 +1100 Subject: md/raid5: remove redundant check in stripe_add_to_batch_list() The stripe_add_to_batch_list() function is called only if stripe_can_batch() returned true, so there is no need for double check. Signed-off-by: Roman Gushchin Cc: Neil Brown Cc: linux-raid@vger.kernel.org Signed-off-by: NeilBrown --- drivers/md/raid5.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 704ef7fcfbf8..22362505f810 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -772,8 +772,6 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh int hash; int dd_idx; - if (!stripe_can_batch(sh)) - return; /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */ tmp_sec = sh->sector; if (!sector_div(tmp_sec, conf->chunk_sectors)) -- cgit v1.2.3-58-ga151 From ac277c6a8a39bc50f891a3477625330c276bd7f5 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Mon, 21 Dec 2015 10:50:59 +1100 Subject: md-cluster: Avoid the resync ping-pong If a RESYNCING message with (0,0) has been sent before, do not send it again. This avoids a resync ping pong between the nodes. We read the bitmap lockresource's LVB to figure out the previous value of the RESYNCING message. Signed-off-by: Goldwyn Rodrigues Signed-off-by: NeilBrown --- drivers/md/md-cluster.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'drivers') diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index d6a1126d85ce..e57bbfed1638 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -882,8 +882,16 @@ static int resync_start(struct mddev *mddev) static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) { struct md_cluster_info *cinfo = mddev->cluster_info; + struct resync_info ri; struct cluster_msg cmsg = {0}; + /* do not send zero again, if we have sent before */ + if (hi == 0) { + memcpy(&ri, cinfo->bitmap_lockres->lksb.sb_lvbptr, sizeof(struct resync_info)); + if (le64_to_cpu(ri.hi) == 0) + return 0; + } + add_resync_info(cinfo->bitmap_lockres, lo, hi); /* Re-acquire the lock to refresh LVB */ dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW); -- cgit v1.2.3-58-ga151 From 659b254fa7392e32b59a30d4b61fb12c4cd440ff Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 21 Dec 2015 10:50:59 +1100 Subject: md-cluster: remove a disk asynchronously from cluster environment For cluster raid, if one disk couldn't be reach in one node, then other nodes would receive the REMOVE message for the disk. In receiving node, we can't call md_kick_rdev_from_array to remove the disk from array synchronously since the disk might still be busy in this node. So let's set a ClusterRemove flag on the disk, then let the thread to do the removal job eventually. Signed-off-by: Guoqing Jiang Signed-off-by: Goldwyn Rodrigues Signed-off-by: NeilBrown --- drivers/md/md-cluster.c | 7 +++++-- drivers/md/md.c | 12 ++++++++++++ drivers/md/md.h | 1 + 3 files changed, 18 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index e57bbfed1638..3fd7301fd7af 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -440,8 +440,11 @@ static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg) struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, le32_to_cpu(msg->raid_slot)); - if (rdev) - md_kick_rdev_from_array(rdev); + if (rdev) { + set_bit(ClusterRemove, &rdev->flags); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } else pr_warn("%s: %d Could not find disk(%d) to REMOVE\n", __func__, __LINE__, le32_to_cpu(msg->raid_slot)); diff --git a/drivers/md/md.c b/drivers/md/md.c index 61aacab424cf..198e29dffb98 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8318,6 +8318,18 @@ void md_check_recovery(struct mddev *mddev) goto unlock; } + if (mddev_is_clustered(mddev)) { + struct md_rdev *rdev; + /* kick the device if another node issued a + * remove disk. + */ + rdev_for_each(rdev, mddev) { + if (test_and_clear_bit(ClusterRemove, &rdev->flags) && + rdev->raid_disk < 0) + md_kick_rdev_from_array(rdev); + } + } + if (!mddev->external) { int did_change = 0; spin_lock(&mddev->lock); diff --git a/drivers/md/md.h b/drivers/md/md.h index ca0b643fe3c1..f7b17aef837d 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -183,6 +183,7 @@ enum flag_bits { * Usually, this device should be faster * than other devices in the array */ + ClusterRemove, }; #define BB_LEN_MASK (0x00000000000001FFULL) -- cgit v1.2.3-58-ga151 From 54a88392cdd84b4a739ce3a986bfabfaff67d9d2 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Mon, 21 Dec 2015 10:51:00 +1100 Subject: md-cluster: Fix the remove sequence with the new MD reload code The remove disk message does not need metadata_update_start(), but can be an independent message. Signed-off-by: Goldwyn Rodrigues Signed-off-by: Guoqing Jiang Signed-off-by: NeilBrown --- drivers/md/md-cluster.c | 2 +- drivers/md/md.c | 9 +-------- 2 files changed, 2 insertions(+), 9 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 3fd7301fd7af..b58374daff32 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -997,7 +997,7 @@ static int remove_disk(struct mddev *mddev, struct md_rdev *rdev) struct md_cluster_info *cinfo = mddev->cluster_info; cmsg.type = cpu_to_le32(REMOVE); cmsg.raid_slot = cpu_to_le32(rdev->desc_nr); - return __sendmsg(cinfo, &cmsg); + return sendmsg(cinfo, &cmsg); } static int gather_bitmaps(struct md_rdev *rdev) diff --git a/drivers/md/md.c b/drivers/md/md.c index 198e29dffb98..ab3995de0418 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6134,15 +6134,11 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) { char b[BDEVNAME_SIZE]; struct md_rdev *rdev; - int ret = -1; rdev = find_rdev(mddev, dev); if (!rdev) return -ENXIO; - if (mddev_is_clustered(mddev)) - ret = md_cluster_ops->metadata_update_start(mddev); - if (rdev->raid_disk < 0) goto kick_rdev; @@ -6153,7 +6149,7 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) goto busy; kick_rdev: - if (mddev_is_clustered(mddev) && ret == 0) + if (mddev_is_clustered(mddev)) md_cluster_ops->remove_disk(mddev, rdev); md_kick_rdev_from_array(rdev); @@ -6162,9 +6158,6 @@ kick_rdev: return 0; busy: - if (mddev_is_clustered(mddev) && ret == 0) - md_cluster_ops->metadata_update_cancel(mddev); - printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", bdevname(rdev->bdev,b), mdname(mddev)); return -EBUSY; -- cgit v1.2.3-58-ga151 From 09afd2a8d6ad2c40f3c1ae0b3f83784864cf4c15 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Mon, 21 Dec 2015 10:51:00 +1100 Subject: md-cluster: Allow spare devices to be marked as faulty If a spare device was marked faulty, it would not be reflected in receiving nodes because it would mark it as activated and continue. Continue the operation, so it may be set as faulty. Signed-off-by: Goldwyn Rodrigues Signed-off-by: NeilBrown --- drivers/md/md.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index ab3995de0418..f2f855c203e5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9106,7 +9106,6 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) ret = remove_and_add_spares(mddev, rdev2); pr_info("Activated spare: %s\n", bdevname(rdev2->bdev,b)); - continue; } /* device faulty * We just want to do the minimum to mark the disk -- cgit v1.2.3-58-ga151 From f6a2dc64ee74477c966f5220b1f560ed6308d010 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 21 Dec 2015 10:51:00 +1100 Subject: md-cluster: append some actions when change bitmap from clustered to none For clustered raid, we need to do extra actions when change bitmap to none. 1. check if all the bitmap lock could be get or not, if yes then we can continue the change since cluster raid is only active in current node. Otherwise return fail and unlock the related bitmap locks 2. set nodes to 0 and then leave cluster environment. 3. release other nodes's bitmap lock. Signed-off-by: Guoqing Jiang Signed-off-by: NeilBrown --- drivers/md/md-cluster.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++ drivers/md/md-cluster.h | 2 ++ drivers/md/md.c | 13 +++++++++++ 3 files changed, 72 insertions(+) (limited to 'drivers') diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index b58374daff32..db9375f501ab 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -55,6 +55,7 @@ struct md_cluster_info { int slot_number; struct completion completion; struct dlm_lock_resource *bitmap_lockres; + struct dlm_lock_resource **other_bitmap_lockres; struct dlm_lock_resource *resync_lockres; struct list_head suspend_list; spinlock_t suspend_lock; @@ -803,6 +804,7 @@ static void resync_bitmap(struct mddev *mddev) __func__, __LINE__, err); } +static void unlock_all_bitmaps(struct mddev *mddev); static int leave(struct mddev *mddev) { struct md_cluster_info *cinfo = mddev->cluster_info; @@ -823,6 +825,7 @@ static int leave(struct mddev *mddev) lockres_free(cinfo->ack_lockres); lockres_free(cinfo->no_new_dev_lockres); lockres_free(cinfo->bitmap_lockres); + unlock_all_bitmaps(mddev); dlm_release_lockspace(cinfo->lockspace, 2); return 0; } @@ -1000,6 +1003,58 @@ static int remove_disk(struct mddev *mddev, struct md_rdev *rdev) return sendmsg(cinfo, &cmsg); } +static int lock_all_bitmaps(struct mddev *mddev) +{ + int slot, my_slot, ret, held = 1, i = 0; + char str[64]; + struct md_cluster_info *cinfo = mddev->cluster_info; + + cinfo->other_bitmap_lockres = kzalloc((mddev->bitmap_info.nodes - 1) * + sizeof(struct dlm_lock_resource *), + GFP_KERNEL); + if (!cinfo->other_bitmap_lockres) { + pr_err("md: can't alloc mem for other bitmap locks\n"); + return 0; + } + + my_slot = slot_number(mddev); + for (slot = 0; slot < mddev->bitmap_info.nodes; slot++) { + if (slot == my_slot) + continue; + + memset(str, '\0', 64); + snprintf(str, 64, "bitmap%04d", slot); + cinfo->other_bitmap_lockres[i] = lockres_init(mddev, str, NULL, 1); + if (!cinfo->other_bitmap_lockres[i]) + return -ENOMEM; + + cinfo->other_bitmap_lockres[i]->flags |= DLM_LKF_NOQUEUE; + ret = dlm_lock_sync(cinfo->other_bitmap_lockres[i], DLM_LOCK_PW); + if (ret) + held = -1; + i++; + } + + return held; +} + +static void unlock_all_bitmaps(struct mddev *mddev) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + int i; + + /* release other node's bitmap lock if they are existed */ + if (cinfo->other_bitmap_lockres) { + for (i = 0; i < mddev->bitmap_info.nodes - 1; i++) { + if (cinfo->other_bitmap_lockres[i]) { + dlm_unlock_sync(cinfo->other_bitmap_lockres[i]); + lockres_free(cinfo->other_bitmap_lockres[i]); + } + } + kfree(cinfo->other_bitmap_lockres); + } +} + static int gather_bitmaps(struct md_rdev *rdev) { int sn, err; @@ -1045,6 +1100,8 @@ static struct md_cluster_operations cluster_ops = { .new_disk_ack = new_disk_ack, .remove_disk = remove_disk, .gather_bitmaps = gather_bitmaps, + .lock_all_bitmaps = lock_all_bitmaps, + .unlock_all_bitmaps = unlock_all_bitmaps, }; static int __init cluster_init(void) diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h index e75ea2613184..45ce6c97d8bd 100644 --- a/drivers/md/md-cluster.h +++ b/drivers/md/md-cluster.h @@ -24,6 +24,8 @@ struct md_cluster_operations { int (*new_disk_ack)(struct mddev *mddev, bool ack); int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev); int (*gather_bitmaps)(struct md_rdev *rdev); + int (*lock_all_bitmaps)(struct mddev *mddev); + void (*unlock_all_bitmaps)(struct mddev *mddev); }; #endif /* _MD_CLUSTER_H */ diff --git a/drivers/md/md.c b/drivers/md/md.c index f2f855c203e5..495d8aa0a0d2 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6599,6 +6599,19 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) rv = -EINVAL; goto err; } + if (mddev->bitmap_info.nodes) { + /* hold PW on all the bitmap lock */ + if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { + printk("md: can't change bitmap to none since the" + " array is in use by more than one node\n"); + rv = -EPERM; + md_cluster_ops->unlock_all_bitmaps(mddev); + goto err; + } + + mddev->bitmap_info.nodes = 0; + md_cluster_ops->leave(mddev); + } mddev->pers->quiesce(mddev, 1); bitmap_destroy(mddev); mddev->pers->quiesce(mddev, 0); -- cgit v1.2.3-58-ga151 From 15858fa5b00c1067a8a8e53ea32f4a65f8bebbb8 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 21 Dec 2015 10:51:00 +1100 Subject: md-cluster: Defer MD reloading to mddev->thread Reloading of superblock must be performed under reconfig_mutex. However, this cannot be done with md_reload_sb because it would deadlock with the message DLM lock. So, we defer it in md_check_recovery() which is executed by mddev->thread. This introduces a new flag, MD_RELOAD_SB, which if set, will reload the superblock. And good_device_nr is also added to 'struct mddev' which is used to get the num of the good device within cluster raid. Signed-off-by: Goldwyn Rodrigues Signed-off-by: Guoqing Jiang Signed-off-by: NeilBrown --- drivers/md/md-cluster.c | 4 +++- drivers/md/md.c | 4 ++++ drivers/md/md.h | 4 ++++ 3 files changed, 11 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index db9375f501ab..b659ef7b8daf 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -432,8 +432,10 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg) { struct md_cluster_info *cinfo = mddev->cluster_info; - md_reload_sb(mddev, le32_to_cpu(msg->raid_slot)); + mddev->good_device_nr = le32_to_cpu(msg->raid_slot); + set_bit(MD_RELOAD_SB, &mddev->flags); dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); + md_wakeup_thread(mddev->thread); } static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg) diff --git a/drivers/md/md.c b/drivers/md/md.c index 495d8aa0a0d2..504ce5d068ce 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8286,6 +8286,7 @@ void md_check_recovery(struct mddev *mddev) (mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<recovery) || test_bit(MD_RECOVERY_DONE, &mddev->recovery) || + test_bit(MD_RELOAD_SB, &mddev->flags) || (mddev->external == 0 && mddev->safemode == 1) || (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) && !mddev->in_sync && mddev->recovery_cp == MaxSector) @@ -8334,6 +8335,9 @@ void md_check_recovery(struct mddev *mddev) rdev->raid_disk < 0) md_kick_rdev_from_array(rdev); } + + if (test_and_clear_bit(MD_RELOAD_SB, &mddev->flags)) + md_reload_sb(mddev, mddev->good_device_nr); } if (!mddev->external) { diff --git a/drivers/md/md.h b/drivers/md/md.h index f7b17aef837d..8817e623258a 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -235,6 +235,9 @@ struct mddev { */ #define MD_JOURNAL_CLEAN 5 /* A raid with journal is already clean */ #define MD_HAS_JOURNAL 6 /* The raid array has journal feature set */ +#define MD_RELOAD_SB 7 /* Reload the superblock because another node + * updated it. + */ int suspended; atomic_t active_io; @@ -465,6 +468,7 @@ struct mddev { struct work_struct event_work; /* used by dm to report failure event */ void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); struct md_cluster_info *cluster_info; + unsigned int good_device_nr; /* good device num within cluster raid */ }; static inline int __must_check mddev_lock(struct mddev *mddev) -- cgit v1.2.3-58-ga151 From 8b9277c81450de9d8081ff6571ac5986e6c83f49 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 21 Dec 2015 10:51:00 +1100 Subject: md-cluster: Protect communication with mutexes Communication can happen through multiple threads. It is possible that one thread steps over another threads sequence. So, we use mutexes to protect both the send and receive sequences. Send communication is locked through state bit, MD_CLUSTER_SEND_LOCK. Communication is locked with bit manipulation in order to allow "lock and hold" for the add operation. In case of an add operation, if the lock is held, MD_CLUSTER_SEND_LOCKED_ALREADY is set. When md_update_sb() calls metadata_update_start(), it checks (in a single statement to avoid races), if the communication is already locked. If yes, it merely returns zero, else it locks the token lockresource. Signed-off-by: Goldwyn Rodrigues Signed-off-by: NeilBrown --- drivers/md/md-cluster.c | 73 ++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 63 insertions(+), 10 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index b659ef7b8daf..ad3ec7df1547 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -48,12 +48,26 @@ struct resync_info { #define MD_CLUSTER_SUSPEND_READ_BALANCING 2 #define MD_CLUSTER_BEGIN_JOIN_CLUSTER 3 +/* Lock the send communication. This is done through + * bit manipulation as opposed to a mutex in order to + * accomodate lock and hold. See next comment. + */ +#define MD_CLUSTER_SEND_LOCK 4 +/* If cluster operations must lock the communication channel, + * so as to perform extra operations (and no other operation + * is allowed on the MD, such as adding a disk. Token needs + * to be locked and held until the operation completes with + * a md_update_sb(), which would eventually release the lock. + */ +#define MD_CLUSTER_SEND_LOCKED_ALREADY 5 + struct md_cluster_info { /* dlm lock space and resources for clustered raid. */ dlm_lockspace_t *lockspace; int slot_number; struct completion completion; + struct mutex recv_mutex; struct dlm_lock_resource *bitmap_lockres; struct dlm_lock_resource **other_bitmap_lockres; struct dlm_lock_resource *resync_lockres; @@ -68,6 +82,7 @@ struct md_cluster_info { struct dlm_lock_resource *no_new_dev_lockres; struct md_thread *recv_thread; struct completion newdisk_completion; + wait_queue_head_t wait; unsigned long state; }; @@ -508,9 +523,11 @@ static void recv_daemon(struct md_thread *thread) struct cluster_msg msg; int ret; + mutex_lock(&cinfo->recv_mutex); /*get CR on Message*/ if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) { pr_err("md/raid1:failed to get CR on MESSAGE\n"); + mutex_unlock(&cinfo->recv_mutex); return; } @@ -534,33 +551,45 @@ static void recv_daemon(struct md_thread *thread) ret = dlm_unlock_sync(message_lockres); if (unlikely(ret != 0)) pr_info("unlock msg failed return %d\n", ret); + mutex_unlock(&cinfo->recv_mutex); } -/* lock_comm() +/* lock_token() * Takes the lock on the TOKEN lock resource so no other * node can communicate while the operation is underway. - * If called again, and the TOKEN lock is alread in EX mode - * return success. However, care must be taken that unlock_comm() - * is called only once. */ -static int lock_comm(struct md_cluster_info *cinfo) +static int lock_token(struct md_cluster_info *cinfo) { int error; - if (cinfo->token_lockres->mode == DLM_LOCK_EX) - return 0; - error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX); if (error) pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n", __func__, __LINE__, error); + + /* Lock the receive sequence */ + mutex_lock(&cinfo->recv_mutex); return error; } +/* lock_comm() + * Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel. + */ +static int lock_comm(struct md_cluster_info *cinfo) +{ + wait_event(cinfo->wait, + !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state)); + + return lock_token(cinfo); +} + static void unlock_comm(struct md_cluster_info *cinfo) { WARN_ON(cinfo->token_lockres->mode != DLM_LOCK_EX); + mutex_unlock(&cinfo->recv_mutex); dlm_unlock_sync(cinfo->token_lockres); + clear_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state); + wake_up(&cinfo->wait); } /* __sendmsg() @@ -713,6 +742,8 @@ static int join(struct mddev *mddev, int nodes) spin_lock_init(&cinfo->suspend_lock); init_completion(&cinfo->completion); set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state); + init_waitqueue_head(&cinfo->wait); + mutex_init(&cinfo->recv_mutex); mddev->cluster_info = cinfo; @@ -843,9 +874,25 @@ static int slot_number(struct mddev *mddev) return cinfo->slot_number - 1; } +/* + * Check if the communication is already locked, else lock the communication + * channel. + * If it is already locked, token is in EX mode, and hence lock_token() + * should not be called. + */ static int metadata_update_start(struct mddev *mddev) { - return lock_comm(mddev->cluster_info); + struct md_cluster_info *cinfo = mddev->cluster_info; + + wait_event(cinfo->wait, + !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state) || + test_and_clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state)); + + /* If token is already locked, return 0 */ + if (cinfo->token_lockres->mode == DLM_LOCK_EX) + return 0; + + return lock_token(cinfo); } static int metadata_update_finish(struct mddev *mddev) @@ -870,6 +917,7 @@ static int metadata_update_finish(struct mddev *mddev) ret = __sendmsg(cinfo, &cmsg); } else pr_warn("md-cluster: No good device id found to send\n"); + clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state); unlock_comm(cinfo); return ret; } @@ -877,6 +925,7 @@ static int metadata_update_finish(struct mddev *mddev) static void metadata_update_cancel(struct mddev *mddev) { struct md_cluster_info *cinfo = mddev->cluster_info; + clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state); unlock_comm(cinfo); } @@ -970,14 +1019,18 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev) ret = -ENOENT; if (ret) unlock_comm(cinfo); - else + else { dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); + set_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state); + wake_up(&cinfo->wait); + } return ret; } static void add_new_disk_cancel(struct mddev *mddev) { struct md_cluster_info *cinfo = mddev->cluster_info; + clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state); unlock_comm(cinfo); } -- cgit v1.2.3-58-ga151 From e19508fa4df896b115f5321c21ce7669559b0863 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 21 Dec 2015 10:51:01 +1100 Subject: md-cluster: update comments for MD_CLUSTER_SEND_LOCKED_ALREADY 1. fix unbalanced parentheses. 2. add more description about that MD_CLUSTER_SEND_LOCKED_ALREADY will be cleared after set it in add_new_disk. Signed-off-by: Guoqing Jiang Signed-off-by: NeilBrown --- drivers/md/md-cluster.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index ad3ec7df1547..0ded8e97751d 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -53,11 +53,12 @@ struct resync_info { * accomodate lock and hold. See next comment. */ #define MD_CLUSTER_SEND_LOCK 4 -/* If cluster operations must lock the communication channel, - * so as to perform extra operations (and no other operation - * is allowed on the MD, such as adding a disk. Token needs - * to be locked and held until the operation completes with - * a md_update_sb(), which would eventually release the lock. +/* If cluster operations (such as adding a disk) must lock the + * communication channel, so as to perform extra operations + * (update metadata) and no other operation is allowed on the + * MD. Token needs to be locked and held until the operation + * completes witha md_update_sb(), which would eventually release + * the lock. */ #define MD_CLUSTER_SEND_LOCKED_ALREADY 5 @@ -1021,6 +1022,18 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev) unlock_comm(cinfo); else { dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); + /* Since MD_CHANGE_DEVS will be set in add_bound_rdev which + * will run soon after add_new_disk, the below path will be + * invoked: + * md_wakeup_thread(mddev->thread) + * -> conf->thread (raid1d) + * -> md_check_recovery -> md_update_sb + * -> metadata_update_start/finish + * MD_CLUSTER_SEND_LOCKED_ALREADY will be cleared eventually. + * + * For other failure cases, metadata_update_cancel and + * add_new_disk_cancel also clear below bit as well. + * */ set_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state); wake_up(&cinfo->wait); } -- cgit v1.2.3-58-ga151 From abf3508d8faa281e01a780e022a6f43d1731fe0b Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 21 Dec 2015 10:51:01 +1100 Subject: md: update comment for md_allow_write MD_CHANGE_CLEAN had been replaced with MD_CHANGE_PENDING after commit 070dc6 ("md: resolve confusion of MD_CHANGE_CLEAN"), so make the change accordingly. Signed-off-by: Guoqing Jiang Signed-off-by: NeilBrown --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index 504ce5d068ce..f71a81b37d08 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7714,7 +7714,7 @@ EXPORT_SYMBOL(md_write_end); * attempting a GFP_KERNEL allocation while holding the mddev lock. * Must be called with mddev_lock held. * - * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock + * In the ->external case MD_CHANGE_PENDING can not be cleared until mddev->lock * is dropped, so return -EAGAIN after notifying userspace. */ int md_allow_write(struct mddev *mddev) -- cgit v1.2.3-58-ga151 From 3848c0bcb09c7b78e6f4ae9f8fc8d6d9aecbd35a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 21 Dec 2015 10:51:01 +1100 Subject: raid5-cache: simplify r5l_move_io_unit_list It's only used for one kind of move, so make that explicit. Also clean up the code a bit by using list_for_each_safe. Signed-off-by: Christoph Hellwig Reviewed-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid5-cache.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) (limited to 'drivers') diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index b887e04d7e5c..3699c4704ba8 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -156,21 +156,6 @@ static void r5l_free_io_unit(struct r5l_log *log, struct r5l_io_unit *io) kmem_cache_free(log->io_kc, io); } -static void r5l_move_io_unit_list(struct list_head *from, struct list_head *to, - enum r5l_io_unit_state state) -{ - struct r5l_io_unit *io; - - while (!list_empty(from)) { - io = list_first_entry(from, struct r5l_io_unit, log_sibling); - /* don't change list order */ - if (io->state >= state) - list_move_tail(&io->log_sibling, to); - else - break; - } -} - static void __r5l_set_io_unit_state(struct r5l_io_unit *io, enum r5l_io_unit_state state) { @@ -206,6 +191,20 @@ static void r5l_log_run_stripes(struct r5l_log *log) } } +static void r5l_move_to_end_ios(struct r5l_log *log) +{ + struct r5l_io_unit *io, *next; + + assert_spin_locked(&log->io_list_lock); + + list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { + /* don't change list order */ + if (io->state < IO_UNIT_IO_END) + break; + list_move_tail(&io->log_sibling, &log->io_end_ios); + } +} + static void r5l_log_endio(struct bio *bio) { struct r5l_io_unit *io = bio->bi_private; @@ -220,8 +219,7 @@ static void r5l_log_endio(struct bio *bio) spin_lock_irqsave(&log->io_list_lock, flags); __r5l_set_io_unit_state(io, IO_UNIT_IO_END); if (log->need_cache_flush) - r5l_move_io_unit_list(&log->running_ios, &log->io_end_ios, - IO_UNIT_IO_END); + r5l_move_to_end_ios(log); else r5l_log_run_stripes(log); spin_unlock_irqrestore(&log->io_list_lock, flags); -- cgit v1.2.3-58-ga151 From ad66d445ee5a5f548142b880e1642c711fbcacd1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 21 Dec 2015 10:51:01 +1100 Subject: raid5-cache: free meta_page earlier Once the I/O completed we don't need the meta page anymore. As the iounits can live on for a long time this reduces memory pressure a bit. Signed-off-by: Christoph Hellwig Reviewed-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid5-cache.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'drivers') diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 3699c4704ba8..668e973f07e6 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -150,12 +150,6 @@ static bool r5l_has_free_space(struct r5l_log *log, sector_t size) return log->device_size > used_size + size; } -static void r5l_free_io_unit(struct r5l_log *log, struct r5l_io_unit *io) -{ - __free_page(io->meta_page); - kmem_cache_free(log->io_kc, io); -} - static void __r5l_set_io_unit_state(struct r5l_io_unit *io, enum r5l_io_unit_state state) { @@ -215,6 +209,7 @@ static void r5l_log_endio(struct bio *bio) md_error(log->rdev->mddev, log->rdev); bio_put(bio); + __free_page(io->meta_page); spin_lock_irqsave(&log->io_list_lock, flags); __r5l_set_io_unit_state(io, IO_UNIT_IO_END); @@ -552,7 +547,7 @@ static bool r5l_complete_finished_ios(struct r5l_log *log) log->next_cp_seq = io->seq; list_del(&io->log_sibling); - r5l_free_io_unit(log, io); + kmem_cache_free(log->io_kc, io); found = true; } -- cgit v1.2.3-58-ga151 From 3312c951efaba55080958974047414576b9e5d63 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 21 Dec 2015 10:51:01 +1100 Subject: md: avoid warning for 32-bit sector_t When CONFIG_LBDAF is not set, sector_t is only 32-bits wide, which means we cannot have devices with more than 2TB, and the code that is trying to handle compatibility support for large devices in md version 0.90 is meaningless but also causes a compile-time warning: drivers/md/md.c: In function 'super_90_load': drivers/md/md.c:1029:19: warning: large integer implicitly truncated to unsigned type [-Woverflow] drivers/md/md.c: In function 'super_90_rdev_size_change': drivers/md/md.c:1323:17: warning: large integer implicitly truncated to unsigned type [-Woverflow] This adds a check for CONFIG_LBDAF to avoid even getting into this code path, and also adds an explicit cast to let the compiler know it doesn't have to warn about the truncation. Signed-off-by: Arnd Bergmann Signed-off-by: NeilBrown --- drivers/md/md.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index f71a81b37d08..3d70d0d11b95 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1026,8 +1026,9 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor * (not needed for Linear and RAID0 as metadata doesn't * record this size) */ - if (rdev->sectors >= (2ULL << 32) && sb->level >= 1) - rdev->sectors = (2ULL << 32) - 2; + if (IS_ENABLED(CONFIG_LBDAF) && (u64)rdev->sectors >= (2ULL << 32) && + sb->level >= 1) + rdev->sectors = (sector_t)(2ULL << 32) - 2; if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) /* "this cannot possibly happen" ... */ @@ -1320,8 +1321,9 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) /* Limit to 4TB as metadata cannot record more than that. * 4TB == 2^32 KB, or 2*2^32 sectors. */ - if (num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) - num_sectors = (2ULL << 32) - 2; + if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) && + rdev->mddev->level >= 1) + num_sectors = (sector_t)(2ULL << 32) - 2; md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, rdev->sb_page); md_super_wait(rdev->mddev); -- cgit v1.2.3-58-ga151 From 9ebc6ef188a0656f3620835f9be7fe22c1644c1c Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Mon, 21 Dec 2015 10:51:01 +1100 Subject: drivers: md: use ktime_get_real_seconds() get_seconds() API is not y2038 safe on 32 bit systems and the API is deprecated. Replace it with calls to ktime_get_real_seconds() API instead. Change mddev structure types to time64_t accordingly. 32 bit signed timestamps will overflow in the year 2038. Change the user interface mdu_array_info_s structure timestamps: ctime and utime values used in ioctls GET_ARRAY_INFO and SET_ARRAY_INFO to unsigned int. This will extend the field to last until the year 2106. The long term plan is to get rid of ctime and utime values in this structure as this information can be read from the on-disk meta data directly. Clamp the tim64_t timestamps to positive values with a max of U32_MAX when returning from GET_ARRAY_INFO ioctl to accommodate above changes in the data type of timestamps to unsigned int. v0.90 on disk meta data uses u32 for maintaining time stamps. So this will also last until year 2106. Assumption is that the usage of v0.90 will be deprecated by year 2106. Timestamp fields in the on disk meta data for v1.0 version already use 64 bit data types. Remove the truncation of the bits while writing to or reading from these from the disk. Signed-off-by: Deepa Dinamani Reviewed-by: Arnd Bergmann Signed-off-by: NeilBrown --- drivers/md/md.c | 18 +++++++++--------- drivers/md/md.h | 2 +- include/uapi/linux/raid/md_u.h | 4 ++-- 3 files changed, 12 insertions(+), 12 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index 3d70d0d11b95..d0f0621bf9b0 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1200,13 +1200,13 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) memcpy(&sb->set_uuid2, mddev->uuid+8, 4); memcpy(&sb->set_uuid3, mddev->uuid+12,4); - sb->ctime = mddev->ctime; + sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); sb->level = mddev->level; sb->size = mddev->dev_sectors / 2; sb->raid_disks = mddev->raid_disks; sb->md_minor = mddev->md_minor; sb->not_persistent = 0; - sb->utime = mddev->utime; + sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); sb->state = 0; sb->events_hi = (mddev->events>>32); sb->events_lo = (u32)mddev->events; @@ -1547,8 +1547,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) mddev->patch_version = 0; mddev->external = 0; mddev->chunk_sectors = le32_to_cpu(sb->chunksize); - mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); - mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); + mddev->ctime = le64_to_cpu(sb->ctime); + mddev->utime = le64_to_cpu(sb->utime); mddev->level = le32_to_cpu(sb->level); mddev->clevel[0] = 0; mddev->layout = le32_to_cpu(sb->layout); @@ -2336,7 +2336,7 @@ repeat: spin_lock(&mddev->lock); - mddev->utime = get_seconds(); + mddev->utime = ktime_get_real_seconds(); if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) force_change = 1; @@ -5843,7 +5843,7 @@ static int get_array_info(struct mddev *mddev, void __user *arg) info.major_version = mddev->major_version; info.minor_version = mddev->minor_version; info.patch_version = MD_PATCHLEVEL_VERSION; - info.ctime = mddev->ctime; + info.ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); info.level = mddev->level; info.size = mddev->dev_sectors / 2; if (info.size != mddev->dev_sectors / 2) /* overflow */ @@ -5853,7 +5853,7 @@ static int get_array_info(struct mddev *mddev, void __user *arg) info.md_minor = mddev->md_minor; info.not_persistent= !mddev->persistent; - info.utime = mddev->utime; + info.utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); info.state = 0; if (mddev->in_sync) info.state = (1<ctime = get_seconds(); + mddev->ctime = ktime_get_real_seconds(); return 0; } mddev->major_version = MD_MAJOR_VERSION; mddev->minor_version = MD_MINOR_VERSION; mddev->patch_version = MD_PATCHLEVEL_VERSION; - mddev->ctime = get_seconds(); + mddev->ctime = ktime_get_real_seconds(); mddev->level = info->level; mddev->clevel[0] = 0; diff --git a/drivers/md/md.h b/drivers/md/md.h index 8817e623258a..e16a17c37418 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -264,7 +264,7 @@ struct mddev { * managed externally */ char metadata_type[17]; /* externally set*/ int chunk_sectors; - time_t ctime, utime; + time64_t ctime, utime; int level, layout; char clevel[16]; int raid_disks; diff --git a/include/uapi/linux/raid/md_u.h b/include/uapi/linux/raid/md_u.h index 1cb8aa6850b5..36cd8210a5d1 100644 --- a/include/uapi/linux/raid/md_u.h +++ b/include/uapi/linux/raid/md_u.h @@ -80,7 +80,7 @@ typedef struct mdu_array_info_s { int major_version; int minor_version; int patch_version; - int ctime; + unsigned int ctime; int level; int size; int nr_disks; @@ -91,7 +91,7 @@ typedef struct mdu_array_info_s { /* * Generic state information */ - int utime; /* 0 Superblock update time */ + unsigned int utime; /* 0 Superblock update time */ int state; /* 1 State bits (clean, ...) */ int active_disks; /* 2 Number of currently active disks */ int working_disks; /* 3 Number of working disks */ -- cgit v1.2.3-58-ga151 From f6b6ec5cfac306c1eea66f074050864efcb11851 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 21 Dec 2015 10:51:02 +1100 Subject: raid5-cache: add journal hot add/remove support Add support for journal disk hot add/remove. Mostly trival checks in md part. The raid5 part is a little tricky. For hot-remove, we can't wait pending write as it's called from raid5d. The wait will cause deadlock. We simplily fail the hot-remove. A hot-remove retry can success eventually since if journal disk is faulty all pending write will be failed and finish. For hot-add, since an array supporting journal but without journal disk will be marked read-only, we are safe to hot add journal without stopping IO (should be read IO, while journal only handles write IO). Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/md.c | 42 ++++++++++++++++++++++++++++++------------ drivers/md/raid5-cache.c | 16 ++++++++++++---- drivers/md/raid5.c | 34 ++++++++++++++++++++++++++-------- 3 files changed, 68 insertions(+), 24 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index d0f0621bf9b0..c0c3e6dec248 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2055,8 +2055,9 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) return -EEXIST; /* make sure rdev->sectors exceeds mddev->dev_sectors */ - if (rdev->sectors && (mddev->dev_sectors == 0 || - rdev->sectors < mddev->dev_sectors)) { + if (!test_bit(Journal, &rdev->flags) && + rdev->sectors && + (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) { if (mddev->pers) { /* Cannot change size, so fail * If mddev->level <= 0, then we don't care @@ -2087,7 +2088,8 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) } } rcu_read_unlock(); - if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { + if (!test_bit(Journal, &rdev->flags) && + mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { printk(KERN_WARNING "md: %s: array is limited to %d devices\n", mdname(mddev), mddev->max_disks); return -EBUSY; @@ -6044,8 +6046,23 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) else clear_bit(WriteMostly, &rdev->flags); - if (info->state & (1<state & (1<flags)) { + has_journal = true; + break; + } + } + if (has_journal) { + export_rdev(rdev); + return -EBUSY; + } set_bit(Journal, &rdev->flags); + } /* * check whether the device shows up in other nodes */ @@ -8181,19 +8198,20 @@ static int remove_and_add_spares(struct mddev *mddev, continue; if (test_bit(Faulty, &rdev->flags)) continue; - if (test_bit(Journal, &rdev->flags)) - continue; - if (mddev->ro && - ! (rdev->saved_raid_disk >= 0 && - !test_bit(Bitmap_sync, &rdev->flags))) - continue; + if (!test_bit(Journal, &rdev->flags)) { + if (mddev->ro && + ! (rdev->saved_raid_disk >= 0 && + !test_bit(Bitmap_sync, &rdev->flags))) + continue; - rdev->recovery_offset = 0; + rdev->recovery_offset = 0; + } if (mddev->pers-> hot_add_disk(mddev, rdev) == 0) { if (sysfs_link_rdev(mddev, rdev)) /* failure here is OK */; - spares++; + if (!test_bit(Journal, &rdev->flags)) + spares++; md_new_event(mddev); set_bit(MD_CHANGE_DEVS, &mddev->flags); } diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 668e973f07e6..c1c4d213a2c2 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -799,10 +799,18 @@ void r5l_quiesce(struct r5l_log *log, int state) bool r5l_log_disk_error(struct r5conf *conf) { + struct r5l_log *log; + bool ret; /* don't allow write if journal disk is missing */ - if (!conf->log) - return test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); - return test_bit(Faulty, &conf->log->rdev->flags); + rcu_read_lock(); + log = rcu_dereference(conf->log); + + if (!log) + ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); + else + ret = test_bit(Faulty, &log->rdev->flags); + rcu_read_unlock(); + return ret; } struct r5l_recovery_ctx { @@ -1165,7 +1173,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) if (r5l_load_log(log)) goto error; - conf->log = log; + rcu_assign_pointer(conf->log, log); return 0; error: md_unregister_thread(&log->reclaim_thread); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 22362505f810..a086014dcd49 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7139,14 +7139,19 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) struct disk_info *p = conf->disks + number; print_raid5_conf(conf); - if (test_bit(Journal, &rdev->flags)) { + if (test_bit(Journal, &rdev->flags) && conf->log) { + struct r5l_log *log; /* - * journal disk is not removable, but we need give a chance to - * update superblock of other disks. Otherwise journal disk - * will be considered as 'fresh' + * we can't wait pending write here, as this is called in + * raid5d, wait will deadlock. */ - set_bit(MD_CHANGE_DEVS, &mddev->flags); - return -EINVAL; + if (atomic_read(&mddev->writes_pending)) + return -EBUSY; + log = conf->log; + conf->log = NULL; + synchronize_rcu(); + r5l_exit_log(log); + return 0; } if (rdev == p->rdev) rdevp = &p->rdev; @@ -7210,8 +7215,21 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) int first = 0; int last = conf->raid_disks - 1; - if (test_bit(Journal, &rdev->flags)) - return -EINVAL; + if (test_bit(Journal, &rdev->flags)) { + char b[BDEVNAME_SIZE]; + if (conf->log) + return -EBUSY; + + rdev->raid_disk = 0; + /* + * The array is in readonly mode if journal is missing, so no + * write requests running. We should be safe + */ + r5l_init_log(conf, rdev); + printk(KERN_INFO"md/raid:%s: using device %s as journal\n", + mdname(mddev), bdevname(rdev->bdev, b)); + return 0; + } if (mddev->recovery_disabled == conf->recovery_disabled) return -EBUSY; -- cgit v1.2.3-58-ga151 From c38d29b33bb3b3c792f3cca8a973422bb1897ebf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 21 Dec 2015 10:51:02 +1100 Subject: raid5-cache: use a bio_set This allows us to make guaranteed forward progress. Signed-off-by: Christoph Hellwig Signed-off-by: NeilBrown --- drivers/md/raid5-cache.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index c1c4d213a2c2..2a644977d90c 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -34,6 +34,12 @@ #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */ #define RECLAIM_MAX_FREE_SPACE_SHIFT (2) +/* + * We only need 2 bios per I/O unit to make progress, but ensure we + * have a few more available to not get too tight. + */ +#define R5L_POOL_SIZE 4 + struct r5l_log { struct md_rdev *rdev; @@ -70,6 +76,7 @@ struct r5l_log { struct bio flush_bio; struct kmem_cache *io_kc; + struct bio_set *bs; struct md_thread *reclaim_thread; unsigned long reclaim_target; /* number of space that need to be @@ -248,7 +255,7 @@ static void r5l_submit_current_io(struct r5l_log *log) static struct bio *r5l_bio_alloc(struct r5l_log *log) { - struct bio *bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES); + struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs); bio->bi_rw = WRITE; bio->bi_bdev = log->rdev->bdev; @@ -1161,6 +1168,10 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) if (!log->io_kc) goto io_kc; + log->bs = bioset_create(R5L_POOL_SIZE, 0); + if (!log->bs) + goto io_bs; + log->reclaim_thread = md_register_thread(r5l_reclaim_thread, log->rdev->mddev, "reclaim"); if (!log->reclaim_thread) @@ -1178,6 +1189,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) error: md_unregister_thread(&log->reclaim_thread); reclaim_thread: + bioset_free(log->bs); +io_bs: kmem_cache_destroy(log->io_kc); io_kc: kfree(log); @@ -1187,6 +1200,7 @@ io_kc: void r5l_exit_log(struct r5l_log *log) { md_unregister_thread(&log->reclaim_thread); + bioset_free(log->bs); kmem_cache_destroy(log->io_kc); kfree(log); } -- cgit v1.2.3-58-ga151 From e8deb6381051bf3ce9d817020e8ba972b405a070 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 21 Dec 2015 10:51:02 +1100 Subject: raid5-cache: use a mempool for the metadata block We only have a limited number in flight, so use a page based mempool. Signed-off-by: Christoph Hellwig Signed-off-by: NeilBrown --- drivers/md/raid5-cache.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 2a644977d90c..fa2d6321f3a4 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -77,6 +77,7 @@ struct r5l_log { struct kmem_cache *io_kc; struct bio_set *bs; + mempool_t *meta_pool; struct md_thread *reclaim_thread; unsigned long reclaim_target; /* number of space that need to be @@ -216,7 +217,7 @@ static void r5l_log_endio(struct bio *bio) md_error(log->rdev->mddev, log->rdev); bio_put(bio); - __free_page(io->meta_page); + mempool_free(io->meta_page, log->meta_pool); spin_lock_irqsave(&log->io_list_lock, flags); __r5l_set_io_unit_state(io, IO_UNIT_IO_END); @@ -293,8 +294,9 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log) INIT_LIST_HEAD(&io->stripe_list); io->state = IO_UNIT_RUNNING; - io->meta_page = alloc_page(GFP_NOIO | __GFP_NOFAIL | __GFP_ZERO); + io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO); block = page_address(io->meta_page); + clear_page(block); block->magic = cpu_to_le32(R5LOG_MAGIC); block->version = R5LOG_VERSION; block->seq = cpu_to_le64(log->seq); @@ -1172,6 +1174,10 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) if (!log->bs) goto io_bs; + log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0); + if (!log->meta_pool) + goto out_mempool; + log->reclaim_thread = md_register_thread(r5l_reclaim_thread, log->rdev->mddev, "reclaim"); if (!log->reclaim_thread) @@ -1186,9 +1192,12 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) rcu_assign_pointer(conf->log, log); return 0; + error: md_unregister_thread(&log->reclaim_thread); reclaim_thread: + mempool_destroy(log->meta_pool); +out_mempool: bioset_free(log->bs); io_bs: kmem_cache_destroy(log->io_kc); @@ -1200,6 +1209,7 @@ io_kc: void r5l_exit_log(struct r5l_log *log) { md_unregister_thread(&log->reclaim_thread); + mempool_destroy(log->meta_pool); bioset_free(log->bs); kmem_cache_destroy(log->io_kc); kfree(log); -- cgit v1.2.3-58-ga151 From 5036c3902054358ee293b8cecfea13342d8019e8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 21 Dec 2015 10:51:02 +1100 Subject: raid5: allow r5l_io_unit allocations to fail And propagate the error up the stack so we can add the stripe to no_stripes_list and retry our log operation later. This avoids blocking raid5d due to reclaim, an it allows to get rid of the deadlock-prone GFP_NOFAIL allocation. shli: add missing mempool_destroy() Signed-off-by: Christoph Hellwig Signed-off-by: NeilBrown --- drivers/md/raid5-cache.c | 67 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 57 insertions(+), 10 deletions(-) (limited to 'drivers') diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index fa2d6321f3a4..6d2b4789a928 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -75,7 +75,10 @@ struct r5l_log { struct list_head finished_ios; /* io_units which settle down in log disk */ struct bio flush_bio; + struct list_head no_mem_stripes; /* pending stripes, -ENOMEM */ + struct kmem_cache *io_kc; + mempool_t *io_pool; struct bio_set *bs; mempool_t *meta_pool; @@ -287,8 +290,11 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log) struct r5l_io_unit *io; struct r5l_meta_block *block; - /* We can't handle memory allocate failure so far */ - io = kmem_cache_zalloc(log->io_kc, GFP_NOIO | __GFP_NOFAIL); + io = mempool_alloc(log->io_pool, GFP_ATOMIC); + if (!io) + return NULL; + memset(io, 0, sizeof(*io)); + io->log = log; INIT_LIST_HEAD(&io->log_sibling); INIT_LIST_HEAD(&io->stripe_list); @@ -326,8 +332,12 @@ static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size) log->current_io->meta_offset + payload_size > PAGE_SIZE) r5l_submit_current_io(log); - if (!log->current_io) + if (!log->current_io) { log->current_io = r5l_new_meta(log); + if (!log->current_io) + return -ENOMEM; + } + return 0; } @@ -372,11 +382,12 @@ static void r5l_append_payload_page(struct r5l_log *log, struct page *page) r5_reserve_log_entry(log, io); } -static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, +static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, int data_pages, int parity_pages) { int i; int meta_size; + int ret; struct r5l_io_unit *io; meta_size = @@ -385,7 +396,10 @@ static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, sizeof(struct r5l_payload_data_parity) + sizeof(__le32) * parity_pages; - r5l_get_meta(log, meta_size); + ret = r5l_get_meta(log, meta_size); + if (ret) + return ret; + io = log->current_io; for (i = 0; i < sh->disks; i++) { @@ -415,6 +429,8 @@ static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, list_add_tail(&sh->log_list, &io->stripe_list); atomic_inc(&io->pending_stripe); sh->log_io = io; + + return 0; } static void r5l_wake_reclaim(struct r5l_log *log, sector_t space); @@ -429,6 +445,7 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) int meta_size; int reserve; int i; + int ret = 0; if (!log) return -EAGAIN; @@ -477,17 +494,22 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) mutex_lock(&log->io_mutex); /* meta + data */ reserve = (1 + write_disks) << (PAGE_SHIFT - 9); - if (r5l_has_free_space(log, reserve)) - r5l_log_stripe(log, sh, data_pages, parity_pages); - else { + if (!r5l_has_free_space(log, reserve)) { spin_lock(&log->no_space_stripes_lock); list_add_tail(&sh->log_list, &log->no_space_stripes); spin_unlock(&log->no_space_stripes_lock); r5l_wake_reclaim(log, reserve); + } else { + ret = r5l_log_stripe(log, sh, data_pages, parity_pages); + if (ret) { + spin_lock_irq(&log->io_list_lock); + list_add_tail(&sh->log_list, &log->no_mem_stripes); + spin_unlock_irq(&log->io_list_lock); + } } - mutex_unlock(&log->io_mutex); + mutex_unlock(&log->io_mutex); return 0; } @@ -540,6 +562,21 @@ static sector_t r5l_reclaimable_space(struct r5l_log *log) log->next_checkpoint); } +static void r5l_run_no_mem_stripe(struct r5l_log *log) +{ + struct stripe_head *sh; + + assert_spin_locked(&log->io_list_lock); + + if (!list_empty(&log->no_mem_stripes)) { + sh = list_first_entry(&log->no_mem_stripes, + struct stripe_head, log_list); + list_del_init(&sh->log_list); + set_bit(STRIPE_HANDLE, &sh->state); + raid5_release_stripe(sh); + } +} + static bool r5l_complete_finished_ios(struct r5l_log *log) { struct r5l_io_unit *io, *next; @@ -556,7 +593,8 @@ static bool r5l_complete_finished_ios(struct r5l_log *log) log->next_cp_seq = io->seq; list_del(&io->log_sibling); - kmem_cache_free(log->io_kc, io); + mempool_free(io, log->io_pool); + r5l_run_no_mem_stripe(log); found = true; } @@ -1170,6 +1208,10 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) if (!log->io_kc) goto io_kc; + log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc); + if (!log->io_pool) + goto io_pool; + log->bs = bioset_create(R5L_POOL_SIZE, 0); if (!log->bs) goto io_bs; @@ -1184,6 +1226,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) goto reclaim_thread; init_waitqueue_head(&log->iounit_wait); + INIT_LIST_HEAD(&log->no_mem_stripes); + INIT_LIST_HEAD(&log->no_space_stripes); spin_lock_init(&log->no_space_stripes_lock); @@ -1200,6 +1244,8 @@ reclaim_thread: out_mempool: bioset_free(log->bs); io_bs: + mempool_destroy(log->io_pool); +io_pool: kmem_cache_destroy(log->io_kc); io_kc: kfree(log); @@ -1211,6 +1257,7 @@ void r5l_exit_log(struct r5l_log *log) md_unregister_thread(&log->reclaim_thread); mempool_destroy(log->meta_pool); bioset_free(log->bs); + mempool_destroy(log->io_pool); kmem_cache_destroy(log->io_kc); kfree(log); } -- cgit v1.2.3-58-ga151 From bb9ef71646606e51adfebdc94231fbbc862dbe28 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 28 Dec 2015 10:46:38 +0800 Subject: md: remove unnecesary md_new_event_inintr md_new_event had removed sysfs_notify since 'commit 72a23c211e45 ("Make sure all changes to md/sync_action are notified.")', so we can use md_new_event and delete md_new_event_inintr. Signed-off-by: Guoqing Jiang Signed-off-by: NeilBrown --- drivers/md/md.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index c0c3e6dec248..43a140457e0c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -205,15 +205,6 @@ void md_new_event(struct mddev *mddev) } EXPORT_SYMBOL_GPL(md_new_event); -/* Alternate version that can be called from interrupts - * when calling sysfs_notify isn't needed. - */ -static void md_new_event_inintr(struct mddev *mddev) -{ - atomic_inc(&md_event_count); - wake_up(&md_event_waiters); -} - /* * Enables to iterate over all existing md arrays * all_mddevs_lock protects this list. @@ -7209,7 +7200,7 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev) md_wakeup_thread(mddev->thread); if (mddev->event_work.func) queue_work(md_misc_wq, &mddev->event_work); - md_new_event_inintr(mddev); + md_new_event(mddev); } EXPORT_SYMBOL(md_error); -- cgit v1.2.3-58-ga151 From 274d8cbde1bc3bdfb31c5d6a58113dff5cee4f87 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 4 Jan 2016 16:16:58 +1100 Subject: md: Remove 'ready' field from mddev. This field is always set in tandem with ->pers, and when it is tested ->pers is also tested. So ->ready is not needed. It was needed once, but code rearrangement and locking changes have removed that needed. Signed-off-by: NeilBrown --- drivers/md/md.c | 5 +---- drivers/md/md.h | 2 -- 2 files changed, 1 insertion(+), 6 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index 43a140457e0c..0d1d822eeda5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -250,8 +250,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) blk_queue_split(q, &bio, q->bio_split); - if (mddev == NULL || mddev->pers == NULL - || !mddev->ready) { + if (mddev == NULL || mddev->pers == NULL) { bio_io_error(bio); return BLK_QC_T_NONE; } @@ -5298,7 +5297,6 @@ int md_run(struct mddev *mddev) smp_wmb(); spin_lock(&mddev->lock); mddev->pers = pers; - mddev->ready = 1; spin_unlock(&mddev->lock); rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0) @@ -5498,7 +5496,6 @@ static void __md_stop(struct mddev *mddev) /* Ensure ->event_work is done */ flush_workqueue(md_misc_wq); spin_lock(&mddev->lock); - mddev->ready = 0; mddev->pers = NULL; spin_unlock(&mddev->lock); pers->free(mddev, mddev->private); diff --git a/drivers/md/md.h b/drivers/md/md.h index e16a17c37418..fc6f7bbc9544 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -246,8 +246,6 @@ struct mddev { * are happening, so run/ * takeover/stop are not safe */ - int ready; /* See when safe to pass - * IO requests down */ struct gendisk *gendisk; struct kobject kobj; -- cgit v1.2.3-58-ga151 From a62ab49eb502a07814f9942770893118c6281223 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 6 Jan 2016 14:37:13 -0800 Subject: md: set MD_HAS_JOURNAL in correct places Set MD_HAS_JOURNAL when a array is loaded or journal is initialized. This is to avoid the flags set too early in journal disk hotadd. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/md.c | 9 +++++---- drivers/md/raid5-cache.c | 1 + 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index 0d1d822eeda5..29a4bbf62be5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1597,6 +1597,11 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) mddev->new_chunk_sectors = mddev->chunk_sectors; } + if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) { + set_bit(MD_HAS_JOURNAL, &mddev->flags); + if (mddev->recovery_cp == MaxSector) + set_bit(MD_JOURNAL_CLEAN, &mddev->flags); + } } else if (mddev->pers == NULL) { /* Insist of good event counter while assembling, except for * spares (which don't need an event count) */ @@ -1643,8 +1648,6 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) } set_bit(Journal, &rdev->flags); rdev->journal_tail = le64_to_cpu(sb->journal_tail); - if (mddev->recovery_cp == MaxSector) - set_bit(MD_JOURNAL_CLEAN, &mddev->flags); rdev->raid_disk = 0; break; default: @@ -1664,8 +1667,6 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) set_bit(WriteMostly, &rdev->flags); if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) set_bit(Replacement, &rdev->flags); - if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) - set_bit(MD_HAS_JOURNAL, &mddev->flags); } else /* MULTIPATH are always insync */ set_bit(In_sync, &rdev->flags); diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 6d2b4789a928..7ac035a73281 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1235,6 +1235,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) goto error; rcu_assign_pointer(conf->log, log); + set_bit(MD_HAS_JOURNAL, &conf->mddev->flags); return 0; error: -- cgit v1.2.3-58-ga151 From 87d4d91616e4db9b8293ba9d9e5a2f3f0d0c8aa6 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 6 Jan 2016 14:37:14 -0800 Subject: MD: add journal with array suspended Hot add journal disk in recovery thread context brings a lot of trouble as IO could be running. Unlike spare disk hot add, adding journal disk with array suspended makes more sense and implmentation is much easier. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/md.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index 29a4bbf62be5..8753dee3983b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2455,15 +2455,20 @@ static int add_bound_rdev(struct md_rdev *rdev) { struct mddev *mddev = rdev->mddev; int err = 0; + bool add_journal = test_bit(Journal, &rdev->flags); - if (!mddev->pers->hot_remove_disk) { + if (!mddev->pers->hot_remove_disk || add_journal) { /* If there is hot_add_disk but no hot_remove_disk * then added disks for geometry changes, * and should be added immediately. */ super_types[mddev->major_version]. validate_super(mddev, rdev); + if (add_journal) + mddev_suspend(mddev); err = mddev->pers->hot_add_disk(mddev, rdev); + if (add_journal) + mddev_resume(mddev); if (err) { unbind_rdev_from_array(rdev); export_rdev(rdev); -- cgit v1.2.3-58-ga151 From 16a43f6a65002ba9a5b063764b4ad5d288a1c15e Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 6 Jan 2016 14:37:15 -0800 Subject: raid5-cache: handle journal hotadd in quiesce Handle journal hotadd in quiesce to avoid creating duplicated threads. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid5-cache.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'drivers') diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 7ac035a73281..9531f5f05b93 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -827,6 +827,13 @@ void r5l_quiesce(struct r5l_log *log, int state) return; if (state == 0) { log->in_teardown = 0; + /* + * This is a special case for hotadd. In suspend, the array has + * no journal. In resume, journal is initialized as well as the + * reclaim thread. + */ + if (log->reclaim_thread) + return; log->reclaim_thread = md_register_thread(r5l_reclaim_thread, log->rdev->mddev, "reclaim"); } else if (state == 1) { -- cgit v1.2.3-58-ga151 From 1501efadc524a0c99494b576923091589a52d2a4 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 13 Jan 2016 16:00:07 -0800 Subject: md/raid: only permit hot-add of compatible integrity profiles It is not safe for an integrity profile to be changed while i/o is in-flight in the queue. Prevent adding new disks or otherwise online spares to an array if the device has an incompatible integrity profile. The original change to the blk_integrity_unregister implementation in md, commmit c7bfced9a671 "md: suspend i/o during runtime blk_integrity_unregister" introduced an immediate hang regression. This policy of disallowing changes the integrity profile once one has been established is shared with DM. Here is an abbreviated log from a test run that: 1/ Creates a degraded raid1 with an integrity-enabled device (pmem0s) [ 59.076127] 2/ Tries to add an integrity-disabled device (pmem1m) [ 90.489209] 3/ Retries with an integrity-enabled device (pmem1s) [ 205.671277] [ 59.076127] md/raid1:md0: active with 1 out of 2 mirrors [ 59.078302] md: data integrity enabled on md0 [..] [ 90.489209] md0: incompatible integrity profile for pmem1m [..] [ 205.671277] md: super_written gets error=-5 [ 205.677386] md/raid1:md0: Disk failure on pmem1m, disabling device. [ 205.677386] md/raid1:md0: Operation continuing on 1 devices. [ 205.683037] RAID1 conf printout: [ 205.684699] --- wd:1 rd:2 [ 205.685972] disk 0, wo:0, o:1, dev:pmem0s [ 205.687562] disk 1, wo:1, o:1, dev:pmem1s [ 205.691717] md: recovery of RAID array md0 Fixes: c7bfced9a671 ("md: suspend i/o during runtime blk_integrity_unregister") Cc: Cc: Mike Snitzer Reported-by: NeilBrown Signed-off-by: Dan Williams Signed-off-by: NeilBrown --- drivers/md/md.c | 28 ++++++++++++++++------------ drivers/md/md.h | 2 +- drivers/md/multipath.c | 6 +++--- drivers/md/raid1.c | 6 +++--- drivers/md/raid10.c | 6 +++--- 5 files changed, 26 insertions(+), 22 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index 8753dee3983b..2cf0e1c00b9a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2010,28 +2010,32 @@ int md_integrity_register(struct mddev *mddev) } EXPORT_SYMBOL(md_integrity_register); -/* Disable data integrity if non-capable/non-matching disk is being added */ -void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) +/* + * Attempt to add an rdev, but only if it is consistent with the current + * integrity profile + */ +int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) { struct blk_integrity *bi_rdev; struct blk_integrity *bi_mddev; + char name[BDEVNAME_SIZE]; if (!mddev->gendisk) - return; + return 0; bi_rdev = bdev_get_integrity(rdev->bdev); bi_mddev = blk_get_integrity(mddev->gendisk); if (!bi_mddev) /* nothing to do */ - return; - if (rdev->raid_disk < 0) /* skip spares */ - return; - if (bi_rdev && blk_integrity_compare(mddev->gendisk, - rdev->bdev->bd_disk) >= 0) - return; - WARN_ON_ONCE(!mddev->suspended); - printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev)); - blk_integrity_unregister(mddev->gendisk); + return 0; + + if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) { + printk(KERN_NOTICE "%s: incompatible integrity profile for %s\n", + mdname(mddev), bdevname(rdev->bdev, name)); + return -ENXIO; + } + + return 0; } EXPORT_SYMBOL(md_integrity_add_rdev); diff --git a/drivers/md/md.h b/drivers/md/md.h index fc6f7bbc9544..a491e220e738 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -660,7 +660,7 @@ extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev); extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors); extern int md_check_no_bitmap(struct mddev *mddev); extern int md_integrity_register(struct mddev *mddev); -extern void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev); +extern int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev); extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); extern void mddev_init(struct mddev *mddev); diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 7331a80d89f1..0a72ab6e6c20 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -257,6 +257,9 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); + err = md_integrity_add_rdev(rdev, mddev); + if (err) + break; spin_lock_irq(&conf->device_lock); mddev->degraded--; rdev->raid_disk = path; @@ -264,9 +267,6 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) spin_unlock_irq(&conf->device_lock); rcu_assign_pointer(p->rdev, rdev); err = 0; - mddev_suspend(mddev); - md_integrity_add_rdev(rdev, mddev); - mddev_resume(mddev); break; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index e2169ff6e0f0..c4b913409226 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1589,6 +1589,9 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (mddev->recovery_disabled == conf->recovery_disabled) return -EBUSY; + if (md_integrity_add_rdev(rdev, mddev)) + return -ENXIO; + if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; @@ -1632,9 +1635,6 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) break; } } - mddev_suspend(mddev); - md_integrity_add_rdev(rdev, mddev); - mddev_resume(mddev); if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev))) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); print_conf(conf); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 84e597e1c489..ce959b4ae4df 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1698,6 +1698,9 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1)) return -EINVAL; + if (md_integrity_add_rdev(rdev, mddev)) + return -ENXIO; + if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; @@ -1739,9 +1742,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) rcu_assign_pointer(p->rdev, rdev); break; } - mddev_suspend(mddev); - md_integrity_add_rdev(rdev, mddev); - mddev_resume(mddev); if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev))) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); -- cgit v1.2.3-58-ga151