summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
authorDan Williams <dan.j.williams@intel.com>2009-09-08 17:55:54 -0700
committerDan Williams <dan.j.williams@intel.com>2009-09-08 17:55:54 -0700
commit9134d02bc0af4a8747d448d1f811ec5f8eb96df6 (patch)
tree704c3e5dcc10f360815c4868a74711f82fb62e27 /drivers/md
parentbbb20089a3275a19e475dbc21320c3742e3ca423 (diff)
parent80ffb3cceaefa405f2ecd46d66500ed8d53efe74 (diff)
Merge commit 'md/for-linus' into async-tx-next
Conflicts: drivers/md/raid5.c
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/dm-crypt.c4
-rw-r--r--drivers/md/dm-delay.c4
-rw-r--r--drivers/md/dm-exception-store.c9
-rw-r--r--drivers/md/dm-linear.c2
-rw-r--r--drivers/md/dm-mpath.c2
-rw-r--r--drivers/md/dm-raid1.c3
-rw-r--r--drivers/md/dm-stripe.c7
-rw-r--r--drivers/md/dm-table.c17
-rw-r--r--drivers/md/dm.c14
-rw-r--r--drivers/md/dm.h1
-rw-r--r--drivers/md/linear.c6
-rw-r--r--drivers/md/md.c251
-rw-r--r--drivers/md/md.h12
-rw-r--r--drivers/md/multipath.c12
-rw-r--r--drivers/md/raid0.c10
-rw-r--r--drivers/md/raid1.c16
-rw-r--r--drivers/md/raid10.c23
-rw-r--r--drivers/md/raid5.c87
18 files changed, 297 insertions, 183 deletions
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 9933eb861c71..ed1038164019 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -776,7 +776,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
* But don't wait if split was due to the io size restriction
*/
if (unlikely(out_of_pages))
- congestion_wait(WRITE, HZ/100);
+ congestion_wait(BLK_RW_ASYNC, HZ/100);
/*
* With async crypto it is unsafe to share the crypto context
@@ -1318,7 +1318,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
{
struct crypt_config *cc = ti->private;
- return fn(ti, cc->dev, cc->start, data);
+ return fn(ti, cc->dev, cc->start, ti->len, data);
}
static struct target_type crypt_target = {
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 4e5b843cd4d7..ebe7381f47c8 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -324,12 +324,12 @@ static int delay_iterate_devices(struct dm_target *ti,
struct delay_c *dc = ti->private;
int ret = 0;
- ret = fn(ti, dc->dev_read, dc->start_read, data);
+ ret = fn(ti, dc->dev_read, dc->start_read, ti->len, data);
if (ret)
goto out;
if (dc->dev_write)
- ret = fn(ti, dc->dev_write, dc->start_write, data);
+ ret = fn(ti, dc->dev_write, dc->start_write, ti->len, data);
out:
return ret;
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index c3ae51584b12..3710ff88fc10 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -195,7 +195,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
struct dm_exception_store **store)
{
int r = 0;
- struct dm_exception_store_type *type;
+ struct dm_exception_store_type *type = NULL;
struct dm_exception_store *tmp_store;
char persistent;
@@ -211,12 +211,15 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
}
persistent = toupper(*argv[1]);
- if (persistent != 'P' && persistent != 'N') {
+ if (persistent == 'P')
+ type = get_type("P");
+ else if (persistent == 'N')
+ type = get_type("N");
+ else {
ti->error = "Persistent flag is not P or N";
return -EINVAL;
}
- type = get_type(&persistent);
if (!type) {
ti->error = "Exception store type not recognised";
r = -EINVAL;
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 9184b6deb868..82f7d6e6b1ea 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -139,7 +139,7 @@ static int linear_iterate_devices(struct dm_target *ti,
{
struct linear_c *lc = ti->private;
- return fn(ti, lc->dev, lc->start, data);
+ return fn(ti, lc->dev, lc->start, ti->len, data);
}
static struct target_type linear_target = {
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index c70604a20897..6f0d90d4a541 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1453,7 +1453,7 @@ static int multipath_iterate_devices(struct dm_target *ti,
list_for_each_entry(pg, &m->priority_groups, list) {
list_for_each_entry(p, &pg->pgpaths, list) {
- ret = fn(ti, p->path.dev, ti->begin, data);
+ ret = fn(ti, p->path.dev, ti->begin, ti->len, data);
if (ret)
goto out;
}
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index ce8868c768cc..9726577cde49 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -638,6 +638,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
spin_lock_irq(&ms->lock);
bio_list_merge(&ms->writes, &requeue);
spin_unlock_irq(&ms->lock);
+ delayed_wake(ms);
}
/*
@@ -1292,7 +1293,7 @@ static int mirror_iterate_devices(struct dm_target *ti,
for (i = 0; !ret && i < ms->nr_mirrors; i++)
ret = fn(ti, ms->mirror[i].dev,
- ms->mirror[i].offset, data);
+ ms->mirror[i].offset, ti->len, data);
return ret;
}
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index b240e85ae39a..4e0e5937e42a 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -320,10 +320,11 @@ static int stripe_iterate_devices(struct dm_target *ti,
int ret = 0;
unsigned i = 0;
- do
+ do {
ret = fn(ti, sc->stripe[i].dev,
- sc->stripe[i].physical_start, data);
- while (!ret && ++i < sc->stripes);
+ sc->stripe[i].physical_start,
+ sc->stripe_width, data);
+ } while (!ret && ++i < sc->stripes);
return ret;
}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 4899ebe767c8..d952b3441913 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -346,7 +346,7 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md)
* If possible, this checks an area of a destination device is valid.
*/
static int device_area_is_valid(struct dm_target *ti, struct dm_dev *dev,
- sector_t start, void *data)
+ sector_t start, sector_t len, void *data)
{
struct queue_limits *limits = data;
struct block_device *bdev = dev->bdev;
@@ -359,7 +359,7 @@ static int device_area_is_valid(struct dm_target *ti, struct dm_dev *dev,
if (!dev_size)
return 1;
- if ((start >= dev_size) || (start + ti->len > dev_size)) {
+ if ((start >= dev_size) || (start + len > dev_size)) {
DMWARN("%s: %s too small for target",
dm_device_name(ti->table->md), bdevname(bdev, b));
return 0;
@@ -377,11 +377,11 @@ static int device_area_is_valid(struct dm_target *ti, struct dm_dev *dev,
return 0;
}
- if (ti->len & (logical_block_size_sectors - 1)) {
+ if (len & (logical_block_size_sectors - 1)) {
DMWARN("%s: len=%llu not aligned to h/w "
"logical block size %hu of %s",
dm_device_name(ti->table->md),
- (unsigned long long)ti->len,
+ (unsigned long long)len,
limits->logical_block_size, bdevname(bdev, b));
return 0;
}
@@ -482,7 +482,7 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
- sector_t start, void *data)
+ sector_t start, sector_t len, void *data)
{
struct queue_limits *limits = data;
struct block_device *bdev = dev->bdev;
@@ -495,7 +495,7 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
return 0;
}
- if (blk_stack_limits(limits, &q->limits, start) < 0)
+ if (blk_stack_limits(limits, &q->limits, start << 9) < 0)
DMWARN("%s: target device %s is misaligned",
dm_device_name(ti->table->md), bdevname(bdev, b));
@@ -830,11 +830,6 @@ unsigned dm_table_get_type(struct dm_table *t)
return t->type;
}
-bool dm_table_bio_based(struct dm_table *t)
-{
- return dm_table_get_type(t) == DM_TYPE_BIO_BASED;
-}
-
bool dm_table_request_based(struct dm_table *t)
{
return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 3c6d4ee8921d..8a311ea0d441 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1017,7 +1017,7 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector,
clone->bi_flags |= 1 << BIO_CLONED;
if (bio_integrity(bio)) {
- bio_integrity_clone(clone, bio, GFP_NOIO);
+ bio_integrity_clone(clone, bio, GFP_NOIO, bs);
bio_integrity_trim(clone,
bio_sector_offset(bio, idx, offset), len);
}
@@ -1045,7 +1045,7 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
clone->bi_flags &= ~(1 << BIO_SEG_VALID);
if (bio_integrity(bio)) {
- bio_integrity_clone(clone, bio, GFP_NOIO);
+ bio_integrity_clone(clone, bio, GFP_NOIO, bs);
if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
bio_integrity_trim(clone,
@@ -2203,16 +2203,6 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table)
goto out;
}
- /*
- * It is enought that blk_queue_ordered() is called only once when
- * the first bio-based table is bound.
- *
- * This setting should be moved to alloc_dev() when request-based dm
- * supports barrier.
- */
- if (!md->map && dm_table_bio_based(table))
- blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL);
-
__unbind(md);
r = __bind(md, table, &limits);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 23278ae80f08..a7663eba17e2 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -61,7 +61,6 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits);
int dm_table_any_busy_target(struct dm_table *t);
int dm_table_set_type(struct dm_table *t);
unsigned dm_table_get_type(struct dm_table *t);
-bool dm_table_bio_based(struct dm_table *t);
bool dm_table_request_based(struct dm_table *t);
int dm_table_alloc_md_mempools(struct dm_table *t);
void dm_table_free_md_mempools(struct dm_table *t);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 15c8b7b25a9b..5fe39c2a3d2b 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -166,8 +166,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
rdev->sectors = sectors * mddev->chunk_sectors;
}
- blk_queue_stack_limits(mddev->queue,
- rdev->bdev->bd_disk->queue);
+ disk_stack_limits(mddev->gendisk, rdev->bdev,
+ rdev->data_offset << 9);
/* as we don't honour merge_bvec_fn, we must never risk
* violating it, so limit ->max_sector to one PAGE, as
* a one page request is never in violation.
@@ -220,6 +220,7 @@ static int linear_run (mddev_t *mddev)
mddev->queue->unplug_fn = linear_unplug;
mddev->queue->backing_dev_info.congested_fn = linear_congested;
mddev->queue->backing_dev_info.congested_data = mddev;
+ md_integrity_register(mddev);
return 0;
}
@@ -256,6 +257,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
rcu_assign_pointer(mddev->private, newconf);
md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
set_capacity(mddev->gendisk, mddev->array_sectors);
+ revalidate_disk(mddev->gendisk);
call_rcu(&oldconf->rcu, free_conf);
return 0;
}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 09be637d52cb..9dd872000cec 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -359,6 +359,7 @@ static mddev_t * mddev_find(dev_t unit)
else
new->md_minor = MINOR(unit) >> MdpMinorShift;
+ mutex_init(&new->open_mutex);
mutex_init(&new->reconfig_mutex);
INIT_LIST_HEAD(&new->disks);
INIT_LIST_HEAD(&new->all_mddevs);
@@ -1308,7 +1309,12 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
}
if (mddev->level != LEVEL_MULTIPATH) {
int role;
- role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
+ if (rdev->desc_nr < 0 ||
+ rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
+ role = 0xffff;
+ rdev->desc_nr = -1;
+ } else
+ role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
switch(role) {
case 0xffff: /* spare */
break;
@@ -1394,8 +1400,14 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
if (rdev2->desc_nr+1 > max_dev)
max_dev = rdev2->desc_nr+1;
- if (max_dev > le32_to_cpu(sb->max_dev))
+ if (max_dev > le32_to_cpu(sb->max_dev)) {
+ int bmask;
sb->max_dev = cpu_to_le32(max_dev);
+ rdev->sb_size = max_dev * 2 + 256;
+ bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
+ if (rdev->sb_size & bmask)
+ rdev->sb_size = (rdev->sb_size | bmask) + 1;
+ }
for (i=0; i<max_dev;i++)
sb->dev_roles[i] = cpu_to_le16(0xfffe);
@@ -1487,37 +1499,76 @@ static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
static LIST_HEAD(pending_raid_disks);
-static void md_integrity_check(mdk_rdev_t *rdev, mddev_t *mddev)
+/*
+ * Try to register data integrity profile for an mddev
+ *
+ * This is called when an array is started and after a disk has been kicked
+ * from the array. It only succeeds if all working and active component devices
+ * are integrity capable with matching profiles.
+ */
+int md_integrity_register(mddev_t *mddev)
+{
+ mdk_rdev_t *rdev, *reference = NULL;
+
+ if (list_empty(&mddev->disks))
+ return 0; /* nothing to do */
+ if (blk_get_integrity(mddev->gendisk))
+ return 0; /* already registered */
+ list_for_each_entry(rdev, &mddev->disks, same_set) {
+ /* skip spares and non-functional disks */
+ if (test_bit(Faulty, &rdev->flags))
+ continue;
+ if (rdev->raid_disk < 0)
+ continue;
+ /*
+ * If at least one rdev is not integrity capable, we can not
+ * enable data integrity for the md device.
+ */
+ if (!bdev_get_integrity(rdev->bdev))
+ return -EINVAL;
+ if (!reference) {
+ /* Use the first rdev as the reference */
+ reference = rdev;
+ continue;
+ }
+ /* does this rdev's profile match the reference profile? */
+ if (blk_integrity_compare(reference->bdev->bd_disk,
+ rdev->bdev->bd_disk) < 0)
+ return -EINVAL;
+ }
+ /*
+ * All component devices are integrity capable and have matching
+ * profiles, register the common profile for the md device.
+ */
+ if (blk_integrity_register(mddev->gendisk,
+ bdev_get_integrity(reference->bdev)) != 0) {
+ printk(KERN_ERR "md: failed to register integrity for %s\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+ printk(KERN_NOTICE "md: data integrity on %s enabled\n",
+ mdname(mddev));
+ return 0;
+}
+EXPORT_SYMBOL(md_integrity_register);
+
+/* Disable data integrity if non-capable/non-matching disk is being added */
+void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
{
- struct mdk_personality *pers = mddev->pers;
- struct gendisk *disk = mddev->gendisk;
struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev);
- struct blk_integrity *bi_mddev = blk_get_integrity(disk);
+ struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk);
- /* Data integrity passthrough not supported on RAID 4, 5 and 6 */
- if (pers && pers->level >= 4 && pers->level <= 6)
+ if (!bi_mddev) /* nothing to do */
return;
-
- /* If rdev is integrity capable, register profile for mddev */
- if (!bi_mddev && bi_rdev) {
- if (blk_integrity_register(disk, bi_rdev))
- printk(KERN_ERR "%s: %s Could not register integrity!\n",
- __func__, disk->disk_name);
- else
- printk(KERN_NOTICE "Enabling data integrity on %s\n",
- disk->disk_name);
+ if (rdev->raid_disk < 0) /* skip spares */
return;
- }
-
- /* Check that mddev and rdev have matching profiles */
- if (blk_integrity_compare(disk, rdev->bdev->bd_disk) < 0) {
- printk(KERN_ERR "%s: %s/%s integrity mismatch!\n", __func__,
- disk->disk_name, rdev->bdev->bd_disk->disk_name);
- printk(KERN_NOTICE "Disabling data integrity on %s\n",
- disk->disk_name);
- blk_integrity_unregister(disk);
- }
+ if (bi_rdev && blk_integrity_compare(mddev->gendisk,
+ rdev->bdev->bd_disk) >= 0)
+ return;
+ printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev));
+ blk_integrity_unregister(mddev->gendisk);
}
+EXPORT_SYMBOL(md_integrity_add_rdev);
static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
{
@@ -1591,7 +1642,6 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
/* May as well allow recovery to be retried once */
mddev->recovery_disabled = 0;
- md_integrity_check(rdev, mddev);
return 0;
fail:
@@ -1756,9 +1806,10 @@ static void print_sb_1(struct mdp_superblock_1 *sb)
__u8 *uuid;
uuid = sb->set_uuid;
- printk(KERN_INFO "md: SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x"
- ":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n"
- KERN_INFO "md: Name: \"%s\" CT:%llu\n",
+ printk(KERN_INFO
+ "md: SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x"
+ ":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n"
+ "md: Name: \"%s\" CT:%llu\n",
le32_to_cpu(sb->major_version),
le32_to_cpu(sb->feature_map),
uuid[0], uuid[1], uuid[2], uuid[3],
@@ -1770,12 +1821,13 @@ static void print_sb_1(struct mdp_superblock_1 *sb)
& MD_SUPERBLOCK_1_TIME_SEC_MASK);
uuid = sb->device_uuid;
- printk(KERN_INFO "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu"
+ printk(KERN_INFO
+ "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu"
" RO:%llu\n"
- KERN_INFO "md: Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x"
- ":%02x%02x%02x%02x%02x%02x\n"
- KERN_INFO "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n"
- KERN_INFO "md: (MaxDev:%u) \n",
+ "md: Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x"
+ ":%02x%02x%02x%02x%02x%02x\n"
+ "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n"
+ "md: (MaxDev:%u) \n",
le32_to_cpu(sb->level),
(unsigned long long)le64_to_cpu(sb->size),
le32_to_cpu(sb->raid_disks),
@@ -1923,17 +1975,14 @@ repeat:
/* otherwise we have to go forward and ... */
mddev->events ++;
if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */
- /* .. if the array isn't clean, insist on an odd 'events' */
- if ((mddev->events&1)==0) {
- mddev->events++;
+ /* .. if the array isn't clean, an 'even' event must also go
+ * to spares. */
+ if ((mddev->events&1)==0)
nospares = 0;
- }
} else {
- /* otherwise insist on an even 'events' (for clean states) */
- if ((mddev->events&1)) {
- mddev->events++;
+ /* otherwise an 'odd' event must go to spares */
+ if ((mddev->events&1))
nospares = 0;
- }
}
}
@@ -2655,6 +2704,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
ssize_t rv = len;
struct mdk_personality *pers;
void *priv;
+ mdk_rdev_t *rdev;
if (mddev->pers == NULL) {
if (len == 0)
@@ -2734,6 +2784,12 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
mddev_suspend(mddev);
mddev->pers->stop(mddev);
module_put(mddev->pers->owner);
+ /* Invalidate devices that are now superfluous */
+ list_for_each_entry(rdev, &mddev->disks, same_set)
+ if (rdev->raid_disk >= mddev->raid_disks) {
+ rdev->raid_disk = -1;
+ clear_bit(In_sync, &rdev->flags);
+ }
mddev->pers = pers;
mddev->private = priv;
strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
@@ -3543,6 +3599,7 @@ max_sync_store(mddev_t *mddev, const char *buf, size_t len)
if (max < mddev->resync_min)
return -EINVAL;
if (max < mddev->resync_max &&
+ mddev->ro == 0 &&
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return -EBUSY;
@@ -3573,7 +3630,8 @@ suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
char *e;
unsigned long long new = simple_strtoull(buf, &e, 10);
- if (mddev->pers->quiesce == NULL)
+ if (mddev->pers == NULL ||
+ mddev->pers->quiesce == NULL)
return -EINVAL;
if (buf == e || (*e && *e != '\n'))
return -EINVAL;
@@ -3601,7 +3659,8 @@ suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
char *e;
unsigned long long new = simple_strtoull(buf, &e, 10);
- if (mddev->pers->quiesce == NULL)
+ if (mddev->pers == NULL ||
+ mddev->pers->quiesce == NULL)
return -EINVAL;
if (buf == e || (*e && *e != '\n'))
return -EINVAL;
@@ -3681,17 +3740,8 @@ array_size_store(mddev_t *mddev, const char *buf, size_t len)
mddev->array_sectors = sectors;
set_capacity(mddev->gendisk, mddev->array_sectors);
- if (mddev->pers) {
- struct block_device *bdev = bdget_disk(mddev->gendisk, 0);
-
- if (bdev) {
- mutex_lock(&bdev->bd_inode->i_mutex);
- i_size_write(bdev->bd_inode,
- (loff_t)mddev->array_sectors << 9);
- mutex_unlock(&bdev->bd_inode->i_mutex);
- bdput(bdev);
- }
- }
+ if (mddev->pers)
+ revalidate_disk(mddev->gendisk);
return len;
}
@@ -3844,11 +3894,9 @@ static int md_alloc(dev_t dev, char *name)
flush_scheduled_work();
mutex_lock(&disks_mutex);
- if (mddev->gendisk) {
- mutex_unlock(&disks_mutex);
- mddev_put(mddev);
- return -EEXIST;
- }
+ error = -EEXIST;
+ if (mddev->gendisk)
+ goto abort;
if (name) {
/* Need to ensure that 'name' is not a duplicate.
@@ -3860,17 +3908,15 @@ static int md_alloc(dev_t dev, char *name)
if (mddev2->gendisk &&
strcmp(mddev2->gendisk->disk_name, name) == 0) {
spin_unlock(&all_mddevs_lock);
- return -EEXIST;
+ goto abort;
}
spin_unlock(&all_mddevs_lock);
}
+ error = -ENOMEM;
mddev->queue = blk_alloc_queue(GFP_KERNEL);
- if (!mddev->queue) {
- mutex_unlock(&disks_mutex);
- mddev_put(mddev);
- return -ENOMEM;
- }
+ if (!mddev->queue)
+ goto abort;
mddev->queue->queuedata = mddev;
/* Can be unlocked because the queue is new: no concurrency */
@@ -3880,11 +3926,9 @@ static int md_alloc(dev_t dev, char *name)
disk = alloc_disk(1 << shift);
if (!disk) {
- mutex_unlock(&disks_mutex);
blk_cleanup_queue(mddev->queue);
mddev->queue = NULL;
- mddev_put(mddev);
- return -ENOMEM;
+ goto abort;
}
disk->major = MAJOR(mddev->unit);
disk->first_minor = unit << shift;
@@ -3906,16 +3950,22 @@ static int md_alloc(dev_t dev, char *name)
mddev->gendisk = disk;
error = kobject_init_and_add(&mddev->kobj, &md_ktype,
&disk_to_dev(disk)->kobj, "%s", "md");
- mutex_unlock(&disks_mutex);
- if (error)
+ if (error) {
+ /* This isn't possible, but as kobject_init_and_add is marked
+ * __must_check, we must do something with the result
+ */
printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
disk->disk_name);
- else {
+ error = 0;
+ }
+ abort:
+ mutex_unlock(&disks_mutex);
+ if (!error) {
kobject_uevent(&mddev->kobj, KOBJ_ADD);
mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state");
}
mddev_put(mddev);
- return 0;
+ return error;
}
static struct kobject *md_probe(dev_t dev, int *part, void *data)
@@ -4044,10 +4094,6 @@ static int do_md_run(mddev_t * mddev)
}
strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
- if (pers->level >= 4 && pers->level <= 6)
- /* Cannot support integrity (yet) */
- blk_integrity_unregister(mddev->gendisk);
-
if (mddev->reshape_position != MaxSector &&
pers->start_reshape == NULL) {
/* This personality cannot handle reshaping... */
@@ -4185,6 +4231,7 @@ static int do_md_run(mddev_t * mddev)
md_wakeup_thread(mddev->thread);
md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
+ revalidate_disk(mddev->gendisk);
mddev->changed = 1;
md_new_event(mddev);
sysfs_notify_dirent(mddev->sysfs_state);
@@ -4256,12 +4303,11 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
struct gendisk *disk = mddev->gendisk;
mdk_rdev_t *rdev;
+ mutex_lock(&mddev->open_mutex);
if (atomic_read(&mddev->openers) > is_open) {
printk("md: %s still in use.\n",mdname(mddev));
- return -EBUSY;
- }
-
- if (mddev->pers) {
+ err = -EBUSY;
+ } else if (mddev->pers) {
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
@@ -4318,8 +4364,12 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
if (mode == 1)
set_disk_ro(disk, 1);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ err = 0;
}
-
+out:
+ mutex_unlock(&mddev->open_mutex);
+ if (err)
+ return err;
/*
* Free resources if final stop
*/
@@ -4385,7 +4435,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
blk_integrity_unregister(disk);
md_new_event(mddev);
sysfs_notify_dirent(mddev->sysfs_state);
-out:
return err;
}
@@ -5083,18 +5132,8 @@ static int update_size(mddev_t *mddev, sector_t num_sectors)
return -ENOSPC;
}
rv = mddev->pers->resize(mddev, num_sectors);
- if (!rv) {
- struct block_device *bdev;
-
- bdev = bdget_disk(mddev->gendisk, 0);
- if (bdev) {
- mutex_lock(&bdev->bd_inode->i_mutex);
- i_size_write(bdev->bd_inode,
- (loff_t)mddev->array_sectors << 9);
- mutex_unlock(&bdev->bd_inode->i_mutex);
- bdput(bdev);
- }
- }
+ if (!rv)
+ revalidate_disk(mddev->gendisk);
return rv;
}
@@ -5480,12 +5519,12 @@ static int md_open(struct block_device *bdev, fmode_t mode)
}
BUG_ON(mddev != bdev->bd_disk->private_data);
- if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1)))
+ if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
goto out;
err = 0;
atomic_inc(&mddev->openers);
- mddev_unlock(mddev);
+ mutex_unlock(&mddev->open_mutex);
check_disk_change(bdev);
out:
@@ -6334,10 +6373,16 @@ void md_do_sync(mddev_t *mddev)
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
}
- if (j >= mddev->resync_max)
- wait_event(mddev->recovery_wait,
- mddev->resync_max > j
- || kthread_should_stop());
+ while (j >= mddev->resync_max && !kthread_should_stop()) {
+ /* As this condition is controlled by user-space,
+ * we can block indefinitely, so use '_interruptible'
+ * to avoid triggering warnings.
+ */
+ flush_signals(current); /* just in case */
+ wait_event_interruptible(mddev->recovery_wait,
+ mddev->resync_max > j
+ || kthread_should_stop());
+ }
if (kthread_should_stop())
goto interrupted;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 9430a110db93..f8fc188bc762 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -223,6 +223,16 @@ struct mddev_s
* so we don't loop trying */
int in_sync; /* know to not need resync */
+ /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so
+ * that we are never stopping an array while it is open.
+ * 'reconfig_mutex' protects all other reconfiguration.
+ * These locks are separate due to conflicting interactions
+ * with bdev->bd_mutex.
+ * Lock ordering is:
+ * reconfig_mutex -> bd_mutex : e.g. do_md_run -> revalidate_disk
+ * bd_mutex -> open_mutex: e.g. __blkdev_get -> md_open
+ */
+ struct mutex open_mutex;
struct mutex reconfig_mutex;
atomic_t active; /* general refcount */
atomic_t openers; /* number of active opens */
@@ -431,5 +441,7 @@ extern int md_allow_write(mddev_t *mddev);
extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors);
extern int md_check_no_bitmap(mddev_t *mddev);
+extern int md_integrity_register(mddev_t *mddev);
+void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
#endif /* _MD_MD_H */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index cbe368fa6598..7140909f6662 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -294,7 +294,8 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
for (path = first; path <= last; path++)
if ((p=conf->multipaths+path)->rdev == NULL) {
q = rdev->bdev->bd_disk->queue;
- blk_queue_stack_limits(mddev->queue, q);
+ disk_stack_limits(mddev->gendisk, rdev->bdev,
+ rdev->data_offset << 9);
/* as we don't honour merge_bvec_fn, we must never risk
* violating it, so limit ->max_sector to one PAGE, as
@@ -312,6 +313,7 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
set_bit(In_sync, &rdev->flags);
rcu_assign_pointer(p->rdev, rdev);
err = 0;
+ md_integrity_add_rdev(rdev, mddev);
break;
}
@@ -344,7 +346,9 @@ static int multipath_remove_disk(mddev_t *mddev, int number)
/* lost the race, try later */
err = -EBUSY;
p->rdev = rdev;
+ goto abort;
}
+ md_integrity_register(mddev);
}
abort:
@@ -463,9 +467,9 @@ static int multipath_run (mddev_t *mddev)
disk = conf->multipaths + disk_idx;
disk->rdev = rdev;
+ disk_stack_limits(mddev->gendisk, rdev->bdev,
+ rdev->data_offset << 9);
- blk_queue_stack_limits(mddev->queue,
- rdev->bdev->bd_disk->queue);
/* as we don't honour merge_bvec_fn, we must never risk
* violating it, not that we ever expect a device with
* a merge_bvec_fn to be involved in multipath */
@@ -518,7 +522,7 @@ static int multipath_run (mddev_t *mddev)
mddev->queue->unplug_fn = multipath_unplug;
mddev->queue->backing_dev_info.congested_fn = multipath_congested;
mddev->queue->backing_dev_info.congested_data = mddev;
-
+ md_integrity_register(mddev);
return 0;
out_free_conf:
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index ab4a489d8695..898e2bdfee47 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -170,8 +170,8 @@ static int create_strip_zones(mddev_t *mddev)
}
dev[j] = rdev1;
- blk_queue_stack_limits(mddev->queue,
- rdev1->bdev->bd_disk->queue);
+ disk_stack_limits(mddev->gendisk, rdev1->bdev,
+ rdev1->data_offset << 9);
/* as we don't honour merge_bvec_fn, we must never risk
* violating it, so limit ->max_sector to one PAGE, as
* a one page request is never in violation.
@@ -250,6 +250,11 @@ static int create_strip_zones(mddev_t *mddev)
mddev->chunk_sectors << 9);
goto abort;
}
+
+ blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
+ blk_queue_io_opt(mddev->queue,
+ (mddev->chunk_sectors << 9) * mddev->raid_disks);
+
printk(KERN_INFO "raid0: done.\n");
mddev->private = conf;
return 0;
@@ -346,6 +351,7 @@ static int raid0_run(mddev_t *mddev)
blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec);
dump_zones(mddev);
+ md_integrity_register(mddev);
return 0;
}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 89939a7aef57..8726fd7ebce5 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1123,8 +1123,8 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
for (mirror = first; mirror <= last; mirror++)
if ( !(p=conf->mirrors+mirror)->rdev) {
- blk_queue_stack_limits(mddev->queue,
- rdev->bdev->bd_disk->queue);
+ disk_stack_limits(mddev->gendisk, rdev->bdev,
+ rdev->data_offset << 9);
/* as we don't honour merge_bvec_fn, we must never risk
* violating it, so limit ->max_sector to one PAGE, as
* a one page request is never in violation.
@@ -1144,7 +1144,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
rcu_assign_pointer(p->rdev, rdev);
break;
}
-
+ md_integrity_add_rdev(rdev, mddev);
print_conf(conf);
return err;
}
@@ -1178,7 +1178,9 @@ static int raid1_remove_disk(mddev_t *mddev, int number)
/* lost the race, try later */
err = -EBUSY;
p->rdev = rdev;
+ goto abort;
}
+ md_integrity_register(mddev);
}
abort:
@@ -1988,9 +1990,8 @@ static int run(mddev_t *mddev)
disk = conf->mirrors + disk_idx;
disk->rdev = rdev;
-
- blk_queue_stack_limits(mddev->queue,
- rdev->bdev->bd_disk->queue);
+ disk_stack_limits(mddev->gendisk, rdev->bdev,
+ rdev->data_offset << 9);
/* as we don't honour merge_bvec_fn, we must never risk
* violating it, so limit ->max_sector to one PAGE, as
* a one page request is never in violation.
@@ -2068,7 +2069,7 @@ static int run(mddev_t *mddev)
mddev->queue->unplug_fn = raid1_unplug;
mddev->queue->backing_dev_info.congested_fn = raid1_congested;
mddev->queue->backing_dev_info.congested_data = mddev;
-
+ md_integrity_register(mddev);
return 0;
out_no_mem:
@@ -2133,6 +2134,7 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
return -EINVAL;
set_capacity(mddev->gendisk, mddev->array_sectors);
mddev->changed = 1;
+ revalidate_disk(mddev->gendisk);
if (sectors > mddev->dev_sectors &&
mddev->recovery_cp == MaxSector) {
mddev->recovery_cp = mddev->dev_sectors;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index ae12ceafe10c..3d9020cf6f6e 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1151,8 +1151,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
for ( ; mirror <= last ; mirror++)
if ( !(p=conf->mirrors+mirror)->rdev) {
- blk_queue_stack_limits(mddev->queue,
- rdev->bdev->bd_disk->queue);
+ disk_stack_limits(mddev->gendisk, rdev->bdev,
+ rdev->data_offset << 9);
/* as we don't honour merge_bvec_fn, we must never risk
* violating it, so limit ->max_sector to one PAGE, as
* a one page request is never in violation.
@@ -1170,6 +1170,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
break;
}
+ md_integrity_add_rdev(rdev, mddev);
print_conf(conf);
return err;
}
@@ -1203,7 +1204,9 @@ static int raid10_remove_disk(mddev_t *mddev, int number)
/* lost the race, try later */
err = -EBUSY;
p->rdev = rdev;
+ goto abort;
}
+ md_integrity_register(mddev);
}
abort:
@@ -2044,7 +2047,7 @@ raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks)
static int run(mddev_t *mddev)
{
conf_t *conf;
- int i, disk_idx;
+ int i, disk_idx, chunk_size;
mirror_info_t *disk;
mdk_rdev_t *rdev;
int nc, fc, fo;
@@ -2130,6 +2133,14 @@ static int run(mddev_t *mddev)
spin_lock_init(&conf->device_lock);
mddev->queue->queue_lock = &conf->device_lock;
+ chunk_size = mddev->chunk_sectors << 9;
+ blk_queue_io_min(mddev->queue, chunk_size);
+ if (conf->raid_disks % conf->near_copies)
+ blk_queue_io_opt(mddev->queue, chunk_size * conf->raid_disks);
+ else
+ blk_queue_io_opt(mddev->queue, chunk_size *
+ (conf->raid_disks / conf->near_copies));
+
list_for_each_entry(rdev, &mddev->disks, same_set) {
disk_idx = rdev->raid_disk;
if (disk_idx >= mddev->raid_disks
@@ -2138,9 +2149,8 @@ static int run(mddev_t *mddev)
disk = conf->mirrors + disk_idx;
disk->rdev = rdev;
-
- blk_queue_stack_limits(mddev->queue,
- rdev->bdev->bd_disk->queue);
+ disk_stack_limits(mddev->gendisk, rdev->bdev,
+ rdev->data_offset << 9);
/* as we don't honour merge_bvec_fn, we must never risk
* violating it, so limit ->max_sector to one PAGE, as
* a one page request is never in violation.
@@ -2218,6 +2228,7 @@ static int run(mddev_t *mddev)
if (conf->near_copies < mddev->raid_disks)
blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
+ md_integrity_register(mddev);
return 0;
out_free_conf:
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index cac6f4d3a143..9b00a229015a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3911,13 +3911,21 @@ static int make_request(struct request_queue *q, struct bio * bi)
goto retry;
}
}
- /* FIXME what if we get a false positive because these
- * are being updated.
- */
- if (logical_sector >= mddev->suspend_lo &&
+
+ if (bio_data_dir(bi) == WRITE &&
+ logical_sector >= mddev->suspend_lo &&
logical_sector < mddev->suspend_hi) {
release_stripe(sh);
- schedule();
+ /* As the suspend_* range is controlled by
+ * userspace, we want an interruptible
+ * wait.
+ */
+ flush_signals(current);
+ prepare_to_wait(&conf->wait_for_overlap,
+ &w, TASK_INTERRUPTIBLE);
+ if (logical_sector >= mddev->suspend_lo &&
+ logical_sector < mddev->suspend_hi)
+ schedule();
goto retry;
}
@@ -3989,7 +3997,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
conf->reshape_progress < raid5_size(mddev, 0, 0)) {
sector_nr = raid5_size(mddev, 0, 0)
- conf->reshape_progress;
- } else if (mddev->delta_disks > 0 &&
+ } else if (mddev->delta_disks >= 0 &&
conf->reshape_progress > 0)
sector_nr = conf->reshape_progress;
sector_div(sector_nr, new_data_disks);
@@ -4203,6 +4211,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
return 0;
}
+ /* Allow raid5_quiesce to complete */
+ wait_event(conf->wait_for_overlap, conf->quiesce != 2);
+
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
return reshape_request(mddev, sector_nr, skipped);
@@ -4803,7 +4814,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
static int run(mddev_t *mddev)
{
raid5_conf_t *conf;
- int working_disks = 0;
+ int working_disks = 0, chunk_size;
mdk_rdev_t *rdev;
if (mddev->recovery_cp != MaxSector)
@@ -4844,7 +4855,26 @@ static int run(mddev_t *mddev)
(old_disks-max_degraded));
/* here_old is the first stripe that we might need to read
* from */
- if (here_new >= here_old) {
+ if (mddev->delta_disks == 0) {
+ /* We cannot be sure it is safe to start an in-place
+ * reshape. It is only safe if user-space if monitoring
+ * and taking constant backups.
+ * mdadm always starts a situation like this in
+ * readonly mode so it can take control before
+ * allowing any writes. So just check for that.
+ */
+ if ((here_new * mddev->new_chunk_sectors !=
+ here_old * mddev->chunk_sectors) ||
+ mddev->ro == 0) {
+ printk(KERN_ERR "raid5: in-place reshape must be started"
+ " in read-only mode - aborting\n");
+ return -EINVAL;
+ }
+ } else if (mddev->delta_disks < 0
+ ? (here_new * mddev->new_chunk_sectors <=
+ here_old * mddev->chunk_sectors)
+ : (here_new * mddev->new_chunk_sectors >=
+ here_old * mddev->chunk_sectors)) {
/* Reading from the same stripe as writing to - bad */
printk(KERN_ERR "raid5: reshape_position too early for "
"auto-recovery - aborting.\n");
@@ -4958,6 +4988,14 @@ static int run(mddev_t *mddev)
md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
+ chunk_size = mddev->chunk_sectors << 9;
+ blk_queue_io_min(mddev->queue, chunk_size);
+ blk_queue_io_opt(mddev->queue, chunk_size *
+ (conf->raid_disks - conf->max_degraded));
+
+ list_for_each_entry(rdev, &mddev->disks, same_set)
+ disk_stack_limits(mddev->gendisk, rdev->bdev,
+ rdev->data_offset << 9);
return 0;
abort:
@@ -5185,6 +5223,7 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
return -EINVAL;
set_capacity(mddev->gendisk, mddev->array_sectors);
mddev->changed = 1;
+ revalidate_disk(mddev->gendisk);
if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) {
mddev->recovery_cp = mddev->dev_sectors;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -5330,7 +5369,7 @@ static int raid5_start_reshape(mddev_t *mddev)
spin_unlock_irqrestore(&conf->device_lock, flags);
}
mddev->raid_disks = conf->raid_disks;
- mddev->reshape_position = 0;
+ mddev->reshape_position = conf->reshape_progress;
set_bit(MD_CHANGE_DEVS, &mddev->flags);
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
@@ -5385,7 +5424,6 @@ static void end_reshape(raid5_conf_t *conf)
*/
static void raid5_finish_reshape(mddev_t *mddev)
{
- struct block_device *bdev;
raid5_conf_t *conf = mddev->private;
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
@@ -5394,15 +5432,7 @@ static void raid5_finish_reshape(mddev_t *mddev)
md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
set_capacity(mddev->gendisk, mddev->array_sectors);
mddev->changed = 1;
-
- bdev = bdget_disk(mddev->gendisk, 0);
- if (bdev) {
- mutex_lock(&bdev->bd_inode->i_mutex);
- i_size_write(bdev->bd_inode,
- (loff_t)mddev->array_sectors << 9);
- mutex_unlock(&bdev->bd_inode->i_mutex);
- bdput(bdev);
- }
+ revalidate_disk(mddev->gendisk);
} else {
int d;
mddev->degraded = conf->raid_disks;
@@ -5413,8 +5443,15 @@ static void raid5_finish_reshape(mddev_t *mddev)
mddev->degraded--;
for (d = conf->raid_disks ;
d < conf->raid_disks - mddev->delta_disks;
- d++)
- raid5_remove_disk(mddev, d);
+ d++) {
+ mdk_rdev_t *rdev = conf->disks[d].rdev;
+ if (rdev && raid5_remove_disk(mddev, d) == 0) {
+ char nm[20];
+ sprintf(nm, "rd%d", rdev->raid_disk);
+ sysfs_remove_link(&mddev->kobj, nm);
+ rdev->raid_disk = -1;
+ }
+ }
}
mddev->layout = conf->algorithm;
mddev->chunk_sectors = conf->chunk_sectors;
@@ -5434,12 +5471,18 @@ static void raid5_quiesce(mddev_t *mddev, int state)
case 1: /* stop all writes */
spin_lock_irq(&conf->device_lock);
- conf->quiesce = 1;
+ /* '2' tells resync/reshape to pause so that all
+ * active stripes can drain
+ */
+ conf->quiesce = 2;
wait_event_lock_irq(conf->wait_for_stripe,
atomic_read(&conf->active_stripes) == 0 &&
atomic_read(&conf->active_aligned_reads) == 0,
conf->device_lock, /* nothing */);
+ conf->quiesce = 1;
spin_unlock_irq(&conf->device_lock);
+ /* allow reshape to continue */
+ wake_up(&conf->wait_for_overlap);
break;
case 0: /* re-enable writes */