diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/bitmap.c | 3 | ||||
-rw-r--r-- | drivers/md/dm-bufio.c | 3 | ||||
-rw-r--r-- | drivers/md/dm-integrity.c | 22 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-raid.c | 29 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 35 | ||||
-rw-r--r-- | drivers/md/dm-verity-fec.c | 21 | ||||
-rw-r--r-- | drivers/md/dm-zoned-metadata.c | 12 | ||||
-rw-r--r-- | drivers/md/dm-zoned-reclaim.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-zoned-target.c | 8 | ||||
-rw-r--r-- | drivers/md/dm.c | 12 | ||||
-rw-r--r-- | drivers/md/md.c | 7 | ||||
-rw-r--r-- | drivers/md/md.h | 58 | ||||
-rw-r--r-- | drivers/md/raid1-10.c | 81 | ||||
-rw-r--r-- | drivers/md/raid1.c | 68 | ||||
-rw-r--r-- | drivers/md/raid10.c | 25 | ||||
-rw-r--r-- | drivers/md/raid5-cache.c | 61 | ||||
-rw-r--r-- | drivers/md/raid5-ppl.c | 2 | ||||
-rw-r--r-- | drivers/md/raid5.c | 15 |
19 files changed, 259 insertions, 207 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index f4eace5ea184..40f3cd7eab0f 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -156,7 +156,8 @@ static int read_sb_page(struct mddev *mddev, loff_t offset, rdev_for_each(rdev, mddev) { if (! test_bit(In_sync, &rdev->flags) - || test_bit(Faulty, &rdev->flags)) + || test_bit(Faulty, &rdev->flags) + || test_bit(Bitmap_sync, &rdev->flags)) continue; target = offset + index * (PAGE_SIZE/512); diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 850ff6c67994..44f4a8ac95bd 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1258,8 +1258,7 @@ EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async); */ int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c) { - blk_status_t a; - int f; + int a, f; unsigned long buffers_processed = 0; struct dm_buffer *b, *tmp; diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 1b224aa9cf15..3acce09bba35 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -1587,16 +1587,18 @@ retry: if (likely(ic->mode == 'J')) { if (dio->write) { unsigned next_entry, i, pos; - unsigned ws, we; + unsigned ws, we, range_sectors; - dio->range.n_sectors = min(dio->range.n_sectors, ic->free_sectors); + dio->range.n_sectors = min(dio->range.n_sectors, + ic->free_sectors << ic->sb->log2_sectors_per_block); if (unlikely(!dio->range.n_sectors)) goto sleep; - ic->free_sectors -= dio->range.n_sectors; + range_sectors = dio->range.n_sectors >> ic->sb->log2_sectors_per_block; + ic->free_sectors -= range_sectors; journal_section = ic->free_section; journal_entry = ic->free_section_entry; - next_entry = ic->free_section_entry + dio->range.n_sectors; + next_entry = ic->free_section_entry + range_sectors; ic->free_section_entry = next_entry % ic->journal_section_entries; ic->free_section += next_entry / ic->journal_section_entries; ic->n_uncommitted_sections += next_entry / ic->journal_section_entries; @@ -1727,6 +1729,8 @@ static void pad_uncommitted(struct dm_integrity_c *ic) wraparound_section(ic, &ic->free_section); ic->n_uncommitted_sections++; } + WARN_ON(ic->journal_sections * ic->journal_section_entries != + (ic->n_uncommitted_sections + ic->n_committed_sections) * ic->journal_section_entries + ic->free_sectors); } static void integrity_commit(struct work_struct *w) @@ -1821,6 +1825,9 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start, { unsigned i, j, n; struct journal_completion comp; + struct blk_plug plug; + + blk_start_plug(&plug); comp.ic = ic; comp.in_flight = (atomic_t)ATOMIC_INIT(1); @@ -1945,6 +1952,8 @@ skip_io: dm_bufio_write_dirty_buffers_async(ic->bufio); + blk_finish_plug(&plug); + complete_journal_op(&comp); wait_for_completion_io(&comp.comp); @@ -3019,6 +3028,11 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->error = "Block size doesn't match the information in superblock"; goto bad; } + if (!le32_to_cpu(ic->sb->journal_sections)) { + r = -EINVAL; + ti->error = "Corrupted superblock, journal_sections is 0"; + goto bad; + } /* make sure that ti->max_io_len doesn't overflow */ if (ic->sb->log2_interleave_sectors < MIN_LOG2_INTERLEAVE_SECTORS || ic->sb->log2_interleave_sectors > MAX_LOG2_INTERLEAVE_SECTORS) { diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 0e8ab5bb3575..d24e4b05f5da 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -504,7 +504,6 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, if (queue_dying) { atomic_inc(&m->pg_init_in_progress); activate_or_offline_path(pgpath); - return DM_MAPIO_REQUEUE; } return DM_MAPIO_DELAY_REQUEUE; } @@ -1458,7 +1457,6 @@ static int noretry_error(blk_status_t error) case BLK_STS_TARGET: case BLK_STS_NEXUS: case BLK_STS_MEDIUM: - case BLK_STS_RESOURCE: return 1; } diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 2e10c2f13a34..5bfe285ea9d1 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -208,6 +208,7 @@ struct raid_dev { #define RT_FLAG_RS_BITMAP_LOADED 2 #define RT_FLAG_UPDATE_SBS 3 #define RT_FLAG_RESHAPE_RS 4 +#define RT_FLAG_RS_SUSPENDED 5 /* Array elements of 64 bit needed for rebuild/failed disk bits */ #define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8) @@ -564,9 +565,10 @@ static const char *raid10_md_layout_to_format(int layout) if (__raid10_near_copies(layout) > 1) return "near"; - WARN_ON(__raid10_far_copies(layout) < 2); + if (__raid10_far_copies(layout) > 1) + return "far"; - return "far"; + return "unknown"; } /* Return md raid10 algorithm for @name */ @@ -2540,11 +2542,6 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) if (!freshest) return 0; - if (validate_raid_redundancy(rs)) { - rs->ti->error = "Insufficient redundancy to activate array"; - return -EINVAL; - } - /* * Validation of the freshest device provides the source of * validation for the remaining devices. @@ -2553,6 +2550,11 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) if (super_validate(rs, freshest)) return -EINVAL; + if (validate_raid_redundancy(rs)) { + rs->ti->error = "Insufficient redundancy to activate array"; + return -EINVAL; + } + rdev_for_each(rdev, mddev) if (!test_bit(Journal, &rdev->flags) && rdev != freshest && @@ -3168,6 +3170,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) } mddev_suspend(&rs->md); + set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags); /* Try to adjust the raid4/5/6 stripe cache size to the stripe size */ if (rs_is_raid456(rs)) { @@ -3625,7 +3628,7 @@ static void raid_postsuspend(struct dm_target *ti) { struct raid_set *rs = ti->private; - if (!rs->md.suspended) + if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) mddev_suspend(&rs->md); rs->md.ro = 1; @@ -3759,7 +3762,7 @@ static int rs_start_reshape(struct raid_set *rs) return r; /* Need to be resumed to be able to start reshape, recovery is frozen until raid_resume() though */ - if (mddev->suspended) + if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) mddev_resume(mddev); /* @@ -3786,8 +3789,8 @@ static int rs_start_reshape(struct raid_set *rs) } /* Suspend because a resume will happen in raid_resume() */ - if (!mddev->suspended) - mddev_suspend(mddev); + set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags); + mddev_suspend(mddev); /* * Now reshape got set up, update superblocks to @@ -3883,13 +3886,13 @@ static void raid_resume(struct dm_target *ti) if (!(rs->ctr_flags & RESUME_STAY_FROZEN_FLAGS)) clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - if (mddev->suspended) + if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) mddev_resume(mddev); } static struct target_type raid_target = { .name = "raid", - .version = {1, 11, 1}, + .version = {1, 12, 1}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index a39bcd9b982a..28a4071cdf85 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -20,6 +20,7 @@ #include <linux/atomic.h> #include <linux/blk-mq.h> #include <linux/mount.h> +#include <linux/dax.h> #define DM_MSG_PREFIX "table" @@ -1630,6 +1631,37 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush) return false; } +static int device_dax_write_cache_enabled(struct dm_target *ti, + struct dm_dev *dev, sector_t start, + sector_t len, void *data) +{ + struct dax_device *dax_dev = dev->dax_dev; + + if (!dax_dev) + return false; + + if (dax_write_cache_enabled(dax_dev)) + return true; + return false; +} + +static int dm_table_supports_dax_write_cache(struct dm_table *t) +{ + struct dm_target *ti; + unsigned i; + + for (i = 0; i < dm_table_get_num_targets(t); i++) { + ti = dm_table_get_target(t, i); + + if (ti->type->iterate_devices && + ti->type->iterate_devices(ti, + device_dax_write_cache_enabled, NULL)) + return true; + } + + return false; +} + static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { @@ -1785,6 +1817,9 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, } blk_queue_write_cache(q, wc, fua); + if (dm_table_supports_dax_write_cache(t)) + dax_write_cache(t->md->dax_dev, true); + /* Ensure that all underlying devices are non-rotational. */ if (dm_table_all_devices_attribute(t, device_is_nonrot)) queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 504ba3fa328b..e13f90832b6b 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -308,19 +308,14 @@ static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio) { unsigned n; - if (!fio->rs) { - fio->rs = mempool_alloc(v->fec->rs_pool, 0); - if (unlikely(!fio->rs)) { - DMERR("failed to allocate RS"); - return -ENOMEM; - } - } + if (!fio->rs) + fio->rs = mempool_alloc(v->fec->rs_pool, GFP_NOIO); fec_for_each_prealloc_buffer(n) { if (fio->bufs[n]) continue; - fio->bufs[n] = mempool_alloc(v->fec->prealloc_pool, GFP_NOIO); + fio->bufs[n] = mempool_alloc(v->fec->prealloc_pool, GFP_NOWAIT); if (unlikely(!fio->bufs[n])) { DMERR("failed to allocate FEC buffer"); return -ENOMEM; @@ -332,22 +327,16 @@ static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio) if (fio->bufs[n]) continue; - fio->bufs[n] = mempool_alloc(v->fec->extra_pool, GFP_NOIO); + fio->bufs[n] = mempool_alloc(v->fec->extra_pool, GFP_NOWAIT); /* we can manage with even one buffer if necessary */ if (unlikely(!fio->bufs[n])) break; } fio->nbufs = n; - if (!fio->output) { + if (!fio->output) fio->output = mempool_alloc(v->fec->output_pool, GFP_NOIO); - if (!fio->output) { - DMERR("failed to allocate FEC page"); - return -ENOMEM; - } - } - return 0; } diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 884ff7c170a0..a4fa2ada6883 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -624,7 +624,7 @@ static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set) ret = dmz_rdwr_block(zmd, REQ_OP_WRITE, block, mblk->page); if (ret == 0) - ret = blkdev_issue_flush(zmd->dev->bdev, GFP_KERNEL, NULL); + ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL); return ret; } @@ -658,7 +658,7 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd, /* Flush drive cache (this will also sync data) */ if (ret == 0) - ret = blkdev_issue_flush(zmd->dev->bdev, GFP_KERNEL, NULL); + ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL); return ret; } @@ -722,7 +722,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) /* If there are no dirty metadata blocks, just flush the device cache */ if (list_empty(&write_list)) { - ret = blkdev_issue_flush(zmd->dev->bdev, GFP_KERNEL, NULL); + ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL); goto out; } @@ -927,7 +927,7 @@ static int dmz_recover_mblocks(struct dmz_metadata *zmd, unsigned int dst_set) (zmd->nr_meta_zones << zmd->dev->zone_nr_blocks_shift); } - page = alloc_page(GFP_KERNEL); + page = alloc_page(GFP_NOIO); if (!page) return -ENOMEM; @@ -1183,7 +1183,7 @@ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) /* Get zone information from disk */ ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone), - &blkz, &nr_blkz, GFP_KERNEL); + &blkz, &nr_blkz, GFP_NOIO); if (ret) { dmz_dev_err(zmd->dev, "Get zone %u report failed", dmz_id(zmd, zone)); @@ -1257,7 +1257,7 @@ static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone) ret = blkdev_reset_zones(dev->bdev, dmz_start_sect(zmd, zone), - dev->zone_nr_sectors, GFP_KERNEL); + dev->zone_nr_sectors, GFP_NOIO); if (ret) { dmz_dev_err(dev, "Reset zone %u failed %d", dmz_id(zmd, zone), ret); diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index 05c0a126f5c8..44a119e12f1a 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -75,7 +75,7 @@ static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone, nr_blocks = block - wp_block; ret = blkdev_issue_zeroout(zrc->dev->bdev, dmz_start_sect(zmd, zone) + dmz_blk2sect(wp_block), - dmz_blk2sect(nr_blocks), GFP_NOFS, false); + dmz_blk2sect(nr_blocks), GFP_NOIO, 0); if (ret) { dmz_dev_err(zrc->dev, "Align zone %u wp %llu to %llu (wp+%u) blocks failed %d", diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 2b538fa817f4..b08bbbd4d902 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -541,7 +541,7 @@ static void dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio) int ret; /* Create a new chunk work */ - cw = kmalloc(sizeof(struct dm_chunk_work), GFP_NOFS); + cw = kmalloc(sizeof(struct dm_chunk_work), GFP_NOIO); if (!cw) goto out; @@ -588,7 +588,7 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) bio->bi_bdev = dev->bdev; - if (!nr_sectors && (bio_op(bio) != REQ_OP_FLUSH) && (bio_op(bio) != REQ_OP_WRITE)) + if (!nr_sectors && bio_op(bio) != REQ_OP_WRITE) return DM_MAPIO_REMAPPED; /* The BIO should be block aligned */ @@ -603,7 +603,7 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) bioctx->status = BLK_STS_OK; /* Set the BIO pending in the flush list */ - if (bio_op(bio) == REQ_OP_FLUSH || (!nr_sectors && bio_op(bio) == REQ_OP_WRITE)) { + if (!nr_sectors && bio_op(bio) == REQ_OP_WRITE) { spin_lock(&dmz->flush_lock); bio_list_add(&dmz->flush_list, bio); spin_unlock(&dmz->flush_lock); @@ -785,7 +785,7 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) /* Chunk BIO work */ mutex_init(&dmz->chunk_lock); - INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_NOFS); + INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_KERNEL); dmz->chunk_wq = alloc_workqueue("dmz_cwq_%s", WQ_MEM_RECLAIM | WQ_UNBOUND, 0, dev->name); if (!dmz->chunk_wq) { diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 2edbcc2d7d3f..d669fddd9290 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -27,16 +27,6 @@ #define DM_MSG_PREFIX "core" -#ifdef CONFIG_PRINTK -/* - * ratelimit state to be used in DMXXX_LIMIT(). - */ -DEFINE_RATELIMIT_STATE(dm_ratelimit_state, - DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); -EXPORT_SYMBOL(dm_ratelimit_state); -#endif - /* * Cookies are numeric values sent with CHANGE and REMOVE * uevents while resuming, removing or renaming the device. @@ -1523,7 +1513,7 @@ static void __split_and_process_bio(struct mapped_device *md, } /* drop the extra reference count */ - dec_pending(ci.io, error); + dec_pending(ci.io, errno_to_blk_status(error)); } /*----------------------------------------------------------------- * CRUD END diff --git a/drivers/md/md.c b/drivers/md/md.c index 8cdca0296749..b01e458d31e9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2287,7 +2287,7 @@ static void export_array(struct mddev *mddev) static bool set_in_sync(struct mddev *mddev) { - WARN_ON_ONCE(!spin_is_locked(&mddev->lock)); + WARN_ON_ONCE(NR_CPUS != 1 && !spin_is_locked(&mddev->lock)); if (!mddev->in_sync) { mddev->sync_checkers++; spin_unlock(&mddev->lock); @@ -7996,7 +7996,7 @@ bool md_write_start(struct mddev *mddev, struct bio *bi) if (mddev->safemode == 1) mddev->safemode = 0; /* sync_checkers is always 0 when writes_pending is in per-cpu mode */ - if (mddev->in_sync || !mddev->sync_checkers) { + if (mddev->in_sync || mddev->sync_checkers) { spin_lock(&mddev->lock); if (mddev->in_sync) { mddev->in_sync = 0; @@ -8656,6 +8656,9 @@ void md_check_recovery(struct mddev *mddev) if (mddev_trylock(mddev)) { int spares = 0; + if (!mddev->external && mddev->safemode == 1) + mddev->safemode = 0; + if (mddev->ro) { struct md_rdev *rdev; if (!mddev->external && mddev->in_sync) diff --git a/drivers/md/md.h b/drivers/md/md.h index 991f0fe2dcc6..09db03455801 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -134,7 +134,9 @@ enum flag_bits { Faulty, /* device is known to have a fault */ In_sync, /* device is in_sync with rest of array */ Bitmap_sync, /* ..actually, not quite In_sync. Need a - * bitmap-based recovery to get fully in sync + * bitmap-based recovery to get fully in sync. + * The bit is only meaningful before device + * has been passed to pers->hot_add_disk. */ WriteMostly, /* Avoid reading if at all possible */ AutoDetected, /* added by auto-detect */ @@ -729,58 +731,4 @@ static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio !bdev_get_queue(bio->bi_bdev)->limits.max_write_zeroes_sectors) mddev->queue->limits.max_write_zeroes_sectors = 0; } - -/* Maximum size of each resync request */ -#define RESYNC_BLOCK_SIZE (64*1024) -#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) - -/* for managing resync I/O pages */ -struct resync_pages { - unsigned idx; /* for get/put page from the pool */ - void *raid_bio; - struct page *pages[RESYNC_PAGES]; -}; - -static inline int resync_alloc_pages(struct resync_pages *rp, - gfp_t gfp_flags) -{ - int i; - - for (i = 0; i < RESYNC_PAGES; i++) { - rp->pages[i] = alloc_page(gfp_flags); - if (!rp->pages[i]) - goto out_free; - } - - return 0; - -out_free: - while (--i >= 0) - put_page(rp->pages[i]); - return -ENOMEM; -} - -static inline void resync_free_pages(struct resync_pages *rp) -{ - int i; - - for (i = 0; i < RESYNC_PAGES; i++) - put_page(rp->pages[i]); -} - -static inline void resync_get_all_pages(struct resync_pages *rp) -{ - int i; - - for (i = 0; i < RESYNC_PAGES; i++) - get_page(rp->pages[i]); -} - -static inline struct page *resync_fetch_page(struct resync_pages *rp, - unsigned idx) -{ - if (WARN_ON_ONCE(idx >= RESYNC_PAGES)) - return NULL; - return rp->pages[idx]; -} #endif /* _MD_MD_H */ diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c new file mode 100644 index 000000000000..9f2670b45f31 --- /dev/null +++ b/drivers/md/raid1-10.c @@ -0,0 +1,81 @@ +/* Maximum size of each resync request */ +#define RESYNC_BLOCK_SIZE (64*1024) +#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) + +/* for managing resync I/O pages */ +struct resync_pages { + void *raid_bio; + struct page *pages[RESYNC_PAGES]; +}; + +static inline int resync_alloc_pages(struct resync_pages *rp, + gfp_t gfp_flags) +{ + int i; + + for (i = 0; i < RESYNC_PAGES; i++) { + rp->pages[i] = alloc_page(gfp_flags); + if (!rp->pages[i]) + goto out_free; + } + + return 0; + +out_free: + while (--i >= 0) + put_page(rp->pages[i]); + return -ENOMEM; +} + +static inline void resync_free_pages(struct resync_pages *rp) +{ + int i; + + for (i = 0; i < RESYNC_PAGES; i++) + put_page(rp->pages[i]); +} + +static inline void resync_get_all_pages(struct resync_pages *rp) +{ + int i; + + for (i = 0; i < RESYNC_PAGES; i++) + get_page(rp->pages[i]); +} + +static inline struct page *resync_fetch_page(struct resync_pages *rp, + unsigned idx) +{ + if (WARN_ON_ONCE(idx >= RESYNC_PAGES)) + return NULL; + return rp->pages[idx]; +} + +/* + * 'strct resync_pages' stores actual pages used for doing the resync + * IO, and it is per-bio, so make .bi_private points to it. + */ +static inline struct resync_pages *get_resync_pages(struct bio *bio) +{ + return bio->bi_private; +} + +/* generally called after bio_reset() for reseting bvec */ +static void md_bio_reset_resync_pages(struct bio *bio, struct resync_pages *rp, + int size) +{ + int idx = 0; + + /* initialize bvec table again */ + do { + struct page *page = resync_fetch_page(rp, idx); + int len = min_t(int, size, PAGE_SIZE); + + /* + * won't fail because the vec table is big + * enough to hold all these pages + */ + bio_add_page(bio, page, len, 0); + size -= len; + } while (idx++ < RESYNC_PAGES && size > 0); +} diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 3febfc8391fb..f50958ded9f0 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -81,14 +81,7 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr); #define raid1_log(md, fmt, args...) \ do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0) -/* - * 'strct resync_pages' stores actual pages used for doing the resync - * IO, and it is per-bio, so make .bi_private points to it. - */ -static inline struct resync_pages *get_resync_pages(struct bio *bio) -{ - return bio->bi_private; -} +#include "raid1-10.c" /* * for resync bio, r1bio pointer can be retrieved from the per-bio @@ -170,7 +163,6 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) resync_get_all_pages(rp); } - rp->idx = 0; rp->raid_bio = r1_bio; bio->bi_private = rp; } @@ -492,10 +484,6 @@ static void raid1_end_write_request(struct bio *bio) } if (behind) { - /* we release behind master bio when all write are done */ - if (r1_bio->behind_master_bio == bio) - to_put = NULL; - if (test_bit(WriteMostly, &rdev->flags)) atomic_dec(&r1_bio->behind_remaining); @@ -802,8 +790,7 @@ static void flush_bio_list(struct r1conf *conf, struct bio *bio) bio->bi_next = NULL; bio->bi_bdev = rdev->bdev; if (test_bit(Faulty, &rdev->flags)) { - bio->bi_status = BLK_STS_IOERR; - bio_endio(bio); + bio_io_error(bio); } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) /* Just ignore it */ @@ -1088,7 +1075,7 @@ static void unfreeze_array(struct r1conf *conf) wake_up(&conf->wait_barrier); } -static struct bio *alloc_behind_master_bio(struct r1bio *r1_bio, +static void alloc_behind_master_bio(struct r1bio *r1_bio, struct bio *bio) { int size = bio->bi_iter.bi_size; @@ -1098,11 +1085,13 @@ static struct bio *alloc_behind_master_bio(struct r1bio *r1_bio, behind_bio = bio_alloc_mddev(GFP_NOIO, vcnt, r1_bio->mddev); if (!behind_bio) - goto fail; + return; /* discard op, we don't support writezero/writesame yet */ - if (!bio_has_data(bio)) + if (!bio_has_data(bio)) { + behind_bio->bi_iter.bi_size = size; goto skip_copy; + } while (i < vcnt && size) { struct page *page; @@ -1123,14 +1112,13 @@ skip_copy: r1_bio->behind_master_bio = behind_bio;; set_bit(R1BIO_BehindIO, &r1_bio->state); - return behind_bio; + return; free_pages: pr_debug("%dB behind alloc failed, doing sync I/O\n", bio->bi_iter.bi_size); bio_free_pages(behind_bio); -fail: - return behind_bio; + bio_put(behind_bio); } struct raid1_plug_cb { @@ -1483,7 +1471,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, (atomic_read(&bitmap->behind_writes) < mddev->bitmap_info.max_write_behind) && !waitqueue_active(&bitmap->behind_wait)) { - mbio = alloc_behind_master_bio(r1_bio, bio); + alloc_behind_master_bio(r1_bio, bio); } bitmap_startwrite(bitmap, r1_bio->sector, @@ -1493,14 +1481,11 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, first_clone = 0; } - if (!mbio) { - if (r1_bio->behind_master_bio) - mbio = bio_clone_fast(r1_bio->behind_master_bio, - GFP_NOIO, - mddev->bio_set); - else - mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); - } + if (r1_bio->behind_master_bio) + mbio = bio_clone_fast(r1_bio->behind_master_bio, + GFP_NOIO, mddev->bio_set); + else + mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); if (r1_bio->behind_master_bio) { if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) @@ -2086,10 +2071,7 @@ static void process_checks(struct r1bio *r1_bio) /* Fix variable parts of all bios */ vcnt = (r1_bio->sectors + PAGE_SIZE / 512 - 1) >> (PAGE_SHIFT - 9); for (i = 0; i < conf->raid_disks * 2; i++) { - int j; - int size; blk_status_t status; - struct bio_vec *bi; struct bio *b = r1_bio->bios[i]; struct resync_pages *rp = get_resync_pages(b); if (b->bi_end_io != end_sync_read) @@ -2098,8 +2080,6 @@ static void process_checks(struct r1bio *r1_bio) status = b->bi_status; bio_reset(b); b->bi_status = status; - b->bi_vcnt = vcnt; - b->bi_iter.bi_size = r1_bio->sectors << 9; b->bi_iter.bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; b->bi_bdev = conf->mirrors[i].rdev->bdev; @@ -2107,15 +2087,8 @@ static void process_checks(struct r1bio *r1_bio) rp->raid_bio = r1_bio; b->bi_private = rp; - size = b->bi_iter.bi_size; - bio_for_each_segment_all(bi, b, j) { - bi->bv_offset = 0; - if (size > PAGE_SIZE) - bi->bv_len = PAGE_SIZE; - else - bi->bv_len = size; - size -= PAGE_SIZE; - } + /* initialize bvec table again */ + md_bio_reset_resync_pages(b, rp, r1_bio->sectors << 9); } for (primary = 0; primary < conf->raid_disks * 2; primary++) if (r1_bio->bios[primary]->bi_end_io == end_sync_read && @@ -2366,8 +2339,6 @@ static int narrow_write_error(struct r1bio *r1_bio, int i) wbio = bio_clone_fast(r1_bio->behind_master_bio, GFP_NOIO, mddev->bio_set); - /* We really need a _all clone */ - wbio->bi_iter = (struct bvec_iter){ 0 }; } else { wbio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO, mddev->bio_set); @@ -2619,6 +2590,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, int good_sectors = RESYNC_SECTORS; int min_bad = 0; /* number of sectors that are bad in all devices */ int idx = sector_to_idx(sector_nr); + int page_idx = 0; if (!conf->r1buf_pool) if (init_resync(conf)) @@ -2846,7 +2818,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, bio = r1_bio->bios[i]; rp = get_resync_pages(bio); if (bio->bi_end_io) { - page = resync_fetch_page(rp, rp->idx++); + page = resync_fetch_page(rp, page_idx); /* * won't fail because the vec table is big @@ -2858,7 +2830,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, nr_sectors += len>>9; sector_nr += len>>9; sync_blocks -= (len>>9); - } while (get_resync_pages(r1_bio->bios[disk]->bi_private)->idx < RESYNC_PAGES); + } while (++page_idx < RESYNC_PAGES); r1_bio->sectors = nr_sectors; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 5026e7ad51d3..f55d4cc085f6 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -110,14 +110,7 @@ static void end_reshape(struct r10conf *conf); #define raid10_log(md, fmt, args...) \ do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0) -/* - * 'strct resync_pages' stores actual pages used for doing the resync - * IO, and it is per-bio, so make .bi_private points to it. - */ -static inline struct resync_pages *get_resync_pages(struct bio *bio) -{ - return bio->bi_private; -} +#include "raid1-10.c" /* * for resync bio, r10bio pointer can be retrieved from the per-bio @@ -221,7 +214,6 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) resync_get_all_pages(rp); } - rp->idx = 0; rp->raid_bio = r10_bio; bio->bi_private = rp; if (rbio) { @@ -913,8 +905,7 @@ static void flush_pending_writes(struct r10conf *conf) bio->bi_next = NULL; bio->bi_bdev = rdev->bdev; if (test_bit(Faulty, &rdev->flags)) { - bio->bi_status = BLK_STS_IOERR; - bio_endio(bio); + bio_io_error(bio); } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) /* Just ignore it */ @@ -1098,8 +1089,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) bio->bi_next = NULL; bio->bi_bdev = rdev->bdev; if (test_bit(Faulty, &rdev->flags)) { - bio->bi_status = BLK_STS_IOERR; - bio_endio(bio); + bio_io_error(bio); } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) /* Just ignore it */ @@ -2087,8 +2077,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) rp = get_resync_pages(tbio); bio_reset(tbio); - tbio->bi_vcnt = vcnt; - tbio->bi_iter.bi_size = fbio->bi_iter.bi_size; + md_bio_reset_resync_pages(tbio, rp, fbio->bi_iter.bi_size); + rp->raid_bio = r10_bio; tbio->bi_private = rp; tbio->bi_iter.bi_sector = r10_bio->devs[i].addr; @@ -2853,6 +2843,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, sector_t sectors_skipped = 0; int chunks_skipped = 0; sector_t chunk_mask = conf->geo.chunk_mask; + int page_idx = 0; if (!conf->r10buf_pool) if (init_resync(conf)) @@ -3355,7 +3346,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, break; for (bio= biolist ; bio ; bio=bio->bi_next) { struct resync_pages *rp = get_resync_pages(bio); - page = resync_fetch_page(rp, rp->idx++); + page = resync_fetch_page(rp, page_idx); /* * won't fail because the vec table is big enough * to hold all these pages @@ -3364,7 +3355,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, } nr_sectors += len>>9; sector_nr += len>>9; - } while (get_resync_pages(biolist)->idx < RESYNC_PAGES); + } while (++page_idx < RESYNC_PAGES); r10_bio->sectors = nr_sectors; while (biolist) { diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index bfa1e907c472..2dcbafa8e66c 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -236,9 +236,10 @@ struct r5l_io_unit { bool need_split_bio; struct bio *split_bio; - unsigned int has_flush:1; /* include flush request */ - unsigned int has_fua:1; /* include fua request */ - unsigned int has_null_flush:1; /* include empty flush request */ + unsigned int has_flush:1; /* include flush request */ + unsigned int has_fua:1; /* include fua request */ + unsigned int has_null_flush:1; /* include null flush request */ + unsigned int has_flush_payload:1; /* include flush payload */ /* * io isn't sent yet, flush/fua request can only be submitted till it's * the first IO in running_ios list @@ -571,6 +572,8 @@ static void r5l_log_endio(struct bio *bio) struct r5l_io_unit *io_deferred; struct r5l_log *log = io->log; unsigned long flags; + bool has_null_flush; + bool has_flush_payload; if (bio->bi_status) md_error(log->rdev->mddev, log->rdev); @@ -580,6 +583,16 @@ static void r5l_log_endio(struct bio *bio) spin_lock_irqsave(&log->io_list_lock, flags); __r5l_set_io_unit_state(io, IO_UNIT_IO_END); + + /* + * if the io doesn't not have null_flush or flush payload, + * it is not safe to access it after releasing io_list_lock. + * Therefore, it is necessary to check the condition with + * the lock held. + */ + has_null_flush = io->has_null_flush; + has_flush_payload = io->has_flush_payload; + if (log->need_cache_flush && !list_empty(&io->stripe_list)) r5l_move_to_end_ios(log); else @@ -600,19 +613,23 @@ static void r5l_log_endio(struct bio *bio) if (log->need_cache_flush) md_wakeup_thread(log->rdev->mddev->thread); - if (io->has_null_flush) { + /* finish flush only io_unit and PAYLOAD_FLUSH only io_unit */ + if (has_null_flush) { struct bio *bi; WARN_ON(bio_list_empty(&io->flush_barriers)); while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) { bio_endio(bi); - atomic_dec(&io->pending_stripe); + if (atomic_dec_and_test(&io->pending_stripe)) { + __r5l_stripe_write_finished(io); + return; + } } } - - /* finish flush only io_unit and PAYLOAD_FLUSH only io_unit */ - if (atomic_read(&io->pending_stripe) == 0) - __r5l_stripe_write_finished(io); + /* decrease pending_stripe for flush payload */ + if (has_flush_payload) + if (atomic_dec_and_test(&io->pending_stripe)) + __r5l_stripe_write_finished(io); } static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io) @@ -881,6 +898,11 @@ static void r5l_append_flush_payload(struct r5l_log *log, sector_t sect) payload->size = cpu_to_le32(sizeof(__le64)); payload->flush_stripes[0] = cpu_to_le64(sect); io->meta_offset += meta_size; + /* multiple flush payloads count as one pending_stripe */ + if (!io->has_flush_payload) { + io->has_flush_payload = 1; + atomic_inc(&io->pending_stripe); + } mutex_unlock(&log->io_mutex); } @@ -2540,23 +2562,32 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) */ int r5c_journal_mode_set(struct mddev *mddev, int mode) { - struct r5conf *conf = mddev->private; - struct r5l_log *log = conf->log; - - if (!log) - return -ENODEV; + struct r5conf *conf; + int err; if (mode < R5C_JOURNAL_MODE_WRITE_THROUGH || mode > R5C_JOURNAL_MODE_WRITE_BACK) return -EINVAL; + err = mddev_lock(mddev); + if (err) + return err; + conf = mddev->private; + if (!conf || !conf->log) { + mddev_unlock(mddev); + return -ENODEV; + } + if (raid5_calc_degraded(conf) > 0 && - mode == R5C_JOURNAL_MODE_WRITE_BACK) + mode == R5C_JOURNAL_MODE_WRITE_BACK) { + mddev_unlock(mddev); return -EINVAL; + } mddev_suspend(mddev); conf->log->r5c_journal_mode = mode; mddev_resume(mddev); + mddev_unlock(mddev); pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n", mdname(mddev), mode, r5c_journal_mode_str[mode]); diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index 77cce3573aa8..44ad5baf3206 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -1150,7 +1150,7 @@ int ppl_init_log(struct r5conf *conf) goto err; } - ppl_conf->bs = bioset_create(conf->raid_disks, 0, 0); + ppl_conf->bs = bioset_create(conf->raid_disks, 0, BIOSET_NEED_BVECS); if (!ppl_conf->bs) { ret = -ENOMEM; goto err; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2ceb338b094b..0fc2748aaf95 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3381,9 +3381,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, sh->dev[i].sector + STRIPE_SECTORS) { struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); - bi->bi_status = BLK_STS_IOERR; md_write_end(conf->mddev); - bio_endio(bi); + bio_io_error(bi); bi = nextbi; } if (bitmap_end) @@ -3403,9 +3402,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, sh->dev[i].sector + STRIPE_SECTORS) { struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); - bi->bi_status = BLK_STS_IOERR; md_write_end(conf->mddev); - bio_endio(bi); + bio_io_error(bi); bi = bi2; } @@ -3429,8 +3427,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); - bi->bi_status = BLK_STS_IOERR; - bio_endio(bi); + bio_io_error(bi); bi = nextbi; } } @@ -6237,6 +6234,8 @@ static void raid5_do_work(struct work_struct *work) pr_debug("%d stripes handled\n", handled); spin_unlock_irq(&conf->device_lock); + + async_tx_issue_pending_all(); blk_finish_plug(&plug); pr_debug("--- raid5worker inactive\n"); @@ -7951,12 +7950,10 @@ static void end_reshape(struct r5conf *conf) { if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { - struct md_rdev *rdev; spin_lock_irq(&conf->device_lock); conf->previous_raid_disks = conf->raid_disks; - rdev_for_each(rdev, conf->mddev) - rdev->data_offset = rdev->new_data_offset; + md_finish_reshape(conf->mddev); smp_wmb(); conf->reshape_progress = MaxSector; conf->mddev->reshape_position = MaxSector; |