diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-08-29 20:21:42 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-08-29 20:21:42 -0700 |
commit | 3d3dfeb3aec7b612d266d500c82054f1fded4980 (patch) | |
tree | 11649eab5c74deb74e6e0879613e8053ae3b9970 /drivers/md | |
parent | c1b7fcf3f6d94c2c3528bf77054bf174a5ef63d7 (diff) | |
parent | 146afeb235ccec10c17ad8ea26327c0c79dbd968 (diff) |
Merge tag 'for-6.6/block-2023-08-28' of git://git.kernel.dk/linux
Pull block updates from Jens Axboe:
"Pretty quiet round for this release. This contains:
- Add support for zoned storage to ublk (Andreas, Ming)
- Series improving performance for drivers that mark themselves as
needing a blocking context for issue (Bart)
- Cleanup the flush logic (Chengming)
- sed opal keyring support (Greg)
- Fixes and improvements to the integrity support (Jinyoung)
- Add some exports for bcachefs that we can hopefully delete again in
the future (Kent)
- deadline throttling fix (Zhiguo)
- Series allowing building the kernel without buffer_head support
(Christoph)
- Sanitize the bio page adding flow (Christoph)
- Write back cache fixes (Christoph)
- MD updates via Song:
- Fix perf regression for raid0 large sequential writes (Jan)
- Fix split bio iostat for raid0 (David)
- Various raid1 fixes (Heinz, Xueshi)
- raid6test build fixes (WANG)
- Deprecate bitmap file support (Christoph)
- Fix deadlock with md sync thread (Yu)
- Refactor md io accounting (Yu)
- Various non-urgent fixes (Li, Yu, Jack)
- Various fixes and cleanups (Arnd, Azeem, Chengming, Damien, Li,
Ming, Nitesh, Ruan, Tejun, Thomas, Xu)"
* tag 'for-6.6/block-2023-08-28' of git://git.kernel.dk/linux: (113 commits)
block: use strscpy() to instead of strncpy()
block: sed-opal: keyring support for SED keys
block: sed-opal: Implement IOC_OPAL_REVERT_LSP
block: sed-opal: Implement IOC_OPAL_DISCOVERY
blk-mq: prealloc tags when increase tagset nr_hw_queues
blk-mq: delete redundant tagset map update when fallback
blk-mq: fix tags leak when shrink nr_hw_queues
ublk: zoned: support REQ_OP_ZONE_RESET_ALL
md: raid0: account for split bio in iostat accounting
md/raid0: Fix performance regression for large sequential writes
md/raid0: Factor out helper for mapping and submitting a bio
md raid1: allow writebehind to work on any leg device set WriteMostly
md/raid1: hold the barrier until handle_read_error() finishes
md/raid1: free the r1bio before waiting for blocked rdev
md/raid1: call free_r1bio() before allow_barrier() in raid_end_bio_io()
blk-cgroup: Fix NULL deref caused by blkg_policy_data being installed before init
drivers/rnbd: restore sysfs interface to rnbd-client
md/raid5-cache: fix null-ptr-deref for r5l_flush_stripe_to_raid()
raid6: test: only check for Altivec if building on powerpc hosts
raid6: test: make sure all intermediate and artifact files are .gitignored
...
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/Kconfig | 11 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 1 | ||||
-rw-r--r-- | drivers/md/dm-raid.c | 1 | ||||
-rw-r--r-- | drivers/md/md-bitmap.c | 347 | ||||
-rw-r--r-- | drivers/md/md-bitmap.h | 1 | ||||
-rw-r--r-- | drivers/md/md-cluster.c | 8 | ||||
-rw-r--r-- | drivers/md/md-faulty.c | 2 | ||||
-rw-r--r-- | drivers/md/md-linear.c | 1 | ||||
-rw-r--r-- | drivers/md/md-multipath.c | 1 | ||||
-rw-r--r-- | drivers/md/md.c | 228 | ||||
-rw-r--r-- | drivers/md/md.h | 13 | ||||
-rw-r--r-- | drivers/md/raid0.c | 98 | ||||
-rw-r--r-- | drivers/md/raid1.c | 86 | ||||
-rw-r--r-- | drivers/md/raid1.h | 1 | ||||
-rw-r--r-- | drivers/md/raid10.c | 85 | ||||
-rw-r--r-- | drivers/md/raid10.h | 1 | ||||
-rw-r--r-- | drivers/md/raid5-cache.c | 14 | ||||
-rw-r--r-- | drivers/md/raid5.c | 72 |
18 files changed, 508 insertions, 463 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index b0a22e99bade..2a8b081bce7d 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -15,6 +15,7 @@ if MD config BLK_DEV_MD tristate "RAID support" select BLOCK_HOLDER_DEPRECATED if SYSFS + select BUFFER_HEAD # BLOCK_LEGACY_AUTOLOAD requirement should be removed # after relevant mdadm enhancements - to make "names=yes" # the default - are widely available. @@ -50,6 +51,16 @@ config MD_AUTODETECT If unsure, say Y. +config MD_BITMAP_FILE + bool "MD bitmap file support (deprecated)" + default y + help + If you say Y here, support for write intent bitmaps in files on an + external file system is enabled. This is an alternative to the internal + bitmaps near the MD superblock, and very problematic code that abuses + various kernel APIs and can only work with files on a file system not + actually sitting on the MD device. + config MD_LINEAR tristate "Linear (append) mode (deprecated)" depends on BLK_DEV_MD diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 1dc6227d353e..f2662c21a6df 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1160,7 +1160,6 @@ static int dm_crypt_integrity_io_alloc(struct dm_crypt_io *io, struct bio *bio) tag_len = io->cc->on_disk_tag_size * (bio_sectors(bio) >> io->cc->sector_shift); - bip->bip_iter.bi_size = tag_len; bip->bip_iter.bi_sector = io->cc->start + io->sector; ret = bio_integrity_add_page(bio, virt_to_page(io->integrity_metadata), diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index becdb689190e..5f9991765f27 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3723,7 +3723,6 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv, if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) { if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_unregister_thread(&mddev->sync_thread); md_reap_sync_thread(mddev); } } else if (decipher_sync_action(mddev, mddev->recovery) != st_idle) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 1ff712889a3b..6f9ff14971f9 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -139,29 +139,26 @@ static void md_bitmap_checkfree(struct bitmap_counts *bitmap, unsigned long page */ /* IO operations when bitmap is stored near all superblocks */ + +/* choose a good rdev and read the page from there */ static int read_sb_page(struct mddev *mddev, loff_t offset, - struct page *page, - unsigned long index, int size) + struct page *page, unsigned long index, int size) { - /* choose a good rdev and read the page from there */ + sector_t sector = mddev->bitmap_info.offset + offset + + index * (PAGE_SIZE / SECTOR_SIZE); struct md_rdev *rdev; - sector_t target; rdev_for_each(rdev, mddev) { - if (! test_bit(In_sync, &rdev->flags) - || test_bit(Faulty, &rdev->flags) - || test_bit(Bitmap_sync, &rdev->flags)) - continue; + u32 iosize = roundup(size, bdev_logical_block_size(rdev->bdev)); - target = offset + index * (PAGE_SIZE/512); + if (!test_bit(In_sync, &rdev->flags) || + test_bit(Faulty, &rdev->flags) || + test_bit(Bitmap_sync, &rdev->flags)) + continue; - if (sync_page_io(rdev, target, - roundup(size, bdev_logical_block_size(rdev->bdev)), - page, REQ_OP_READ, true)) { - page->index = index; + if (sync_page_io(rdev, sector, iosize, page, REQ_OP_READ, true)) return 0; - } } return -EIO; } @@ -225,18 +222,19 @@ static unsigned int bitmap_io_size(unsigned int io_size, unsigned int opt_size, } static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, - struct page *page) + unsigned long pg_index, struct page *page) { struct block_device *bdev; struct mddev *mddev = bitmap->mddev; struct bitmap_storage *store = &bitmap->storage; loff_t sboff, offset = mddev->bitmap_info.offset; - sector_t ps, doff; + sector_t ps = pg_index * PAGE_SIZE / SECTOR_SIZE; unsigned int size = PAGE_SIZE; unsigned int opt_size = PAGE_SIZE; + sector_t doff; bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; - if (page->index == store->file_pages - 1) { + if (pg_index == store->file_pages - 1) { unsigned int last_page_size = store->bytes & (PAGE_SIZE - 1); if (last_page_size == 0) @@ -245,7 +243,6 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, opt_size = optimal_io_size(bdev, last_page_size, size); } - ps = page->index * PAGE_SIZE / SECTOR_SIZE; sboff = rdev->sb_start + offset; doff = rdev->data_offset; @@ -279,55 +276,41 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, return 0; } -static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) +static void write_sb_page(struct bitmap *bitmap, unsigned long pg_index, + struct page *page, bool wait) { - struct md_rdev *rdev; struct mddev *mddev = bitmap->mddev; - int ret; do { - rdev = NULL; + struct md_rdev *rdev = NULL; + while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { - ret = __write_sb_page(rdev, bitmap, page); - if (ret) - return ret; + if (__write_sb_page(rdev, bitmap, pg_index, page) < 0) { + set_bit(BITMAP_WRITE_ERROR, &bitmap->flags); + return; + } } } while (wait && md_super_wait(mddev) < 0); - - return 0; } static void md_bitmap_file_kick(struct bitmap *bitmap); -/* - * write out a page to a file - */ -static void write_page(struct bitmap *bitmap, struct page *page, int wait) -{ - struct buffer_head *bh; - - if (bitmap->storage.file == NULL) { - switch (write_sb_page(bitmap, page, wait)) { - case -EINVAL: - set_bit(BITMAP_WRITE_ERROR, &bitmap->flags); - } - } else { - - bh = page_buffers(page); - while (bh && bh->b_blocknr) { - atomic_inc(&bitmap->pending_writes); - set_buffer_locked(bh); - set_buffer_mapped(bh); - submit_bh(REQ_OP_WRITE | REQ_SYNC, bh); - bh = bh->b_this_page; - } +#ifdef CONFIG_MD_BITMAP_FILE +static void write_file_page(struct bitmap *bitmap, struct page *page, int wait) +{ + struct buffer_head *bh = page_buffers(page); - if (wait) - wait_event(bitmap->write_wait, - atomic_read(&bitmap->pending_writes)==0); + while (bh && bh->b_blocknr) { + atomic_inc(&bitmap->pending_writes); + set_buffer_locked(bh); + set_buffer_mapped(bh); + submit_bh(REQ_OP_WRITE | REQ_SYNC, bh); + bh = bh->b_this_page; } - if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) - md_bitmap_file_kick(bitmap); + + if (wait) + wait_event(bitmap->write_wait, + atomic_read(&bitmap->pending_writes) == 0); } static void end_bitmap_write(struct buffer_head *bh, int uptodate) @@ -364,10 +347,8 @@ static void free_buffers(struct page *page) * This usage is similar to how swap files are handled, and allows us * to write to a file with no concerns of memory allocation failing. */ -static int read_page(struct file *file, unsigned long index, - struct bitmap *bitmap, - unsigned long count, - struct page *page) +static int read_file_page(struct file *file, unsigned long index, + struct bitmap *bitmap, unsigned long count, struct page *page) { int ret = 0; struct inode *inode = file_inode(file); @@ -415,7 +396,6 @@ static int read_page(struct file *file, unsigned long index, blk_cur++; bh = bh->b_this_page; } - page->index = index; wait_event(bitmap->write_wait, atomic_read(&bitmap->pending_writes)==0); @@ -429,12 +409,46 @@ out: ret); return ret; } +#else /* CONFIG_MD_BITMAP_FILE */ +static void write_file_page(struct bitmap *bitmap, struct page *page, int wait) +{ +} +static int read_file_page(struct file *file, unsigned long index, + struct bitmap *bitmap, unsigned long count, struct page *page) +{ + return -EIO; +} +static void free_buffers(struct page *page) +{ + put_page(page); +} +#endif /* CONFIG_MD_BITMAP_FILE */ /* * bitmap file superblock operations */ /* + * write out a page to a file + */ +static void filemap_write_page(struct bitmap *bitmap, unsigned long pg_index, + bool wait) +{ + struct bitmap_storage *store = &bitmap->storage; + struct page *page = store->filemap[pg_index]; + + if (mddev_is_clustered(bitmap->mddev)) { + pg_index += bitmap->cluster_slot * + DIV_ROUND_UP(store->bytes, PAGE_SIZE); + } + + if (store->file) + write_file_page(bitmap, page, wait); + else + write_sb_page(bitmap, pg_index, page, wait); +} + +/* * md_bitmap_wait_writes() should be called before writing any bitmap * blocks, to ensure previous writes, particularly from * md_bitmap_daemon_work(), have completed. @@ -488,7 +502,12 @@ void md_bitmap_update_sb(struct bitmap *bitmap) sb->sectors_reserved = cpu_to_le32(bitmap->mddev-> bitmap_info.space); kunmap_atomic(sb); - write_page(bitmap, bitmap->storage.sb_page, 1); + + if (bitmap->storage.file) + write_file_page(bitmap, bitmap->storage.sb_page, 1); + else + write_sb_page(bitmap, bitmap->storage.sb_index, + bitmap->storage.sb_page, 1); } EXPORT_SYMBOL(md_bitmap_update_sb); @@ -540,7 +559,7 @@ static int md_bitmap_new_disk_sb(struct bitmap *bitmap) bitmap->storage.sb_page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (bitmap->storage.sb_page == NULL) return -ENOMEM; - bitmap->storage.sb_page->index = 0; + bitmap->storage.sb_index = 0; sb = kmap_atomic(bitmap->storage.sb_page); @@ -601,7 +620,7 @@ static int md_bitmap_read_sb(struct bitmap *bitmap) unsigned long sectors_reserved = 0; int err = -EINVAL; struct page *sb_page; - loff_t offset = bitmap->mddev->bitmap_info.offset; + loff_t offset = 0; if (!bitmap->storage.file && !bitmap->mddev->bitmap_info.offset) { chunksize = 128 * 1024 * 1024; @@ -628,7 +647,7 @@ re_read: bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t); /* to 4k blocks */ bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096); - offset = bitmap->mddev->bitmap_info.offset + (bitmap->cluster_slot * (bm_blocks << 3)); + offset = bitmap->cluster_slot * (bm_blocks << 3); pr_debug("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__, bitmap->cluster_slot, offset); } @@ -637,13 +656,11 @@ re_read: loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host); int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize; - err = read_page(bitmap->storage.file, 0, + err = read_file_page(bitmap->storage.file, 0, bitmap, bytes, sb_page); } else { - err = read_sb_page(bitmap->mddev, - offset, - sb_page, - 0, sizeof(bitmap_super_t)); + err = read_sb_page(bitmap->mddev, offset, sb_page, 0, + sizeof(bitmap_super_t)); } if (err) return err; @@ -819,7 +836,7 @@ static int md_bitmap_storage_alloc(struct bitmap_storage *store, if (store->sb_page) { store->filemap[0] = store->sb_page; pnum = 1; - store->sb_page->index = offset; + store->sb_index = offset; } for ( ; pnum < num_pages; pnum++) { @@ -828,7 +845,6 @@ static int md_bitmap_storage_alloc(struct bitmap_storage *store, store->file_pages = pnum; return -ENOMEM; } - store->filemap[pnum]->index = pnum + offset; } store->file_pages = pnum; @@ -847,14 +863,10 @@ static int md_bitmap_storage_alloc(struct bitmap_storage *store, static void md_bitmap_file_unmap(struct bitmap_storage *store) { - struct page **map, *sb_page; - int pages; - struct file *file; - - file = store->file; - map = store->filemap; - pages = store->file_pages; - sb_page = store->sb_page; + struct file *file = store->file; + struct page *sb_page = store->sb_page; + struct page **map = store->filemap; + int pages = store->file_pages; while (pages--) if (map[pages] != sb_page) /* 0 is sb_page, release it below */ @@ -879,21 +891,13 @@ static void md_bitmap_file_unmap(struct bitmap_storage *store) */ static void md_bitmap_file_kick(struct bitmap *bitmap) { - char *path, *ptr = NULL; - if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) { md_bitmap_update_sb(bitmap); if (bitmap->storage.file) { - path = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (path) - ptr = file_path(bitmap->storage.file, - path, PAGE_SIZE); + pr_warn("%s: kicking failed bitmap file %pD4 from array!\n", + bmname(bitmap), bitmap->storage.file); - pr_warn("%s: kicking failed bitmap file %s from array!\n", - bmname(bitmap), IS_ERR(ptr) ? "" : ptr); - - kfree(path); } else pr_warn("%s: disabling internal bitmap due to errors\n", bmname(bitmap)); @@ -945,6 +949,7 @@ static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) void *kaddr; unsigned long chunk = block >> bitmap->counts.chunkshift; struct bitmap_storage *store = &bitmap->storage; + unsigned long index = file_page_index(store, chunk); unsigned long node_offset = 0; if (mddev_is_clustered(bitmap->mddev)) @@ -962,9 +967,9 @@ static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) else set_bit_le(bit, kaddr); kunmap_atomic(kaddr); - pr_debug("set file bit %lu page %lu\n", bit, page->index); + pr_debug("set file bit %lu page %lu\n", bit, index); /* record page number so it gets flushed to disk when unplug occurs */ - set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_DIRTY); + set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_DIRTY); } static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) @@ -974,6 +979,7 @@ static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) void *paddr; unsigned long chunk = block >> bitmap->counts.chunkshift; struct bitmap_storage *store = &bitmap->storage; + unsigned long index = file_page_index(store, chunk); unsigned long node_offset = 0; if (mddev_is_clustered(bitmap->mddev)) @@ -989,8 +995,8 @@ static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) else clear_bit_le(bit, paddr); kunmap_atomic(paddr); - if (!test_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_NEEDWRITE)) { - set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_PENDING); + if (!test_page_attr(bitmap, index - node_offset, BITMAP_PAGE_NEEDWRITE)) { + set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_PENDING); bitmap->allclean = 0; } } @@ -1042,7 +1048,7 @@ void md_bitmap_unplug(struct bitmap *bitmap) "md bitmap_unplug"); } clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING); - write_page(bitmap, bitmap->storage.filemap[i], 0); + filemap_write_page(bitmap, i, false); writing = 1; } } @@ -1084,33 +1090,31 @@ void md_bitmap_unplug_async(struct bitmap *bitmap) EXPORT_SYMBOL(md_bitmap_unplug_async); static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed); -/* * bitmap_init_from_disk -- called at bitmap_create time to initialize - * the in-memory bitmap from the on-disk bitmap -- also, sets up the - * memory mapping of the bitmap file - * Special cases: - * if there's no bitmap file, or if the bitmap file had been - * previously kicked from the array, we mark all the bits as - * 1's in order to cause a full resync. + +/* + * Initialize the in-memory bitmap from the on-disk bitmap and set up the memory + * mapping of the bitmap file. + * + * Special case: If there's no bitmap file, or if the bitmap file had been + * previously kicked from the array, we mark all the bits as 1's in order to + * cause a full resync. * * We ignore all bits for sectors that end earlier than 'start'. - * This is used when reading an out-of-date bitmap... + * This is used when reading an out-of-date bitmap. */ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) { - unsigned long i, chunks, index, oldindex, bit, node_offset = 0; - struct page *page = NULL; - unsigned long bit_cnt = 0; - struct file *file; - unsigned long offset; - int outofdate; - int ret = -ENOSPC; - void *paddr; + bool outofdate = test_bit(BITMAP_STALE, &bitmap->flags); + struct mddev *mddev = bitmap->mddev; + unsigned long chunks = bitmap->counts.chunks; struct bitmap_storage *store = &bitmap->storage; + struct file *file = store->file; + unsigned long node_offset = 0; + unsigned long bit_cnt = 0; + unsigned long i; + int ret; - chunks = bitmap->counts.chunks; - file = store->file; - - if (!file && !bitmap->mddev->bitmap_info.offset) { + if (!file && !mddev->bitmap_info.offset) { /* No permanent bitmap - fill with '1s'. */ store->filemap = NULL; store->file_pages = 0; @@ -1125,77 +1129,79 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) return 0; } - outofdate = test_bit(BITMAP_STALE, &bitmap->flags); - if (outofdate) - pr_warn("%s: bitmap file is out of date, doing full recovery\n", bmname(bitmap)); - if (file && i_size_read(file->f_mapping->host) < store->bytes) { pr_warn("%s: bitmap file too short %lu < %lu\n", bmname(bitmap), (unsigned long) i_size_read(file->f_mapping->host), store->bytes); + ret = -ENOSPC; goto err; } - oldindex = ~0L; - offset = 0; - if (!bitmap->mddev->bitmap_info.external) - offset = sizeof(bitmap_super_t); - - if (mddev_is_clustered(bitmap->mddev)) + if (mddev_is_clustered(mddev)) node_offset = bitmap->cluster_slot * (DIV_ROUND_UP(store->bytes, PAGE_SIZE)); - for (i = 0; i < chunks; i++) { - int b; - index = file_page_index(&bitmap->storage, i); - bit = file_page_offset(&bitmap->storage, i); - if (index != oldindex) { /* this is a new page, read it in */ - int count; - /* unmap the old page, we're done with it */ - if (index == store->file_pages-1) - count = store->bytes - index * PAGE_SIZE; - else - count = PAGE_SIZE; - page = store->filemap[index]; - if (file) - ret = read_page(file, index, bitmap, - count, page); - else - ret = read_sb_page( - bitmap->mddev, - bitmap->mddev->bitmap_info.offset, - page, - index + node_offset, count); + for (i = 0; i < store->file_pages; i++) { + struct page *page = store->filemap[i]; + int count; - if (ret) - goto err; + /* unmap the old page, we're done with it */ + if (i == store->file_pages - 1) + count = store->bytes - i * PAGE_SIZE; + else + count = PAGE_SIZE; - oldindex = index; + if (file) + ret = read_file_page(file, i, bitmap, count, page); + else + ret = read_sb_page(mddev, 0, page, i + node_offset, + count); + if (ret) + goto err; + } - if (outofdate) { - /* - * if bitmap is out of date, dirty the - * whole page and write it out - */ - paddr = kmap_atomic(page); - memset(paddr + offset, 0xff, - PAGE_SIZE - offset); - kunmap_atomic(paddr); - write_page(bitmap, page, 1); + if (outofdate) { + pr_warn("%s: bitmap file is out of date, doing full recovery\n", + bmname(bitmap)); + for (i = 0; i < store->file_pages; i++) { + struct page *page = store->filemap[i]; + unsigned long offset = 0; + void *paddr; + + if (i == 0 && !mddev->bitmap_info.external) + offset = sizeof(bitmap_super_t); + + /* + * If the bitmap is out of date, dirty the whole page + * and write it out + */ + paddr = kmap_atomic(page); + memset(paddr + offset, 0xff, PAGE_SIZE - offset); + kunmap_atomic(paddr); + + filemap_write_page(bitmap, i, true); + if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) { ret = -EIO; - if (test_bit(BITMAP_WRITE_ERROR, - &bitmap->flags)) - goto err; + goto err; } } + } + + for (i = 0; i < chunks; i++) { + struct page *page = filemap_get_page(&bitmap->storage, i); + unsigned long bit = file_page_offset(&bitmap->storage, i); + void *paddr; + bool was_set; + paddr = kmap_atomic(page); if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) - b = test_bit(bit, paddr); + was_set = test_bit(bit, paddr); else - b = test_bit_le(bit, paddr); + was_set = test_bit_le(bit, paddr); kunmap_atomic(paddr); - if (b) { + + if (was_set) { /* if the disk bit is set, set the memory bit */ int needed = ((sector_t)(i+1) << bitmap->counts.chunkshift >= start); @@ -1204,7 +1210,6 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) needed); bit_cnt++; } - offset = 0; } pr_debug("%s: bitmap initialized from disk: read %lu pages, set %lu of %lu bits\n", @@ -1396,9 +1401,8 @@ void md_bitmap_daemon_work(struct mddev *mddev) break; if (bitmap->storage.filemap && test_and_clear_page_attr(bitmap, j, - BITMAP_PAGE_NEEDWRITE)) { - write_page(bitmap, bitmap->storage.filemap[j], 0); - } + BITMAP_PAGE_NEEDWRITE)) + filemap_write_page(bitmap, j, false); } done: @@ -2542,6 +2546,10 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len) if (backlog > COUNTER_MAX) return -EINVAL; + rv = mddev_lock(mddev); + if (rv) + return rv; + /* * Without write mostly device, it doesn't make sense to set * backlog for max_write_behind. @@ -2555,6 +2563,7 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len) if (!has_write_mostly) { pr_warn_ratelimited("%s: can't set backlog, no write mostly device available\n", mdname(mddev)); + mddev_unlock(mddev); return -EINVAL; } @@ -2565,13 +2574,13 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len) mddev_destroy_serial_pool(mddev, NULL, false); } else if (backlog && !mddev->serial_info_pool) { /* serial_info_pool is needed since backlog is not zero */ - struct md_rdev *rdev; - rdev_for_each(rdev, mddev) mddev_create_serial_pool(mddev, rdev, false); } if (old_mwb != backlog) md_bitmap_update_sb(mddev->bitmap); + + mddev_unlock(mddev); return len; } diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 8a3788c9bfef..bb9eb418780a 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -201,6 +201,7 @@ struct bitmap { struct file *file; /* backing disk file */ struct page *sb_page; /* cached copy of the bitmap * file superblock */ + unsigned long sb_index; struct page **filemap; /* list of cache pages for * the file */ unsigned long *filemap_attr; /* attributes associated diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 3d9fd74233df..1e26eb223349 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -952,8 +952,8 @@ static int join(struct mddev *mddev, int nodes) return 0; err: set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); - md_unregister_thread(&cinfo->recovery_thread); - md_unregister_thread(&cinfo->recv_thread); + md_unregister_thread(mddev, &cinfo->recovery_thread); + md_unregister_thread(mddev, &cinfo->recv_thread); lockres_free(cinfo->message_lockres); lockres_free(cinfo->token_lockres); lockres_free(cinfo->ack_lockres); @@ -1015,8 +1015,8 @@ static int leave(struct mddev *mddev) resync_bitmap(mddev); set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); - md_unregister_thread(&cinfo->recovery_thread); - md_unregister_thread(&cinfo->recv_thread); + md_unregister_thread(mddev, &cinfo->recovery_thread); + md_unregister_thread(mddev, &cinfo->recv_thread); lockres_free(cinfo->message_lockres); lockres_free(cinfo->token_lockres); lockres_free(cinfo->ack_lockres); diff --git a/drivers/md/md-faulty.c b/drivers/md/md-faulty.c index 50ad818978a4..a039e8e20f55 100644 --- a/drivers/md/md-faulty.c +++ b/drivers/md/md-faulty.c @@ -204,6 +204,8 @@ static bool faulty_make_request(struct mddev *mddev, struct bio *bio) failit = 1; } } + + md_account_bio(mddev, &bio); if (failit) { struct bio *b = bio_alloc_clone(conf->rdev->bdev, bio, GFP_NOIO, &mddev->bio_set); diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 4eb72b9dd933..71ac99646827 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -238,6 +238,7 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio) bio = split; } + md_account_bio(mddev, &bio); bio_set_dev(bio, tmp_dev->rdev->bdev); bio->bi_iter.bi_sector = bio->bi_iter.bi_sector - start_sector + data_offset; diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c index 92c45be203d7..d22276870283 100644 --- a/drivers/md/md-multipath.c +++ b/drivers/md/md-multipath.c @@ -107,6 +107,7 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio) && md_flush_request(mddev, bio)) return true; + md_account_bio(mddev, &bio); mp_bh = mempool_alloc(&conf->pool, GFP_NOIO); mp_bh->master_bio = bio; diff --git a/drivers/md/md.c b/drivers/md/md.c index 78be7811a89f..0fe7ab6e8ab9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -453,7 +453,6 @@ void mddev_suspend(struct mddev *mddev) mddev->pers->prepare_suspend(mddev); wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io)); - mddev->pers->quiesce(mddev, 1); clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags); wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags)); @@ -465,14 +464,15 @@ EXPORT_SYMBOL_GPL(mddev_suspend); void mddev_resume(struct mddev *mddev) { - /* entred the memalloc scope from mddev_suspend() */ - memalloc_noio_restore(mddev->noio_flag); lockdep_assert_held(&mddev->reconfig_mutex); if (--mddev->suspended) return; + + /* entred the memalloc scope from mddev_suspend() */ + memalloc_noio_restore(mddev->noio_flag); + percpu_ref_resurrect(&mddev->active_io); wake_up(&mddev->sb_wait); - mddev->pers->quiesce(mddev, 0); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); @@ -643,6 +643,7 @@ void mddev_init(struct mddev *mddev) { mutex_init(&mddev->open_mutex); mutex_init(&mddev->reconfig_mutex); + mutex_init(&mddev->sync_mutex); mutex_init(&mddev->bitmap_info.mutex); INIT_LIST_HEAD(&mddev->disks); INIT_LIST_HEAD(&mddev->all_mddevs); @@ -650,6 +651,7 @@ void mddev_init(struct mddev *mddev) timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); atomic_set(&mddev->active, 1); atomic_set(&mddev->openers, 0); + atomic_set(&mddev->sync_seq, 0); spin_lock_init(&mddev->lock); atomic_set(&mddev->flush_pending, 0); init_waitqueue_head(&mddev->sb_wait); @@ -2304,7 +2306,7 @@ int md_integrity_register(struct mddev *mddev) pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || (mddev->level != 1 && mddev->level != 10 && - bioset_integrity_create(&mddev->io_acct_set, BIO_POOL_SIZE))) { + bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) { /* * No need to handle the failure of bioset_integrity_create, * because the function is called by md_run() -> pers->run(), @@ -4747,6 +4749,62 @@ action_show(struct mddev *mddev, char *page) return sprintf(page, "%s\n", type); } +static void stop_sync_thread(struct mddev *mddev) +{ + if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + return; + + if (mddev_lock(mddev)) + return; + + /* + * Check again in case MD_RECOVERY_RUNNING is cleared before lock is + * held. + */ + if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { + mddev_unlock(mddev); + return; + } + + if (work_pending(&mddev->del_work)) + flush_workqueue(md_misc_wq); + + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + /* + * Thread might be blocked waiting for metadata update which will now + * never happen + */ + md_wakeup_thread_directly(mddev->sync_thread); + + mddev_unlock(mddev); +} + +static void idle_sync_thread(struct mddev *mddev) +{ + int sync_seq = atomic_read(&mddev->sync_seq); + + mutex_lock(&mddev->sync_mutex); + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + stop_sync_thread(mddev); + + wait_event(resync_wait, sync_seq != atomic_read(&mddev->sync_seq) || + !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); + + mutex_unlock(&mddev->sync_mutex); +} + +static void frozen_sync_thread(struct mddev *mddev) +{ + mutex_lock(&mddev->sync_mutex); + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + stop_sync_thread(mddev); + + wait_event(resync_wait, mddev->sync_thread == NULL && + !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); + + mutex_unlock(&mddev->sync_mutex); +} + static ssize_t action_store(struct mddev *mddev, const char *page, size_t len) { @@ -4754,35 +4812,11 @@ action_store(struct mddev *mddev, const char *page, size_t len) return -EINVAL; - if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { - if (cmd_match(page, "frozen")) - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - else - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && - mddev_lock(mddev) == 0) { - if (work_pending(&mddev->del_work)) - flush_workqueue(md_misc_wq); - if (mddev->sync_thread) { - sector_t save_rp = mddev->reshape_position; - - mddev_unlock(mddev); - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_unregister_thread(&mddev->sync_thread); - mddev_lock_nointr(mddev); - /* - * set RECOVERY_INTR again and restore reshape - * position in case others changed them after - * got lock, eg, reshape_position_store and - * md_check_recovery. - */ - mddev->reshape_position = save_rp; - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_reap_sync_thread(mddev); - } - mddev_unlock(mddev); - } - } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + if (cmd_match(page, "idle")) + idle_sync_thread(mddev); + else if (cmd_match(page, "frozen")) + frozen_sync_thread(mddev); + else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return -EBUSY; else if (cmd_match(page, "resync")) clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); @@ -5842,6 +5876,13 @@ int md_run(struct mddev *mddev) goto exit_bio_set; } + if (!bioset_initialized(&mddev->io_clone_set)) { + err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE, + offsetof(struct md_io_clone, bio_clone), 0); + if (err) + goto exit_sync_set; + } + spin_lock(&pers_lock); pers = find_pers(mddev->level, mddev->clevel); if (!pers || !try_module_get(pers->owner)) { @@ -6019,6 +6060,8 @@ bitmap_abort: module_put(pers->owner); md_bitmap_destroy(mddev); abort: + bioset_exit(&mddev->io_clone_set); +exit_sync_set: bioset_exit(&mddev->sync_set); exit_bio_set: bioset_exit(&mddev->bio_set); @@ -6176,7 +6219,6 @@ static void __md_stop_writes(struct mddev *mddev) flush_workqueue(md_misc_wq); if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_unregister_thread(&mddev->sync_thread); md_reap_sync_thread(mddev); } @@ -6216,7 +6258,7 @@ static void mddev_detach(struct mddev *mddev) mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); } - md_unregister_thread(&mddev->thread); + md_unregister_thread(mddev, &mddev->thread); if (mddev->queue) blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ } @@ -6243,6 +6285,7 @@ static void __md_stop(struct mddev *mddev) percpu_ref_exit(&mddev->active_io); bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); + bioset_exit(&mddev->io_clone_set); } void md_stop(struct mddev *mddev) @@ -7012,6 +7055,15 @@ static int set_bitmap_file(struct mddev *mddev, int fd) if (mddev->bitmap || mddev->bitmap_info.file) return -EEXIST; /* cannot add when bitmap is present */ + + if (!IS_ENABLED(CONFIG_MD_BITMAP_FILE)) { + pr_warn("%s: bitmap files not supported by this kernel\n", + mdname(mddev)); + return -EINVAL; + } + pr_warn("%s: using deprecated bitmap file support\n", + mdname(mddev)); + f = fget(fd); if (f == NULL) { @@ -7940,9 +7992,10 @@ struct md_thread *md_register_thread(void (*run) (struct md_thread *), } EXPORT_SYMBOL(md_register_thread); -void md_unregister_thread(struct md_thread __rcu **threadp) +void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp) { - struct md_thread *thread = rcu_dereference_protected(*threadp, true); + struct md_thread *thread = rcu_dereference_protected(*threadp, + lockdep_is_held(&mddev->reconfig_mutex)); if (!thread) return; @@ -8601,62 +8654,44 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, } EXPORT_SYMBOL_GPL(md_submit_discard_bio); -int acct_bioset_init(struct mddev *mddev) +static void md_end_clone_io(struct bio *bio) { - int err = 0; - - if (!bioset_initialized(&mddev->io_acct_set)) - err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE, - offsetof(struct md_io_acct, bio_clone), 0); - return err; -} -EXPORT_SYMBOL_GPL(acct_bioset_init); - -void acct_bioset_exit(struct mddev *mddev) -{ - bioset_exit(&mddev->io_acct_set); -} -EXPORT_SYMBOL_GPL(acct_bioset_exit); - -static void md_end_io_acct(struct bio *bio) -{ - struct md_io_acct *md_io_acct = bio->bi_private; - struct bio *orig_bio = md_io_acct->orig_bio; - struct mddev *mddev = md_io_acct->mddev; + struct md_io_clone *md_io_clone = bio->bi_private; + struct bio *orig_bio = md_io_clone->orig_bio; + struct mddev *mddev = md_io_clone->mddev; orig_bio->bi_status = bio->bi_status; - bio_end_io_acct(orig_bio, md_io_acct->start_time); + if (md_io_clone->start_time) + bio_end_io_acct(orig_bio, md_io_clone->start_time); + bio_put(bio); bio_endio(orig_bio); - percpu_ref_put(&mddev->active_io); } -/* - * Used by personalities that don't already clone the bio and thus can't - * easily add the timestamp to their extended bio structure. - */ -void md_account_bio(struct mddev *mddev, struct bio **bio) +static void md_clone_bio(struct mddev *mddev, struct bio **bio) { struct block_device *bdev = (*bio)->bi_bdev; - struct md_io_acct *md_io_acct; - struct bio *clone; - - if (!blk_queue_io_stat(bdev->bd_disk->queue)) - return; + struct md_io_clone *md_io_clone; + struct bio *clone = + bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set); + + md_io_clone = container_of(clone, struct md_io_clone, bio_clone); + md_io_clone->orig_bio = *bio; + md_io_clone->mddev = mddev; + if (blk_queue_io_stat(bdev->bd_disk->queue)) + md_io_clone->start_time = bio_start_io_acct(*bio); + + clone->bi_end_io = md_end_clone_io; + clone->bi_private = md_io_clone; + *bio = clone; +} +void md_account_bio(struct mddev *mddev, struct bio **bio) +{ percpu_ref_get(&mddev->active_io); - - clone = bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_acct_set); - md_io_acct = container_of(clone, struct md_io_acct, bio_clone); - md_io_acct->orig_bio = *bio; - md_io_acct->start_time = bio_start_io_acct(*bio); - md_io_acct->mddev = mddev; - - clone->bi_end_io = md_end_io_acct; - clone->bi_private = md_io_acct; - *bio = clone; + md_clone_bio(mddev, bio); } EXPORT_SYMBOL_GPL(md_account_bio); @@ -9329,7 +9364,6 @@ void md_check_recovery(struct mddev *mddev) * ->spare_active and clear saved_raid_disk */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_unregister_thread(&mddev->sync_thread); md_reap_sync_thread(mddev); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); @@ -9358,17 +9392,24 @@ void md_check_recovery(struct mddev *mddev) if (mddev->sb_flags) md_update_sb(mddev, 0); - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && - !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { - /* resync/recovery still happening */ - clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - goto unlock; - } - if (mddev->sync_thread) { - md_unregister_thread(&mddev->sync_thread); + /* + * Never start a new sync thread if MD_RECOVERY_RUNNING is + * still set. + */ + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { + if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { + /* resync/recovery still happening */ + clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + goto unlock; + } + + if (WARN_ON_ONCE(!mddev->sync_thread)) + goto unlock; + md_reap_sync_thread(mddev); goto unlock; } + /* Set RUNNING before clearing NEEDED to avoid * any transients in the value of "sync_action". */ @@ -9445,7 +9486,10 @@ void md_reap_sync_thread(struct mddev *mddev) sector_t old_dev_sectors = mddev->dev_sectors; bool is_reshaped = false; - /* sync_thread should be unregistered, collect result */ + /* resync has finished, collect result */ + md_unregister_thread(mddev, &mddev->sync_thread); + atomic_inc(&mddev->sync_seq); + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && mddev->degraded != mddev->raid_disks) { @@ -9490,7 +9534,6 @@ void md_reap_sync_thread(struct mddev *mddev) if (mddev_is_clustered(mddev) && is_reshaped && !test_bit(MD_CLOSING, &mddev->flags)) md_cluster_ops->update_size(mddev, old_dev_sectors); - wake_up(&resync_wait); /* flag recovery needed just to double check */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); sysfs_notify_dirent_safe(mddev->sysfs_completed); @@ -9498,6 +9541,7 @@ void md_reap_sync_thread(struct mddev *mddev) md_new_event(); if (mddev->event_work.func) queue_work(md_misc_wq, &mddev->event_work); + wake_up(&resync_wait); } EXPORT_SYMBOL(md_reap_sync_thread); diff --git a/drivers/md/md.h b/drivers/md/md.h index 1aef86bf3fc3..9bcb77bca963 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -510,7 +510,7 @@ struct mddev { struct bio_set sync_set; /* for sync operations like * metadata and bitmap writes */ - struct bio_set io_acct_set; /* for raid0 and raid5 io accounting */ + struct bio_set io_clone_set; /* Generic flush handling. * The last to finish preflush schedules a worker to submit @@ -535,6 +535,11 @@ struct mddev { */ struct list_head deleting; + /* Used to synchronize idle and frozen for action_store() */ + struct mutex sync_mutex; + /* The sequence number for sync thread */ + atomic_t sync_seq; + bool has_superblocks:1; bool fail_last_dev:1; bool serialize_policy:1; @@ -731,7 +736,7 @@ struct md_thread { void *private; }; -struct md_io_acct { +struct md_io_clone { struct mddev *mddev; struct bio *orig_bio; unsigned long start_time; @@ -756,7 +761,7 @@ extern struct md_thread *md_register_thread( void (*run)(struct md_thread *thread), struct mddev *mddev, const char *name); -extern void md_unregister_thread(struct md_thread __rcu **threadp); +extern void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp); extern void md_wakeup_thread(struct md_thread __rcu *thread); extern void md_check_recovery(struct mddev *mddev); extern void md_reap_sync_thread(struct mddev *mddev); @@ -769,8 +774,6 @@ extern void md_error(struct mddev *mddev, struct md_rdev *rdev); extern void md_finish_reshape(struct mddev *mddev); void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, struct bio *bio, sector_t start, sector_t size); -int acct_bioset_init(struct mddev *mddev); -void acct_bioset_exit(struct mddev *mddev); void md_account_bio(struct mddev *mddev, struct bio **bio); extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index d1ac73fcd852..c50a7abda744 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -377,7 +377,6 @@ static void raid0_free(struct mddev *mddev, void *priv) struct r0conf *conf = priv; free_conf(mddev, conf); - acct_bioset_exit(mddev); } static int raid0_run(struct mddev *mddev) @@ -392,16 +391,11 @@ static int raid0_run(struct mddev *mddev) if (md_check_no_bitmap(mddev)) return -EINVAL; - if (acct_bioset_init(mddev)) { - pr_err("md/raid0:%s: alloc acct bioset failed.\n", mdname(mddev)); - return -ENOMEM; - } - /* if private is not null, we are here after takeover */ if (mddev->private == NULL) { ret = create_strip_zones(mddev, &conf); if (ret < 0) - goto exit_acct_set; + return ret; mddev->private = conf; } conf = mddev->private; @@ -432,15 +426,9 @@ static int raid0_run(struct mddev *mddev) ret = md_integrity_register(mddev); if (ret) - goto free; + free_conf(mddev, conf); return ret; - -free: - free_conf(mddev, conf); -exit_acct_set: - acct_bioset_exit(mddev); - return ret; } /* @@ -557,54 +545,20 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) bio_endio(bio); } -static bool raid0_make_request(struct mddev *mddev, struct bio *bio) +static void raid0_map_submit_bio(struct mddev *mddev, struct bio *bio) { struct r0conf *conf = mddev->private; struct strip_zone *zone; struct md_rdev *tmp_dev; - sector_t bio_sector; - sector_t sector; - sector_t orig_sector; - unsigned chunk_sects; - unsigned sectors; - - if (unlikely(bio->bi_opf & REQ_PREFLUSH) - && md_flush_request(mddev, bio)) - return true; - - if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) { - raid0_handle_discard(mddev, bio); - return true; - } - - bio_sector = bio->bi_iter.bi_sector; - sector = bio_sector; - chunk_sects = mddev->chunk_sectors; - - sectors = chunk_sects - - (likely(is_power_of_2(chunk_sects)) - ? (sector & (chunk_sects-1)) - : sector_div(sector, chunk_sects)); - - /* Restore due to sector_div */ - sector = bio_sector; - - if (sectors < bio_sectors(bio)) { - struct bio *split = bio_split(bio, sectors, GFP_NOIO, - &mddev->bio_set); - bio_chain(split, bio); - submit_bio_noacct(bio); - bio = split; - } + sector_t bio_sector = bio->bi_iter.bi_sector; + sector_t sector = bio_sector; - if (bio->bi_pool != &mddev->bio_set) - md_account_bio(mddev, &bio); + md_account_bio(mddev, &bio); - orig_sector = sector; zone = find_zone(mddev->private, §or); switch (conf->layout) { case RAID0_ORIG_LAYOUT: - tmp_dev = map_sector(mddev, zone, orig_sector, §or); + tmp_dev = map_sector(mddev, zone, bio_sector, §or); break; case RAID0_ALT_MULTIZONE_LAYOUT: tmp_dev = map_sector(mddev, zone, sector, §or); @@ -612,13 +566,13 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) default: WARN(1, "md/raid0:%s: Invalid layout\n", mdname(mddev)); bio_io_error(bio); - return true; + return; } if (unlikely(is_rdev_broken(tmp_dev))) { bio_io_error(bio); md_error(mddev, tmp_dev); - return true; + return; } bio_set_dev(bio, tmp_dev->bdev); @@ -630,6 +584,40 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) bio_sector); mddev_check_write_zeroes(mddev, bio); submit_bio_noacct(bio); +} + +static bool raid0_make_request(struct mddev *mddev, struct bio *bio) +{ + sector_t sector; + unsigned chunk_sects; + unsigned sectors; + + if (unlikely(bio->bi_opf & REQ_PREFLUSH) + && md_flush_request(mddev, bio)) + return true; + + if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) { + raid0_handle_discard(mddev, bio); + return true; + } + + sector = bio->bi_iter.bi_sector; + chunk_sects = mddev->chunk_sectors; + + sectors = chunk_sects - + (likely(is_power_of_2(chunk_sects)) + ? (sector & (chunk_sects-1)) + : sector_div(sector, chunk_sects)); + + if (sectors < bio_sectors(bio)) { + struct bio *split = bio_split(bio, sectors, GFP_NOIO, + &mddev->bio_set); + bio_chain(split, bio); + raid0_map_submit_bio(mddev, bio); + bio = split; + } + + raid0_map_submit_bio(mddev, bio); return true; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index dd25832eb045..4b30a1742162 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -304,8 +304,6 @@ static void call_bio_endio(struct r1bio *r1_bio) if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) bio->bi_status = BLK_STS_IOERR; - if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) - bio_end_io_acct(bio, r1_bio->start_time); bio_endio(bio); } @@ -313,6 +311,7 @@ static void raid_end_bio_io(struct r1bio *r1_bio) { struct bio *bio = r1_bio->master_bio; struct r1conf *conf = r1_bio->mddev->private; + sector_t sector = r1_bio->sector; /* if nobody has done the final endio yet, do it now */ if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { @@ -323,13 +322,13 @@ static void raid_end_bio_io(struct r1bio *r1_bio) call_bio_endio(r1_bio); } + + free_r1bio(r1_bio); /* * Wake up any possible resync thread that waits for the device * to go idle. All I/Os, even write-behind writes, are done. */ - allow_barrier(conf, r1_bio->sector); - - free_r1bio(r1_bio); + allow_barrier(conf, sector); } /* @@ -791,11 +790,17 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect return best_disk; } +static void wake_up_barrier(struct r1conf *conf) +{ + if (wq_has_sleeper(&conf->wait_barrier)) + wake_up(&conf->wait_barrier); +} + static void flush_bio_list(struct r1conf *conf, struct bio *bio) { /* flush any pending bitmap writes to disk before proceeding w/ I/O */ raid1_prepare_flush_writes(conf->mddev->bitmap); - wake_up(&conf->wait_barrier); + wake_up_barrier(conf); while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; @@ -972,7 +977,7 @@ static bool _wait_barrier(struct r1conf *conf, int idx, bool nowait) * In case freeze_array() is waiting for * get_unqueued_pending() == extra */ - wake_up(&conf->wait_barrier); + wake_up_barrier(conf); /* Wait for the barrier in same barrier unit bucket to drop. */ /* Return false when nowait flag is set */ @@ -1015,7 +1020,7 @@ static bool wait_read_barrier(struct r1conf *conf, sector_t sector_nr, bool nowa * In case freeze_array() is waiting for * get_unqueued_pending() == extra */ - wake_up(&conf->wait_barrier); + wake_up_barrier(conf); /* Wait for array to be unfrozen */ /* Return false when nowait flag is set */ @@ -1044,7 +1049,7 @@ static bool wait_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait) static void _allow_barrier(struct r1conf *conf, int idx) { atomic_dec(&conf->nr_pending[idx]); - wake_up(&conf->wait_barrier); + wake_up_barrier(conf); } static void allow_barrier(struct r1conf *conf, sector_t sector_nr) @@ -1173,7 +1178,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) spin_lock_irq(&conf->device_lock); bio_list_merge(&conf->pending_bio_list, &plug->pending); spin_unlock_irq(&conf->device_lock); - wake_up(&conf->wait_barrier); + wake_up_barrier(conf); md_wakeup_thread(mddev->thread); kfree(plug); return; @@ -1303,10 +1308,10 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, } r1_bio->read_disk = rdisk; - - if (!r1bio_existed && blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) - r1_bio->start_time = bio_start_io_acct(bio); - + if (!r1bio_existed) { + md_account_bio(mddev, &bio); + r1_bio->master_bio = bio; + } read_bio = bio_alloc_clone(mirror->rdev->bdev, bio, gfp, &mddev->bio_set); @@ -1373,6 +1378,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, return; } + retry_write: r1_bio = alloc_r1bio(mddev, bio); r1_bio->sectors = max_write_sectors; @@ -1388,7 +1394,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, */ disks = conf->raid_disks * 2; - retry_write: blocked_rdev = NULL; rcu_read_lock(); max_sectors = r1_bio->sectors; @@ -1468,7 +1473,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, for (j = 0; j < i; j++) if (r1_bio->bios[j]) rdev_dec_pending(conf->mirrors[j].rdev, mddev); - r1_bio->state = 0; + free_r1bio(r1_bio); allow_barrier(conf, bio->bi_iter.bi_sector); if (bio->bi_opf & REQ_NOWAIT) { @@ -1500,8 +1505,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, r1_bio->sectors = max_sectors; } - if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) - r1_bio->start_time = bio_start_io_acct(bio); + md_account_bio(mddev, &bio); + r1_bio->master_bio = bio; atomic_set(&r1_bio->remaining, 1); atomic_set(&r1_bio->behind_remaining, 0); @@ -1518,8 +1523,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, * Not if there are too many, or cannot * allocate memory, or a reader on WriteMostly * is waiting for behind writes to flush */ - if (bitmap && - test_bit(WriteMostly, &rdev->flags) && + if (bitmap && write_behind && (atomic_read(&bitmap->behind_writes) < mddev->bitmap_info.max_write_behind) && !waitqueue_active(&bitmap->behind_wait)) { @@ -1576,7 +1580,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, r1_bio_write_done(r1_bio); /* In case raid1d snuck in to freeze_array */ - wake_up(&conf->wait_barrier); + wake_up_barrier(conf); } static bool raid1_make_request(struct mddev *mddev, struct bio *bio) @@ -1766,7 +1770,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) { struct r1conf *conf = mddev->private; int err = -EEXIST; - int mirror = 0; + int mirror = 0, repl_slot = -1; struct raid1_info *p; int first = 0; int last = conf->raid_disks - 1; @@ -1809,17 +1813,21 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) break; } if (test_bit(WantReplacement, &p->rdev->flags) && - p[conf->raid_disks].rdev == NULL) { - /* Add this device as a replacement */ - clear_bit(In_sync, &rdev->flags); - set_bit(Replacement, &rdev->flags); - rdev->raid_disk = mirror; - err = 0; - conf->fullsync = 1; - rcu_assign_pointer(p[conf->raid_disks].rdev, rdev); - break; - } + p[conf->raid_disks].rdev == NULL && repl_slot < 0) + repl_slot = mirror; } + + if (err && repl_slot >= 0) { + /* Add this device as a replacement */ + p = conf->mirrors + repl_slot; + clear_bit(In_sync, &rdev->flags); + set_bit(Replacement, &rdev->flags); + rdev->raid_disk = repl_slot; + err = 0; + conf->fullsync = 1; + rcu_assign_pointer(p[conf->raid_disks].rdev, rdev); + } + print_conf(conf); return err; } @@ -1829,6 +1837,10 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) struct r1conf *conf = mddev->private; int err = 0; int number = rdev->raid_disk; + + if (unlikely(number >= conf->raid_disks)) + goto abort; + struct raid1_info *p = conf->mirrors + number; if (rdev != p->rdev) @@ -2299,7 +2311,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk, d++; if (d == conf->raid_disks * 2) d = 0; - } while (!success && d != read_disk); + } while (d != read_disk); if (!success) { /* Cannot read from anywhere - mark it bad */ @@ -2498,6 +2510,7 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) struct mddev *mddev = conf->mddev; struct bio *bio; struct md_rdev *rdev; + sector_t sector; clear_bit(R1BIO_ReadError, &r1_bio->state); /* we got a read error. Maybe the drive is bad. Maybe just @@ -2527,12 +2540,13 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) } rdev_dec_pending(rdev, conf->mddev); - allow_barrier(conf, r1_bio->sector); + sector = r1_bio->sector; bio = r1_bio->master_bio; /* Reuse the old r1_bio so that the IO_BLOCKED settings are preserved */ r1_bio->state = 0; raid1_read_request(mddev, bio, r1_bio->sectors, r1_bio); + allow_barrier(conf, sector); } static void raid1d(struct md_thread *thread) @@ -3144,7 +3158,7 @@ static int raid1_run(struct mddev *mddev) * RAID1 needs at least one disk in active */ if (conf->raid_disks - mddev->degraded < 1) { - md_unregister_thread(&conf->thread); + md_unregister_thread(mddev, &conf->thread); ret = -EINVAL; goto abort; } @@ -3171,7 +3185,7 @@ static int raid1_run(struct mddev *mddev) ret = md_integrity_register(mddev); if (ret) { - md_unregister_thread(&mddev->thread); + md_unregister_thread(mddev, &mddev->thread); goto abort; } return 0; diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 468f189da7a0..14d4211a123a 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -157,7 +157,6 @@ struct r1bio { sector_t sector; int sectors; unsigned long state; - unsigned long start_time; struct mddev *mddev; /* * original bio going to /dev/mdx diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 5051149e27bb..023413120851 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -325,8 +325,6 @@ static void raid_end_bio_io(struct r10bio *r10_bio) if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) bio->bi_status = BLK_STS_IOERR; - if (r10_bio->start_time) - bio_end_io_acct(bio, r10_bio->start_time); bio_endio(bio); /* * Wake up any possible resync thread that waits for the device @@ -1172,7 +1170,7 @@ static bool regular_request_wait(struct mddev *mddev, struct r10conf *conf, } static void raid10_read_request(struct mddev *mddev, struct bio *bio, - struct r10bio *r10_bio) + struct r10bio *r10_bio, bool io_accounting) { struct r10conf *conf = mddev->private; struct bio *read_bio; @@ -1243,9 +1241,10 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, } slot = r10_bio->read_slot; - if (!r10_bio->start_time && - blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) - r10_bio->start_time = bio_start_io_acct(bio); + if (io_accounting) { + md_account_bio(mddev, &bio); + r10_bio->master_bio = bio; + } read_bio = bio_alloc_clone(rdev->bdev, bio, gfp, &mddev->bio_set); r10_bio->devs[slot].bio = read_bio; @@ -1322,6 +1321,25 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, } } +static struct md_rdev *dereference_rdev_and_rrdev(struct raid10_info *mirror, + struct md_rdev **prrdev) +{ + struct md_rdev *rdev, *rrdev; + + rrdev = rcu_dereference(mirror->replacement); + /* + * Read replacement first to prevent reading both rdev and + * replacement as NULL during replacement replace rdev. + */ + smp_mb(); + rdev = rcu_dereference(mirror->rdev); + if (rdev == rrdev) + rrdev = NULL; + + *prrdev = rrdev; + return rdev; +} + static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio) { int i; @@ -1332,11 +1350,9 @@ retry_wait: blocked_rdev = NULL; rcu_read_lock(); for (i = 0; i < conf->copies; i++) { - struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); - struct md_rdev *rrdev = rcu_dereference( - conf->mirrors[i].replacement); - if (rdev == rrdev) - rrdev = NULL; + struct md_rdev *rdev, *rrdev; + + rdev = dereference_rdev_and_rrdev(&conf->mirrors[i], &rrdev); if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { atomic_inc(&rdev->nr_pending); blocked_rdev = rdev; @@ -1465,15 +1481,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, int d = r10_bio->devs[i].devnum; struct md_rdev *rdev, *rrdev; - rrdev = rcu_dereference(conf->mirrors[d].replacement); - /* - * Read replacement first to prevent reading both rdev and - * replacement as NULL during replacement replace rdev. - */ - smp_mb(); - rdev = rcu_dereference(conf->mirrors[d].rdev); - if (rdev == rrdev) - rrdev = NULL; + rdev = dereference_rdev_and_rrdev(&conf->mirrors[d], &rrdev); if (rdev && (test_bit(Faulty, &rdev->flags))) rdev = NULL; if (rrdev && (test_bit(Faulty, &rrdev->flags))) @@ -1543,8 +1551,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, r10_bio->master_bio = bio; } - if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) - r10_bio->start_time = bio_start_io_acct(bio); + md_account_bio(mddev, &bio); + r10_bio->master_bio = bio; atomic_set(&r10_bio->remaining, 1); md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); @@ -1571,12 +1579,11 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) r10_bio->sector = bio->bi_iter.bi_sector; r10_bio->state = 0; r10_bio->read_slot = -1; - r10_bio->start_time = 0; memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->geo.raid_disks); if (bio_data_dir(bio) == READ) - raid10_read_request(mddev, bio, r10_bio); + raid10_read_request(mddev, bio, r10_bio, true); else raid10_write_request(mddev, bio, r10_bio); } @@ -1780,10 +1787,9 @@ retry_discard: */ rcu_read_lock(); for (disk = 0; disk < geo->raid_disks; disk++) { - struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); - struct md_rdev *rrdev = rcu_dereference( - conf->mirrors[disk].replacement); + struct md_rdev *rdev, *rrdev; + rdev = dereference_rdev_and_rrdev(&conf->mirrors[disk], &rrdev); r10_bio->devs[disk].bio = NULL; r10_bio->devs[disk].repl_bio = NULL; @@ -2720,10 +2726,10 @@ static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector, static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio) { int sect = 0; /* Offset from r10_bio->sector */ - int sectors = r10_bio->sectors; + int sectors = r10_bio->sectors, slot = r10_bio->read_slot; struct md_rdev *rdev; int max_read_errors = atomic_read(&mddev->max_corr_read_errors); - int d = r10_bio->devs[r10_bio->read_slot].devnum; + int d = r10_bio->devs[slot].devnum; /* still own a reference to this rdev, so it cannot * have been cleared recently. @@ -2744,13 +2750,13 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 pr_notice("md/raid10:%s: %pg: Failing raid device\n", mdname(mddev), rdev->bdev); md_error(mddev, rdev); - r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED; + r10_bio->devs[slot].bio = IO_BLOCKED; return; } while(sectors) { int s = sectors; - int sl = r10_bio->read_slot; + int sl = slot; int success = 0; int start; @@ -2785,7 +2791,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 sl++; if (sl == conf->copies) sl = 0; - } while (!success && sl != r10_bio->read_slot); + } while (sl != slot); rcu_read_unlock(); if (!success) { @@ -2793,16 +2799,16 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 * as bad on the first device to discourage future * reads. */ - int dn = r10_bio->devs[r10_bio->read_slot].devnum; + int dn = r10_bio->devs[slot].devnum; rdev = conf->mirrors[dn].rdev; if (!rdev_set_badblocks( rdev, - r10_bio->devs[r10_bio->read_slot].addr + r10_bio->devs[slot].addr + sect, s, 0)) { md_error(mddev, rdev); - r10_bio->devs[r10_bio->read_slot].bio + r10_bio->devs[slot].bio = IO_BLOCKED; } break; @@ -2811,7 +2817,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 start = sl; /* write it back and re-read */ rcu_read_lock(); - while (sl != r10_bio->read_slot) { + while (sl != slot) { if (sl==0) sl = conf->copies; sl--; @@ -2845,7 +2851,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 rcu_read_lock(); } sl = start; - while (sl != r10_bio->read_slot) { + while (sl != slot) { if (sl==0) sl = conf->copies; sl--; @@ -2985,7 +2991,7 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) rdev_dec_pending(rdev, mddev); r10_bio->state = 0; - raid10_read_request(mddev, r10_bio->master_bio, r10_bio); + raid10_read_request(mddev, r10_bio->master_bio, r10_bio, false); /* * allow_barrier after re-submit to ensure no sync io * can be issued while regular io pending. @@ -4314,7 +4320,7 @@ static int raid10_run(struct mddev *mddev) return 0; out_free_conf: - md_unregister_thread(&mddev->thread); + md_unregister_thread(mddev, &mddev->thread); raid10_free_conf(conf); mddev->private = NULL; out: @@ -4411,7 +4417,6 @@ static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs) rdev->new_raid_disk = rdev->raid_disk * 2; rdev->sectors = size; } - WRITE_ONCE(conf->barrier, 1); } return conf; diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 63e48b11b552..2e75e88d0802 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -123,7 +123,6 @@ struct r10bio { sector_t sector; /* virtual sector number */ int sectors; unsigned long state; - unsigned long start_time; struct mddev *mddev; /* * original bio going to /dev/mdx diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 47ba7d9e81e1..518b7cfa78b9 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1260,14 +1260,13 @@ static void r5l_log_flush_endio(struct bio *bio) if (bio->bi_status) md_error(log->rdev->mddev, log->rdev); + bio_uninit(bio); spin_lock_irqsave(&log->io_list_lock, flags); list_for_each_entry(io, &log->flushing_ios, log_sibling) r5l_io_run_stripes(io); list_splice_tail_init(&log->flushing_ios, &log->finished_ios); spin_unlock_irqrestore(&log->io_list_lock, flags); - - bio_uninit(bio); } /* @@ -3168,12 +3167,15 @@ void r5l_exit_log(struct r5conf *conf) { struct r5l_log *log = conf->log; - /* Ensure disable_writeback_work wakes up and exits */ - wake_up(&conf->mddev->sb_wait); - flush_work(&log->disable_writeback_work); - md_unregister_thread(&log->reclaim_thread); + md_unregister_thread(conf->mddev, &log->reclaim_thread); + /* + * 'reconfig_mutex' is held by caller, set 'confg->log' to NULL to + * ensure disable_writeback_work wakes up and exits. + */ conf->log = NULL; + wake_up(&conf->mddev->sb_wait); + flush_work(&log->disable_writeback_work); mempool_exit(&log->meta_pool); bioset_exit(&log->bs); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 85b3004594e0..4cb9c608ee19 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5468,26 +5468,17 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf, */ static void raid5_align_endio(struct bio *bi) { - struct md_io_acct *md_io_acct = bi->bi_private; - struct bio *raid_bi = md_io_acct->orig_bio; - struct mddev *mddev; - struct r5conf *conf; - struct md_rdev *rdev; + struct bio *raid_bi = bi->bi_private; + struct md_rdev *rdev = (void *)raid_bi->bi_next; + struct mddev *mddev = rdev->mddev; + struct r5conf *conf = mddev->private; blk_status_t error = bi->bi_status; - unsigned long start_time = md_io_acct->start_time; bio_put(bi); - - rdev = (void*)raid_bi->bi_next; raid_bi->bi_next = NULL; - mddev = rdev->mddev; - conf = mddev->private; - rdev_dec_pending(rdev, conf->mddev); if (!error) { - if (blk_queue_io_stat(raid_bi->bi_bdev->bd_disk->queue)) - bio_end_io_acct(raid_bi, start_time); bio_endio(raid_bi); if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_quiescent); @@ -5506,7 +5497,6 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) struct md_rdev *rdev; sector_t sector, end_sector, first_bad; int bad_sectors, dd_idx; - struct md_io_acct *md_io_acct; bool did_inc; if (!in_chunk_boundary(mddev, raid_bio)) { @@ -5543,16 +5533,13 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) return 0; } - align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO, - &mddev->io_acct_set); - md_io_acct = container_of(align_bio, struct md_io_acct, bio_clone); + md_account_bio(mddev, &raid_bio); raid_bio->bi_next = (void *)rdev; - if (blk_queue_io_stat(raid_bio->bi_bdev->bd_disk->queue)) - md_io_acct->start_time = bio_start_io_acct(raid_bio); - md_io_acct->orig_bio = raid_bio; + align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO, + &mddev->bio_set); align_bio->bi_end_io = raid5_align_endio; - align_bio->bi_private = md_io_acct; + align_bio->bi_private = raid_bio; align_bio->bi_iter.bi_sector = sector; /* No reshape active, so we can trust rdev->data_offset */ @@ -7787,19 +7774,12 @@ static int raid5_run(struct mddev *mddev) struct md_rdev *rdev; struct md_rdev *journal_dev = NULL; sector_t reshape_offset = 0; - int i, ret = 0; + int i; long long min_offset_diff = 0; int first = 1; - if (acct_bioset_init(mddev)) { - pr_err("md/raid456:%s: alloc acct bioset failed.\n", mdname(mddev)); + if (mddev_init_writes_pending(mddev) < 0) return -ENOMEM; - } - - if (mddev_init_writes_pending(mddev) < 0) { - ret = -ENOMEM; - goto exit_acct_set; - } if (mddev->recovery_cp != MaxSector) pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", @@ -7830,8 +7810,7 @@ static int raid5_run(struct mddev *mddev) (mddev->bitmap_info.offset || mddev->bitmap_info.file)) { pr_notice("md/raid:%s: array cannot have both journal and bitmap\n", mdname(mddev)); - ret = -EINVAL; - goto exit_acct_set; + return -EINVAL; } if (mddev->reshape_position != MaxSector) { @@ -7856,15 +7835,13 @@ static int raid5_run(struct mddev *mddev) if (journal_dev) { pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n", mdname(mddev)); - ret = -EINVAL; - goto exit_acct_set; + return -EINVAL; } if (mddev->new_level != mddev->level) { pr_warn("md/raid:%s: unsupported reshape required - aborting.\n", mdname(mddev)); - ret = -EINVAL; - goto exit_acct_set; + return -EINVAL; } old_disks = mddev->raid_disks - mddev->delta_disks; /* reshape_position must be on a new-stripe boundary, and one @@ -7880,8 +7857,7 @@ static int raid5_run(struct mddev *mddev) if (sector_div(here_new, chunk_sectors * new_data_disks)) { pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n", mdname(mddev)); - ret = -EINVAL; - goto exit_acct_set; + return -EINVAL; } reshape_offset = here_new * chunk_sectors; /* here_new is the stripe we will write to */ @@ -7903,8 +7879,7 @@ static int raid5_run(struct mddev *mddev) else if (mddev->ro == 0) { pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n", mdname(mddev)); - ret = -EINVAL; - goto exit_acct_set; + return -EINVAL; } } else if (mddev->reshape_backwards ? (here_new * chunk_sectors + min_offset_diff <= @@ -7914,8 +7889,7 @@ static int raid5_run(struct mddev *mddev) /* Reading from the same stripe as writing to - bad */ pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n", mdname(mddev)); - ret = -EINVAL; - goto exit_acct_set; + return -EINVAL; } pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev)); /* OK, we should be able to continue; */ @@ -7939,10 +7913,8 @@ static int raid5_run(struct mddev *mddev) else conf = mddev->private; - if (IS_ERR(conf)) { - ret = PTR_ERR(conf); - goto exit_acct_set; - } + if (IS_ERR(conf)) + return PTR_ERR(conf); if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { if (!journal_dev) { @@ -8135,15 +8107,12 @@ static int raid5_run(struct mddev *mddev) return 0; abort: - md_unregister_thread(&mddev->thread); + md_unregister_thread(mddev, &mddev->thread); print_raid5_conf(conf); free_conf(conf); mddev->private = NULL; pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev)); - ret = -EIO; -exit_acct_set: - acct_bioset_exit(mddev); - return ret; + return -EIO; } static void raid5_free(struct mddev *mddev, void *priv) @@ -8151,7 +8120,6 @@ static void raid5_free(struct mddev *mddev, void *priv) struct r5conf *conf = priv; free_conf(conf); - acct_bioset_exit(mddev); mddev->to_remove = &raid5_attrs_group; } |