diff options
author | Jens Axboe <axboe@kernel.dk> | 2023-12-19 15:49:23 -0700 |
---|---|---|
committer | Jens Axboe <axboe@kernel.dk> | 2023-12-19 15:49:23 -0700 |
commit | 0bd7c5d802586e9d86c09f79cbf5ca8fdbec57ec (patch) | |
tree | 807c626d7453819a1fbc77a43834216a520af517 | |
parent | 4c434392c4777881d01beada6701eff8c76b43fe (diff) | |
parent | 415c7451872b0d037760795edd3961eaa63276ea (diff) |
Merge tag 'md-next-20231219' of https://git.kernel.org/pub/scm/linux/kernel/git/song/md into for-6.8/block
Pull MD updates from Song:
"1. Remove deprecated flavors, by Song Liu;
2. raid1 read error check support, by Li Nan;
3. Better handle events off-by-1 case, by Alex Lyakas."
* tag 'md-next-20231219' of https://git.kernel.org/pub/scm/linux/kernel/git/song/md:
md: Remove deprecated CONFIG_MD_FAULTY
md: Remove deprecated CONFIG_MD_MULTIPATH
md: Remove deprecated CONFIG_MD_LINEAR
md/raid1: support read error check
md: factor out a helper exceed_read_errors() to check read_errors
md: Whenassemble the array, consult the superblock of the freshest device
md/raid1: remove unnecessary null checking
-rw-r--r-- | drivers/md/Kconfig | 34 | ||||
-rw-r--r-- | drivers/md/Makefile | 10 | ||||
-rw-r--r-- | drivers/md/md-autodetect.c | 8 | ||||
-rw-r--r-- | drivers/md/md-faulty.c | 365 | ||||
-rw-r--r-- | drivers/md/md-linear.c | 318 | ||||
-rw-r--r-- | drivers/md/md-multipath.c | 463 | ||||
-rw-r--r-- | drivers/md/md.c | 239 | ||||
-rw-r--r-- | drivers/md/raid1-10.c | 54 | ||||
-rw-r--r-- | drivers/md/raid1.c | 20 | ||||
-rw-r--r-- | drivers/md/raid10.c | 49 | ||||
-rw-r--r-- | include/uapi/linux/raid/md_p.h | 8 | ||||
-rw-r--r-- | include/uapi/linux/raid/md_u.h | 11 |
12 files changed, 201 insertions, 1378 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 2a8b081bce7d..f6dc2acdf1ed 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -61,19 +61,6 @@ config MD_BITMAP_FILE various kernel APIs and can only work with files on a file system not actually sitting on the MD device. -config MD_LINEAR - tristate "Linear (append) mode (deprecated)" - depends on BLK_DEV_MD - help - If you say Y here, then your multiple devices driver will be able to - use the so-called linear mode, i.e. it will combine the hard disk - partitions by simply appending one to the other. - - To compile this as a module, choose M here: the module - will be called linear. - - If unsure, say Y. - config MD_RAID0 tristate "RAID-0 (striping) mode" depends on BLK_DEV_MD @@ -172,27 +159,6 @@ config MD_RAID456 If unsure, say Y. -config MD_MULTIPATH - tristate "Multipath I/O support (deprecated)" - depends on BLK_DEV_MD - help - MD_MULTIPATH provides a simple multi-path personality for use - the MD framework. It is not under active development. New - projects should consider using DM_MULTIPATH which has more - features and more testing. - - If unsure, say N. - -config MD_FAULTY - tristate "Faulty test module for MD (deprecated)" - depends on BLK_DEV_MD - help - The "faulty" module allows for a block device that occasionally returns - read or write errors. It is useful for testing. - - In unsure, say N. - - config MD_CLUSTER tristate "Cluster Support for MD" depends on BLK_DEV_MD diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 84291e38dca8..027d7cfeca3f 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -29,22 +29,16 @@ dm-zoned-y += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o md-mod-y += md.o md-bitmap.o raid456-y += raid5.o raid5-cache.o raid5-ppl.o -linear-y += md-linear.o -multipath-y += md-multipath.o -faulty-y += md-faulty.o # Note: link order is important. All raid personalities -# and must come before md.o, as they each initialise -# themselves, and md.o may use the personalities when it +# and must come before md.o, as they each initialise +# themselves, and md.o may use the personalities when it # auto-initialised. -obj-$(CONFIG_MD_LINEAR) += linear.o obj-$(CONFIG_MD_RAID0) += raid0.o obj-$(CONFIG_MD_RAID1) += raid1.o obj-$(CONFIG_MD_RAID10) += raid10.o obj-$(CONFIG_MD_RAID456) += raid456.o -obj-$(CONFIG_MD_MULTIPATH) += multipath.o -obj-$(CONFIG_MD_FAULTY) += faulty.o obj-$(CONFIG_MD_CLUSTER) += md-cluster.o obj-$(CONFIG_BCACHE) += bcache/ obj-$(CONFIG_BLK_DEV_MD) += md-mod.o diff --git a/drivers/md/md-autodetect.c b/drivers/md/md-autodetect.c index 4b80165afd23..b2a00f213c2c 100644 --- a/drivers/md/md-autodetect.c +++ b/drivers/md/md-autodetect.c @@ -49,7 +49,6 @@ static int md_setup_ents __initdata; * instead of just one. -- KTK * 18May2000: Added support for persistent-superblock arrays: * md=n,0,factor,fault,device-list uses RAID0 for device n - * md=n,-1,factor,fault,device-list uses LINEAR for device n * md=n,device-list reads a RAID superblock from the devices * elements in device-list are read by name_to_kdev_t so can be * a hex number or something like /dev/hda1 /dev/sdb @@ -88,7 +87,7 @@ static int __init md_setup(char *str) md_setup_ents++; switch (get_option(&str, &level)) { /* RAID level */ case 2: /* could be 0 or -1.. */ - if (level == 0 || level == LEVEL_LINEAR) { + if (level == 0) { if (get_option(&str, &factor) != 2 || /* Chunk Size */ get_option(&str, &fault) != 2) { printk(KERN_WARNING "md: Too few arguments supplied to md=.\n"); @@ -96,10 +95,7 @@ static int __init md_setup(char *str) } md_setup_args[ent].level = level; md_setup_args[ent].chunk = 1 << (factor+12); - if (level == LEVEL_LINEAR) - pername = "linear"; - else - pername = "raid0"; + pername = "raid0"; break; } fallthrough; diff --git a/drivers/md/md-faulty.c b/drivers/md/md-faulty.c deleted file mode 100644 index a039e8e20f55..000000000000 --- a/drivers/md/md-faulty.c +++ /dev/null @@ -1,365 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * faulty.c : Multiple Devices driver for Linux - * - * Copyright (C) 2004 Neil Brown - * - * fautly-device-simulator personality for md - */ - - -/* - * The "faulty" personality causes some requests to fail. - * - * Possible failure modes are: - * reads fail "randomly" but succeed on retry - * writes fail "randomly" but succeed on retry - * reads for some address fail and then persist until a write - * reads for some address fail and then persist irrespective of write - * writes for some address fail and persist - * all writes fail - * - * Different modes can be active at a time, but only - * one can be set at array creation. Others can be added later. - * A mode can be one-shot or recurrent with the recurrence being - * once in every N requests. - * The bottom 5 bits of the "layout" indicate the mode. The - * remainder indicate a period, or 0 for one-shot. - * - * There is an implementation limit on the number of concurrently - * persisting-faulty blocks. When a new fault is requested that would - * exceed the limit, it is ignored. - * All current faults can be clear using a layout of "0". - * - * Requests are always sent to the device. If they are to fail, - * we clone the bio and insert a new b_end_io into the chain. - */ - -#define WriteTransient 0 -#define ReadTransient 1 -#define WritePersistent 2 -#define ReadPersistent 3 -#define WriteAll 4 /* doesn't go to device */ -#define ReadFixable 5 -#define Modes 6 - -#define ClearErrors 31 -#define ClearFaults 30 - -#define AllPersist 100 /* internal use only */ -#define NoPersist 101 - -#define ModeMask 0x1f -#define ModeShift 5 - -#define MaxFault 50 -#include <linux/blkdev.h> -#include <linux/module.h> -#include <linux/raid/md_u.h> -#include <linux/slab.h> -#include "md.h" -#include <linux/seq_file.h> - - -static void faulty_fail(struct bio *bio) -{ - struct bio *b = bio->bi_private; - - b->bi_iter.bi_size = bio->bi_iter.bi_size; - b->bi_iter.bi_sector = bio->bi_iter.bi_sector; - - bio_put(bio); - - bio_io_error(b); -} - -struct faulty_conf { - int period[Modes]; - atomic_t counters[Modes]; - sector_t faults[MaxFault]; - int modes[MaxFault]; - int nfaults; - struct md_rdev *rdev; -}; - -static int check_mode(struct faulty_conf *conf, int mode) -{ - if (conf->period[mode] == 0 && - atomic_read(&conf->counters[mode]) <= 0) - return 0; /* no failure, no decrement */ - - - if (atomic_dec_and_test(&conf->counters[mode])) { - if (conf->period[mode]) - atomic_set(&conf->counters[mode], conf->period[mode]); - return 1; - } - return 0; -} - -static int check_sector(struct faulty_conf *conf, sector_t start, sector_t end, int dir) -{ - /* If we find a ReadFixable sector, we fix it ... */ - int i; - for (i=0; i<conf->nfaults; i++) - if (conf->faults[i] >= start && - conf->faults[i] < end) { - /* found it ... */ - switch (conf->modes[i] * 2 + dir) { - case WritePersistent*2+WRITE: return 1; - case ReadPersistent*2+READ: return 1; - case ReadFixable*2+READ: return 1; - case ReadFixable*2+WRITE: - conf->modes[i] = NoPersist; - return 0; - case AllPersist*2+READ: - case AllPersist*2+WRITE: return 1; - default: - return 0; - } - } - return 0; -} - -static void add_sector(struct faulty_conf *conf, sector_t start, int mode) -{ - int i; - int n = conf->nfaults; - for (i=0; i<conf->nfaults; i++) - if (conf->faults[i] == start) { - switch(mode) { - case NoPersist: conf->modes[i] = mode; return; - case WritePersistent: - if (conf->modes[i] == ReadPersistent || - conf->modes[i] == ReadFixable) - conf->modes[i] = AllPersist; - else - conf->modes[i] = WritePersistent; - return; - case ReadPersistent: - if (conf->modes[i] == WritePersistent) - conf->modes[i] = AllPersist; - else - conf->modes[i] = ReadPersistent; - return; - case ReadFixable: - if (conf->modes[i] == WritePersistent || - conf->modes[i] == ReadPersistent) - conf->modes[i] = AllPersist; - else - conf->modes[i] = ReadFixable; - return; - } - } else if (conf->modes[i] == NoPersist) - n = i; - - if (n >= MaxFault) - return; - conf->faults[n] = start; - conf->modes[n] = mode; - if (conf->nfaults == n) - conf->nfaults = n+1; -} - -static bool faulty_make_request(struct mddev *mddev, struct bio *bio) -{ - struct faulty_conf *conf = mddev->private; - int failit = 0; - - if (bio_data_dir(bio) == WRITE) { - /* write request */ - if (atomic_read(&conf->counters[WriteAll])) { - /* special case - don't decrement, don't submit_bio_noacct, - * just fail immediately - */ - bio_io_error(bio); - return true; - } - - if (check_sector(conf, bio->bi_iter.bi_sector, - bio_end_sector(bio), WRITE)) - failit = 1; - if (check_mode(conf, WritePersistent)) { - add_sector(conf, bio->bi_iter.bi_sector, - WritePersistent); - failit = 1; - } - if (check_mode(conf, WriteTransient)) - failit = 1; - } else { - /* read request */ - if (check_sector(conf, bio->bi_iter.bi_sector, - bio_end_sector(bio), READ)) - failit = 1; - if (check_mode(conf, ReadTransient)) - failit = 1; - if (check_mode(conf, ReadPersistent)) { - add_sector(conf, bio->bi_iter.bi_sector, - ReadPersistent); - failit = 1; - } - if (check_mode(conf, ReadFixable)) { - add_sector(conf, bio->bi_iter.bi_sector, - ReadFixable); - failit = 1; - } - } - - md_account_bio(mddev, &bio); - if (failit) { - struct bio *b = bio_alloc_clone(conf->rdev->bdev, bio, GFP_NOIO, - &mddev->bio_set); - - b->bi_private = bio; - b->bi_end_io = faulty_fail; - bio = b; - } else - bio_set_dev(bio, conf->rdev->bdev); - - submit_bio_noacct(bio); - return true; -} - -static void faulty_status(struct seq_file *seq, struct mddev *mddev) -{ - struct faulty_conf *conf = mddev->private; - int n; - - if ((n=atomic_read(&conf->counters[WriteTransient])) != 0) - seq_printf(seq, " WriteTransient=%d(%d)", - n, conf->period[WriteTransient]); - - if ((n=atomic_read(&conf->counters[ReadTransient])) != 0) - seq_printf(seq, " ReadTransient=%d(%d)", - n, conf->period[ReadTransient]); - - if ((n=atomic_read(&conf->counters[WritePersistent])) != 0) - seq_printf(seq, " WritePersistent=%d(%d)", - n, conf->period[WritePersistent]); - - if ((n=atomic_read(&conf->counters[ReadPersistent])) != 0) - seq_printf(seq, " ReadPersistent=%d(%d)", - n, conf->period[ReadPersistent]); - - - if ((n=atomic_read(&conf->counters[ReadFixable])) != 0) - seq_printf(seq, " ReadFixable=%d(%d)", - n, conf->period[ReadFixable]); - - if ((n=atomic_read(&conf->counters[WriteAll])) != 0) - seq_printf(seq, " WriteAll"); - - seq_printf(seq, " nfaults=%d", conf->nfaults); -} - - -static int faulty_reshape(struct mddev *mddev) -{ - int mode = mddev->new_layout & ModeMask; - int count = mddev->new_layout >> ModeShift; - struct faulty_conf *conf = mddev->private; - - if (mddev->new_layout < 0) - return 0; - - /* new layout */ - if (mode == ClearFaults) - conf->nfaults = 0; - else if (mode == ClearErrors) { - int i; - for (i=0 ; i < Modes ; i++) { - conf->period[i] = 0; - atomic_set(&conf->counters[i], 0); - } - } else if (mode < Modes) { - conf->period[mode] = count; - if (!count) count++; - atomic_set(&conf->counters[mode], count); - } else - return -EINVAL; - mddev->new_layout = -1; - mddev->layout = -1; /* makes sure further changes come through */ - return 0; -} - -static sector_t faulty_size(struct mddev *mddev, sector_t sectors, int raid_disks) -{ - WARN_ONCE(raid_disks, - "%s does not support generic reshape\n", __func__); - - if (sectors == 0) - return mddev->dev_sectors; - - return sectors; -} - -static int faulty_run(struct mddev *mddev) -{ - struct md_rdev *rdev; - int i; - struct faulty_conf *conf; - - if (md_check_no_bitmap(mddev)) - return -EINVAL; - - conf = kmalloc(sizeof(*conf), GFP_KERNEL); - if (!conf) - return -ENOMEM; - - for (i=0; i<Modes; i++) { - atomic_set(&conf->counters[i], 0); - conf->period[i] = 0; - } - conf->nfaults = 0; - - rdev_for_each(rdev, mddev) { - conf->rdev = rdev; - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - } - - md_set_array_sectors(mddev, faulty_size(mddev, 0, 0)); - mddev->private = conf; - - faulty_reshape(mddev); - - return 0; -} - -static void faulty_free(struct mddev *mddev, void *priv) -{ - struct faulty_conf *conf = priv; - - kfree(conf); -} - -static struct md_personality faulty_personality = -{ - .name = "faulty", - .level = LEVEL_FAULTY, - .owner = THIS_MODULE, - .make_request = faulty_make_request, - .run = faulty_run, - .free = faulty_free, - .status = faulty_status, - .check_reshape = faulty_reshape, - .size = faulty_size, -}; - -static int __init raid_init(void) -{ - return register_md_personality(&faulty_personality); -} - -static void raid_exit(void) -{ - unregister_md_personality(&faulty_personality); -} - -module_init(raid_init); -module_exit(raid_exit); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Fault injection personality for MD (deprecated)"); -MODULE_ALIAS("md-personality-10"); /* faulty */ -MODULE_ALIAS("md-faulty"); -MODULE_ALIAS("md-level--5"); diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c deleted file mode 100644 index 8eca7693b793..000000000000 --- a/drivers/md/md-linear.c +++ /dev/null @@ -1,318 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - linear.c : Multiple Devices driver for Linux - Copyright (C) 1994-96 Marc ZYNGIER - <zyngier@ufr-info-p7.ibp.fr> or - <maz@gloups.fdn.fr> - - Linear mode management functions. - -*/ - -#include <linux/blkdev.h> -#include <linux/raid/md_u.h> -#include <linux/seq_file.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <trace/events/block.h> -#include "md.h" -#include "md-linear.h" - -/* - * find which device holds a particular offset - */ -static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector) -{ - int lo, mid, hi; - struct linear_conf *conf; - - lo = 0; - hi = mddev->raid_disks - 1; - conf = mddev->private; - - /* - * Binary Search - */ - - while (hi > lo) { - - mid = (hi + lo) / 2; - if (sector < conf->disks[mid].end_sector) - hi = mid; - else - lo = mid + 1; - } - - return conf->disks + lo; -} - -static sector_t linear_size(struct mddev *mddev, sector_t sectors, int raid_disks) -{ - struct linear_conf *conf; - sector_t array_sectors; - - conf = mddev->private; - WARN_ONCE(sectors || raid_disks, - "%s does not support generic reshape\n", __func__); - array_sectors = conf->array_sectors; - - return array_sectors; -} - -static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) -{ - struct linear_conf *conf; - struct md_rdev *rdev; - int i, cnt; - - conf = kzalloc(struct_size(conf, disks, raid_disks), GFP_KERNEL); - if (!conf) - return NULL; - - /* - * conf->raid_disks is copy of mddev->raid_disks. The reason to - * keep a copy of mddev->raid_disks in struct linear_conf is, - * mddev->raid_disks may not be consistent with pointers number of - * conf->disks[] when it is updated in linear_add() and used to - * iterate old conf->disks[] earray in linear_congested(). - * Here conf->raid_disks is always consitent with number of - * pointers in conf->disks[] array, and mddev->private is updated - * with rcu_assign_pointer() in linear_addr(), such race can be - * avoided. - */ - conf->raid_disks = raid_disks; - - cnt = 0; - conf->array_sectors = 0; - - rdev_for_each(rdev, mddev) { - int j = rdev->raid_disk; - struct dev_info *disk = conf->disks + j; - sector_t sectors; - - if (j < 0 || j >= raid_disks || disk->rdev) { - pr_warn("md/linear:%s: disk numbering problem. Aborting!\n", - mdname(mddev)); - goto out; - } - - disk->rdev = rdev; - if (mddev->chunk_sectors) { - sectors = rdev->sectors; - sector_div(sectors, mddev->chunk_sectors); - rdev->sectors = sectors * mddev->chunk_sectors; - } - - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - - conf->array_sectors += rdev->sectors; - cnt++; - } - if (cnt != raid_disks) { - pr_warn("md/linear:%s: not enough drives present. Aborting!\n", - mdname(mddev)); - goto out; - } - - /* - * Here we calculate the device offsets. - */ - conf->disks[0].end_sector = conf->disks[0].rdev->sectors; - - for (i = 1; i < raid_disks; i++) - conf->disks[i].end_sector = - conf->disks[i-1].end_sector + - conf->disks[i].rdev->sectors; - - return conf; - -out: - kfree(conf); - return NULL; -} - -static int linear_run (struct mddev *mddev) -{ - struct linear_conf *conf; - int ret; - - if (md_check_no_bitmap(mddev)) - return -EINVAL; - conf = linear_conf(mddev, mddev->raid_disks); - - if (!conf) - return 1; - mddev->private = conf; - md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); - - ret = md_integrity_register(mddev); - if (ret) { - kfree(conf); - mddev->private = NULL; - } - return ret; -} - -static int linear_add(struct mddev *mddev, struct md_rdev *rdev) -{ - /* Adding a drive to a linear array allows the array to grow. - * It is permitted if the new drive has a matching superblock - * already on it, with raid_disk equal to raid_disks. - * It is achieved by creating a new linear_private_data structure - * and swapping it in in-place of the current one. - * The current one is never freed until the array is stopped. - * This avoids races. - */ - struct linear_conf *newconf, *oldconf; - - if (rdev->saved_raid_disk != mddev->raid_disks) - return -EINVAL; - - rdev->raid_disk = rdev->saved_raid_disk; - rdev->saved_raid_disk = -1; - - newconf = linear_conf(mddev,mddev->raid_disks+1); - - if (!newconf) - return -ENOMEM; - - /* newconf->raid_disks already keeps a copy of * the increased - * value of mddev->raid_disks, WARN_ONCE() is just used to make - * sure of this. It is possible that oldconf is still referenced - * in linear_congested(), therefore kfree_rcu() is used to free - * oldconf until no one uses it anymore. - */ - oldconf = rcu_dereference_protected(mddev->private, - lockdep_is_held(&mddev->reconfig_mutex)); - mddev->raid_disks++; - WARN_ONCE(mddev->raid_disks != newconf->raid_disks, - "copied raid_disks doesn't match mddev->raid_disks"); - rcu_assign_pointer(mddev->private, newconf); - md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); - set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); - kfree_rcu(oldconf, rcu); - return 0; -} - -static void linear_free(struct mddev *mddev, void *priv) -{ - struct linear_conf *conf = priv; - - kfree(conf); -} - -static bool linear_make_request(struct mddev *mddev, struct bio *bio) -{ - struct dev_info *tmp_dev; - sector_t start_sector, end_sector, data_offset; - sector_t bio_sector = bio->bi_iter.bi_sector; - - if (unlikely(bio->bi_opf & REQ_PREFLUSH) - && md_flush_request(mddev, bio)) - return true; - - tmp_dev = which_dev(mddev, bio_sector); - start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; - end_sector = tmp_dev->end_sector; - data_offset = tmp_dev->rdev->data_offset; - - if (unlikely(bio_sector >= end_sector || - bio_sector < start_sector)) - goto out_of_bounds; - - if (unlikely(is_rdev_broken(tmp_dev->rdev))) { - md_error(mddev, tmp_dev->rdev); - bio_io_error(bio); - return true; - } - - if (unlikely(bio_end_sector(bio) > end_sector)) { - /* This bio crosses a device boundary, so we have to split it */ - struct bio *split = bio_split(bio, end_sector - bio_sector, - GFP_NOIO, &mddev->bio_set); - bio_chain(split, bio); - submit_bio_noacct(bio); - bio = split; - } - - md_account_bio(mddev, &bio); - bio_set_dev(bio, tmp_dev->rdev->bdev); - bio->bi_iter.bi_sector = bio->bi_iter.bi_sector - - start_sector + data_offset; - - if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !bdev_max_discard_sectors(bio->bi_bdev))) { - /* Just ignore it */ - bio_endio(bio); - } else { - if (mddev->gendisk) - trace_block_bio_remap(bio, disk_devt(mddev->gendisk), - bio_sector); - mddev_check_write_zeroes(mddev, bio); - submit_bio_noacct(bio); - } - return true; - -out_of_bounds: - pr_err("md/linear:%s: make_request: Sector %llu out of bounds on dev %pg: %llu sectors, offset %llu\n", - mdname(mddev), - (unsigned long long)bio->bi_iter.bi_sector, - tmp_dev->rdev->bdev, - (unsigned long long)tmp_dev->rdev->sectors, - (unsigned long long)start_sector); - bio_io_error(bio); - return true; -} - -static void linear_status (struct seq_file *seq, struct mddev *mddev) -{ - seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2); -} - -static void linear_error(struct mddev *mddev, struct md_rdev *rdev) -{ - if (!test_and_set_bit(MD_BROKEN, &mddev->flags)) { - char *md_name = mdname(mddev); - - pr_crit("md/linear%s: Disk failure on %pg detected, failing array.\n", - md_name, rdev->bdev); - } -} - -static void linear_quiesce(struct mddev *mddev, int state) -{ -} - -static struct md_personality linear_personality = -{ - .name = "linear", - .level = LEVEL_LINEAR, - .owner = THIS_MODULE, - .make_request = linear_make_request, - .run = linear_run, - .free = linear_free, - .status = linear_status, - .hot_add_disk = linear_add, - .size = linear_size, - .quiesce = linear_quiesce, - .error_handler = linear_error, -}; - -static int __init linear_init (void) -{ - return register_md_personality (&linear_personality); -} - -static void linear_exit (void) -{ - unregister_md_personality (&linear_personality); -} - -module_init(linear_init); -module_exit(linear_exit); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Linear device concatenation personality for MD (deprecated)"); -MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/ -MODULE_ALIAS("md-linear"); -MODULE_ALIAS("md-level--1"); diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c deleted file mode 100644 index 19c8625ea642..000000000000 --- a/drivers/md/md-multipath.c +++ /dev/null @@ -1,463 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * multipath.c : Multiple Devices driver for Linux - * - * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat - * - * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman - * - * MULTIPATH management functions. - * - * derived from raid1.c. - */ - -#include <linux/blkdev.h> -#include <linux/module.h> -#include <linux/raid/md_u.h> -#include <linux/seq_file.h> -#include <linux/slab.h> -#include "md.h" -#include "md-multipath.h" - -#define MAX_WORK_PER_DISK 128 - -#define NR_RESERVED_BUFS 32 - -static int multipath_map (struct mpconf *conf) -{ - int i, disks = conf->raid_disks; - - /* - * Later we do read balancing on the read side - * now we use the first available disk. - */ - - for (i = 0; i < disks; i++) { - struct md_rdev *rdev = conf->multipaths[i].rdev; - - if (rdev && test_bit(In_sync, &rdev->flags) && - !test_bit(Faulty, &rdev->flags)) { - atomic_inc(&rdev->nr_pending); - return i; - } - } - - pr_crit_ratelimited("multipath_map(): no more operational IO paths?\n"); - return (-1); -} - -static void multipath_reschedule_retry (struct multipath_bh *mp_bh) -{ - unsigned long flags; - struct mddev *mddev = mp_bh->mddev; - struct mpconf *conf = mddev->private; - - spin_lock_irqsave(&conf->device_lock, flags); - list_add(&mp_bh->retry_list, &conf->retry_list); - spin_unlock_irqrestore(&conf->device_lock, flags); - md_wakeup_thread(mddev->thread); -} - -/* - * multipath_end_bh_io() is called when we have finished servicing a multipathed - * operation and are ready to return a success/failure code to the buffer - * cache layer. - */ -static void multipath_end_bh_io(struct multipath_bh *mp_bh, blk_status_t status) -{ - struct bio *bio = mp_bh->master_bio; - struct mpconf *conf = mp_bh->mddev->private; - - bio->bi_status = status; - bio_endio(bio); - mempool_free(mp_bh, &conf->pool); -} - -static void multipath_end_request(struct bio *bio) -{ - struct multipath_bh *mp_bh = bio->bi_private; - struct mpconf *conf = mp_bh->mddev->private; - struct md_rdev *rdev = conf->multipaths[mp_bh->path].rdev; - - if (!bio->bi_status) - multipath_end_bh_io(mp_bh, 0); - else if (!(bio->bi_opf & REQ_RAHEAD)) { - /* - * oops, IO error: - */ - md_error (mp_bh->mddev, rdev); - pr_info("multipath: %pg: rescheduling sector %llu\n", - rdev->bdev, - (unsigned long long)bio->bi_iter.bi_sector); - multipath_reschedule_retry(mp_bh); - } else - multipath_end_bh_io(mp_bh, bio->bi_status); - rdev_dec_pending(rdev, conf->mddev); -} - -static bool multipath_make_request(struct mddev *mddev, struct bio * bio) -{ - struct mpconf *conf = mddev->private; - struct multipath_bh * mp_bh; - struct multipath_info *multipath; - - if (unlikely(bio->bi_opf & REQ_PREFLUSH) - && md_flush_request(mddev, bio)) - return true; - - md_account_bio(mddev, &bio); - mp_bh = mempool_alloc(&conf->pool, GFP_NOIO); - - mp_bh->master_bio = bio; - mp_bh->mddev = mddev; - - mp_bh->path = multipath_map(conf); - if (mp_bh->path < 0) { - bio_io_error(bio); - mempool_free(mp_bh, &conf->pool); - return true; - } - multipath = conf->multipaths + mp_bh->path; - - bio_init_clone(multipath->rdev->bdev, &mp_bh->bio, bio, GFP_NOIO); - - mp_bh->bio.bi_iter.bi_sector += multipath->rdev->data_offset; - mp_bh->bio.bi_opf |= REQ_FAILFAST_TRANSPORT; - mp_bh->bio.bi_end_io = multipath_end_request; - mp_bh->bio.bi_private = mp_bh; - mddev_check_write_zeroes(mddev, &mp_bh->bio); - submit_bio_noacct(&mp_bh->bio); - return true; -} - -static void multipath_status(struct seq_file *seq, struct mddev *mddev) -{ - struct mpconf *conf = mddev->private; - int i; - - lockdep_assert_held(&mddev->lock); - - seq_printf (seq, " [%d/%d] [", conf->raid_disks, - conf->raid_disks - mddev->degraded); - for (i = 0; i < conf->raid_disks; i++) { - struct md_rdev *rdev = READ_ONCE(conf->multipaths[i].rdev); - - seq_printf(seq, "%s", - rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); - } - seq_putc(seq, ']'); -} - -/* - * Careful, this can execute in IRQ contexts as well! - */ -static void multipath_error (struct mddev *mddev, struct md_rdev *rdev) -{ - struct mpconf *conf = mddev->private; - - if (conf->raid_disks - mddev->degraded <= 1) { - /* - * Uh oh, we can do nothing if this is our last path, but - * first check if this is a queued request for a device - * which has just failed. - */ - pr_warn("multipath: only one IO path left and IO error.\n"); - /* leave it active... it's all we have */ - return; - } - /* - * Mark disk as unusable - */ - if (test_and_clear_bit(In_sync, &rdev->flags)) { - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded++; - spin_unlock_irqrestore(&conf->device_lock, flags); - } - set_bit(Faulty, &rdev->flags); - set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); - pr_err("multipath: IO failure on %pg, disabling IO path.\n" - "multipath: Operation continuing on %d IO paths.\n", - rdev->bdev, - conf->raid_disks - mddev->degraded); -} - -static void print_multipath_conf(struct mpconf *conf) -{ - int i; - struct multipath_info *tmp; - - pr_debug("MULTIPATH conf printout:\n"); - if (!conf) { - pr_debug("(conf==NULL)\n"); - return; - } - pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, - conf->raid_disks); - - lockdep_assert_held(&conf->mddev->reconfig_mutex); - for (i = 0; i < conf->raid_disks; i++) { - tmp = conf->multipaths + i; - if (tmp->rdev) - pr_debug(" disk%d, o:%d, dev:%pg\n", - i,!test_bit(Faulty, &tmp->rdev->flags), - tmp->rdev->bdev); - } -} - -static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) -{ - struct mpconf *conf = mddev->private; - int err = -EEXIST; - int path; - struct multipath_info *p; - int first = 0; - int last = mddev->raid_disks - 1; - - if (rdev->raid_disk >= 0) - first = last = rdev->raid_disk; - - print_multipath_conf(conf); - - for (path = first; path <= last; path++) - if ((p=conf->multipaths+path)->rdev == NULL) { - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - - err = md_integrity_add_rdev(rdev, mddev); - if (err) - break; - spin_lock_irq(&conf->device_lock); - mddev->degraded--; - rdev->raid_disk = path; - set_bit(In_sync, &rdev->flags); - spin_unlock_irq(&conf->device_lock); - WRITE_ONCE(p->rdev, rdev); - err = 0; - break; - } - - print_multipath_conf(conf); - - return err; -} - -static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev) -{ - struct mpconf *conf = mddev->private; - int err = 0; - int number = rdev->raid_disk; - struct multipath_info *p = conf->multipaths + number; - - print_multipath_conf(conf); - - if (rdev == p->rdev) { - if (test_bit(In_sync, &rdev->flags) || - atomic_read(&rdev->nr_pending)) { - pr_warn("hot-remove-disk, slot %d is identified but is still operational!\n", number); - err = -EBUSY; - goto abort; - } - WRITE_ONCE(p->rdev, NULL); - err = md_integrity_register(mddev); - } -abort: - - print_multipath_conf(conf); - return err; -} - -/* - * This is a kernel thread which: - * - * 1. Retries failed read operations on working multipaths. - * 2. Updates the raid superblock when problems encounter. - * 3. Performs writes following reads for array syncronising. - */ - -static void multipathd(struct md_thread *thread) -{ - struct mddev *mddev = thread->mddev; - struct multipath_bh *mp_bh; - struct bio *bio; - unsigned long flags; - struct mpconf *conf = mddev->private; - struct list_head *head = &conf->retry_list; - - md_check_recovery(mddev); - for (;;) { - spin_lock_irqsave(&conf->device_lock, flags); - if (list_empty(head)) - break; - mp_bh = list_entry(head->prev, struct multipath_bh, retry_list); - list_del(head->prev); - spin_unlock_irqrestore(&conf->device_lock, flags); - - bio = &mp_bh->bio; - bio->bi_iter.bi_sector = mp_bh->master_bio->bi_iter.bi_sector; - - if ((mp_bh->path = multipath_map (conf))<0) { - pr_err("multipath: %pg: unrecoverable IO read error for block %llu\n", - bio->bi_bdev, - (unsigned long long)bio->bi_iter.bi_sector); - multipath_end_bh_io(mp_bh, BLK_STS_IOERR); - } else { - pr_err("multipath: %pg: redirecting sector %llu to another IO path\n", - bio->bi_bdev, - (unsigned long long)bio->bi_iter.bi_sector); - *bio = *(mp_bh->master_bio); - bio->bi_iter.bi_sector += - conf->multipaths[mp_bh->path].rdev->data_offset; - bio_set_dev(bio, conf->multipaths[mp_bh->path].rdev->bdev); - bio->bi_opf |= REQ_FAILFAST_TRANSPORT; - bio->bi_end_io = multipath_end_request; - bio->bi_private = mp_bh; - submit_bio_noacct(bio); - } - } - spin_unlock_irqrestore(&conf->device_lock, flags); -} - -static sector_t multipath_size(struct mddev *mddev, sector_t sectors, int raid_disks) -{ - WARN_ONCE(sectors || raid_disks, - "%s does not support generic reshape\n", __func__); - - return mddev->dev_sectors; -} - -static int multipath_run (struct mddev *mddev) -{ - struct mpconf *conf; - int disk_idx; - struct multipath_info *disk; - struct md_rdev *rdev; - int working_disks; - int ret; - - if (md_check_no_bitmap(mddev)) - return -EINVAL; - - if (mddev->level != LEVEL_MULTIPATH) { - pr_warn("multipath: %s: raid level not set to multipath IO (%d)\n", - mdname(mddev), mddev->level); - goto out; - } - /* - * copy the already verified devices into our private MULTIPATH - * bookkeeping area. [whatever we allocate in multipath_run(), - * should be freed in multipath_free()] - */ - - conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL); - mddev->private = conf; - if (!conf) - goto out; - - conf->multipaths = kcalloc(mddev->raid_disks, - sizeof(struct multipath_info), - GFP_KERNEL); - if (!conf->multipaths) - goto out_free_conf; - - working_disks = 0; - rdev_for_each(rdev, mddev) { - disk_idx = rdev->raid_disk; - if (disk_idx < 0 || - disk_idx >= mddev->raid_disks) - continue; - - disk = conf->multipaths + disk_idx; - disk->rdev = rdev; - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - - if (!test_bit(Faulty, &rdev->flags)) - working_disks++; - } - - conf->raid_disks = mddev->raid_disks; - conf->mddev = mddev; - spin_lock_init(&conf->device_lock); - INIT_LIST_HEAD(&conf->retry_list); - - if (!working_disks) { - pr_warn("multipath: no operational IO paths for %s\n", - mdname(mddev)); - goto out_free_conf; - } - mddev->degraded = conf->raid_disks - working_disks; - - ret = mempool_init_kmalloc_pool(&conf->pool, NR_RESERVED_BUFS, - sizeof(struct multipath_bh)); - if (ret) - goto out_free_conf; - - rcu_assign_pointer(mddev->thread, - md_register_thread(multipathd, mddev, "multipath")); - if (!mddev->thread) - goto out_free_conf; - - pr_info("multipath: array %s active with %d out of %d IO paths\n", - mdname(mddev), conf->raid_disks - mddev->degraded, - mddev->raid_disks); - /* - * Ok, everything is just fine now - */ - md_set_array_sectors(mddev, multipath_size(mddev, 0, 0)); - - if (md_integrity_register(mddev)) - goto out_free_conf; - - return 0; - -out_free_conf: - mempool_exit(&conf->pool); - kfree(conf->multipaths); - kfree(conf); - mddev->private = NULL; -out: - return -EIO; -} - -static void multipath_free(struct mddev *mddev, void *priv) -{ - struct mpconf *conf = priv; - - mempool_exit(&conf->pool); - kfree(conf->multipaths); - kfree(conf); -} - -static struct md_personality multipath_personality = -{ - .name = "multipath", - .level = LEVEL_MULTIPATH, - .owner = THIS_MODULE, - .make_request = multipath_make_request, - .run = multipath_run, - .free = multipath_free, - .status = multipath_status, - .error_handler = multipath_error, - .hot_add_disk = multipath_add_disk, - .hot_remove_disk= multipath_remove_disk, - .size = multipath_size, -}; - -static int __init multipath_init (void) -{ - return register_md_personality (&multipath_personality); -} - -static void __exit multipath_exit (void) -{ - unregister_md_personality (&multipath_personality); -} - -module_init(multipath_init); -module_exit(multipath_exit); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("simple multi-path personality for MD (deprecated)"); -MODULE_ALIAS("md-personality-7"); /* MULTIPATH */ -MODULE_ALIAS("md-multipath"); -MODULE_ALIAS("md-level--4"); diff --git a/drivers/md/md.c b/drivers/md/md.c index 4e9fe5cbeedc..e351e6c51cc7 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1206,6 +1206,7 @@ struct super_type { struct md_rdev *refdev, int minor_version); int (*validate_super)(struct mddev *mddev, + struct md_rdev *freshest, struct md_rdev *rdev); void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); @@ -1286,17 +1287,11 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor rdev->sb_size = MD_SB_BYTES; rdev->badblocks.shift = -1; - if (sb->level == LEVEL_MULTIPATH) - rdev->desc_nr = -1; - else - rdev->desc_nr = sb->this_disk.number; - - /* not spare disk, or LEVEL_MULTIPATH */ - if (sb->level == LEVEL_MULTIPATH || - (rdev->desc_nr >= 0 && - rdev->desc_nr < MD_SB_DISKS && - sb->disks[rdev->desc_nr].state & - ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))) + rdev->desc_nr = sb->this_disk.number; + + /* not spare disk */ + if (rdev->desc_nr >= 0 && rdev->desc_nr < MD_SB_DISKS && + sb->disks[rdev->desc_nr].state & ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) spare_disk = false; if (!refdev) { @@ -1343,8 +1338,9 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor /* * validate_super for 0.90.0 + * note: we are not using "freshest" for 0.9 superblock */ -static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) +static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) { mdp_disk_t *desc; mdp_super_t *sb = page_address(rdev->sb_page); @@ -1442,31 +1438,28 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) return 0; } - if (mddev->level != LEVEL_MULTIPATH) { - desc = sb->disks + rdev->desc_nr; + desc = sb->disks + rdev->desc_nr; - if (desc->state & (1<<MD_DISK_FAULTY)) - set_bit(Faulty, &rdev->flags); - else if (desc->state & (1<<MD_DISK_SYNC) /* && - desc->raid_disk < mddev->raid_disks */) { - set_bit(In_sync, &rdev->flags); + if (desc->state & (1<<MD_DISK_FAULTY)) + set_bit(Faulty, &rdev->flags); + else if (desc->state & (1<<MD_DISK_SYNC)) { + set_bit(In_sync, &rdev->flags); + rdev->raid_disk = desc->raid_disk; + rdev->saved_raid_disk = desc->raid_disk; + } else if (desc->state & (1<<MD_DISK_ACTIVE)) { + /* active but not in sync implies recovery up to + * reshape position. We don't know exactly where + * that is, so set to zero for now + */ + if (mddev->minor_version >= 91) { + rdev->recovery_offset = 0; rdev->raid_disk = desc->raid_disk; - rdev->saved_raid_disk = desc->raid_disk; - } else if (desc->state & (1<<MD_DISK_ACTIVE)) { - /* active but not in sync implies recovery up to - * reshape position. We don't know exactly where - * that is, so set to zero for now */ - if (mddev->minor_version >= 91) { - rdev->recovery_offset = 0; - rdev->raid_disk = desc->raid_disk; - } } - if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) - set_bit(WriteMostly, &rdev->flags); - if (desc->state & (1<<MD_DISK_FAILFAST)) - set_bit(FailFast, &rdev->flags); - } else /* MULTIPATH are always insync */ - set_bit(In_sync, &rdev->flags); + } + if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) + set_bit(WriteMostly, &rdev->flags); + if (desc->state & (1<<MD_DISK_FAILFAST)) + set_bit(FailFast, &rdev->flags); return 0; } @@ -1756,10 +1749,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) return -EINVAL; - if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) - rdev->desc_nr = -1; - else - rdev->desc_nr = le32_to_cpu(sb->dev_number); + rdev->desc_nr = le32_to_cpu(sb->dev_number); if (!rdev->bb_page) { rdev->bb_page = alloc_page(GFP_KERNEL); @@ -1812,12 +1802,10 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ sb->level != 0) return -EINVAL; - /* not spare disk, or LEVEL_MULTIPATH */ - if (sb->level == cpu_to_le32(LEVEL_MULTIPATH) || - (rdev->desc_nr >= 0 && - rdev->desc_nr < le32_to_cpu(sb->max_dev) && - (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || - le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))) + /* not spare disk */ + if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) && + (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || + le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) spare_disk = false; if (!refdev) { @@ -1856,10 +1844,11 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ return ret; } -static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) +static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) { struct mdp_superblock_1 *sb = page_address(rdev->sb_page); __u64 ev1 = le64_to_cpu(sb->events); + int role; rdev->raid_disk = -1; clear_bit(Faulty, &rdev->flags); @@ -1952,13 +1941,15 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) } } else if (mddev->pers == NULL) { /* Insist of good event counter while assembling, except for - * spares (which don't need an event count) */ - ++ev1; + * spares (which don't need an event count). + * Similar to mdadm, we allow event counter difference of 1 + * from the freshest device. + */ if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) && (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) - if (ev1 < mddev->events) + if (ev1 + 1 < mddev->events) return -EINVAL; } else if (mddev->bitmap) { /* If adding to array with a bitmap, then we can accept an @@ -1973,58 +1964,85 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) /* just a hot-add of a new device, leave raid_disk at -1 */ return 0; } - if (mddev->level != LEVEL_MULTIPATH) { - int role; - if (rdev->desc_nr < 0 || - rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { - role = MD_DISK_ROLE_SPARE; - rdev->desc_nr = -1; - } else - role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); - switch(role) { - case MD_DISK_ROLE_SPARE: /* spare */ - break; - case MD_DISK_ROLE_FAULTY: /* faulty */ - set_bit(Faulty, &rdev->flags); - break; - case MD_DISK_ROLE_JOURNAL: /* journal device */ - if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { - /* journal device without journal feature */ - pr_warn("md: journal device provided without journal feature, ignoring the device\n"); - return -EINVAL; - } - set_bit(Journal, &rdev->flags); - rdev->journal_tail = le64_to_cpu(sb->journal_tail); - rdev->raid_disk = 0; - break; - default: - rdev->saved_raid_disk = role; - if ((le32_to_cpu(sb->feature_map) & - MD_FEATURE_RECOVERY_OFFSET)) { - rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); - if (!(le32_to_cpu(sb->feature_map) & - MD_FEATURE_RECOVERY_BITMAP)) - rdev->saved_raid_disk = -1; - } else { - /* - * If the array is FROZEN, then the device can't - * be in_sync with rest of array. - */ - if (!test_bit(MD_RECOVERY_FROZEN, - &mddev->recovery)) - set_bit(In_sync, &rdev->flags); - } - rdev->raid_disk = role; - break; + + if (rdev->desc_nr < 0 || + rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { + role = MD_DISK_ROLE_SPARE; + rdev->desc_nr = -1; + } else if (mddev->pers == NULL && freshest && ev1 < mddev->events) { + /* + * If we are assembling, and our event counter is smaller than the + * highest event counter, we cannot trust our superblock about the role. + * It could happen that our rdev was marked as Faulty, and all other + * superblocks were updated with +1 event counter. + * Then, before the next superblock update, which typically happens when + * remove_and_add_spares() removes the device from the array, there was + * a crash or reboot. + * If we allow current rdev without consulting the freshest superblock, + * we could cause data corruption. + * Note that in this case our event counter is smaller by 1 than the + * highest, otherwise, this rdev would not be allowed into array; + * both kernel and mdadm allow event counter difference of 1. + */ + struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page); + u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev); + + if (rdev->desc_nr >= freshest_max_dev) { + /* this is unexpected, better not proceed */ + pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n", + mdname(mddev), rdev->bdev, rdev->desc_nr, + freshest->bdev, freshest_max_dev); + return -EUCLEAN; } - if (sb->devflags & WriteMostly1) - set_bit(WriteMostly, &rdev->flags); - if (sb->devflags & FailFast1) - set_bit(FailFast, &rdev->flags); - if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) - set_bit(Replacement, &rdev->flags); - } else /* MULTIPATH are always insync */ - set_bit(In_sync, &rdev->flags); + + role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]); + pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n", + mdname(mddev), rdev->bdev, role, role, freshest->bdev); + } else { + role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); + } + switch (role) { + case MD_DISK_ROLE_SPARE: /* spare */ + break; + case MD_DISK_ROLE_FAULTY: /* faulty */ + set_bit(Faulty, &rdev->flags); + break; + case MD_DISK_ROLE_JOURNAL: /* journal device */ + if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { + /* journal device without journal feature */ + pr_warn("md: journal device provided without journal feature, ignoring the device\n"); + return -EINVAL; + } + set_bit(Journal, &rdev->flags); + rdev->journal_tail = le64_to_cpu(sb->journal_tail); + rdev->raid_disk = 0; + break; + default: + rdev->saved_raid_disk = role; + if ((le32_to_cpu(sb->feature_map) & + MD_FEATURE_RECOVERY_OFFSET)) { + rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); + if (!(le32_to_cpu(sb->feature_map) & + MD_FEATURE_RECOVERY_BITMAP)) + rdev->saved_raid_disk = -1; + } else { + /* + * If the array is FROZEN, then the device can't + * be in_sync with rest of array. + */ + if (!test_bit(MD_RECOVERY_FROZEN, + &mddev->recovery)) + set_bit(In_sync, &rdev->flags); + } + rdev->raid_disk = role; + break; + } + if (sb->devflags & WriteMostly1) + set_bit(WriteMostly, &rdev->flags); + if (sb->devflags & FailFast1) + set_bit(FailFast, &rdev->flags); + if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) + set_bit(Replacement, &rdev->flags); return 0; } @@ -2842,10 +2860,6 @@ rewrite: } else pr_debug("md: %pg (skipping faulty)\n", rdev->bdev); - - if (mddev->level == LEVEL_MULTIPATH) - /* only need to write one superblock... */ - break; } if (md_super_wait(mddev) < 0) goto rewrite; @@ -2887,7 +2901,7 @@ static int add_bound_rdev(struct md_rdev *rdev) * and should be added immediately. */ super_types[mddev->major_version]. - validate_super(mddev, rdev); + validate_super(mddev, NULL/*freshest*/, rdev); err = mddev->pers->hot_add_disk(mddev, rdev); if (err) { md_kick_rdev_from_array(rdev); @@ -3824,7 +3838,7 @@ static int analyze_sbs(struct mddev *mddev) } super_types[mddev->major_version]. - validate_super(mddev, freshest); + validate_super(mddev, NULL/*freshest*/, freshest); i = 0; rdev_for_each_safe(rdev, tmp, mddev) { @@ -3839,20 +3853,15 @@ static int analyze_sbs(struct mddev *mddev) } if (rdev != freshest) { if (super_types[mddev->major_version]. - validate_super(mddev, rdev)) { + validate_super(mddev, freshest, rdev)) { pr_warn("md: kicking non-fresh %pg from array!\n", rdev->bdev); md_kick_rdev_from_array(rdev); continue; } } - if (mddev->level == LEVEL_MULTIPATH) { - rdev->desc_nr = i++; - rdev->raid_disk = rdev->desc_nr; - set_bit(In_sync, &rdev->flags); - } else if (rdev->raid_disk >= - (mddev->raid_disks - min(0, mddev->delta_disks)) && - !test_bit(Journal, &rdev->flags)) { + if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks)) && + !test_bit(Journal, &rdev->flags)) { rdev->raid_disk = -1; clear_bit(In_sync, &rdev->flags); } @@ -6847,7 +6856,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) rdev->saved_raid_disk = rdev->raid_disk; } else super_types[mddev->major_version]. - validate_super(mddev, rdev); + validate_super(mddev, NULL/*freshest*/, rdev); if ((info->state & (1<<MD_DISK_SYNC)) && rdev->raid_disk != info->raid_disk) { /* This was a hot-add request, but events doesn't @@ -8090,7 +8099,7 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev) return; mddev->pers->error_handler(mddev, rdev); - if (mddev->pers->level == 0 || mddev->pers->level == LEVEL_LINEAR) + if (mddev->pers->level == 0) return; if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags)) diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index 3f22edec70e7..512746551f36 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -173,3 +173,57 @@ static inline void raid1_prepare_flush_writes(struct bitmap *bitmap) else md_bitmap_unplug(bitmap); } + +/* + * Used by fix_read_error() to decay the per rdev read_errors. + * We halve the read error count for every hour that has elapsed + * since the last recorded read error. + */ +static inline void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev) +{ + long cur_time_mon; + unsigned long hours_since_last; + unsigned int read_errors = atomic_read(&rdev->read_errors); + + cur_time_mon = ktime_get_seconds(); + + if (rdev->last_read_error == 0) { + /* first time we've seen a read error */ + rdev->last_read_error = cur_time_mon; + return; + } + + hours_since_last = (long)(cur_time_mon - + rdev->last_read_error) / 3600; + + rdev->last_read_error = cur_time_mon; + + /* + * if hours_since_last is > the number of bits in read_errors + * just set read errors to 0. We do this to avoid + * overflowing the shift of read_errors by hours_since_last. + */ + if (hours_since_last >= 8 * sizeof(read_errors)) + atomic_set(&rdev->read_errors, 0); + else + atomic_set(&rdev->read_errors, read_errors >> hours_since_last); +} + +static inline bool exceed_read_errors(struct mddev *mddev, struct md_rdev *rdev) +{ + int max_read_errors = atomic_read(&mddev->max_corr_read_errors); + int read_errors; + + check_decay_read_errors(mddev, rdev); + read_errors = atomic_inc_return(&rdev->read_errors); + if (read_errors > max_read_errors) { + pr_notice("md/"RAID_1_10_NAME":%s: %pg: Raid device exceeded read_error threshold [cur %d:max %d]\n", + mdname(mddev), rdev->bdev, read_errors, max_read_errors); + pr_notice("md/"RAID_1_10_NAME":%s: %pg: Failing raid device\n", + mdname(mddev), rdev->bdev); + md_error(mddev, rdev); + return true; + } + + return false; +} diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 9348f1709512..aaa434f0c175 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -49,6 +49,7 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr); #define raid1_log(md, fmt, args...) \ do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0) +#define RAID_1_10_NAME "raid1" #include "raid1-10.c" #define START(node) ((node)->start) @@ -1124,8 +1125,6 @@ static void alloc_behind_master_bio(struct r1bio *r1_bio, behind_bio = bio_alloc_bioset(NULL, vcnt, 0, GFP_NOIO, &r1_bio->mddev->bio_set); - if (!behind_bio) - return; /* discard op, we don't support writezero/writesame yet */ if (!bio_has_data(bio)) { @@ -2257,16 +2256,24 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) * 3. Performs writes following reads for array synchronising. */ -static void fix_read_error(struct r1conf *conf, int read_disk, - sector_t sect, int sectors) +static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio) { + sector_t sect = r1_bio->sector; + int sectors = r1_bio->sectors; + int read_disk = r1_bio->read_disk; struct mddev *mddev = conf->mddev; + struct md_rdev *rdev = rcu_dereference(conf->mirrors[read_disk].rdev); + + if (exceed_read_errors(mddev, rdev)) { + r1_bio->bios[r1_bio->read_disk] = IO_BLOCKED; + return; + } + while(sectors) { int s = sectors; int d = read_disk; int success = 0; int start; - struct md_rdev *rdev; if (s > (PAGE_SIZE>>9)) s = PAGE_SIZE >> 9; @@ -2507,8 +2514,7 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) if (mddev->ro == 0 && !test_bit(FailFast, &rdev->flags)) { freeze_array(conf, 1); - fix_read_error(conf, r1_bio->read_disk, - r1_bio->sector, r1_bio->sectors); + fix_read_error(conf, r1_bio); unfreeze_array(conf); } else if (mddev->ro == 0 && test_bit(FailFast, &rdev->flags)) { md_error(mddev, rdev); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 375c11d6159f..7412066ea22c 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -19,6 +19,8 @@ #include <linux/raid/md_p.h> #include <trace/events/block.h> #include "md.h" + +#define RAID_1_10_NAME "raid10" #include "raid10.h" #include "raid0.h" #include "md-bitmap.h" @@ -2592,42 +2594,6 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) } } -/* - * Used by fix_read_error() to decay the per rdev read_errors. - * We halve the read error count for every hour that has elapsed - * since the last recorded read error. - * - */ -static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev) -{ - long cur_time_mon; - unsigned long hours_since_last; - unsigned int read_errors = atomic_read(&rdev->read_errors); - - cur_time_mon = ktime_get_seconds(); - - if (rdev->last_read_error == 0) { - /* first time we've seen a read error */ - rdev->last_read_error = cur_time_mon; - return; - } - - hours_since_last = (long)(cur_time_mon - - rdev->last_read_error) / 3600; - - rdev->last_read_error = cur_time_mon; - - /* - * if hours_since_last is > the number of bits in read_errors - * just set read errors to 0. We do this to avoid - * overflowing the shift of read_errors by hours_since_last. - */ - if (hours_since_last >= 8 * sizeof(read_errors)) - atomic_set(&rdev->read_errors, 0); - else - atomic_set(&rdev->read_errors, read_errors >> hours_since_last); -} - static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector, int sectors, struct page *page, enum req_op op) { @@ -2665,7 +2631,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 int sect = 0; /* Offset from r10_bio->sector */ int sectors = r10_bio->sectors, slot = r10_bio->read_slot; struct md_rdev *rdev; - int max_read_errors = atomic_read(&mddev->max_corr_read_errors); int d = r10_bio->devs[slot].devnum; /* still own a reference to this rdev, so it cannot @@ -2678,15 +2643,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 more fix_read_error() attempts */ return; - check_decay_read_errors(mddev, rdev); - atomic_inc(&rdev->read_errors); - if (atomic_read(&rdev->read_errors) > max_read_errors) { - pr_notice("md/raid10:%s: %pg: Raid device exceeded read_error threshold [cur %d:max %d]\n", - mdname(mddev), rdev->bdev, - atomic_read(&rdev->read_errors), max_read_errors); - pr_notice("md/raid10:%s: %pg: Failing raid device\n", - mdname(mddev), rdev->bdev); - md_error(mddev, rdev); + if (exceed_read_errors(mddev, rdev)) { r10_bio->devs[slot].bio = IO_BLOCKED; return; } diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index 6c0aa577730f..5a43c23f53bf 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h @@ -2,15 +2,11 @@ /* md_p.h : physical layout of Linux RAID devices Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman - + This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. - - You should have received a copy of the GNU General Public License - (for example /usr/src/linux/COPYING); if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef _MD_P_H @@ -237,7 +233,7 @@ struct mdp_superblock_1 { char set_name[32]; /* set and interpreted by user-space */ __le64 ctime; /* lo 40 bits are seconds, top 24 are microseconds or 0*/ - __le32 level; /* -4 (multipath), -1 (linear), 0,1,4,5 */ + __le32 level; /* 0,1,4,5 */ __le32 layout; /* only for raid5 and raid10 currently */ __le64 size; /* used size of component devices, in 512byte sectors */ diff --git a/include/uapi/linux/raid/md_u.h b/include/uapi/linux/raid/md_u.h index 105307244961..7be89a4906e7 100644 --- a/include/uapi/linux/raid/md_u.h +++ b/include/uapi/linux/raid/md_u.h @@ -2,15 +2,11 @@ /* md_u.h : user <=> kernel API between Linux raidtools and RAID drivers Copyright (C) 1998 Ingo Molnar - + This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. - - You should have received a copy of the GNU General Public License - (for example /usr/src/linux/COPYING); if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef _UAPI_MD_U_H @@ -107,11 +103,6 @@ typedef struct mdu_array_info_s { } mdu_array_info_t; -/* non-obvious values for 'level' */ -#define LEVEL_MULTIPATH (-4) -#define LEVEL_LINEAR (-1) -#define LEVEL_FAULTY (-5) - /* we need a value for 'no level specified' and 0 * means 'raid0', so we need something else. This is * for internal use only |