diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/bcache/btree.c | 242 | ||||
-rw-r--r-- | drivers/md/bcache/btree.h | 84 | ||||
-rw-r--r-- | drivers/md/bcache/request.c | 7 | ||||
-rw-r--r-- | drivers/md/bcache/request.h | 3 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 11 | ||||
-rw-r--r-- | drivers/md/bcache/sysfs.c | 2 | ||||
-rw-r--r-- | drivers/md/bcache/writeback.c | 164 | ||||
-rw-r--r-- | drivers/md/bcache/writeback.h | 19 | ||||
-rw-r--r-- | drivers/md/dm-clone-metadata.c | 15 | ||||
-rw-r--r-- | drivers/md/dm-clone-metadata.h | 2 | ||||
-rw-r--r-- | drivers/md/dm-clone-target.c | 66 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 6 | ||||
-rw-r--r-- | drivers/md/dm-integrity.c | 304 | ||||
-rw-r--r-- | drivers/md/dm-verity-fec.c | 1 | ||||
-rw-r--r-- | drivers/md/dm-writecache.c | 138 | ||||
-rw-r--r-- | drivers/md/dm-zoned-metadata.c | 1 | ||||
-rw-r--r-- | drivers/md/dm.c | 10 | ||||
-rw-r--r-- | drivers/md/md.c | 11 |
18 files changed, 889 insertions, 197 deletions
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index fa872df4e770..72856e5f23a3 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -101,64 +101,6 @@ #define insert_lock(s, b) ((b)->level <= (s)->lock) -/* - * These macros are for recursing down the btree - they handle the details of - * locking and looking up nodes in the cache for you. They're best treated as - * mere syntax when reading code that uses them. - * - * op->lock determines whether we take a read or a write lock at a given depth. - * If you've got a read lock and find that you need a write lock (i.e. you're - * going to have to split), set op->lock and return -EINTR; btree_root() will - * call you again and you'll have the correct lock. - */ - -/** - * btree - recurse down the btree on a specified key - * @fn: function to call, which will be passed the child node - * @key: key to recurse on - * @b: parent btree node - * @op: pointer to struct btree_op - */ -#define btree(fn, key, b, op, ...) \ -({ \ - int _r, l = (b)->level - 1; \ - bool _w = l <= (op)->lock; \ - struct btree *_child = bch_btree_node_get((b)->c, op, key, l, \ - _w, b); \ - if (!IS_ERR(_child)) { \ - _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ - rw_unlock(_w, _child); \ - } else \ - _r = PTR_ERR(_child); \ - _r; \ -}) - -/** - * btree_root - call a function on the root of the btree - * @fn: function to call, which will be passed the child node - * @c: cache set - * @op: pointer to struct btree_op - */ -#define btree_root(fn, c, op, ...) \ -({ \ - int _r = -EINTR; \ - do { \ - struct btree *_b = (c)->root; \ - bool _w = insert_lock(op, _b); \ - rw_lock(_w, _b, _b->level); \ - if (_b == (c)->root && \ - _w == insert_lock(op, _b)) { \ - _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ - } \ - rw_unlock(_w, _b); \ - bch_cannibalize_unlock(c); \ - if (_r == -EINTR) \ - schedule(); \ - } while (_r == -EINTR); \ - \ - finish_wait(&(c)->btree_cache_wait, &(op)->wait); \ - _r; \ -}) static inline struct bset *write_block(struct btree *b) { @@ -1848,7 +1790,7 @@ static void bch_btree_gc(struct cache_set *c) /* if CACHE_SET_IO_DISABLE set, gc thread should stop too */ do { - ret = btree_root(gc_root, c, &op, &writes, &stats); + ret = bcache_btree_root(gc_root, c, &op, &writes, &stats); closure_sync(&writes); cond_resched(); @@ -1946,7 +1888,7 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) } if (p) - ret = btree(check_recurse, p, b, op); + ret = bcache_btree(check_recurse, p, b, op); p = k; } while (p && !ret); @@ -1955,13 +1897,176 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) return ret; } + +static int bch_btree_check_thread(void *arg) +{ + int ret; + struct btree_check_info *info = arg; + struct btree_check_state *check_state = info->state; + struct cache_set *c = check_state->c; + struct btree_iter iter; + struct bkey *k, *p; + int cur_idx, prev_idx, skip_nr; + int i, n; + + k = p = NULL; + i = n = 0; + cur_idx = prev_idx = 0; + ret = 0; + + /* root node keys are checked before thread created */ + bch_btree_iter_init(&c->root->keys, &iter, NULL); + k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); + BUG_ON(!k); + + p = k; + while (k) { + /* + * Fetch a root node key index, skip the keys which + * should be fetched by other threads, then check the + * sub-tree indexed by the fetched key. + */ + spin_lock(&check_state->idx_lock); + cur_idx = check_state->key_idx; + check_state->key_idx++; + spin_unlock(&check_state->idx_lock); + + skip_nr = cur_idx - prev_idx; + + while (skip_nr) { + k = bch_btree_iter_next_filter(&iter, + &c->root->keys, + bch_ptr_bad); + if (k) + p = k; + else { + /* + * No more keys to check in root node, + * current checking threads are enough, + * stop creating more. + */ + atomic_set(&check_state->enough, 1); + /* Update check_state->enough earlier */ + smp_mb__after_atomic(); + goto out; + } + skip_nr--; + cond_resched(); + } + + if (p) { + struct btree_op op; + + btree_node_prefetch(c->root, p); + c->gc_stats.nodes++; + bch_btree_op_init(&op, 0); + ret = bcache_btree(check_recurse, p, c->root, &op); + if (ret) + goto out; + } + p = NULL; + prev_idx = cur_idx; + cond_resched(); + } + +out: + info->result = ret; + /* update check_state->started among all CPUs */ + smp_mb__before_atomic(); + if (atomic_dec_and_test(&check_state->started)) + wake_up(&check_state->wait); + + return ret; +} + + + +static int bch_btree_chkthread_nr(void) +{ + int n = num_online_cpus()/2; + + if (n == 0) + n = 1; + else if (n > BCH_BTR_CHKTHREAD_MAX) + n = BCH_BTR_CHKTHREAD_MAX; + + return n; +} + int bch_btree_check(struct cache_set *c) { - struct btree_op op; + int ret = 0; + int i; + struct bkey *k = NULL; + struct btree_iter iter; + struct btree_check_state *check_state; + char name[32]; - bch_btree_op_init(&op, SHRT_MAX); + /* check and mark root node keys */ + for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid) + bch_initial_mark_key(c, c->root->level, k); + + bch_initial_mark_key(c, c->root->level + 1, &c->root->key); + + if (c->root->level == 0) + return 0; + + check_state = kzalloc(sizeof(struct btree_check_state), GFP_KERNEL); + if (!check_state) + return -ENOMEM; - return btree_root(check_recurse, c, &op); + check_state->c = c; + check_state->total_threads = bch_btree_chkthread_nr(); + check_state->key_idx = 0; + spin_lock_init(&check_state->idx_lock); + atomic_set(&check_state->started, 0); + atomic_set(&check_state->enough, 0); + init_waitqueue_head(&check_state->wait); + + /* + * Run multiple threads to check btree nodes in parallel, + * if check_state->enough is non-zero, it means current + * running check threads are enough, unncessary to create + * more. + */ + for (i = 0; i < check_state->total_threads; i++) { + /* fetch latest check_state->enough earlier */ + smp_mb__before_atomic(); + if (atomic_read(&check_state->enough)) + break; + + check_state->infos[i].result = 0; + check_state->infos[i].state = check_state; + snprintf(name, sizeof(name), "bch_btrchk[%u]", i); + atomic_inc(&check_state->started); + + check_state->infos[i].thread = + kthread_run(bch_btree_check_thread, + &check_state->infos[i], + name); + if (IS_ERR(check_state->infos[i].thread)) { + pr_err("fails to run thread bch_btrchk[%d]", i); + for (--i; i >= 0; i--) + kthread_stop(check_state->infos[i].thread); + ret = -ENOMEM; + goto out; + } + } + + wait_event_interruptible(check_state->wait, + atomic_read(&check_state->started) == 0 || + test_bit(CACHE_SET_IO_DISABLE, &c->flags)); + + for (i = 0; i < check_state->total_threads; i++) { + if (check_state->infos[i].result) { + ret = check_state->infos[i].result; + goto out; + } + } + +out: + kfree(check_state); + return ret; } void bch_initial_gc_finish(struct cache_set *c) @@ -2401,7 +2506,7 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { - ret = btree(map_nodes_recurse, k, b, + ret = bcache_btree(map_nodes_recurse, k, b, op, from, fn, flags); from = NULL; @@ -2419,10 +2524,10 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, struct bkey *from, btree_map_nodes_fn *fn, int flags) { - return btree_root(map_nodes_recurse, c, op, from, fn, flags); + return bcache_btree_root(map_nodes_recurse, c, op, from, fn, flags); } -static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, +int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, struct bkey *from, btree_map_keys_fn *fn, int flags) { @@ -2435,7 +2540,8 @@ static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { ret = !b->level ? fn(op, b, k) - : btree(map_keys_recurse, k, b, op, from, fn, flags); + : bcache_btree(map_keys_recurse, k, + b, op, from, fn, flags); from = NULL; if (ret != MAP_CONTINUE) @@ -2452,7 +2558,7 @@ static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, int bch_btree_map_keys(struct btree_op *op, struct cache_set *c, struct bkey *from, btree_map_keys_fn *fn, int flags) { - return btree_root(map_keys_recurse, c, op, from, fn, flags); + return bcache_btree_root(map_keys_recurse, c, op, from, fn, flags); } /* Keybuf code */ diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index f4dcca449391..257969980c49 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -145,6 +145,9 @@ struct btree { struct bio *bio; }; + + + #define BTREE_FLAG(flag) \ static inline bool btree_node_ ## flag(struct btree *b) \ { return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ @@ -216,6 +219,25 @@ struct btree_op { unsigned int insert_collision:1; }; +struct btree_check_state; +struct btree_check_info { + struct btree_check_state *state; + struct task_struct *thread; + int result; +}; + +#define BCH_BTR_CHKTHREAD_MAX 64 +struct btree_check_state { + struct cache_set *c; + int total_threads; + int key_idx; + spinlock_t idx_lock; + atomic_t started; + atomic_t enough; + wait_queue_head_t wait; + struct btree_check_info infos[BCH_BTR_CHKTHREAD_MAX]; +}; + static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level) { memset(op, 0, sizeof(struct btree_op)); @@ -284,6 +306,65 @@ static inline void force_wake_up_gc(struct cache_set *c) wake_up_gc(c); } +/* + * These macros are for recursing down the btree - they handle the details of + * locking and looking up nodes in the cache for you. They're best treated as + * mere syntax when reading code that uses them. + * + * op->lock determines whether we take a read or a write lock at a given depth. + * If you've got a read lock and find that you need a write lock (i.e. you're + * going to have to split), set op->lock and return -EINTR; btree_root() will + * call you again and you'll have the correct lock. + */ + +/** + * btree - recurse down the btree on a specified key + * @fn: function to call, which will be passed the child node + * @key: key to recurse on + * @b: parent btree node + * @op: pointer to struct btree_op + */ +#define bcache_btree(fn, key, b, op, ...) \ +({ \ + int _r, l = (b)->level - 1; \ + bool _w = l <= (op)->lock; \ + struct btree *_child = bch_btree_node_get((b)->c, op, key, l, \ + _w, b); \ + if (!IS_ERR(_child)) { \ + _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ + rw_unlock(_w, _child); \ + } else \ + _r = PTR_ERR(_child); \ + _r; \ +}) + +/** + * btree_root - call a function on the root of the btree + * @fn: function to call, which will be passed the child node + * @c: cache set + * @op: pointer to struct btree_op + */ +#define bcache_btree_root(fn, c, op, ...) \ +({ \ + int _r = -EINTR; \ + do { \ + struct btree *_b = (c)->root; \ + bool _w = insert_lock(op, _b); \ + rw_lock(_w, _b, _b->level); \ + if (_b == (c)->root && \ + _w == insert_lock(op, _b)) { \ + _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ + } \ + rw_unlock(_w, _b); \ + bch_cannibalize_unlock(c); \ + if (_r == -EINTR) \ + schedule(); \ + } while (_r == -EINTR); \ + \ + finish_wait(&(c)->btree_cache_wait, &(op)->wait); \ + _r; \ +}) + #define MAP_DONE 0 #define MAP_CONTINUE 1 @@ -314,6 +395,9 @@ typedef int (btree_map_keys_fn)(struct btree_op *op, struct btree *b, struct bkey *k); int bch_btree_map_keys(struct btree_op *op, struct cache_set *c, struct bkey *from, btree_map_keys_fn *fn, int flags); +int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, + struct bkey *from, btree_map_keys_fn *fn, + int flags); typedef bool (keybuf_pred_fn)(struct keybuf *buf, struct bkey *k); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 820d8402a1dc..71a90fbec314 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -1161,8 +1161,7 @@ static void quit_max_writeback_rate(struct cache_set *c, /* Cached devices - read & write stuff */ -static blk_qc_t cached_dev_make_request(struct request_queue *q, - struct bio *bio) +blk_qc_t cached_dev_make_request(struct request_queue *q, struct bio *bio) { struct search *s; struct bcache_device *d = bio->bi_disk->private_data; @@ -1266,7 +1265,6 @@ void bch_cached_dev_request_init(struct cached_dev *dc) { struct gendisk *g = dc->disk.disk; - g->queue->make_request_fn = cached_dev_make_request; g->queue->backing_dev_info->congested_fn = cached_dev_congested; dc->disk.cache_miss = cached_dev_cache_miss; dc->disk.ioctl = cached_dev_ioctl; @@ -1301,8 +1299,7 @@ static void flash_dev_nodata(struct closure *cl) continue_at(cl, search_free, NULL); } -static blk_qc_t flash_dev_make_request(struct request_queue *q, - struct bio *bio) +blk_qc_t flash_dev_make_request(struct request_queue *q, struct bio *bio) { struct search *s; struct closure *cl; diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index c64dbd7a91aa..bb005c93dd72 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -37,7 +37,10 @@ unsigned int bch_get_congested(const struct cache_set *c); void bch_data_insert(struct closure *cl); void bch_cached_dev_request_init(struct cached_dev *dc); +blk_qc_t cached_dev_make_request(struct request_queue *q, struct bio *bio); + void bch_flash_dev_request_init(struct bcache_device *d); +blk_qc_t flash_dev_make_request(struct request_queue *q, struct bio *bio); extern struct kmem_cache *bch_search_cache; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 0c3c5419c52b..d98354fa28e3 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -816,7 +816,7 @@ static void bcache_device_free(struct bcache_device *d) } static int bcache_device_init(struct bcache_device *d, unsigned int block_size, - sector_t sectors) + sector_t sectors, make_request_fn make_request_fn) { struct request_queue *q; const size_t max_stripes = min_t(size_t, INT_MAX, @@ -866,11 +866,10 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, d->disk->fops = &bcache_ops; d->disk->private_data = d; - q = blk_alloc_queue(GFP_KERNEL); + q = blk_alloc_queue(make_request_fn, NUMA_NO_NODE); if (!q) return -ENOMEM; - blk_queue_make_request(q, NULL); d->disk->queue = q; q->queuedata = d; q->backing_dev_info->congested_data = d; @@ -1339,7 +1338,8 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) q->limits.raid_partial_stripes_expensive; ret = bcache_device_init(&dc->disk, block_size, - dc->bdev->bd_part->nr_sects - dc->sb.data_offset); + dc->bdev->bd_part->nr_sects - dc->sb.data_offset, + cached_dev_make_request); if (ret) return ret; @@ -1451,7 +1451,8 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) kobject_init(&d->kobj, &bch_flash_dev_ktype); - if (bcache_device_init(d, block_bytes(c), u->sectors)) + if (bcache_device_init(d, block_bytes(c), u->sectors, + flash_dev_make_request)) goto err; bcache_device_attach(d, c, u - c->uuids); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 3470fae4eabc..323276994aab 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -154,7 +154,7 @@ static ssize_t bch_snprint_string_list(char *buf, size_t i; for (i = 0; list[i]; i++) - out += snprintf(out, buf + size - out, + out += scnprintf(out, buf + size - out, i == selected ? "[%s] " : "%s ", list[i]); out[-1] = '\n'; diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 4a40f9eadeaf..3f7641fb28d5 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -183,7 +183,7 @@ static void update_writeback_rate(struct work_struct *work) */ set_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ - smp_mb(); + smp_mb__after_atomic(); /* * CACHE_SET_IO_DISABLE might be set via sysfs interface, @@ -193,7 +193,7 @@ static void update_writeback_rate(struct work_struct *work) test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ - smp_mb(); + smp_mb__after_atomic(); return; } @@ -229,7 +229,7 @@ static void update_writeback_rate(struct work_struct *work) */ clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ - smp_mb(); + smp_mb__after_atomic(); } static unsigned int writeback_delay(struct cached_dev *dc, @@ -785,7 +785,9 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, return MAP_CONTINUE; } -void bch_sectors_dirty_init(struct bcache_device *d) +static int bch_root_node_dirty_init(struct cache_set *c, + struct bcache_device *d, + struct bkey *k) { struct sectors_dirty_init op; int ret; @@ -796,8 +798,13 @@ void bch_sectors_dirty_init(struct bcache_device *d) op.start = KEY(op.inode, 0, 0); do { - ret = bch_btree_map_keys(&op.op, d->c, &op.start, - sectors_dirty_init_fn, 0); + ret = bcache_btree(map_keys_recurse, + k, + c->root, + &op.op, + &op.start, + sectors_dirty_init_fn, + 0); if (ret == -EAGAIN) schedule_timeout_interruptible( msecs_to_jiffies(INIT_KEYS_SLEEP_MS)); @@ -806,6 +813,151 @@ void bch_sectors_dirty_init(struct bcache_device *d) break; } } while (ret == -EAGAIN); + + return ret; +} + +static int bch_dirty_init_thread(void *arg) +{ + struct dirty_init_thrd_info *info = arg; + struct bch_dirty_init_state *state = info->state; + struct cache_set *c = state->c; + struct btree_iter iter; + struct bkey *k, *p; + int cur_idx, prev_idx, skip_nr; + int i; + + k = p = NULL; + i = 0; + cur_idx = prev_idx = 0; + + bch_btree_iter_init(&c->root->keys, &iter, NULL); + k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); + BUG_ON(!k); + + p = k; + + while (k) { + spin_lock(&state->idx_lock); + cur_idx = state->key_idx; + state->key_idx++; + spin_unlock(&state->idx_lock); + + skip_nr = cur_idx - prev_idx; + + while (skip_nr) { + k = bch_btree_iter_next_filter(&iter, + &c->root->keys, + bch_ptr_bad); + if (k) + p = k; + else { + atomic_set(&state->enough, 1); + /* Update state->enough earlier */ + smp_mb__after_atomic(); + goto out; + } + skip_nr--; + cond_resched(); + } + + if (p) { + if (bch_root_node_dirty_init(c, state->d, p) < 0) + goto out; + } + + p = NULL; + prev_idx = cur_idx; + cond_resched(); + } + +out: + /* In order to wake up state->wait in time */ + smp_mb__before_atomic(); + if (atomic_dec_and_test(&state->started)) + wake_up(&state->wait); + + return 0; +} + +static int bch_btre_dirty_init_thread_nr(void) +{ + int n = num_online_cpus()/2; + + if (n == 0) + n = 1; + else if (n > BCH_DIRTY_INIT_THRD_MAX) + n = BCH_DIRTY_INIT_THRD_MAX; + + return n; +} + +void bch_sectors_dirty_init(struct bcache_device *d) +{ + int i; + struct bkey *k = NULL; + struct btree_iter iter; + struct sectors_dirty_init op; + struct cache_set *c = d->c; + struct bch_dirty_init_state *state; + char name[32]; + + /* Just count root keys if no leaf node */ + if (c->root->level == 0) { + bch_btree_op_init(&op.op, -1); + op.inode = d->id; + op.count = 0; + op.start = KEY(op.inode, 0, 0); + + for_each_key_filter(&c->root->keys, + k, &iter, bch_ptr_invalid) + sectors_dirty_init_fn(&op.op, c->root, k); + return; + } + + state = kzalloc(sizeof(struct bch_dirty_init_state), GFP_KERNEL); + if (!state) { + pr_warn("sectors dirty init failed: cannot allocate memory"); + return; + } + + state->c = c; + state->d = d; + state->total_threads = bch_btre_dirty_init_thread_nr(); + state->key_idx = 0; + spin_lock_init(&state->idx_lock); + atomic_set(&state->started, 0); + atomic_set(&state->enough, 0); + init_waitqueue_head(&state->wait); + + for (i = 0; i < state->total_threads; i++) { + /* Fetch latest state->enough earlier */ + smp_mb__before_atomic(); + if (atomic_read(&state->enough)) + break; + + state->infos[i].state = state; + atomic_inc(&state->started); + snprintf(name, sizeof(name), "bch_dirty_init[%d]", i); + + state->infos[i].thread = + kthread_run(bch_dirty_init_thread, + &state->infos[i], + name); + if (IS_ERR(state->infos[i].thread)) { + pr_err("fails to run thread bch_dirty_init[%d]", i); + for (--i; i >= 0; i--) + kthread_stop(state->infos[i].thread); + goto out; + } + } + + wait_event_interruptible(state->wait, + atomic_read(&state->started) == 0 || + test_bit(CACHE_SET_IO_DISABLE, &c->flags)); + +out: + kfree(state); } void bch_cached_dev_writeback_init(struct cached_dev *dc) diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 4e4c6810dc3c..b029843ce5b6 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -16,6 +16,7 @@ #define BCH_AUTO_GC_DIRTY_THRESHOLD 50 +#define BCH_DIRTY_INIT_THRD_MAX 64 /* * 14 (16384ths) is chosen here as something that each backing device * should be a reasonable fraction of the share, and not to blow up @@ -23,6 +24,24 @@ */ #define WRITEBACK_SHARE_SHIFT 14 +struct bch_dirty_init_state; +struct dirty_init_thrd_info { + struct bch_dirty_init_state *state; + struct task_struct *thread; +}; + +struct bch_dirty_init_state { + struct cache_set *c; + struct bcache_device *d; + int total_threads; + int key_idx; + spinlock_t idx_lock; + atomic_t started; + atomic_t enough; + wait_queue_head_t wait; + struct dirty_init_thrd_info infos[BCH_DIRTY_INIT_THRD_MAX]; +}; + static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) { uint64_t i, ret = 0; diff --git a/drivers/md/dm-clone-metadata.c b/drivers/md/dm-clone-metadata.c index c05b12110456..17712456fa63 100644 --- a/drivers/md/dm-clone-metadata.c +++ b/drivers/md/dm-clone-metadata.c @@ -656,7 +656,7 @@ bool dm_clone_is_range_hydrated(struct dm_clone_metadata *cmd, return (bit >= (start + nr_regions)); } -unsigned long dm_clone_nr_of_hydrated_regions(struct dm_clone_metadata *cmd) +unsigned int dm_clone_nr_of_hydrated_regions(struct dm_clone_metadata *cmd) { return bitmap_weight(cmd->region_map, cmd->nr_regions); } @@ -850,6 +850,12 @@ int dm_clone_set_region_hydrated(struct dm_clone_metadata *cmd, unsigned long re struct dirty_map *dmap; unsigned long word, flags; + if (unlikely(region_nr >= cmd->nr_regions)) { + DMERR("Region %lu out of range (total number of regions %lu)", + region_nr, cmd->nr_regions); + return -ERANGE; + } + word = region_nr / BITS_PER_LONG; spin_lock_irqsave(&cmd->bitmap_lock, flags); @@ -879,6 +885,13 @@ int dm_clone_cond_set_range(struct dm_clone_metadata *cmd, unsigned long start, struct dirty_map *dmap; unsigned long word, region_nr; + if (unlikely(start >= cmd->nr_regions || (start + nr_regions) < start || + (start + nr_regions) > cmd->nr_regions)) { + DMERR("Invalid region range: start %lu, nr_regions %lu (total number of regions %lu)", + start, nr_regions, cmd->nr_regions); + return -ERANGE; + } + spin_lock_irq(&cmd->bitmap_lock); if (cmd->read_only) { diff --git a/drivers/md/dm-clone-metadata.h b/drivers/md/dm-clone-metadata.h index 14af1ebd853f..d848b8799c07 100644 --- a/drivers/md/dm-clone-metadata.h +++ b/drivers/md/dm-clone-metadata.h @@ -156,7 +156,7 @@ bool dm_clone_is_range_hydrated(struct dm_clone_metadata *cmd, /* * Returns the number of hydrated regions. */ -unsigned long dm_clone_nr_of_hydrated_regions(struct dm_clone_metadata *cmd); +unsigned int dm_clone_nr_of_hydrated_regions(struct dm_clone_metadata *cmd); /* * Returns the first unhydrated region with region_nr >= @start diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c index d1e1b5b56b1b..5ce96ddf1ce1 100644 --- a/drivers/md/dm-clone-target.c +++ b/drivers/md/dm-clone-target.c @@ -282,7 +282,7 @@ static bool bio_triggers_commit(struct clone *clone, struct bio *bio) /* Get the address of the region in sectors */ static inline sector_t region_to_sector(struct clone *clone, unsigned long region_nr) { - return (region_nr << clone->region_shift); + return ((sector_t)region_nr << clone->region_shift); } /* Get the region number of the bio */ @@ -293,10 +293,17 @@ static inline unsigned long bio_to_region(struct clone *clone, struct bio *bio) /* Get the region range covered by the bio */ static void bio_region_range(struct clone *clone, struct bio *bio, - unsigned long *rs, unsigned long *re) + unsigned long *rs, unsigned long *nr_regions) { + unsigned long end; + *rs = dm_sector_div_up(bio->bi_iter.bi_sector, clone->region_size); - *re = bio_end_sector(bio) >> clone->region_shift; + end = bio_end_sector(bio) >> clone->region_shift; + + if (*rs >= end) + *nr_regions = 0; + else + *nr_regions = end - *rs; } /* Check whether a bio overwrites a region */ @@ -454,7 +461,7 @@ static void trim_bio(struct bio *bio, sector_t sector, unsigned int len) static void complete_discard_bio(struct clone *clone, struct bio *bio, bool success) { - unsigned long rs, re; + unsigned long rs, nr_regions; /* * If the destination device supports discards, remap and trim the @@ -463,9 +470,9 @@ static void complete_discard_bio(struct clone *clone, struct bio *bio, bool succ */ if (test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags) && success) { remap_to_dest(clone, bio); - bio_region_range(clone, bio, &rs, &re); - trim_bio(bio, rs << clone->region_shift, - (re - rs) << clone->region_shift); + bio_region_range(clone, bio, &rs, &nr_regions); + trim_bio(bio, region_to_sector(clone, rs), + nr_regions << clone->region_shift); generic_make_request(bio); } else bio_endio(bio); @@ -473,12 +480,21 @@ static void complete_discard_bio(struct clone *clone, struct bio *bio, bool succ static void process_discard_bio(struct clone *clone, struct bio *bio) { - unsigned long rs, re; + unsigned long rs, nr_regions; - bio_region_range(clone, bio, &rs, &re); - BUG_ON(re > clone->nr_regions); + bio_region_range(clone, bio, &rs, &nr_regions); + if (!nr_regions) { + bio_endio(bio); + return; + } - if (unlikely(rs == re)) { + if (WARN_ON(rs >= clone->nr_regions || (rs + nr_regions) < rs || + (rs + nr_regions) > clone->nr_regions)) { + DMERR("%s: Invalid range (%lu + %lu, total regions %lu) for discard (%llu + %u)", + clone_device_name(clone), rs, nr_regions, + clone->nr_regions, + (unsigned long long)bio->bi_iter.bi_sector, + bio_sectors(bio)); bio_endio(bio); return; } @@ -487,7 +503,7 @@ static void process_discard_bio(struct clone *clone, struct bio *bio) * The covered regions are already hydrated so we just need to pass * down the discard. */ - if (dm_clone_is_range_hydrated(clone->cmd, rs, re - rs)) { + if (dm_clone_is_range_hydrated(clone->cmd, rs, nr_regions)) { complete_discard_bio(clone, bio, true); return; } @@ -788,11 +804,14 @@ static void hydration_copy(struct dm_clone_region_hydration *hd, unsigned int nr struct dm_io_region from, to; struct clone *clone = hd->clone; + if (WARN_ON(!nr_regions)) + return; + region_size = clone->region_size; region_start = hd->region_nr; region_end = region_start + nr_regions - 1; - total_size = (nr_regions - 1) << clone->region_shift; + total_size = region_to_sector(clone, nr_regions - 1); if (region_end == clone->nr_regions - 1) { /* @@ -1169,7 +1188,7 @@ static void process_deferred_discards(struct clone *clone) int r = -EPERM; struct bio *bio; struct blk_plug plug; - unsigned long rs, re; + unsigned long rs, nr_regions; struct bio_list discards = BIO_EMPTY_LIST; spin_lock_irq(&clone->lock); @@ -1185,14 +1204,13 @@ static void process_deferred_discards(struct clone *clone) /* Update the metadata */ bio_list_for_each(bio, &discards) { - bio_region_range(clone, bio, &rs, &re); + bio_region_range(clone, bio, &rs, &nr_regions); /* * A discard request might cover regions that have been already * hydrated. There is no need to update the metadata for these * regions. */ - r = dm_clone_cond_set_range(clone->cmd, rs, re - rs); - + r = dm_clone_cond_set_range(clone->cmd, rs, nr_regions); if (unlikely(r)) break; } @@ -1455,7 +1473,7 @@ static void clone_status(struct dm_target *ti, status_type_t type, goto error; } - DMEMIT("%u %llu/%llu %llu %lu/%lu %u ", + DMEMIT("%u %llu/%llu %llu %u/%lu %u ", DM_CLONE_METADATA_BLOCK_SIZE, (unsigned long long)(nr_metadata_blocks - nr_free_metadata_blocks), (unsigned long long)nr_metadata_blocks, @@ -1775,6 +1793,7 @@ error: static int clone_ctr(struct dm_target *ti, unsigned int argc, char **argv) { int r; + sector_t nr_regions; struct clone *clone; struct dm_arg_set as; @@ -1816,7 +1835,16 @@ static int clone_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto out_with_source_dev; clone->region_shift = __ffs(clone->region_size); - clone->nr_regions = dm_sector_div_up(ti->len, clone->region_size); + nr_regions = dm_sector_div_up(ti->len, clone->region_size); + + /* Check for overflow */ + if (nr_regions != (unsigned long)nr_regions) { + ti->error = "Too many regions. Consider increasing the region size"; + r = -EOVERFLOW; + goto out_with_source_dev; + } + + clone->nr_regions = nr_regions; r = validate_nr_regions(clone->nr_regions, &ti->error); if (r) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index c6a529873d0f..3df90daba89e 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -230,6 +230,8 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io); static struct scatterlist *crypt_get_sg_data(struct crypt_config *cc, struct scatterlist *sg); +static bool crypt_integrity_aead(struct crypt_config *cc); + /* * Use this to access cipher attributes that are independent of the key. */ @@ -346,7 +348,7 @@ static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, unsigned bs; int log; - if (test_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags)) + if (crypt_integrity_aead(cc)) bs = crypto_aead_blocksize(any_tfm_aead(cc)); else bs = crypto_skcipher_blocksize(any_tfm(cc)); @@ -712,7 +714,7 @@ static int crypt_iv_random_gen(struct crypt_config *cc, u8 *iv, static int crypt_iv_eboiv_ctr(struct crypt_config *cc, struct dm_target *ti, const char *opts) { - if (test_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags)) { + if (crypt_integrity_aead(cc)) { ti->error = "AEAD transforms not supported for EBOIV"; return -EINVAL; } diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 2f03fecd312d..b989d109d55d 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -39,6 +39,7 @@ #define RECALC_WRITE_SUPER 16 #define BITMAP_BLOCK_SIZE 4096 /* don't change it */ #define BITMAP_FLUSH_INTERVAL (10 * HZ) +#define DISCARD_FILLER 0xf6 /* * Warning - DEBUG_PRINT prints security-sensitive data to the log, @@ -257,6 +258,7 @@ struct dm_integrity_c { bool just_formatted; bool recalculate_flag; bool fix_padding; + bool discard; struct alg_spec internal_hash_alg; struct alg_spec journal_crypt_alg; @@ -284,7 +286,7 @@ struct dm_integrity_io { struct work_struct work; struct dm_integrity_c *ic; - bool write; + enum req_opf op; bool fua; struct dm_integrity_range range; @@ -510,8 +512,8 @@ static bool block_bitmap_op(struct dm_integrity_c *ic, struct page_list *bitmap, if (unlikely(((sector | n_sectors) & ((1 << ic->sb->log2_sectors_per_block) - 1)) != 0)) { DMCRIT("invalid bitmap access (%llx,%llx,%d,%d,%d)", - (unsigned long long)sector, - (unsigned long long)n_sectors, + sector, + n_sectors, ic->sb->log2_sectors_per_block, ic->log2_blocks_per_bitmap_bit, mode); @@ -1299,6 +1301,11 @@ static bool find_newer_committed_node(struct dm_integrity_c *ic, struct journal_ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, sector_t *metadata_block, unsigned *metadata_offset, unsigned total_size, int op) { +#define MAY_BE_FILLER 1 +#define MAY_BE_HASH 2 + unsigned hash_offset = 0; + unsigned may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0); + do { unsigned char *data, *dp; struct dm_buffer *b; @@ -1320,18 +1327,35 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se } else if (op == TAG_WRITE) { memcpy(dp, tag, to_copy); dm_bufio_mark_partial_buffer_dirty(b, *metadata_offset, *metadata_offset + to_copy); - } else { + } else { /* e.g.: op == TAG_CMP */ - if (unlikely(memcmp(dp, tag, to_copy))) { - unsigned i; - for (i = 0; i < to_copy; i++) { - if (dp[i] != tag[i]) - break; - total_size--; + if (likely(is_power_of_2(ic->tag_size))) { + if (unlikely(memcmp(dp, tag, to_copy))) + if (unlikely(!ic->discard) || + unlikely(!memchr_inv(dp, DISCARD_FILLER, to_copy))) { + goto thorough_test; + } + } else { + unsigned i, ts; +thorough_test: + ts = total_size; + + for (i = 0; i < to_copy; i++, ts--) { + if (unlikely(dp[i] != tag[i])) + may_be &= ~MAY_BE_HASH; + if (likely(dp[i] != DISCARD_FILLER)) + may_be &= ~MAY_BE_FILLER; + hash_offset++; + if (unlikely(hash_offset == ic->tag_size)) { + if (unlikely(!may_be)) { + dm_bufio_release(b); + return ts; + } + hash_offset = 0; + may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0); + } } - dm_bufio_release(b); - return total_size; } } dm_bufio_release(b); @@ -1342,10 +1366,17 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se (*metadata_block)++; *metadata_offset = 0; } + + if (unlikely(!is_power_of_2(ic->tag_size))) { + hash_offset = (hash_offset + to_copy) % ic->tag_size; + } + total_size -= to_copy; } while (unlikely(total_size)); return 0; +#undef MAY_BE_FILLER +#undef MAY_BE_HASH } static void dm_integrity_flush_buffers(struct dm_integrity_c *ic) @@ -1428,7 +1459,7 @@ static void dec_in_flight(struct dm_integrity_io *dio) remove_range(ic, &dio->range); - if (unlikely(dio->write)) + if (dio->op == REQ_OP_WRITE || unlikely(dio->op == REQ_OP_DISCARD)) schedule_autocommit(ic); bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); @@ -1519,15 +1550,20 @@ static void integrity_metadata(struct work_struct *w) struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); char *checksums; unsigned extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0; - char checksums_onstack[HASH_MAX_DIGESTSIZE]; - unsigned sectors_to_process = dio->range.n_sectors; - sector_t sector = dio->range.logical_sector; + char checksums_onstack[max((size_t)HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; + sector_t sector; + unsigned sectors_to_process; + sector_t save_metadata_block; + unsigned save_metadata_offset; if (unlikely(ic->mode == 'R')) goto skip_io; - checksums = kmalloc((PAGE_SIZE >> SECTOR_SHIFT >> ic->sb->log2_sectors_per_block) * ic->tag_size + extra_space, - GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN); + if (likely(dio->op != REQ_OP_DISCARD)) + checksums = kmalloc((PAGE_SIZE >> SECTOR_SHIFT >> ic->sb->log2_sectors_per_block) * ic->tag_size + extra_space, + GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN); + else + checksums = kmalloc(PAGE_SIZE, GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN); if (!checksums) { checksums = checksums_onstack; if (WARN_ON(extra_space && @@ -1537,6 +1573,43 @@ static void integrity_metadata(struct work_struct *w) } } + if (unlikely(dio->op == REQ_OP_DISCARD)) { + sector_t bi_sector = dio->bio_details.bi_iter.bi_sector; + unsigned bi_size = dio->bio_details.bi_iter.bi_size; + unsigned max_size = likely(checksums != checksums_onstack) ? PAGE_SIZE : HASH_MAX_DIGESTSIZE; + unsigned max_blocks = max_size / ic->tag_size; + memset(checksums, DISCARD_FILLER, max_size); + + while (bi_size) { + unsigned this_step_blocks = bi_size >> (SECTOR_SHIFT + ic->sb->log2_sectors_per_block); + this_step_blocks = min(this_step_blocks, max_blocks); + r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset, + this_step_blocks * ic->tag_size, TAG_WRITE); + if (unlikely(r)) { + if (likely(checksums != checksums_onstack)) + kfree(checksums); + goto error; + } + + /*if (bi_size < this_step_blocks << (SECTOR_SHIFT + ic->sb->log2_sectors_per_block)) { + printk("BUGG: bi_sector: %llx, bi_size: %u\n", bi_sector, bi_size); + printk("BUGG: this_step_blocks: %u\n", this_step_blocks); + BUG(); + }*/ + bi_size -= this_step_blocks << (SECTOR_SHIFT + ic->sb->log2_sectors_per_block); + bi_sector += this_step_blocks << ic->sb->log2_sectors_per_block; + } + + if (likely(checksums != checksums_onstack)) + kfree(checksums); + goto skip_io; + } + + save_metadata_block = dio->metadata_block; + save_metadata_offset = dio->metadata_offset; + sector = dio->range.logical_sector; + sectors_to_process = dio->range.n_sectors; + __bio_for_each_segment(bv, bio, iter, dio->bio_details.bi_iter) { unsigned pos; char *mem, *checksums_ptr; @@ -1555,11 +1628,12 @@ again: kunmap_atomic(mem); r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset, - checksums_ptr - checksums, !dio->write ? TAG_CMP : TAG_WRITE); + checksums_ptr - checksums, dio->op == REQ_OP_READ ? TAG_CMP : TAG_WRITE); if (unlikely(r)) { if (r > 0) { - DMERR_LIMIT("Checksum failed at sector 0x%llx", - (unsigned long long)(sector - ((r + ic->tag_size - 1) / ic->tag_size))); + char b[BDEVNAME_SIZE]; + DMERR_LIMIT("%s: Checksum failed at sector 0x%llx", bio_devname(bio, b), + (sector - ((r + ic->tag_size - 1) / ic->tag_size))); r = -EILSEQ; atomic64_inc(&ic->number_of_mismatches); } @@ -1598,7 +1672,7 @@ again: tag = lowmem_page_address(biv.bv_page) + biv.bv_offset; this_len = min(biv.bv_len, data_to_process); r = dm_integrity_rw_tag(ic, tag, &dio->metadata_block, &dio->metadata_offset, - this_len, !dio->write ? TAG_READ : TAG_WRITE); + this_len, dio->op == REQ_OP_READ ? TAG_READ : TAG_WRITE); if (unlikely(r)) goto error; data_to_process -= this_len; @@ -1625,6 +1699,20 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio) dio->ic = ic; dio->bi_status = 0; + dio->op = bio_op(bio); + + if (unlikely(dio->op == REQ_OP_DISCARD)) { + if (ti->max_io_len) { + sector_t sec = dm_target_offset(ti, bio->bi_iter.bi_sector); + unsigned log2_max_io_len = __fls(ti->max_io_len); + sector_t start_boundary = sec >> log2_max_io_len; + sector_t end_boundary = (sec + bio_sectors(bio) - 1) >> log2_max_io_len; + if (start_boundary < end_boundary) { + sector_t len = ti->max_io_len - (sec & (ti->max_io_len - 1)); + dm_accept_partial_bio(bio, len); + } + } + } if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { submit_flush_bio(ic, dio); @@ -1632,8 +1720,7 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio) } dio->range.logical_sector = dm_target_offset(ti, bio->bi_iter.bi_sector); - dio->write = bio_op(bio) == REQ_OP_WRITE; - dio->fua = dio->write && bio->bi_opf & REQ_FUA; + dio->fua = dio->op == REQ_OP_WRITE && bio->bi_opf & REQ_FUA; if (unlikely(dio->fua)) { /* * Don't pass down the FUA flag because we have to flush @@ -1643,18 +1730,18 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio) } if (unlikely(dio->range.logical_sector + bio_sectors(bio) > ic->provided_data_sectors)) { DMERR("Too big sector number: 0x%llx + 0x%x > 0x%llx", - (unsigned long long)dio->range.logical_sector, bio_sectors(bio), - (unsigned long long)ic->provided_data_sectors); + dio->range.logical_sector, bio_sectors(bio), + ic->provided_data_sectors); return DM_MAPIO_KILL; } if (unlikely((dio->range.logical_sector | bio_sectors(bio)) & (unsigned)(ic->sectors_per_block - 1))) { DMERR("Bio not aligned on %u sectors: 0x%llx, 0x%x", ic->sectors_per_block, - (unsigned long long)dio->range.logical_sector, bio_sectors(bio)); + dio->range.logical_sector, bio_sectors(bio)); return DM_MAPIO_KILL; } - if (ic->sectors_per_block > 1) { + if (ic->sectors_per_block > 1 && likely(dio->op != REQ_OP_DISCARD)) { struct bvec_iter iter; struct bio_vec bv; bio_for_each_segment(bv, bio, iter) { @@ -1687,7 +1774,7 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio) } } - if (unlikely(ic->mode == 'R') && unlikely(dio->write)) + if (unlikely(ic->mode == 'R') && unlikely(dio->op != REQ_OP_READ)) return DM_MAPIO_KILL; get_area_and_offset(ic, dio->range.logical_sector, &area, &offset); @@ -1717,13 +1804,13 @@ static bool __journal_read_write(struct dm_integrity_io *dio, struct bio *bio, bio_advance_iter(bio, &bio->bi_iter, bv.bv_len); retry_kmap: mem = kmap_atomic(bv.bv_page); - if (likely(dio->write)) + if (likely(dio->op == REQ_OP_WRITE)) flush_dcache_page(bv.bv_page); do { struct journal_entry *je = access_journal_entry(ic, journal_section, journal_entry); - if (unlikely(!dio->write)) { + if (unlikely(dio->op == REQ_OP_READ)) { struct journal_sector *js; char *mem_ptr; unsigned s; @@ -1748,12 +1835,12 @@ retry_kmap: } while (++s < ic->sectors_per_block); #ifdef INTERNAL_VERIFY if (ic->internal_hash) { - char checksums_onstack[max(HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; + char checksums_onstack[max((size_t)HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack); if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) { DMERR_LIMIT("Checksum failed when reading from journal, at sector 0x%llx", - (unsigned long long)logical_sector); + logical_sector); } } #endif @@ -1770,7 +1857,7 @@ retry_kmap: char *tag_addr; BUG_ON(PageHighMem(biv.bv_page)); tag_addr = lowmem_page_address(biv.bv_page) + biv.bv_offset; - if (likely(dio->write)) + if (likely(dio->op == REQ_OP_WRITE)) memcpy(tag_ptr, tag_addr, tag_now); else memcpy(tag_addr, tag_ptr, tag_now); @@ -1778,12 +1865,12 @@ retry_kmap: tag_ptr += tag_now; tag_todo -= tag_now; } while (unlikely(tag_todo)); else { - if (likely(dio->write)) + if (likely(dio->op == REQ_OP_WRITE)) memset(tag_ptr, 0, tag_todo); } } - if (likely(dio->write)) { + if (likely(dio->op == REQ_OP_WRITE)) { struct journal_sector *js; unsigned s; @@ -1819,12 +1906,12 @@ retry_kmap: bv.bv_offset += ic->sectors_per_block << SECTOR_SHIFT; } while (bv.bv_len -= ic->sectors_per_block << SECTOR_SHIFT); - if (unlikely(!dio->write)) + if (unlikely(dio->op == REQ_OP_READ)) flush_dcache_page(bv.bv_page); kunmap_atomic(mem); } while (n_sectors); - if (likely(dio->write)) { + if (likely(dio->op == REQ_OP_WRITE)) { smp_mb(); if (unlikely(waitqueue_active(&ic->copy_to_journal_wait))) wake_up(&ic->copy_to_journal_wait); @@ -1856,7 +1943,10 @@ static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map unsigned journal_section, journal_entry; unsigned journal_read_pos; struct completion read_comp; - bool need_sync_io = ic->internal_hash && !dio->write; + bool discard_retried = false; + bool need_sync_io = ic->internal_hash && dio->op == REQ_OP_READ; + if (unlikely(dio->op == REQ_OP_DISCARD) && ic->mode != 'D') + need_sync_io = true; if (need_sync_io && from_map) { INIT_WORK(&dio->work, integrity_bio_wait); @@ -1874,8 +1964,8 @@ retry: } dio->range.n_sectors = bio_sectors(bio); journal_read_pos = NOT_FOUND; - if (likely(ic->mode == 'J')) { - if (dio->write) { + if (ic->mode == 'J' && likely(dio->op != REQ_OP_DISCARD)) { + if (dio->op == REQ_OP_WRITE) { unsigned next_entry, i, pos; unsigned ws, we, range_sectors; @@ -1970,6 +2060,21 @@ offload_to_thread: } } } + if (ic->mode == 'J' && likely(dio->op == REQ_OP_DISCARD) && !discard_retried) { + sector_t next_sector; + unsigned new_pos = find_journal_node(ic, dio->range.logical_sector, &next_sector); + if (unlikely(new_pos != NOT_FOUND) || + unlikely(next_sector < dio->range.logical_sector - dio->range.n_sectors)) { + remove_range_unlocked(ic, &dio->range); + spin_unlock_irq(&ic->endio_wait.lock); + queue_work(ic->commit_wq, &ic->commit_work); + flush_workqueue(ic->commit_wq); + queue_work(ic->writer_wq, &ic->writer_work); + flush_workqueue(ic->writer_wq); + discard_retried = true; + goto lock_retry; + } + } spin_unlock_irq(&ic->endio_wait.lock); if (unlikely(journal_read_pos != NOT_FOUND)) { @@ -1978,7 +2083,7 @@ offload_to_thread: goto journal_read_write; } - if (ic->mode == 'B' && dio->write) { + if (ic->mode == 'B' && (dio->op == REQ_OP_WRITE || unlikely(dio->op == REQ_OP_DISCARD))) { if (!block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector, dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) { struct bitmap_block_status *bbs; @@ -2007,6 +2112,18 @@ offload_to_thread: bio->bi_end_io = integrity_end_io; bio->bi_iter.bi_size = dio->range.n_sectors << SECTOR_SHIFT; + if (unlikely(dio->op == REQ_OP_DISCARD) && likely(ic->mode != 'D')) { + integrity_metadata(&dio->work); + dm_integrity_flush_buffers(ic); + + dio->in_flight = (atomic_t)ATOMIC_INIT(1); + dio->completion = NULL; + + generic_make_request(bio); + + return; + } + generic_make_request(bio); if (need_sync_io) { @@ -2193,6 +2310,8 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start, sec &= ~(sector_t)(ic->sectors_per_block - 1); } } + if (unlikely(sec >= ic->provided_data_sectors)) + continue; get_area_and_offset(ic, sec, &area, &offset); restore_last_bytes(ic, access_journal_data(ic, i, j), je); for (k = j + 1; k < ic->journal_section_entries; k++) { @@ -2202,6 +2321,8 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start, break; BUG_ON(unlikely(journal_entry_is_inprogress(je2)) && !from_replay); sec2 = journal_entry_get_sector(je2); + if (unlikely(sec2 >= ic->provided_data_sectors)) + break; get_area_and_offset(ic, sec2, &area2, &offset2); if (area2 != area || offset2 != offset + ((k - j) << ic->sb->log2_sectors_per_block)) break; @@ -2404,7 +2525,7 @@ next_chunk: get_area_and_offset(ic, logical_sector, &area, &offset); } - DEBUG_print("recalculating: %lx, %lx\n", logical_sector, n_sectors); + DEBUG_print("recalculating: %llx, %llx\n", logical_sector, n_sectors); if (unlikely(++super_counter == RECALC_WRITE_SUPER)) { recalc_write_super(ic); @@ -2828,9 +2949,29 @@ static void dm_integrity_postsuspend(struct dm_target *ti) static void dm_integrity_resume(struct dm_target *ti) { struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private; + __u64 old_provided_data_sectors = le64_to_cpu(ic->sb->provided_data_sectors); int r; + DEBUG_print("resume\n"); + if (ic->provided_data_sectors != old_provided_data_sectors) { + if (ic->provided_data_sectors > old_provided_data_sectors && + ic->mode == 'B' && + ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit) { + rw_journal_sectors(ic, REQ_OP_READ, 0, 0, + ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); + block_bitmap_op(ic, ic->journal, old_provided_data_sectors, + ic->provided_data_sectors - old_provided_data_sectors, BITMAP_OP_SET); + rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0, + ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); + } + + ic->sb->provided_data_sectors = cpu_to_le64(ic->provided_data_sectors); + r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA); + if (unlikely(r)) + dm_integrity_io_error(ic, "writing superblock", r); + } + if (ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP)) { DEBUG_print("resume dirty_bitmap\n"); rw_journal_sectors(ic, REQ_OP_READ, 0, 0, @@ -2898,7 +3039,7 @@ static void dm_integrity_resume(struct dm_target *ti) DEBUG_print("testing recalc: %x\n", ic->sb->flags); if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) { __u64 recalc_pos = le64_to_cpu(ic->sb->recalc_sector); - DEBUG_print("recalc pos: %lx / %lx\n", (long)recalc_pos, ic->provided_data_sectors); + DEBUG_print("recalc pos: %llx / %llx\n", recalc_pos, ic->provided_data_sectors); if (recalc_pos < ic->provided_data_sectors) { queue_work(ic->recalc_wq, &ic->recalc_work); } else if (recalc_pos > ic->provided_data_sectors) { @@ -2928,10 +3069,10 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, switch (type) { case STATUSTYPE_INFO: DMEMIT("%llu %llu", - (unsigned long long)atomic64_read(&ic->number_of_mismatches), - (unsigned long long)ic->provided_data_sectors); + atomic64_read(&ic->number_of_mismatches), + ic->provided_data_sectors); if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) - DMEMIT(" %llu", (unsigned long long)le64_to_cpu(ic->sb->recalc_sector)); + DMEMIT(" %llu", le64_to_cpu(ic->sb->recalc_sector)); else DMEMIT(" -"); break; @@ -2944,6 +3085,7 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, arg_count += !!ic->meta_dev; arg_count += ic->sectors_per_block != 1; arg_count += !!(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)); + arg_count += ic->discard; arg_count += ic->mode == 'J'; arg_count += ic->mode == 'J'; arg_count += ic->mode == 'B'; @@ -2952,7 +3094,7 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, arg_count += !!ic->journal_crypt_alg.alg_string; arg_count += !!ic->journal_mac_alg.alg_string; arg_count += (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0; - DMEMIT("%s %llu %u %c %u", ic->dev->name, (unsigned long long)ic->start, + DMEMIT("%s %llu %u %c %u", ic->dev->name, ic->start, ic->tag_size, ic->mode, arg_count); if (ic->meta_dev) DMEMIT(" meta_device:%s", ic->meta_dev->name); @@ -2960,6 +3102,8 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT); if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) DMEMIT(" recalculate"); + if (ic->discard) + DMEMIT(" allow_discards"); DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS); DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors); DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors); @@ -2968,7 +3112,7 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, DMEMIT(" commit_time:%u", ic->autocommit_msec); } if (ic->mode == 'B') { - DMEMIT(" sectors_per_bit:%llu", (unsigned long long)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit); + DMEMIT(" sectors_per_bit:%llu", (sector_t)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit); DMEMIT(" bitmap_flush_interval:%u", jiffies_to_msecs(ic->bitmap_flush_interval)); } if ((ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0) @@ -3073,6 +3217,24 @@ static int calculate_device_limits(struct dm_integrity_c *ic) return 0; } +static void get_provided_data_sectors(struct dm_integrity_c *ic) +{ + if (!ic->meta_dev) { + int test_bit; + ic->provided_data_sectors = 0; + for (test_bit = fls64(ic->meta_device_sectors) - 1; test_bit >= 3; test_bit--) { + __u64 prev_data_sectors = ic->provided_data_sectors; + + ic->provided_data_sectors |= (sector_t)1 << test_bit; + if (calculate_device_limits(ic)) + ic->provided_data_sectors = prev_data_sectors; + } + } else { + ic->provided_data_sectors = ic->data_device_sectors; + ic->provided_data_sectors &= ~(sector_t)(ic->sectors_per_block - 1); + } +} + static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sectors, unsigned interleave_sectors) { unsigned journal_sections; @@ -3100,20 +3262,15 @@ static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sec ic->sb->log2_interleave_sectors = max((__u8)MIN_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors); ic->sb->log2_interleave_sectors = min((__u8)MAX_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors); - ic->provided_data_sectors = 0; - for (test_bit = fls64(ic->meta_device_sectors) - 1; test_bit >= 3; test_bit--) { - __u64 prev_data_sectors = ic->provided_data_sectors; - - ic->provided_data_sectors |= (sector_t)1 << test_bit; - if (calculate_device_limits(ic)) - ic->provided_data_sectors = prev_data_sectors; - } + get_provided_data_sectors(ic); if (!ic->provided_data_sectors) return -EINVAL; } else { ic->sb->log2_interleave_sectors = 0; - ic->provided_data_sectors = ic->data_device_sectors; - ic->provided_data_sectors &= ~(sector_t)(ic->sectors_per_block - 1); + + get_provided_data_sectors(ic); + if (!ic->provided_data_sectors) + return -EINVAL; try_smaller_buffer: ic->sb->journal_sections = cpu_to_le32(0); @@ -3733,6 +3890,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } else if (!strcmp(opt_string, "recalculate")) { ic->recalculate_flag = true; + } else if (!strcmp(opt_string, "allow_discards")) { + ic->discard = true; } else if (!strcmp(opt_string, "fix_padding")) { ic->fix_padding = true; } else { @@ -3791,6 +3950,12 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } + if (ic->discard && !ic->internal_hash) { + r = -EINVAL; + ti->error = "Discard can be only used with internal hash"; + goto bad; + } + ic->autocommit_jiffies = msecs_to_jiffies(sync_msec); ic->autocommit_msec = sync_msec; timer_setup(&ic->autocommit_timer, autocommit_fn, 0); @@ -3920,16 +4085,16 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } } - ic->provided_data_sectors = le64_to_cpu(ic->sb->provided_data_sectors); - if (ic->provided_data_sectors != le64_to_cpu(ic->sb->provided_data_sectors)) { - /* test for overflow */ + if (!!(ic->sb->flags & cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC)) != !!ic->journal_mac_alg.alg_string) { r = -EINVAL; - ti->error = "The superblock has 64-bit device size, but the kernel was compiled with 32-bit sectors"; + ti->error = "Journal mac mismatch"; goto bad; } - if (!!(ic->sb->flags & cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC)) != !!ic->journal_mac_alg.alg_string) { + + get_provided_data_sectors(ic); + if (!ic->provided_data_sectors) { r = -EINVAL; - ti->error = "Journal mac mismatch"; + ti->error = "The device is too small"; goto bad; } @@ -3994,10 +4159,9 @@ try_smaller_buffer: DEBUG_print(" initial_sectors 0x%x\n", ic->initial_sectors); DEBUG_print(" metadata_run 0x%x\n", ic->metadata_run); DEBUG_print(" log2_metadata_run %d\n", ic->log2_metadata_run); - DEBUG_print(" provided_data_sectors 0x%llx (%llu)\n", (unsigned long long)ic->provided_data_sectors, - (unsigned long long)ic->provided_data_sectors); + DEBUG_print(" provided_data_sectors 0x%llx (%llu)\n", ic->provided_data_sectors, ic->provided_data_sectors); DEBUG_print(" log2_buffer_sectors %u\n", ic->log2_buffer_sectors); - DEBUG_print(" bits_in_journal %llu\n", (unsigned long long)bits_in_journal); + DEBUG_print(" bits_in_journal %llu\n", bits_in_journal); if (ic->recalculate_flag && !(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))) { ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING); @@ -4121,6 +4285,8 @@ try_smaller_buffer: ti->num_flush_bios = 1; ti->flush_supported = true; + if (ic->discard) + ti->num_discard_bios = 1; return 0; @@ -4202,7 +4368,7 @@ static void dm_integrity_dtr(struct dm_target *ti) static struct target_type integrity_target = { .name = "integrity", - .version = {1, 5, 0}, + .version = {1, 6, 0}, .module = THIS_MODULE, .features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY, .ctr = dm_integrity_ctr, diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 3ceeb6b404ed..49147e634046 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -551,6 +551,7 @@ void verity_fec_dtr(struct dm_verity *v) mempool_exit(&f->rs_pool); mempool_exit(&f->prealloc_pool); mempool_exit(&f->extra_pool); + mempool_exit(&f->output_pool); kmem_cache_destroy(f->cache); if (f->data_bufio) diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index a09bdc000e64..114927da9cc9 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -26,6 +26,8 @@ #define AUTOCOMMIT_BLOCKS_SSD 65536 #define AUTOCOMMIT_BLOCKS_PMEM 64 #define AUTOCOMMIT_MSEC 1000 +#define MAX_AGE_DIV 16 +#define MAX_AGE_UNSPECIFIED -1UL #define BITMAP_GRANULARITY 65536 #if BITMAP_GRANULARITY < PAGE_SIZE @@ -88,6 +90,7 @@ struct wc_entry { :47 #endif ; + unsigned long age; #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS uint64_t original_sector; uint64_t seq_count; @@ -119,6 +122,7 @@ struct dm_writecache { size_t writeback_size; size_t freelist_high_watermark; size_t freelist_low_watermark; + unsigned long max_age; unsigned uncommitted_blocks; unsigned autocommit_blocks; @@ -130,6 +134,8 @@ struct dm_writecache { struct timer_list autocommit_timer; struct wait_queue_head freelist_wait; + struct timer_list max_age_timer; + atomic_t bio_in_progress[2]; struct wait_queue_head bio_in_progress_wait[2]; @@ -160,6 +166,7 @@ struct dm_writecache { bool autocommit_time_set:1; bool writeback_fua_set:1; bool flush_on_suspend:1; + bool cleaner:1; unsigned writeback_all; struct workqueue_struct *writeback_wq; @@ -502,6 +509,34 @@ static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios) memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size); } +static void ssd_commit_superblock(struct dm_writecache *wc) +{ + int r; + struct dm_io_region region; + struct dm_io_request req; + + region.bdev = wc->ssd_dev->bdev; + region.sector = 0; + region.count = PAGE_SIZE; + + if (unlikely(region.sector + region.count > wc->metadata_sectors)) + region.count = wc->metadata_sectors - region.sector; + + region.sector += wc->start_sector; + + req.bi_op = REQ_OP_WRITE; + req.bi_op_flags = REQ_SYNC | REQ_FUA; + req.mem.type = DM_IO_VMA; + req.mem.ptr.vma = (char *)wc->memory_map; + req.client = wc->dm_io; + req.notify.fn = NULL; + req.notify.context = NULL; + + r = dm_io(&req, 1, ®ion, NULL); + if (unlikely(r)) + writecache_error(wc, r, "error writing superblock"); +} + static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios) { if (WC_MODE_PMEM(wc)) @@ -596,6 +631,7 @@ static void writecache_insert_entry(struct dm_writecache *wc, struct wc_entry *i rb_link_node(&ins->rb_node, parent, node); rb_insert_color(&ins->rb_node, &wc->tree); list_add(&ins->lru, &wc->lru); + ins->age = jiffies; } static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e) @@ -631,6 +667,16 @@ static inline void writecache_verify_watermark(struct dm_writecache *wc) queue_work(wc->writeback_wq, &wc->writeback_work); } +static void writecache_max_age_timer(struct timer_list *t) +{ + struct dm_writecache *wc = from_timer(wc, t, max_age_timer); + + if (!dm_suspended(wc->ti) && !writecache_has_error(wc)) { + queue_work(wc->writeback_wq, &wc->writeback_work); + mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV); + } +} + static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector) { struct wc_entry *e; @@ -741,8 +787,10 @@ static void writecache_flush(struct dm_writecache *wc) wc->seq_count++; pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count)); - writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count); - writecache_commit_flushed(wc, false); + if (WC_MODE_PMEM(wc)) + writecache_commit_flushed(wc, false); + else + ssd_commit_superblock(wc); wc->overwrote_committed = false; @@ -837,6 +885,7 @@ static void writecache_suspend(struct dm_target *ti) bool flush_on_suspend; del_timer_sync(&wc->autocommit_timer); + del_timer_sync(&wc->max_age_timer); wc_lock(wc); writecache_flush(wc); @@ -876,6 +925,7 @@ static int writecache_alloc_entries(struct dm_writecache *wc) struct wc_entry *e = &wc->entries[b]; e->index = b; e->write_in_progress = false; + cond_resched(); } return 0; @@ -930,6 +980,7 @@ static void writecache_resume(struct dm_target *ti) e->original_sector = le64_to_cpu(wme.original_sector); e->seq_count = le64_to_cpu(wme.seq_count); } + cond_resched(); } #endif for (b = 0; b < wc->n_blocks; b++) { @@ -973,6 +1024,9 @@ erase_this: writecache_verify_watermark(wc); + if (wc->max_age != MAX_AGE_UNSPECIFIED) + mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV); + wc_unlock(wc); } @@ -1021,6 +1075,28 @@ static int process_flush_on_suspend_mesg(unsigned argc, char **argv, struct dm_w return 0; } +static void activate_cleaner(struct dm_writecache *wc) +{ + wc->flush_on_suspend = true; + wc->cleaner = true; + wc->freelist_high_watermark = wc->n_blocks; + wc->freelist_low_watermark = wc->n_blocks; +} + +static int process_cleaner_mesg(unsigned argc, char **argv, struct dm_writecache *wc) +{ + if (argc != 1) + return -EINVAL; + + wc_lock(wc); + activate_cleaner(wc); + if (!dm_suspended(wc->ti)) + writecache_verify_watermark(wc); + wc_unlock(wc); + + return 0; +} + static int writecache_message(struct dm_target *ti, unsigned argc, char **argv, char *result, unsigned maxlen) { @@ -1031,6 +1107,8 @@ static int writecache_message(struct dm_target *ti, unsigned argc, char **argv, r = process_flush_mesg(argc, argv, wc); else if (!strcasecmp(argv[0], "flush_on_suspend")) r = process_flush_on_suspend_mesg(argc, argv, wc); + else if (!strcasecmp(argv[0], "cleaner")) + r = process_cleaner_mesg(argc, argv, wc); else DMERR("unrecognised message received: %s", argv[0]); @@ -1194,6 +1272,7 @@ read_next_block: } } else { do { + bool found_entry = false; if (writecache_has_error(wc)) goto unlock_error; e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0); @@ -1204,9 +1283,25 @@ read_next_block: wc->overwrote_committed = true; goto bio_copy; } + found_entry = true; + } else { + if (unlikely(wc->cleaner)) + goto direct_write; } e = writecache_pop_from_freelist(wc, (sector_t)-1); if (unlikely(!e)) { + if (!found_entry) { +direct_write: + e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING); + if (e) { + sector_t next_boundary = read_original_sector(wc, e) - bio->bi_iter.bi_sector; + BUG_ON(!next_boundary); + if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) { + dm_accept_partial_bio(bio, next_boundary); + } + } + goto unlock_remap_origin; + } writecache_wait_on_freelist(wc); continue; } @@ -1619,7 +1714,9 @@ restart: wbl.size = 0; while (!list_empty(&wc->lru) && (wc->writeback_all || - wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark)) { + wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark || + (jiffies - container_of(wc->lru.prev, struct wc_entry, lru)->age >= + wc->max_age - wc->max_age / MAX_AGE_DIV))) { n_walked++; if (unlikely(n_walked > WRITEBACK_LATENCY) && @@ -1791,8 +1888,10 @@ static int init_memory(struct dm_writecache *wc) pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks)); pmem_assign(sb(wc)->seq_count, cpu_to_le64(0)); - for (b = 0; b < wc->n_blocks; b++) + for (b = 0; b < wc->n_blocks; b++) { write_original_sector_seq_count(wc, &wc->entries[b], -1, -1); + cond_resched(); + } writecache_flush_all_metadata(wc); writecache_commit_flushed(wc, false); @@ -1882,9 +1981,11 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) wc->ti = ti; mutex_init(&wc->lock); + wc->max_age = MAX_AGE_UNSPECIFIED; writecache_poison_lists(wc); init_waitqueue_head(&wc->freelist_wait); timer_setup(&wc->autocommit_timer, writecache_autocommit_timer, 0); + timer_setup(&wc->max_age_timer, writecache_max_age_timer, 0); for (i = 0; i < 2; i++) { atomic_set(&wc->bio_in_progress[i], 0); @@ -2058,6 +2159,16 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) goto invalid_optional; wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs); wc->autocommit_time_set = true; + } else if (!strcasecmp(string, "max_age") && opt_params >= 1) { + unsigned max_age_msecs; + string = dm_shift_arg(&as), opt_params--; + if (sscanf(string, "%u%c", &max_age_msecs, &dummy) != 1) + goto invalid_optional; + if (max_age_msecs > 86400000) + goto invalid_optional; + wc->max_age = msecs_to_jiffies(max_age_msecs); + } else if (!strcasecmp(string, "cleaner")) { + wc->cleaner = true; } else if (!strcasecmp(string, "fua")) { if (WC_MODE_PMEM(wc)) { wc->writeback_fua = true; @@ -2235,6 +2346,9 @@ overflow: do_div(x, 100); wc->freelist_low_watermark = x; + if (wc->cleaner) + activate_cleaner(wc); + r = writecache_alloc_entries(wc); if (r) { ti->error = "Cannot allocate memory"; @@ -2278,9 +2392,9 @@ static void writecache_status(struct dm_target *ti, status_type_t type, extra_args = 0; if (wc->start_sector) extra_args += 2; - if (wc->high_wm_percent_set) + if (wc->high_wm_percent_set && !wc->cleaner) extra_args += 2; - if (wc->low_wm_percent_set) + if (wc->low_wm_percent_set && !wc->cleaner) extra_args += 2; if (wc->max_writeback_jobs_set) extra_args += 2; @@ -2288,19 +2402,21 @@ static void writecache_status(struct dm_target *ti, status_type_t type, extra_args += 2; if (wc->autocommit_time_set) extra_args += 2; + if (wc->cleaner) + extra_args++; if (wc->writeback_fua_set) extra_args++; DMEMIT("%u", extra_args); if (wc->start_sector) DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector); - if (wc->high_wm_percent_set) { + if (wc->high_wm_percent_set && !wc->cleaner) { x = (uint64_t)wc->freelist_high_watermark * 100; x += wc->n_blocks / 2; do_div(x, (size_t)wc->n_blocks); DMEMIT(" high_watermark %u", 100 - (unsigned)x); } - if (wc->low_wm_percent_set) { + if (wc->low_wm_percent_set && !wc->cleaner) { x = (uint64_t)wc->freelist_low_watermark * 100; x += wc->n_blocks / 2; do_div(x, (size_t)wc->n_blocks); @@ -2312,6 +2428,10 @@ static void writecache_status(struct dm_target *ti, status_type_t type, DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks); if (wc->autocommit_time_set) DMEMIT(" autocommit_time %u", jiffies_to_msecs(wc->autocommit_jiffies)); + if (wc->max_age != MAX_AGE_UNSPECIFIED) + DMEMIT(" max_age %u", jiffies_to_msecs(wc->max_age)); + if (wc->cleaner) + DMEMIT(" cleaner"); if (wc->writeback_fua_set) DMEMIT(" %sfua", wc->writeback_fua ? "" : "no"); break; @@ -2320,7 +2440,7 @@ static void writecache_status(struct dm_target *ti, status_type_t type, static struct target_type writecache_target = { .name = "writecache", - .version = {1, 2, 0}, + .version = {1, 3, 0}, .module = THIS_MODULE, .ctr = writecache_ctr, .dtr = writecache_dtr, diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 516c7b671d25..369de15c4e80 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1109,7 +1109,6 @@ static int dmz_init_zone(struct blk_zone *blkz, unsigned int idx, void *data) switch (blkz->type) { case BLK_ZONE_TYPE_CONVENTIONAL: set_bit(DMZ_RND, &zone->flags); - zmd->nr_rnd_zones++; break; case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_PREF: diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 0413018c8305..753302e83910 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -25,6 +25,7 @@ #include <linux/wait.h> #include <linux/pr.h> #include <linux/refcount.h> +#include <linux/part_stat.h> #define DM_MSG_PREFIX "core" @@ -1938,16 +1939,15 @@ static struct mapped_device *alloc_dev(int minor) INIT_LIST_HEAD(&md->table_devices); spin_lock_init(&md->uevent_lock); - md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id); - if (!md->queue) - goto bad; - md->queue->queuedata = md; /* * default to bio-based required ->make_request_fn until DM * table is loaded and md->type established. If request-based * table is loaded: blk-mq will override accordingly. */ - blk_queue_make_request(md->queue, dm_make_request); + md->queue = blk_alloc_queue(dm_make_request, numa_node_id); + if (!md->queue) + goto bad; + md->queue->queuedata = md; md->disk = alloc_disk_node(1, md->numa_node_id); if (!md->disk) diff --git a/drivers/md/md.c b/drivers/md/md.c index 469f551863be..271e8a587354 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -58,8 +58,10 @@ #include <linux/delay.h> #include <linux/raid/md_p.h> #include <linux/raid/md_u.h> +#include <linux/raid/detect.h> #include <linux/slab.h> #include <linux/percpu-refcount.h> +#include <linux/part_stat.h> #include <trace/events/block.h> #include "md.h" @@ -2491,12 +2493,12 @@ static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared) { int err = 0; struct block_device *bdev; - char b[BDEVNAME_SIZE]; bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, shared ? (struct md_rdev *)lock_rdev : rdev); if (IS_ERR(bdev)) { - pr_warn("md: could not open %s.\n", __bdevname(dev, b)); + pr_warn("md: could not open device unknown-block(%u,%u).\n", + MAJOR(dev), MINOR(dev)); return PTR_ERR(bdev); } rdev->bdev = bdev; @@ -5621,12 +5623,11 @@ static int md_alloc(dev_t dev, char *name) mddev->hold_active = UNTIL_STOP; error = -ENOMEM; - mddev->queue = blk_alloc_queue(GFP_KERNEL); + mddev->queue = blk_alloc_queue(md_make_request, NUMA_NO_NODE); if (!mddev->queue) goto abort; mddev->queue->queuedata = mddev; - blk_queue_make_request(mddev->queue, md_make_request); blk_set_stacking_limits(&mddev->queue->limits); disk = alloc_disk(1 << shift); @@ -6184,7 +6185,7 @@ EXPORT_SYMBOL_GPL(md_stop_writes); static void mddev_detach(struct mddev *mddev) { md_bitmap_wait_behind_writes(mddev); - if (mddev->pers && mddev->pers->quiesce) { + if (mddev->pers && mddev->pers->quiesce && !mddev->suspended) { mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); } |