diff options
Diffstat (limited to 'drivers/md/dm.c')
-rw-r--r-- | drivers/md/dm.c | 556 |
1 files changed, 321 insertions, 235 deletions
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 39081338ca61..d62f1354ecbf 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -71,6 +71,10 @@ void dm_issue_global_event(void) wake_up(&dm_global_eventq); } +DEFINE_STATIC_KEY_FALSE(stats_enabled); +DEFINE_STATIC_KEY_FALSE(swap_bios_enabled); +DEFINE_STATIC_KEY_FALSE(zoned_enabled); + /* * One of these is allocated (on-stack) per original bio. */ @@ -80,7 +84,8 @@ struct clone_info { struct dm_io *io; sector_t sector; unsigned sector_count; - bool submit_as_polled; + bool is_abnormal_io:1; + bool submit_as_polled:1; }; #define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone)) @@ -498,69 +503,76 @@ static bool bio_is_flush_with_data(struct bio *bio) return ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size); } -static void dm_io_acct(bool end, struct mapped_device *md, struct bio *bio, - unsigned long start_time, struct dm_stats_aux *stats_aux) +static void dm_io_acct(struct dm_io *io, bool end) { - bool is_flush_with_data; - unsigned int bi_size; + struct dm_stats_aux *stats_aux = &io->stats_aux; + unsigned long start_time = io->start_time; + struct mapped_device *md = io->md; + struct bio *bio = io->orig_bio; + unsigned int sectors; - /* If REQ_PREFLUSH set save any payload but do not account it */ - is_flush_with_data = bio_is_flush_with_data(bio); - if (is_flush_with_data) { - bi_size = bio->bi_iter.bi_size; - bio->bi_iter.bi_size = 0; - } + /* + * If REQ_PREFLUSH set, don't account payload, it will be + * submitted (and accounted) after this flush completes. + */ + if (bio_is_flush_with_data(bio)) + sectors = 0; + else if (likely(!(dm_io_flagged(io, DM_IO_WAS_SPLIT)))) + sectors = bio_sectors(bio); + else + sectors = io->sectors; if (!end) - bio_start_io_acct_time(bio, start_time); + bdev_start_io_acct(bio->bi_bdev, sectors, bio_op(bio), + start_time); else - bio_end_io_acct(bio, start_time); + bdev_end_io_acct(bio->bi_bdev, bio_op(bio), start_time); + + if (static_branch_unlikely(&stats_enabled) && + unlikely(dm_stats_used(&md->stats))) { + sector_t sector; + + if (likely(!dm_io_flagged(io, DM_IO_WAS_SPLIT))) + sector = bio->bi_iter.bi_sector; + else + sector = bio_end_sector(bio) - io->sector_offset; - if (unlikely(dm_stats_used(&md->stats))) dm_stats_account_io(&md->stats, bio_data_dir(bio), - bio->bi_iter.bi_sector, bio_sectors(bio), + sector, sectors, end, start_time, stats_aux); - - /* Restore bio's payload so it does get accounted upon requeue */ - if (is_flush_with_data) - bio->bi_iter.bi_size = bi_size; + } } -static void __dm_start_io_acct(struct dm_io *io, struct bio *bio) +static void __dm_start_io_acct(struct dm_io *io) { - dm_io_acct(false, io->md, bio, io->start_time, &io->stats_aux); + dm_io_acct(io, false); } static void dm_start_io_acct(struct dm_io *io, struct bio *clone) { - /* Must account IO to DM device in terms of orig_bio */ - struct bio *bio = io->orig_bio; - /* * Ensure IO accounting is only ever started once. - * Expect no possibility for race unless DM_TIO_IS_DUPLICATE_BIO. */ - if (!clone || - likely(!dm_tio_flagged(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO))) { - if (WARN_ON_ONCE(dm_io_flagged(io, DM_IO_ACCOUNTED))) - return; + if (dm_io_flagged(io, DM_IO_ACCOUNTED)) + return; + + /* Expect no possibility for race unless DM_TIO_IS_DUPLICATE_BIO. */ + if (!clone || likely(dm_tio_is_normal(clone_to_tio(clone)))) { dm_io_set_flag(io, DM_IO_ACCOUNTED); } else { unsigned long flags; - if (dm_io_flagged(io, DM_IO_ACCOUNTED)) - return; /* Can afford locking given DM_TIO_IS_DUPLICATE_BIO */ spin_lock_irqsave(&io->lock, flags); dm_io_set_flag(io, DM_IO_ACCOUNTED); spin_unlock_irqrestore(&io->lock, flags); } - __dm_start_io_acct(io, bio); + __dm_start_io_acct(io); } -static void dm_end_io_acct(struct dm_io *io, struct bio *bio) +static void dm_end_io_acct(struct dm_io *io) { - dm_io_acct(true, io->md, bio, io->start_time, &io->stats_aux); + dm_io_acct(io, true); } static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) @@ -569,7 +581,9 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) struct dm_target_io *tio; struct bio *clone; - clone = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO, &md->io_bs); + clone = bio_alloc_clone(NULL, bio, GFP_NOIO, &md->io_bs); + /* Set default bdev, but target must bio_set_dev() before issuing IO */ + clone->bi_bdev = md->disk->part0; tio = clone_to_tio(clone); tio->flags = 0; @@ -578,17 +592,19 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) io = container_of(tio, struct dm_io, tio); io->magic = DM_IO_MAGIC; - io->status = 0; - atomic_set(&io->io_count, 1); + io->status = BLK_STS_OK; + + /* one ref is for submission, the other is for completion */ + atomic_set(&io->io_count, 2); this_cpu_inc(*md->pending_io); - io->orig_bio = NULL; + io->orig_bio = bio; io->md = md; - io->map_task = current; spin_lock_init(&io->lock); io->start_time = jiffies; io->flags = 0; - dm_stats_record_start(&md->stats, &io->stats_aux); + if (static_branch_unlikely(&stats_enabled)) + dm_stats_record_start(&md->stats, &io->stats_aux); return io; } @@ -599,7 +615,7 @@ static void free_io(struct dm_io *io) } static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti, - unsigned target_bio_nr, unsigned *len, gfp_t gfp_mask) + unsigned target_bio_nr, unsigned *len, gfp_t gfp_mask) { struct dm_target_io *tio; struct bio *clone; @@ -610,10 +626,13 @@ static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti, /* alloc_io() already initialized embedded clone */ clone = &tio->clone; } else { - clone = bio_alloc_clone(ci->bio->bi_bdev, ci->bio, - gfp_mask, &ci->io->md->bs); + struct mapped_device *md = ci->io->md; + + clone = bio_alloc_clone(NULL, ci->bio, gfp_mask, &md->bs); if (!clone) return NULL; + /* Set default bdev, but target must bio_set_dev() before issuing IO */ + clone->bi_bdev = md->disk->part0; /* REQ_DM_POLL_LIST shouldn't be inherited */ clone->bi_opf &= ~REQ_DM_POLL_LIST; @@ -663,14 +682,16 @@ static void queue_io(struct mapped_device *md, struct bio *bio) * function to access the md->map field, and make sure they call * dm_put_live_table() when finished. */ -struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier) +struct dm_table *dm_get_live_table(struct mapped_device *md, + int *srcu_idx) __acquires(md->io_barrier) { *srcu_idx = srcu_read_lock(&md->io_barrier); return srcu_dereference(md->map, &md->io_barrier); } -void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier) +void dm_put_live_table(struct mapped_device *md, + int srcu_idx) __releases(md->io_barrier) { srcu_read_unlock(&md->io_barrier, srcu_idx); } @@ -696,6 +717,24 @@ static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) rcu_read_unlock(); } +static inline struct dm_table *dm_get_live_table_bio(struct mapped_device *md, + int *srcu_idx, struct bio *bio) +{ + if (bio->bi_opf & REQ_NOWAIT) + return dm_get_live_table_fast(md); + else + return dm_get_live_table(md, srcu_idx); +} + +static inline void dm_put_live_table_bio(struct mapped_device *md, int srcu_idx, + struct bio *bio) +{ + if (bio->bi_opf & REQ_NOWAIT) + dm_put_live_table_fast(md); + else + dm_put_live_table(md, srcu_idx); +} + static char *_dm_claim_ptr = "I belong to device-mapper"; /* @@ -875,14 +914,14 @@ static void dm_io_complete(struct dm_io *io) io_error = io->status; if (dm_io_flagged(io, DM_IO_ACCOUNTED)) - dm_end_io_acct(io, bio); + dm_end_io_acct(io); else if (!io_error) { /* * Must handle target that DM_MAPIO_SUBMITTED only to * then bio_endio() rather than dm_submit_bio_remap() */ - __dm_start_io_acct(io, bio); - dm_end_io_acct(io, bio); + __dm_start_io_acct(io); + dm_end_io_acct(io); } free_io(io); smp_wmb(); @@ -899,7 +938,7 @@ static void dm_io_complete(struct dm_io *io) * may only reflect a subset of the pre-split original) * so clear REQ_POLLED in case of requeue. */ - bio->bi_opf &= ~REQ_POLLED; + bio_clear_polled(bio); if (io_error == BLK_STS_AGAIN) { /* io_uring doesn't handle BLK_STS_AGAIN (yet) */ queue_io(md, bio); @@ -923,30 +962,35 @@ static void dm_io_complete(struct dm_io *io) } } -static inline bool dm_tio_is_normal(struct dm_target_io *tio) -{ - return (dm_tio_flagged(tio, DM_TIO_INSIDE_DM_IO) && - !dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO)); -} - /* * Decrements the number of outstanding ios that a bio has been * cloned into, completing the original io if necc. */ -void dm_io_dec_pending(struct dm_io *io, blk_status_t error) +static inline void __dm_io_dec_pending(struct dm_io *io) { + if (atomic_dec_and_test(&io->io_count)) + dm_io_complete(io); +} + +static void dm_io_set_error(struct dm_io *io, blk_status_t error) +{ + unsigned long flags; + /* Push-back supersedes any I/O errors */ - if (unlikely(error)) { - unsigned long flags; - spin_lock_irqsave(&io->lock, flags); - if (!(io->status == BLK_STS_DM_REQUEUE && - __noflush_suspending(io->md))) - io->status = error; - spin_unlock_irqrestore(&io->lock, flags); + spin_lock_irqsave(&io->lock, flags); + if (!(io->status == BLK_STS_DM_REQUEUE && + __noflush_suspending(io->md))) { + io->status = error; } + spin_unlock_irqrestore(&io->lock, flags); +} - if (atomic_dec_and_test(&io->io_count)) - dm_io_complete(io); +static void dm_io_dec_pending(struct dm_io *io, blk_status_t error) +{ + if (unlikely(error)) + dm_io_set_error(io, error); + + __dm_io_dec_pending(io); } void disable_discard(struct mapped_device *md) @@ -974,35 +1018,43 @@ static void clone_endio(struct bio *bio) { blk_status_t error = bio->bi_status; struct dm_target_io *tio = clone_to_tio(bio); + struct dm_target *ti = tio->ti; + dm_endio_fn endio = ti->type->end_io; struct dm_io *io = tio->io; - struct mapped_device *md = tio->io->md; - dm_endio_fn endio = tio->ti->type->end_io; - struct request_queue *q = bio->bi_bdev->bd_disk->queue; + struct mapped_device *md = io->md; - if (unlikely(error == BLK_STS_TARGET)) { - if (bio_op(bio) == REQ_OP_DISCARD && - !bdev_max_discard_sectors(bio->bi_bdev)) - disable_discard(md); - else if (bio_op(bio) == REQ_OP_WRITE_ZEROES && - !q->limits.max_write_zeroes_sectors) - disable_write_zeroes(md); - } + if (likely(bio->bi_bdev != md->disk->part0)) { + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + + if (unlikely(error == BLK_STS_TARGET)) { + if (bio_op(bio) == REQ_OP_DISCARD && + !bdev_max_discard_sectors(bio->bi_bdev)) + disable_discard(md); + else if (bio_op(bio) == REQ_OP_WRITE_ZEROES && + !q->limits.max_write_zeroes_sectors) + disable_write_zeroes(md); + } - if (blk_queue_is_zoned(q)) - dm_zone_endio(io, bio); + if (static_branch_unlikely(&zoned_enabled) && + unlikely(blk_queue_is_zoned(q))) + dm_zone_endio(io, bio); + } if (endio) { - int r = endio(tio->ti, bio, &error); + int r = endio(ti, bio, &error); switch (r) { case DM_ENDIO_REQUEUE: - /* - * Requeuing writes to a sequential zone of a zoned - * target will break the sequential write pattern: - * fail such IO. - */ - if (WARN_ON_ONCE(dm_is_zone_write(md, bio))) - error = BLK_STS_IOERR; - else + if (static_branch_unlikely(&zoned_enabled)) { + /* + * Requeuing writes to a sequential zone of a zoned + * target will break the sequential write pattern: + * fail such IO. + */ + if (WARN_ON_ONCE(dm_is_zone_write(md, bio))) + error = BLK_STS_IOERR; + else + error = BLK_STS_DM_REQUEUE; + } else error = BLK_STS_DM_REQUEUE; fallthrough; case DM_ENDIO_DONE: @@ -1016,10 +1068,9 @@ static void clone_endio(struct bio *bio) } } - if (unlikely(swap_bios_limit(tio->ti, bio))) { - struct mapped_device *md = io->md; + if (static_branch_unlikely(&swap_bios_enabled) && + unlikely(swap_bios_limit(ti, bio))) up(&md->swap_bios_semaphore); - } free_tio(bio); dm_io_dec_pending(io, error); @@ -1161,7 +1212,7 @@ static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, * +--------------------+---------------+-------+ * * <-------------- *tio->len_ptr ---------------> - * <------- bi_size -------> + * <----- bio_sectors -----> * <-- n_sectors --> * * Region 1 was already iterated over with bio_advance or similar function. @@ -1178,25 +1229,25 @@ static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) { struct dm_target_io *tio = clone_to_tio(bio); - unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT; + unsigned bio_sectors = bio_sectors(bio); BUG_ON(dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO)); BUG_ON(op_is_zone_mgmt(bio_op(bio))); BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND); - BUG_ON(bi_size > *tio->len_ptr); - BUG_ON(n_sectors > bi_size); + BUG_ON(bio_sectors > *tio->len_ptr); + BUG_ON(n_sectors > bio_sectors); - *tio->len_ptr -= bi_size - n_sectors; + *tio->len_ptr -= bio_sectors - n_sectors; bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT; -} -EXPORT_SYMBOL_GPL(dm_accept_partial_bio); -static inline void __dm_submit_bio_remap(struct bio *clone, - dev_t dev, sector_t old_sector) -{ - trace_block_bio_remap(clone, dev, old_sector); - submit_bio_noacct(clone); + /* + * __split_and_process_bio() may have already saved mapped part + * for accounting but it is being reduced so update accordingly. + */ + dm_io_set_flag(tio->io, DM_IO_WAS_SPLIT); + tio->io->sectors = n_sectors; } +EXPORT_SYMBOL_GPL(dm_accept_partial_bio); /* * @clone: clone bio that DM core passed to target's .map function @@ -1212,8 +1263,6 @@ void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone) struct dm_target_io *tio = clone_to_tio(clone); struct dm_io *io = tio->io; - WARN_ON_ONCE(!tio->ti->accounts_remapped_io); - /* establish bio that will get submitted */ if (!tgt_clone) tgt_clone = clone; @@ -1222,22 +1271,11 @@ void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone) * Account io->origin_bio to DM dev on behalf of target * that took ownership of IO with DM_MAPIO_SUBMITTED. */ - if (io->map_task == current) { - /* Still in target's map function */ - dm_io_set_flag(io, DM_IO_START_ACCT); - } else { - /* - * Called by another thread, managed by DM target, - * wait for dm_split_and_process_bio() to store - * io->orig_bio - */ - while (unlikely(!smp_load_acquire(&io->orig_bio))) - msleep(1); - dm_start_io_acct(io, clone); - } + dm_start_io_acct(io, clone); - __dm_submit_bio_remap(tgt_clone, disk_devt(io->md->disk), + trace_block_bio_remap(tgt_clone, disk_devt(io->md->disk), tio->old_sector); + submit_bio_noacct(tgt_clone); } EXPORT_SYMBOL_GPL(dm_submit_bio_remap); @@ -1260,55 +1298,53 @@ static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch) static void __map_bio(struct bio *clone) { struct dm_target_io *tio = clone_to_tio(clone); - int r; - struct dm_io *io = tio->io; struct dm_target *ti = tio->ti; + struct dm_io *io = tio->io; + struct mapped_device *md = io->md; + int r; clone->bi_end_io = clone_endio; /* * Map the clone. */ - dm_io_inc_pending(io); tio->old_sector = clone->bi_iter.bi_sector; - if (unlikely(swap_bios_limit(ti, clone))) { - struct mapped_device *md = io->md; + if (static_branch_unlikely(&swap_bios_enabled) && + unlikely(swap_bios_limit(ti, clone))) { int latch = get_swap_bios(); if (unlikely(latch != md->swap_bios)) __set_swap_bios_limit(md, latch); down(&md->swap_bios_semaphore); } - /* - * Check if the IO needs a special mapping due to zone append emulation - * on zoned target. In this case, dm_zone_map_bio() calls the target - * map operation. - */ - if (dm_emulate_zone_append(io->md)) - r = dm_zone_map_bio(tio); - else + if (static_branch_unlikely(&zoned_enabled)) { + /* + * Check if the IO needs a special mapping due to zone append + * emulation on zoned target. In this case, dm_zone_map_bio() + * calls the target map operation. + */ + if (unlikely(dm_emulate_zone_append(md))) + r = dm_zone_map_bio(tio); + else + r = ti->type->map(ti, clone); + } else r = ti->type->map(ti, clone); switch (r) { case DM_MAPIO_SUBMITTED: /* target has assumed ownership of this io */ if (!ti->accounts_remapped_io) - dm_io_set_flag(io, DM_IO_START_ACCT); + dm_start_io_acct(io, clone); break; case DM_MAPIO_REMAPPED: - /* - * the bio has been remapped so dispatch it, but defer - * dm_start_io_acct() until after possible bio_split(). - */ - __dm_submit_bio_remap(clone, disk_devt(io->md->disk), - tio->old_sector); - dm_io_set_flag(io, DM_IO_START_ACCT); + dm_submit_bio_remap(clone, NULL); break; case DM_MAPIO_KILL: case DM_MAPIO_REQUEUE: - if (unlikely(swap_bios_limit(ti, clone))) - up(&io->md->swap_bios_semaphore); + if (static_branch_unlikely(&swap_bios_enabled) && + unlikely(swap_bios_limit(ti, clone))) + up(&md->swap_bios_semaphore); free_tio(clone); if (r == DM_MAPIO_KILL) dm_io_dec_pending(io, BLK_STS_IOERR); @@ -1321,6 +1357,31 @@ static void __map_bio(struct bio *clone) } } +static void setup_split_accounting(struct clone_info *ci, unsigned len) +{ + struct dm_io *io = ci->io; + + if (ci->sector_count > len) { + /* + * Split needed, save the mapped part for accounting. + * NOTE: dm_accept_partial_bio() will update accordingly. + */ + dm_io_set_flag(io, DM_IO_WAS_SPLIT); + io->sectors = len; + } + + if (static_branch_unlikely(&stats_enabled) && + unlikely(dm_stats_used(&io->md->stats))) { + /* + * Save bi_sector in terms of its offset from end of + * original bio, only needed for DM-stats' benefit. + * - saved regardless of whether split needed so that + * dm_accept_partial_bio() doesn't need to. + */ + io->sector_offset = bio_end_sector(ci->bio) - ci->sector; + } +} + static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci, struct dm_target *ti, unsigned num_bios) { @@ -1350,18 +1411,22 @@ static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci, } } -static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, +static int __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, unsigned num_bios, unsigned *len) { struct bio_list blist = BIO_EMPTY_LIST; struct bio *clone; + int ret = 0; switch (num_bios) { case 0: break; case 1: + if (len) + setup_split_accounting(ci, *len); clone = alloc_tio(ci, ti, 0, len, GFP_NOIO); __map_bio(clone); + ret = 1; break; default: /* dm_accept_partial_bio() is not supported with shared tio->len_ptr */ @@ -1369,9 +1434,12 @@ static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, while ((clone = bio_list_pop(&blist))) { dm_tio_set_flag(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO); __map_bio(clone); + ret += 1; } break; } + + return ret; } static void __send_empty_flush(struct clone_info *ci) @@ -1392,8 +1460,19 @@ static void __send_empty_flush(struct clone_info *ci) ci->sector_count = 0; ci->io->tio.clone.bi_iter.bi_size = 0; - while ((ti = dm_table_get_target(ci->map, target_nr++))) - __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); + while ((ti = dm_table_get_target(ci->map, target_nr++))) { + int bios; + + atomic_add(ti->num_flush_bios, &ci->io->io_count); + bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); + atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count); + } + + /* + * alloc_io() takes one extra reference for submission, so the + * reference won't reach 0 without the following subtraction + */ + atomic_sub(1, &ci->io->io_count); bio_uninit(ci->bio); } @@ -1402,11 +1481,18 @@ static void __send_changing_extent_only(struct clone_info *ci, struct dm_target unsigned num_bios) { unsigned len; + int bios; len = min_t(sector_t, ci->sector_count, max_io_len_target_boundary(ti, dm_target_offset(ti, ci->sector))); - __send_duplicate_bios(ci, ti, num_bios, &len); + atomic_add(num_bios, &ci->io->io_count); + bios = __send_duplicate_bios(ci, ti, num_bios, &len); + /* + * alloc_io() takes one extra reference for submission, so the + * reference won't reach 0 without the following (+1) subtraction + */ + atomic_sub(num_bios - bios + 1, &ci->io->io_count); ci->sector += len; ci->sector_count -= len; @@ -1414,21 +1500,24 @@ static void __send_changing_extent_only(struct clone_info *ci, struct dm_target static bool is_abnormal_io(struct bio *bio) { - bool r = false; + unsigned int op = bio_op(bio); - switch (bio_op(bio)) { - case REQ_OP_DISCARD: - case REQ_OP_SECURE_ERASE: - case REQ_OP_WRITE_ZEROES: - r = true; - break; + if (op != REQ_OP_READ && op != REQ_OP_WRITE && op != REQ_OP_FLUSH) { + switch (op) { + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: + case REQ_OP_WRITE_ZEROES: + return true; + default: + break; + } } - return r; + return false; } -static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti, - int *result) +static blk_status_t __process_abnormal_io(struct clone_info *ci, + struct dm_target *ti) { unsigned num_bios = 0; @@ -1442,8 +1531,6 @@ static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti, case REQ_OP_WRITE_ZEROES: num_bios = ti->num_write_zeroes_bios; break; - default: - return false; } /* @@ -1452,17 +1539,15 @@ static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti, * reconfiguration might also have changed that since the * check was performed. */ - if (!num_bios) - *result = -EOPNOTSUPP; - else { - __send_changing_extent_only(ci, ti, num_bios); - *result = 0; - } - return true; + if (unlikely(!num_bios)) + return BLK_STS_NOTSUPP; + + __send_changing_extent_only(ci, ti, num_bios); + return BLK_STS_OK; } /* - * Reuse ->bi_private as hlist head for storing all dm_io instances + * Reuse ->bi_private as dm_io list head for storing all dm_io instances * associated with this bio, and this bio's bi_private needs to be * stored in dm_io->data before the reuse. * @@ -1470,54 +1555,53 @@ static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti, * touch it after splitting. Meantime it won't be changed by anyone after * bio is submitted. So this reuse is safe. */ -static inline struct hlist_head *dm_get_bio_hlist_head(struct bio *bio) +static inline struct dm_io **dm_poll_list_head(struct bio *bio) { - return (struct hlist_head *)&bio->bi_private; + return (struct dm_io **)&bio->bi_private; } static void dm_queue_poll_io(struct bio *bio, struct dm_io *io) { - struct hlist_head *head = dm_get_bio_hlist_head(bio); + struct dm_io **head = dm_poll_list_head(bio); if (!(bio->bi_opf & REQ_DM_POLL_LIST)) { bio->bi_opf |= REQ_DM_POLL_LIST; /* * Save .bi_private into dm_io, so that we can reuse - * .bi_private as hlist head for storing dm_io list + * .bi_private as dm_io list head for storing dm_io list */ io->data = bio->bi_private; - INIT_HLIST_HEAD(head); - /* tell block layer to poll for completion */ bio->bi_cookie = ~BLK_QC_T_NONE; + + io->next = NULL; } else { /* * bio recursed due to split, reuse original poll list, * and save bio->bi_private too. */ - io->data = hlist_entry(head->first, struct dm_io, node)->data; + io->data = (*head)->data; + io->next = *head; } - hlist_add_head(&io->node, head); + *head = io; } /* * Select the correct strategy for processing a non-flush bio. */ -static int __split_and_process_bio(struct clone_info *ci) +static blk_status_t __split_and_process_bio(struct clone_info *ci) { struct bio *clone; struct dm_target *ti; unsigned len; - int r; ti = dm_table_find_target(ci->map, ci->sector); - if (!ti) - return -EIO; - - if (__process_abnormal_io(ci, ti, &r)) - return r; + if (unlikely(!ti)) + return BLK_STS_IOERR; + else if (unlikely(ci->is_abnormal_io)) + return __process_abnormal_io(ci, ti); /* * Only support bio polling for normal IO, and the target io is @@ -1526,27 +1610,30 @@ static int __split_and_process_bio(struct clone_info *ci) ci->submit_as_polled = ci->bio->bi_opf & REQ_POLLED; len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count); + setup_split_accounting(ci, len); clone = alloc_tio(ci, ti, 0, &len, GFP_NOIO); __map_bio(clone); ci->sector += len; ci->sector_count -= len; - return 0; + return BLK_STS_OK; } static void init_clone_info(struct clone_info *ci, struct mapped_device *md, - struct dm_table *map, struct bio *bio) + struct dm_table *map, struct bio *bio, bool is_abnormal) { ci->map = map; ci->io = alloc_io(md, bio); ci->bio = bio; + ci->is_abnormal_io = is_abnormal; ci->submit_as_polled = false; ci->sector = bio->bi_iter.bi_sector; ci->sector_count = bio_sectors(bio); /* Shouldn't happen but sector_count was being set to 0 so... */ - if (WARN_ON_ONCE(op_is_zone_mgmt(bio_op(bio)) && ci->sector_count)) + if (static_branch_unlikely(&zoned_enabled) && + WARN_ON_ONCE(op_is_zone_mgmt(bio_op(bio)) && ci->sector_count)) ci->sector_count = 0; } @@ -1557,10 +1644,21 @@ static void dm_split_and_process_bio(struct mapped_device *md, struct dm_table *map, struct bio *bio) { struct clone_info ci; - struct bio *orig_bio = NULL; - int error = 0; + struct dm_io *io; + blk_status_t error = BLK_STS_OK; + bool is_abnormal; - init_clone_info(&ci, md, map, bio); + is_abnormal = is_abnormal_io(bio); + if (unlikely(is_abnormal)) { + /* + * Use blk_queue_split() for abnormal IO (e.g. discard, etc) + * otherwise associated queue_limits won't be imposed. + */ + blk_queue_split(&bio); + } + + init_clone_info(&ci, md, map, bio, is_abnormal); + io = ci.io; if (bio->bi_opf & REQ_PREFLUSH) { __send_empty_flush(&ci); @@ -1569,40 +1667,34 @@ static void dm_split_and_process_bio(struct mapped_device *md, } error = __split_and_process_bio(&ci); - ci.io->map_task = NULL; if (error || !ci.sector_count) goto out; - /* * Remainder must be passed to submit_bio_noacct() so it gets handled * *after* bios already submitted have been completely processed. - * We take a clone of the original to store in ci.io->orig_bio to be - * used by dm_end_io_acct() and for dm_io_complete() to use for - * completion handling. */ - orig_bio = bio_split(bio, bio_sectors(bio) - ci.sector_count, - GFP_NOIO, &md->queue->bio_split); - bio_chain(orig_bio, bio); - trace_block_split(orig_bio, bio->bi_iter.bi_sector); + bio_trim(bio, io->sectors, ci.sector_count); + trace_block_split(bio, bio->bi_iter.bi_sector); + bio_inc_remaining(bio); submit_bio_noacct(bio); out: - if (!orig_bio) - orig_bio = bio; - smp_store_release(&ci.io->orig_bio, orig_bio); - if (dm_io_flagged(ci.io, DM_IO_START_ACCT)) - dm_start_io_acct(ci.io, NULL); - /* * Drop the extra reference count for non-POLLED bio, and hold one * reference for POLLED bio, which will be released in dm_poll_bio * - * Add every dm_io instance into the hlist_head which is stored in - * bio->bi_private, so that dm_poll_bio can poll them all. + * Add every dm_io instance into the dm_io list head which is stored + * in bio->bi_private, so that dm_poll_bio can poll them all. */ - if (error || !ci.submit_as_polled) - dm_io_dec_pending(ci.io, errno_to_blk_status(error)); - else - dm_queue_poll_io(bio, ci.io); + if (error || !ci.submit_as_polled) { + /* + * In case of submission failure, the extra reference for + * submitting io isn't consumed yet + */ + if (error) + atomic_dec(&io->io_count); + dm_io_dec_pending(io, error); + } else + dm_queue_poll_io(bio, io); } static void dm_submit_bio(struct bio *bio) @@ -1611,7 +1703,7 @@ static void dm_submit_bio(struct bio *bio) int srcu_idx; struct dm_table *map; - map = dm_get_live_table(md, &srcu_idx); + map = dm_get_live_table_bio(md, &srcu_idx, bio); /* If suspended, or map not yet available, queue this IO for later */ if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) || @@ -1625,16 +1717,9 @@ static void dm_submit_bio(struct bio *bio) goto out; } - /* - * Use blk_queue_split() for abnormal IO (e.g. discard, writesame, etc) - * otherwise associated queue_limits won't be imposed. - */ - if (is_abnormal_io(bio)) - blk_queue_split(&bio); - dm_split_and_process_bio(md, map, bio); out: - dm_put_live_table(md, srcu_idx); + dm_put_live_table_bio(md, srcu_idx, bio); } static bool dm_poll_dm_io(struct dm_io *io, struct io_comp_batch *iob, @@ -1653,18 +1738,16 @@ static bool dm_poll_dm_io(struct dm_io *io, struct io_comp_batch *iob, static int dm_poll_bio(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) { - struct hlist_head *head = dm_get_bio_hlist_head(bio); - struct hlist_head tmp = HLIST_HEAD_INIT; - struct hlist_node *next; - struct dm_io *io; + struct dm_io **head = dm_poll_list_head(bio); + struct dm_io *list = *head; + struct dm_io *tmp = NULL; + struct dm_io *curr, *next; /* Only poll normal bio which was marked as REQ_DM_POLL_LIST */ if (!(bio->bi_opf & REQ_DM_POLL_LIST)) return 0; - WARN_ON_ONCE(hlist_empty(head)); - - hlist_move_list(head, &tmp); + WARN_ON_ONCE(!list); /* * Restore .bi_private before possibly completing dm_io. @@ -1675,24 +1758,27 @@ static int dm_poll_bio(struct bio *bio, struct io_comp_batch *iob, * clearing REQ_DM_POLL_LIST here. */ bio->bi_opf &= ~REQ_DM_POLL_LIST; - bio->bi_private = hlist_entry(tmp.first, struct dm_io, node)->data; + bio->bi_private = list->data; - hlist_for_each_entry_safe(io, next, &tmp, node) { - if (dm_poll_dm_io(io, iob, flags)) { - hlist_del_init(&io->node); + for (curr = list, next = curr->next; curr; curr = next, next = + curr ? curr->next : NULL) { + if (dm_poll_dm_io(curr, iob, flags)) { /* - * clone_endio() has already occurred, so passing - * error as 0 here doesn't override io->status + * clone_endio() has already occurred, so no + * error handling is needed here. */ - dm_io_dec_pending(io, 0); + __dm_io_dec_pending(curr); + } else { + curr->next = tmp; + tmp = curr; } } /* Not done? */ - if (!hlist_empty(&tmp)) { + if (tmp) { bio->bi_opf |= REQ_DM_POLL_LIST; /* Reset bio->bi_private to dm_io list head */ - hlist_move_list(&tmp, head); + *head = tmp; return 0; } return 1; @@ -2901,8 +2987,8 @@ int dm_noflush_suspending(struct dm_target *ti) EXPORT_SYMBOL_GPL(dm_noflush_suspending); struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type, - unsigned integrity, unsigned per_io_data_size, - unsigned min_pool_size) + unsigned per_io_data_size, unsigned min_pool_size, + bool integrity, bool poll) { struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id); unsigned int pool_size = 0; @@ -2918,7 +3004,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size); front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + DM_TARGET_IO_BIO_OFFSET; io_front_pad = roundup(per_io_data_size, __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET; - ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0); + ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, poll ? BIOSET_PERCPU_CACHE : 0); if (ret) goto out; if (integrity && bioset_integrity_create(&pools->io_bs, pool_size)) |