diff options
author | Mike Snitzer <snitzer@redhat.com> | 2015-06-26 10:01:13 -0400 |
---|---|---|
committer | Mike Snitzer <snitzer@redhat.com> | 2015-06-26 10:11:58 -0400 |
commit | 78d8e58a086b214dddf1fd463e20a7e1d82d7866 (patch) | |
tree | 2868b9a09eb82b8ecf37ec96fc494838dfdee63a /drivers/md | |
parent | 4e6e36c3714364b65f2bfea8c73691c663891726 (diff) |
Revert "block, dm: don't copy bios for request clones"
This reverts commit 5f1b670d0bef508a5554d92525f5f6d00d640b38.
Justification for revert as reported in this dm-devel post:
https://www.redhat.com/archives/dm-devel/2015-June/msg00160.html
this change should not be pushed to mainline yet.
Firstly, Christoph has a newer version of the patch that fixes silent
data corruption problem:
https://www.redhat.com/archives/dm-devel/2015-May/msg00229.html
And the new version still depends on LLDDs to always complete requests
to the end when error happens, while block API doesn't enforce such a
requirement. If the assumption is ever broken, the inconsistency between
request and bio (e.g. rq->__sector and rq->bio) will cause silent data
corruption:
https://www.redhat.com/archives/dm-devel/2015-June/msg00022.html
Reported-by: Junichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/dm-table.c | 25 | ||||
-rw-r--r-- | drivers/md/dm.c | 171 | ||||
-rw-r--r-- | drivers/md/dm.h | 5 |
3 files changed, 142 insertions, 59 deletions
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index a5f94125ad01..16ba55ad7089 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -942,28 +942,21 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * { unsigned type = dm_table_get_type(t); unsigned per_bio_data_size = 0; + struct dm_target *tgt; unsigned i; - switch (type) { - case DM_TYPE_BIO_BASED: - for (i = 0; i < t->num_targets; i++) { - struct dm_target *tgt = t->targets + i; - - per_bio_data_size = max(per_bio_data_size, - tgt->per_bio_data_size); - } - t->mempools = dm_alloc_bio_mempools(t->integrity_supported, - per_bio_data_size); - break; - case DM_TYPE_REQUEST_BASED: - case DM_TYPE_MQ_REQUEST_BASED: - t->mempools = dm_alloc_rq_mempools(md, type); - break; - default: + if (unlikely(type == DM_TYPE_NONE)) { DMWARN("no table type is set, can't allocate mempools"); return -EINVAL; } + if (type == DM_TYPE_BIO_BASED) + for (i = 0; i < t->num_targets; i++) { + tgt = t->targets + i; + per_bio_data_size = max(per_bio_data_size, tgt->per_bio_data_size); + } + + t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, per_bio_data_size); if (!t->mempools) return -ENOMEM; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 492181e16c69..9d942aef0f75 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -993,6 +993,57 @@ static void clone_endio(struct bio *bio, int error) dec_pending(io, error); } +/* + * Partial completion handling for request-based dm + */ +static void end_clone_bio(struct bio *clone, int error) +{ + struct dm_rq_clone_bio_info *info = + container_of(clone, struct dm_rq_clone_bio_info, clone); + struct dm_rq_target_io *tio = info->tio; + struct bio *bio = info->orig; + unsigned int nr_bytes = info->orig->bi_iter.bi_size; + + bio_put(clone); + + if (tio->error) + /* + * An error has already been detected on the request. + * Once error occurred, just let clone->end_io() handle + * the remainder. + */ + return; + else if (error) { + /* + * Don't notice the error to the upper layer yet. + * The error handling decision is made by the target driver, + * when the request is completed. + */ + tio->error = error; + return; + } + + /* + * I/O for the bio successfully completed. + * Notice the data completion to the upper layer. + */ + + /* + * bios are processed from the head of the list. + * So the completing bio should always be rq->bio. + * If it's not, something wrong is happening. + */ + if (tio->orig->bio != bio) + DMERR("bio completion is going in the middle of the request"); + + /* + * Update the original request. + * Do not use blk_end_request() here, because it may complete + * the original request before the clone, and break the ordering. + */ + blk_update_request(tio->orig, 0, nr_bytes); +} + static struct dm_rq_target_io *tio_from_request(struct request *rq) { return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special); @@ -1050,6 +1101,8 @@ static void free_rq_clone(struct request *clone) struct dm_rq_target_io *tio = clone->end_io_data; struct mapped_device *md = tio->md; + blk_rq_unprep_clone(clone); + if (md->type == DM_TYPE_MQ_REQUEST_BASED) /* stacked on blk-mq queue(s) */ tio->ti->type->release_clone_rq(clone); @@ -1784,13 +1837,39 @@ static void dm_dispatch_clone_request(struct request *clone, struct request *rq) dm_complete_request(rq, r); } -static void setup_clone(struct request *clone, struct request *rq, - struct dm_rq_target_io *tio) +static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, + void *data) { - blk_rq_prep_clone(clone, rq); + struct dm_rq_target_io *tio = data; + struct dm_rq_clone_bio_info *info = + container_of(bio, struct dm_rq_clone_bio_info, clone); + + info->orig = bio_orig; + info->tio = tio; + bio->bi_end_io = end_clone_bio; + + return 0; +} + +static int setup_clone(struct request *clone, struct request *rq, + struct dm_rq_target_io *tio, gfp_t gfp_mask) +{ + int r; + + r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask, + dm_rq_bio_constructor, tio); + if (r) + return r; + + clone->cmd = rq->cmd; + clone->cmd_len = rq->cmd_len; + clone->sense = rq->sense; clone->end_io = end_clone_request; clone->end_io_data = tio; + tio->clone = clone; + + return 0; } static struct request *clone_rq(struct request *rq, struct mapped_device *md, @@ -1811,7 +1890,12 @@ static struct request *clone_rq(struct request *rq, struct mapped_device *md, clone = tio->clone; blk_rq_init(NULL, clone); - setup_clone(clone, rq, tio); + if (setup_clone(clone, rq, tio, gfp_mask)) { + /* -ENOMEM */ + if (alloc_clone) + free_clone_request(md, clone); + return NULL; + } return clone; } @@ -1905,7 +1989,11 @@ static int map_request(struct dm_rq_target_io *tio, struct request *rq, } if (r != DM_MAPIO_REMAPPED) return r; - setup_clone(clone, rq, tio); + if (setup_clone(clone, rq, tio, GFP_ATOMIC)) { + /* -ENOMEM */ + ti->type->release_clone_rq(clone); + return DM_MAPIO_REQUEUE; + } } switch (r) { @@ -2375,6 +2463,8 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t) goto out; } + BUG_ON(!p || md->io_pool || md->rq_pool || md->bs); + md->io_pool = p->io_pool; p->io_pool = NULL; md->rq_pool = p->rq_pool; @@ -3481,23 +3571,48 @@ int dm_noflush_suspending(struct dm_target *ti) } EXPORT_SYMBOL_GPL(dm_noflush_suspending); -struct dm_md_mempools *dm_alloc_bio_mempools(unsigned integrity, - unsigned per_bio_data_size) +struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type, + unsigned integrity, unsigned per_bio_data_size) { - struct dm_md_mempools *pools; - unsigned int pool_size = dm_get_reserved_bio_based_ios(); + struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL); + struct kmem_cache *cachep = NULL; + unsigned int pool_size = 0; unsigned int front_pad; - pools = kzalloc(sizeof(*pools), GFP_KERNEL); if (!pools) return NULL; - front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + - offsetof(struct dm_target_io, clone); + type = filter_md_type(type, md); - pools->io_pool = mempool_create_slab_pool(pool_size, _io_cache); - if (!pools->io_pool) - goto out; + switch (type) { + case DM_TYPE_BIO_BASED: + cachep = _io_cache; + pool_size = dm_get_reserved_bio_based_ios(); + front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); + break; + case DM_TYPE_REQUEST_BASED: + cachep = _rq_tio_cache; + pool_size = dm_get_reserved_rq_based_ios(); + pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache); + if (!pools->rq_pool) + goto out; + /* fall through to setup remaining rq-based pools */ + case DM_TYPE_MQ_REQUEST_BASED: + if (!pool_size) + pool_size = dm_get_reserved_rq_based_ios(); + front_pad = offsetof(struct dm_rq_clone_bio_info, clone); + /* per_bio_data_size is not used. See __bind_mempools(). */ + WARN_ON(per_bio_data_size != 0); + break; + default: + BUG(); + } + + if (cachep) { + pools->io_pool = mempool_create_slab_pool(pool_size, cachep); + if (!pools->io_pool) + goto out; + } pools->bs = bioset_create_nobvec(pool_size, front_pad); if (!pools->bs) @@ -3507,34 +3622,10 @@ struct dm_md_mempools *dm_alloc_bio_mempools(unsigned integrity, goto out; return pools; -out: - dm_free_md_mempools(pools); - return NULL; -} - -struct dm_md_mempools *dm_alloc_rq_mempools(struct mapped_device *md, - unsigned type) -{ - unsigned int pool_size = dm_get_reserved_rq_based_ios(); - struct dm_md_mempools *pools; - - pools = kzalloc(sizeof(*pools), GFP_KERNEL); - if (!pools) - return NULL; - if (filter_md_type(type, md) == DM_TYPE_REQUEST_BASED) { - pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache); - if (!pools->rq_pool) - goto out; - } - - pools->io_pool = mempool_create_slab_pool(pool_size, _rq_tio_cache); - if (!pools->io_pool) - goto out; - - return pools; out: dm_free_md_mempools(pools); + return NULL; } diff --git a/drivers/md/dm.h b/drivers/md/dm.h index e6e66d087b26..6123c2bf9150 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -222,9 +222,8 @@ void dm_kcopyd_exit(void); /* * Mempool operations */ -struct dm_md_mempools *dm_alloc_bio_mempools(unsigned integrity, - unsigned per_bio_data_size); -struct dm_md_mempools *dm_alloc_rq_mempools(struct mapped_device *md, unsigned type); +struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type, + unsigned integrity, unsigned per_bio_data_size); void dm_free_md_mempools(struct dm_md_mempools *pools); /* |