From 5054e778fcd9cd29ddaa8109077cd235527e4f94 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 1 May 2023 09:19:17 -0400 Subject: dm crypt: allocate compound pages if possible It was reported that allocating pages for the write buffer in dm-crypt causes measurable overhead [1]. Change dm-crypt to allocate compound pages if they are available. If not, fall back to the mempool. [1] https://listman.redhat.com/archives/dm-devel/2023-February/053284.html Suggested-by: Matthew Wilcox Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 49 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 14 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 09e37ebf7cc8..dbf13bd1d219 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1661,6 +1661,9 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone); * In order to not degrade performance with excessive locking, we try * non-blocking allocations without a mutex first but on failure we fallback * to blocking allocations with a mutex. + * + * In order to reduce allocation overhead, we try to allocate compound pages in + * the first pass. If they are not available, we fall back to the mempool. */ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned int size) { @@ -1668,8 +1671,8 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned int size) struct bio *clone; unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; gfp_t gfp_mask = GFP_NOWAIT | __GFP_HIGHMEM; - unsigned int i, len, remaining_size; - struct page *page; + unsigned int remaining_size; + unsigned int order = MAX_ORDER - 1; retry: if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM)) @@ -1682,19 +1685,34 @@ retry: remaining_size = size; - for (i = 0; i < nr_iovecs; i++) { - page = mempool_alloc(&cc->page_pool, gfp_mask); - if (!page) { + while (remaining_size) { + struct page *pages; + unsigned size_to_add; + unsigned remaining_order = __fls((remaining_size + PAGE_SIZE - 1) >> PAGE_SHIFT); + order = min(order, remaining_order); + + while (order > 0) { + pages = alloc_pages(gfp_mask + | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | __GFP_COMP, + order); + if (likely(pages != NULL)) + goto have_pages; + order--; + } + + pages = mempool_alloc(&cc->page_pool, gfp_mask); + if (!pages) { crypt_free_buffer_pages(cc, clone); bio_put(clone); gfp_mask |= __GFP_DIRECT_RECLAIM; + order = 0; goto retry; } - len = (remaining_size > PAGE_SIZE) ? PAGE_SIZE : remaining_size; - - __bio_add_page(clone, page, len, 0); - remaining_size -= len; +have_pages: + size_to_add = min((unsigned)PAGE_SIZE << order, remaining_size); + __bio_add_page(clone, pages, size_to_add, 0); + remaining_size -= size_to_add; } /* Allocate space for integrity tags */ @@ -1712,12 +1730,15 @@ retry: static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone) { - struct bio_vec *bv; - struct bvec_iter_all iter_all; + struct folio_iter fi; - bio_for_each_segment_all(bv, clone, iter_all) { - BUG_ON(!bv->bv_page); - mempool_free(bv->bv_page, &cc->page_pool); + if (clone->bi_vcnt > 0) { /* bio_for_each_folio_all crashes with an empty bio */ + bio_for_each_folio_all(fi, clone) { + if (folio_test_large(fi.folio)) + folio_put(fi.folio); + else + mempool_free(&fi.folio->page, &cc->page_pool); + } } } -- cgit v1.2.3-58-ga151 From 1d9a943898533e83f20370c0e1448d606627522e Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 1 May 2023 09:19:45 -0400 Subject: dm flakey: clone pages on write bio before corrupting them dm-flakey has an option to corrupt write bios. It corrupts the memory that is being written. This can cause system crashes or security bugs - for example, if the user writes a shared library code with O_DIRECT flag to a dm-flakey device, it corrupts the memory for all users that have the shared library mapped. Fix this bug by cloning the bio and corrupting the clone rather than the original. Also drop the test for ZERO_PAGE(0) - it can't happen because we write the cloned pages. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-flakey.c | 104 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 95 insertions(+), 9 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index bd80bcafbe50..079d48b29628 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -322,11 +322,7 @@ static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc) */ bio_for_each_segment(bvec, bio, iter) { if (bio_iter_len(bio, iter) > corrupt_bio_byte) { - char *segment; - struct page *page = bio_iter_page(bio, iter); - if (unlikely(page == ZERO_PAGE(0))) - break; - segment = bvec_kmap_local(&bvec); + char *segment = bvec_kmap_local(&bvec); segment[corrupt_bio_byte] = fc->corrupt_bio_value; kunmap_local(segment); DMDEBUG("Corrupting data bio=%p by writing %u to byte %u " @@ -340,6 +336,92 @@ static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc) } } +static void clone_free(struct bio *clone) +{ + struct folio_iter fi; + + if (clone->bi_vcnt > 0) { /* bio_for_each_folio_all crashes with an empty bio */ + bio_for_each_folio_all(fi, clone) + folio_put(fi.folio); + } + + bio_uninit(clone); + kfree(clone); +} + +static void clone_endio(struct bio *clone) +{ + struct bio *bio = clone->bi_private; + bio->bi_status = clone->bi_status; + clone_free(clone); + bio_endio(bio); +} + +static struct bio *clone_bio(struct dm_target *ti, struct flakey_c *fc, struct bio *bio) +{ + struct bio *clone; + unsigned size, remaining_size, nr_iovecs, order; + struct bvec_iter iter = bio->bi_iter; + + if (unlikely(bio->bi_iter.bi_size > UIO_MAXIOV << PAGE_SHIFT)) + dm_accept_partial_bio(bio, UIO_MAXIOV << PAGE_SHIFT >> SECTOR_SHIFT); + + size = bio->bi_iter.bi_size; + nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + + clone = bio_kmalloc(nr_iovecs, GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN); + if (!clone) + return NULL; + + bio_init(clone, fc->dev->bdev, bio->bi_inline_vecs, nr_iovecs, bio->bi_opf); + + clone->bi_iter.bi_sector = flakey_map_sector(ti, bio->bi_iter.bi_sector); + clone->bi_private = bio; + clone->bi_end_io = clone_endio; + + remaining_size = size; + + order = MAX_ORDER - 1; + while (remaining_size) { + struct page *pages; + unsigned size_to_add, to_copy; + unsigned char *virt; + unsigned remaining_order = __fls((remaining_size + PAGE_SIZE - 1) >> PAGE_SHIFT); + order = min(order, remaining_order); + +retry_alloc_pages: + pages = alloc_pages(GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN | __GFP_COMP, order); + if (unlikely(!pages)) { + if (order) { + order--; + goto retry_alloc_pages; + } + clone_free(clone); + return NULL; + } + size_to_add = min((unsigned)PAGE_SIZE << order, remaining_size); + + virt = page_to_virt(pages); + to_copy = size_to_add; + do { + struct bio_vec bvec = bvec_iter_bvec(bio->bi_io_vec, iter); + unsigned this_step = min(bvec.bv_len, to_copy); + void *map = bvec_kmap_local(&bvec); + memcpy(virt, map, this_step); + kunmap_local(map); + + bvec_iter_advance(bio->bi_io_vec, &iter, this_step); + to_copy -= this_step; + virt += this_step; + } while (to_copy); + + __bio_add_page(clone, pages, size_to_add, 0); + remaining_size -= size_to_add; + } + + return clone; +} + static int flakey_map(struct dm_target *ti, struct bio *bio) { struct flakey_c *fc = ti->private; @@ -383,10 +465,14 @@ static int flakey_map(struct dm_target *ti, struct bio *bio) /* * Corrupt matching writes. */ - if (fc->corrupt_bio_byte) { - if (fc->corrupt_bio_rw == WRITE) { - if (all_corrupt_bio_flags_match(bio, fc)) - corrupt_bio_data(bio, fc); + if (fc->corrupt_bio_byte && fc->corrupt_bio_rw == WRITE) { + if (all_corrupt_bio_flags_match(bio, fc)) { + struct bio *clone = clone_bio(ti, fc, bio); + if (clone) { + corrupt_bio_data(clone, fc); + submit_bio(clone); + return DM_MAPIO_SUBMITTED; + } } goto map_bio; } -- cgit v1.2.3-58-ga151 From 4c2c845bdc9a3443ce805460a75242923b0c5ab5 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 1 May 2023 09:20:08 -0400 Subject: dm flakey: introduce random_read_corrupt and random_write_corrupt options The random_read_corrupt and random_write_corrupt options corrupt a random byte in a bio with the provided probability. The corruption only happens in the "down" interval. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- .../admin-guide/device-mapper/dm-flakey.rst | 10 ++ drivers/md/dm-flakey.c | 120 +++++++++++++++++---- 2 files changed, 110 insertions(+), 20 deletions(-) (limited to 'drivers') diff --git a/Documentation/admin-guide/device-mapper/dm-flakey.rst b/Documentation/admin-guide/device-mapper/dm-flakey.rst index f7104c01b0f7..f967c5fea219 100644 --- a/Documentation/admin-guide/device-mapper/dm-flakey.rst +++ b/Documentation/admin-guide/device-mapper/dm-flakey.rst @@ -67,6 +67,16 @@ Optional feature parameters: Perform the replacement only if bio->bi_opf has all the selected flags set. + random_read_corrupt + During , replace random byte in a read bio + with a random value. probability is an integer between + 0 and 1000000000 meaning 0% to 100% probability of corruption. + + random_write_corrupt + During , replace random byte in a write bio + with a random value. probability is an integer between + 0 and 1000000000 meaning 0% to 100% probability of corruption. + Examples: Replaces the 32nd byte of READ bios with the value 1:: diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index 079d48b29628..120153e44ae0 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -16,6 +16,8 @@ #define DM_MSG_PREFIX "flakey" +#define PROBABILITY_BASE 1000000000 + #define all_corrupt_bio_flags_match(bio, fc) \ (((bio)->bi_opf & (fc)->corrupt_bio_flags) == (fc)->corrupt_bio_flags) @@ -34,6 +36,8 @@ struct flakey_c { unsigned int corrupt_bio_rw; unsigned int corrupt_bio_value; blk_opf_t corrupt_bio_flags; + unsigned int random_read_corrupt; + unsigned int random_write_corrupt; }; enum feature_flag_bits { @@ -54,10 +58,11 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, const char *arg_name; static const struct dm_arg _args[] = { - {0, 7, "Invalid number of feature args"}, + {0, 11, "Invalid number of feature args"}, {1, UINT_MAX, "Invalid corrupt bio byte"}, {0, 255, "Invalid corrupt value to write into bio byte (0-255)"}, {0, UINT_MAX, "Invalid corrupt bio flags mask"}, + {0, PROBABILITY_BASE, "Invalid random corrupt argument"}, }; /* No feature arguments supplied. */ @@ -170,6 +175,32 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, continue; } + if (!strcasecmp(arg_name, "random_read_corrupt")) { + if (!argc) { + ti->error = "Feature random_read_corrupt requires a parameter"; + return -EINVAL; + } + r = dm_read_arg(_args + 4, as, &fc->random_read_corrupt, &ti->error); + if (r) + return r; + argc--; + + continue; + } + + if (!strcasecmp(arg_name, "random_write_corrupt")) { + if (!argc) { + ti->error = "Feature random_write_corrupt requires a parameter"; + return -EINVAL; + } + r = dm_read_arg(_args + 4, as, &fc->random_write_corrupt, &ti->error); + if (r) + return r; + argc--; + + continue; + } + ti->error = "Unrecognised flakey feature requested"; return -EINVAL; } @@ -184,7 +215,8 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, } if (!fc->corrupt_bio_byte && !test_bit(ERROR_READS, &fc->flags) && - !test_bit(DROP_WRITES, &fc->flags) && !test_bit(ERROR_WRITES, &fc->flags)) { + !test_bit(DROP_WRITES, &fc->flags) && !test_bit(ERROR_WRITES, &fc->flags) && + !fc->random_read_corrupt && !fc->random_write_corrupt) { set_bit(ERROR_WRITES, &fc->flags); set_bit(ERROR_READS, &fc->flags); } @@ -306,36 +338,57 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio) bio->bi_iter.bi_sector = flakey_map_sector(ti, bio->bi_iter.bi_sector); } -static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc) +static void corrupt_bio_common(struct bio *bio, unsigned int corrupt_bio_byte, + unsigned char corrupt_bio_value) { - unsigned int corrupt_bio_byte = fc->corrupt_bio_byte - 1; - struct bvec_iter iter; struct bio_vec bvec; - if (!bio_has_data(bio)) - return; - /* * Overwrite the Nth byte of the bio's data, on whichever page * it falls. */ bio_for_each_segment(bvec, bio, iter) { if (bio_iter_len(bio, iter) > corrupt_bio_byte) { - char *segment = bvec_kmap_local(&bvec); - segment[corrupt_bio_byte] = fc->corrupt_bio_value; + unsigned char *segment = bvec_kmap_local(&bvec); + segment[corrupt_bio_byte] = corrupt_bio_value; kunmap_local(segment); DMDEBUG("Corrupting data bio=%p by writing %u to byte %u " "(rw=%c bi_opf=%u bi_sector=%llu size=%u)\n", - bio, fc->corrupt_bio_value, fc->corrupt_bio_byte, + bio, corrupt_bio_value, corrupt_bio_byte, (bio_data_dir(bio) == WRITE) ? 'w' : 'r', bio->bi_opf, - (unsigned long long)bio->bi_iter.bi_sector, bio->bi_iter.bi_size); + (unsigned long long)bio->bi_iter.bi_sector, + bio->bi_iter.bi_size); break; } corrupt_bio_byte -= bio_iter_len(bio, iter); } } +static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc) +{ + unsigned int corrupt_bio_byte = fc->corrupt_bio_byte - 1; + + if (!bio_has_data(bio)) + return; + + corrupt_bio_common(bio, corrupt_bio_byte, fc->corrupt_bio_value); +} + +static void corrupt_bio_random(struct bio *bio) +{ + unsigned int corrupt_byte; + unsigned char corrupt_value; + + if (!bio_has_data(bio)) + return; + + corrupt_byte = get_random_u32() % bio->bi_iter.bi_size; + corrupt_value = get_random_u8(); + + corrupt_bio_common(bio, corrupt_byte, corrupt_value); +} + static void clone_free(struct bio *clone) { struct folio_iter fi; @@ -436,6 +489,7 @@ static int flakey_map(struct dm_target *ti, struct bio *bio) /* Are we alive ? */ elapsed = (jiffies - fc->start_time) / HZ; if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) { + bool corrupt_fixed, corrupt_random; /* * Flag this bio as submitted while down. */ @@ -465,16 +519,28 @@ static int flakey_map(struct dm_target *ti, struct bio *bio) /* * Corrupt matching writes. */ + corrupt_fixed = false; + corrupt_random = false; if (fc->corrupt_bio_byte && fc->corrupt_bio_rw == WRITE) { - if (all_corrupt_bio_flags_match(bio, fc)) { - struct bio *clone = clone_bio(ti, fc, bio); - if (clone) { + if (all_corrupt_bio_flags_match(bio, fc)) + corrupt_fixed = true; + } + if (fc->random_write_corrupt) { + u64 rnd = get_random_u64(); + u32 rem = do_div(rnd, PROBABILITY_BASE); + if (rem < fc->random_write_corrupt) + corrupt_random = true; + } + if (corrupt_fixed || corrupt_random) { + struct bio *clone = clone_bio(ti, fc, bio); + if (clone) { + if (corrupt_fixed) corrupt_bio_data(clone, fc); - submit_bio(clone); - return DM_MAPIO_SUBMITTED; - } + if (corrupt_random) + corrupt_bio_random(clone); + submit_bio(clone); + return DM_MAPIO_SUBMITTED; } - goto map_bio; } } @@ -503,6 +569,12 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio, corrupt_bio_data(bio, fc); } } + if (fc->random_read_corrupt) { + u64 rnd = get_random_u64(); + u32 rem = do_div(rnd, PROBABILITY_BASE); + if (rem < fc->random_read_corrupt) + corrupt_bio_random(bio); + } if (test_bit(ERROR_READS, &fc->flags)) { /* * Error read during the down_interval if drop_writes @@ -535,7 +607,10 @@ static void flakey_status(struct dm_target *ti, status_type_t type, error_reads = test_bit(ERROR_READS, &fc->flags); drop_writes = test_bit(DROP_WRITES, &fc->flags); error_writes = test_bit(ERROR_WRITES, &fc->flags); - DMEMIT(" %u", error_reads + drop_writes + error_writes + (fc->corrupt_bio_byte > 0) * 5); + DMEMIT(" %u", error_reads + drop_writes + error_writes + + (fc->corrupt_bio_byte > 0) * 5 + + (fc->random_read_corrupt > 0) * 2 + + (fc->random_write_corrupt > 0) * 2); if (error_reads) DMEMIT(" error_reads"); @@ -550,6 +625,11 @@ static void flakey_status(struct dm_target *ti, status_type_t type, (fc->corrupt_bio_rw == WRITE) ? 'w' : 'r', fc->corrupt_bio_value, fc->corrupt_bio_flags); + if (fc->random_read_corrupt > 0) + DMEMIT(" random_read_corrupt %u", fc->random_read_corrupt); + if (fc->random_write_corrupt > 0) + DMEMIT(" random_write_corrupt %u", fc->random_write_corrupt); + break; case STATUSTYPE_IMA: -- cgit v1.2.3-58-ga151 From c0a7a0ac0707a123f936daccf6639ce1c48840d5 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 15 May 2023 16:21:38 -0400 Subject: dm thin: remove return code variable in pool_map Always returns DM_MAPIO_REMAPPED so no need for variable. Signed-off-by: Mike Snitzer --- drivers/md/dm-thin.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 464c6b678417..ed1b7f564481 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -3447,7 +3447,6 @@ out_unlock: static int pool_map(struct dm_target *ti, struct bio *bio) { - int r; struct pool_c *pt = ti->private; struct pool *pool = pt->pool; @@ -3456,10 +3455,9 @@ static int pool_map(struct dm_target *ti, struct bio *bio) */ spin_lock_irq(&pool->lock); bio_set_dev(bio, pt->data_dev->bdev); - r = DM_MAPIO_REMAPPED; spin_unlock_irq(&pool->lock); - return r; + return DM_MAPIO_REMAPPED; } static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit) -- cgit v1.2.3-58-ga151 From ef6953fb68fe52a13cd154509d1ac9f9748c6051 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 31 May 2023 13:49:47 -0400 Subject: dm thin: update .io_hints methods to not require handling discards last Removes assumptions about what might follow the discard setup code (previously the code would return early if discards not enabled). Makes it possible to add more capabilites to the end of each .io_hints method (which is the natural thing to do when adding new features). Signed-off-by: Mike Snitzer --- drivers/md/dm-thin.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index ed1b7f564481..ebcfd84e8b7f 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -4098,21 +4098,20 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) * They get transferred to the live pool in bind_control_target() * called from pool_preresume(). */ - if (!pt->adjusted_pf.discard_enabled) { + + if (pt->adjusted_pf.discard_enabled) { + disable_passdown_if_not_supported(pt); + /* + * The pool uses the same discard limits as the underlying data + * device. DM core has already set this up. + */ + } else { /* * Must explicitly disallow stacking discard limits otherwise the * block layer will stack them if pool's data device has support. */ limits->discard_granularity = 0; - return; } - - disable_passdown_if_not_supported(pt); - - /* - * The pool uses the same discard limits as the underlying data - * device. DM core has already set this up. - */ } static struct target_type pool_target = { @@ -4496,11 +4495,10 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) struct thin_c *tc = ti->private; struct pool *pool = tc->pool; - if (!pool->pf.discard_enabled) - return; - - limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; - limits->max_discard_sectors = pool->sectors_per_block * BIO_PRISON_MAX_RANGE; + if (pool->pf.discard_enabled) { + limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; + limits->max_discard_sectors = pool->sectors_per_block * BIO_PRISON_MAX_RANGE; + } } static struct target_type thin_target = { -- cgit v1.2.3-58-ga151 From 2a32897c840be1c0a0525f4279b365781acfba24 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 24 May 2023 05:35:29 -0400 Subject: dm crypt: fix crypt_ctr_cipher_new return value on invalid AEAD cipher If the user specifies invalid AEAD cipher, dm-crypt should return the error returned from crypt_ctr_auth_spec, not -ENOMEM. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index dbf13bd1d219..98622a15df30 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -2908,7 +2908,7 @@ static int crypt_ctr_cipher_new(struct dm_target *ti, char *cipher_in, char *key ret = crypt_ctr_auth_cipher(cc, cipher_api); if (ret < 0) { ti->error = "Invalid AEAD cipher spec"; - return -ENOMEM; + return ret; } } -- cgit v1.2.3-58-ga151 From d48300120627a1cb98914738fff38b424625b8ad Mon Sep 17 00:00:00 2001 From: Li Lingfeng Date: Mon, 5 Jun 2023 15:03:16 +0800 Subject: dm thin metadata: Fix ABBA deadlock by resetting dm_bufio_client As described in commit 8111964f1b85 ("dm thin: Fix ABBA deadlock between shrink_slab and dm_pool_abort_metadata"), ABBA deadlocks will be triggered because shrinker_rwsem currently needs to held by dm_pool_abort_metadata() as a side-effect of thin-pool metadata operation failure. The following three problem scenarios have been noticed: 1) Described by commit 8111964f1b85 ("dm thin: Fix ABBA deadlock between shrink_slab and dm_pool_abort_metadata") 2) shrinker_rwsem and throttle->lock P1(drop cache) P2(kworker) drop_caches_sysctl_handler drop_slab shrink_slab down_read(&shrinker_rwsem) - LOCK A do_shrink_slab super_cache_scan prune_icache_sb dispose_list evict ext4_evict_inode ext4_clear_inode ext4_discard_preallocations ext4_mb_load_buddy_gfp ext4_mb_init_cache ext4_wait_block_bitmap __ext4_error ext4_handle_error ext4_commit_super ... dm_submit_bio do_worker throttle_work_update down_write(&t->lock) -- LOCK B process_deferred_bios commit metadata_operation_failed dm_pool_abort_metadata dm_block_manager_create dm_bufio_client_create register_shrinker down_write(&shrinker_rwsem) -- LOCK A thin_map thin_bio_map thin_defer_bio_with_throttle throttle_lock down_read(&t->lock) - LOCK B 3) shrinker_rwsem and wait_on_buffer P1(drop cache) P2(kworker) drop_caches_sysctl_handler drop_slab shrink_slab down_read(&shrinker_rwsem) - LOCK A do_shrink_slab ... ext4_wait_block_bitmap __ext4_error ext4_handle_error jbd2_journal_abort jbd2_journal_update_sb_errno jbd2_write_superblock submit_bh // LOCK B // RELEASE B do_worker throttle_work_update down_write(&t->lock) - LOCK B process_deferred_bios process_bio commit metadata_operation_failed dm_pool_abort_metadata dm_block_manager_create dm_bufio_client_create register_shrinker register_shrinker_prepared down_write(&shrinker_rwsem) - LOCK A bio_endio wait_on_buffer __wait_on_buffer Fix these by resetting dm_bufio_client without holding shrinker_rwsem. Fixes: 8111964f1b85 ("dm thin: Fix ABBA deadlock between shrink_slab and dm_pool_abort_metadata") Cc: stable@vger.kernel.org Signed-off-by: Li Lingfeng Signed-off-by: Mike Snitzer --- drivers/md/dm-bufio.c | 7 +++ drivers/md/dm-thin-metadata.c | 58 ++++++++++------------ drivers/md/persistent-data/dm-block-manager.c | 6 +++ drivers/md/persistent-data/dm-block-manager.h | 1 + drivers/md/persistent-data/dm-space-map.h | 3 +- .../md/persistent-data/dm-transaction-manager.c | 3 ++ include/linux/dm-bufio.h | 2 + 7 files changed, 46 insertions(+), 34 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index eea977662e81..a7079b38756a 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -2592,6 +2592,13 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c) } EXPORT_SYMBOL_GPL(dm_bufio_client_destroy); +void dm_bufio_client_reset(struct dm_bufio_client *c) +{ + drop_buffers(c); + flush_work(&c->shrink_work); +} +EXPORT_SYMBOL_GPL(dm_bufio_client_reset); + void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start) { c->start = start; diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 9f5cb52c5763..63d92d388ee6 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -603,6 +603,8 @@ static int __format_metadata(struct dm_pool_metadata *pmd) r = dm_tm_create_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION, &pmd->tm, &pmd->metadata_sm); if (r < 0) { + pmd->tm = NULL; + pmd->metadata_sm = NULL; DMERR("tm_create_with_sm failed"); return r; } @@ -611,6 +613,7 @@ static int __format_metadata(struct dm_pool_metadata *pmd) if (IS_ERR(pmd->data_sm)) { DMERR("sm_disk_create failed"); r = PTR_ERR(pmd->data_sm); + pmd->data_sm = NULL; goto bad_cleanup_tm; } @@ -641,11 +644,15 @@ static int __format_metadata(struct dm_pool_metadata *pmd) bad_cleanup_nb_tm: dm_tm_destroy(pmd->nb_tm); + pmd->nb_tm = NULL; bad_cleanup_data_sm: dm_sm_destroy(pmd->data_sm); + pmd->data_sm = NULL; bad_cleanup_tm: dm_tm_destroy(pmd->tm); + pmd->tm = NULL; dm_sm_destroy(pmd->metadata_sm); + pmd->metadata_sm = NULL; return r; } @@ -711,6 +718,8 @@ static int __open_metadata(struct dm_pool_metadata *pmd) sizeof(disk_super->metadata_space_map_root), &pmd->tm, &pmd->metadata_sm); if (r < 0) { + pmd->tm = NULL; + pmd->metadata_sm = NULL; DMERR("tm_open_with_sm failed"); goto bad_unlock_sblock; } @@ -720,6 +729,7 @@ static int __open_metadata(struct dm_pool_metadata *pmd) if (IS_ERR(pmd->data_sm)) { DMERR("sm_disk_open failed"); r = PTR_ERR(pmd->data_sm); + pmd->data_sm = NULL; goto bad_cleanup_tm; } @@ -746,9 +756,12 @@ static int __open_metadata(struct dm_pool_metadata *pmd) bad_cleanup_data_sm: dm_sm_destroy(pmd->data_sm); + pmd->data_sm = NULL; bad_cleanup_tm: dm_tm_destroy(pmd->tm); + pmd->tm = NULL; dm_sm_destroy(pmd->metadata_sm); + pmd->metadata_sm = NULL; bad_unlock_sblock: dm_bm_unlock(sblock); @@ -795,9 +808,13 @@ static void __destroy_persistent_data_objects(struct dm_pool_metadata *pmd, bool destroy_bm) { dm_sm_destroy(pmd->data_sm); + pmd->data_sm = NULL; dm_sm_destroy(pmd->metadata_sm); + pmd->metadata_sm = NULL; dm_tm_destroy(pmd->nb_tm); + pmd->nb_tm = NULL; dm_tm_destroy(pmd->tm); + pmd->tm = NULL; if (destroy_bm) dm_block_manager_destroy(pmd->bm); } @@ -1005,8 +1022,7 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd) __func__, r); } pmd_write_unlock(pmd); - if (!pmd->fail_io) - __destroy_persistent_data_objects(pmd, true); + __destroy_persistent_data_objects(pmd, true); kfree(pmd); return 0; @@ -1877,53 +1893,29 @@ static void __set_abort_with_changes_flags(struct dm_pool_metadata *pmd) int dm_pool_abort_metadata(struct dm_pool_metadata *pmd) { int r = -EINVAL; - struct dm_block_manager *old_bm = NULL, *new_bm = NULL; /* fail_io is double-checked with pmd->root_lock held below */ if (unlikely(pmd->fail_io)) return r; - /* - * Replacement block manager (new_bm) is created and old_bm destroyed outside of - * pmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of - * shrinker associated with the block manager's bufio client vs pmd root_lock). - * - must take shrinker_mutex without holding pmd->root_lock - */ - new_bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT, - THIN_MAX_CONCURRENT_LOCKS); - pmd_write_lock(pmd); if (pmd->fail_io) { pmd_write_unlock(pmd); - goto out; + return r; } - __set_abort_with_changes_flags(pmd); + + /* destroy data_sm/metadata_sm/nb_tm/tm */ __destroy_persistent_data_objects(pmd, false); - old_bm = pmd->bm; - if (IS_ERR(new_bm)) { - DMERR("could not create block manager during abort"); - pmd->bm = NULL; - r = PTR_ERR(new_bm); - goto out_unlock; - } - pmd->bm = new_bm; + /* reset bm */ + dm_block_manager_reset(pmd->bm); + + /* rebuild data_sm/metadata_sm/nb_tm/tm */ r = __open_or_format_metadata(pmd, false); - if (r) { - pmd->bm = NULL; - goto out_unlock; - } - new_bm = NULL; -out_unlock: if (r) pmd->fail_io = true; pmd_write_unlock(pmd); - dm_block_manager_destroy(old_bm); -out: - if (new_bm && !IS_ERR(new_bm)) - dm_block_manager_destroy(new_bm); - return r; } diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c index 7bdfc23f758a..0e010e1204aa 100644 --- a/drivers/md/persistent-data/dm-block-manager.c +++ b/drivers/md/persistent-data/dm-block-manager.c @@ -421,6 +421,12 @@ void dm_block_manager_destroy(struct dm_block_manager *bm) } EXPORT_SYMBOL_GPL(dm_block_manager_destroy); +void dm_block_manager_reset(struct dm_block_manager *bm) +{ + dm_bufio_client_reset(bm->bufio); +} +EXPORT_SYMBOL_GPL(dm_block_manager_reset); + unsigned int dm_bm_block_size(struct dm_block_manager *bm) { return dm_bufio_get_block_size(bm->bufio); diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h index 5746b0f82a03..f706d3de8d5a 100644 --- a/drivers/md/persistent-data/dm-block-manager.h +++ b/drivers/md/persistent-data/dm-block-manager.h @@ -36,6 +36,7 @@ struct dm_block_manager *dm_block_manager_create( struct block_device *bdev, unsigned int block_size, unsigned int max_held_per_thread); void dm_block_manager_destroy(struct dm_block_manager *bm); +void dm_block_manager_reset(struct dm_block_manager *bm); unsigned int dm_bm_block_size(struct dm_block_manager *bm); dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm); diff --git a/drivers/md/persistent-data/dm-space-map.h b/drivers/md/persistent-data/dm-space-map.h index dab490353781..6bf69922b5ad 100644 --- a/drivers/md/persistent-data/dm-space-map.h +++ b/drivers/md/persistent-data/dm-space-map.h @@ -77,7 +77,8 @@ struct dm_space_map { static inline void dm_sm_destroy(struct dm_space_map *sm) { - sm->destroy(sm); + if (sm) + sm->destroy(sm); } static inline int dm_sm_extend(struct dm_space_map *sm, dm_block_t extra_blocks) diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c index 6dc016248baf..c88fa6266203 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.c +++ b/drivers/md/persistent-data/dm-transaction-manager.c @@ -199,6 +199,9 @@ EXPORT_SYMBOL_GPL(dm_tm_create_non_blocking_clone); void dm_tm_destroy(struct dm_transaction_manager *tm) { + if (!tm) + return; + if (!tm->is_clone) wipe_shadow_table(tm); diff --git a/include/linux/dm-bufio.h b/include/linux/dm-bufio.h index 681656a1c03d..75e7d8cbb532 100644 --- a/include/linux/dm-bufio.h +++ b/include/linux/dm-bufio.h @@ -38,6 +38,8 @@ dm_bufio_client_create(struct block_device *bdev, unsigned int block_size, */ void dm_bufio_client_destroy(struct dm_bufio_client *c); +void dm_bufio_client_reset(struct dm_bufio_client *c); + /* * Set the sector range. * When this function is called, there must be no I/O in progress on the bufio -- cgit v1.2.3-58-ga151 From e118029cb7605a89bf334a83023fc0c102420954 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Wed, 7 Jun 2023 05:26:27 +0200 Subject: dm zone: Use the bitmap API to allocate bitmaps Use bitmap_zalloc()/bitmap_free() instead of hand-writing them. It is less verbose and it improves the semantic. Signed-off-by: Christophe JAILLET Signed-off-by: Mike Snitzer --- drivers/md/dm-zone.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index 4b82b7798ce4..eb9832b22b14 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -7,6 +7,7 @@ #include #include #include +#include #include "dm-core.h" @@ -140,9 +141,9 @@ bool dm_is_zone_write(struct mapped_device *md, struct bio *bio) void dm_cleanup_zoned_dev(struct mapped_device *md) { if (md->disk) { - kfree(md->disk->conv_zones_bitmap); + bitmap_free(md->disk->conv_zones_bitmap); md->disk->conv_zones_bitmap = NULL; - kfree(md->disk->seq_zones_wlock); + bitmap_free(md->disk->seq_zones_wlock); md->disk->seq_zones_wlock = NULL; } @@ -182,9 +183,8 @@ static int dm_zone_revalidate_cb(struct blk_zone *zone, unsigned int idx, switch (zone->type) { case BLK_ZONE_TYPE_CONVENTIONAL: if (!disk->conv_zones_bitmap) { - disk->conv_zones_bitmap = - kcalloc(BITS_TO_LONGS(disk->nr_zones), - sizeof(unsigned long), GFP_NOIO); + disk->conv_zones_bitmap = bitmap_zalloc(disk->nr_zones, + GFP_NOIO); if (!disk->conv_zones_bitmap) return -ENOMEM; } @@ -193,9 +193,8 @@ static int dm_zone_revalidate_cb(struct blk_zone *zone, unsigned int idx, case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_PREF: if (!disk->seq_zones_wlock) { - disk->seq_zones_wlock = - kcalloc(BITS_TO_LONGS(disk->nr_zones), - sizeof(unsigned long), GFP_NOIO); + disk->seq_zones_wlock = bitmap_zalloc(disk->nr_zones, + GFP_NOIO); if (!disk->seq_zones_wlock) return -ENOMEM; } -- cgit v1.2.3-58-ga151 From 526d10061bc29b314cc41f3b8322606df9172f14 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Tue, 13 Jun 2023 09:33:32 +0800 Subject: dm: support turning off block-core's io stats accounting Commit bc58ba9468d9 ("block: add sysfs file for controlling io stats accounting") allowed users to turn off disk stat accounting completely by checking if queue flag QUEUE_FLAG_IO_STAT is set. In dm, this flag is neither set nor checked: so block-core's io stats are continuously counted and cannot be turned off. Add support for turning off block-core's io stats accounting for dm. Set QUEUE_FLAG_IO_STAT for dm's request_queue. If QUEUE_FLAG_IO_STAT is set when an io starts, record the need for block core's io stats by setting the DM_IO_BLK_STAT dm_io flag to avoid io stats being disabled in the middle of the io. DM statistics (dm-stats) is independent of block-core's io stats and remains unchanged. Signed-off-by: Li Nan Signed-off-by: Mike Snitzer --- drivers/md/dm-core.h | 3 ++- drivers/md/dm.c | 16 +++++++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index ce913ad91a52..0d93661f88d3 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -306,7 +306,8 @@ struct dm_io { */ enum { DM_IO_ACCOUNTED, - DM_IO_WAS_SPLIT + DM_IO_WAS_SPLIT, + DM_IO_BLK_STAT }; static inline bool dm_io_flagged(struct dm_io *io, unsigned int bit) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index ca2dc079c3f4..c8b3d686dc18 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -511,11 +511,14 @@ static void dm_io_acct(struct dm_io *io, bool end) else sectors = io->sectors; - if (!end) - bdev_start_io_acct(bio->bi_bdev, bio_op(bio), start_time); - else - bdev_end_io_acct(bio->bi_bdev, bio_op(bio), sectors, - start_time); + if (dm_io_flagged(io, DM_IO_BLK_STAT)) { + if (!end) + bdev_start_io_acct(bio->bi_bdev, bio_op(bio), + start_time); + else + bdev_end_io_acct(bio->bi_bdev, bio_op(bio), + sectors, start_time); + } if (static_branch_unlikely(&stats_enabled) && unlikely(dm_stats_used(&md->stats))) { @@ -592,6 +595,8 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) spin_lock_init(&io->lock); io->start_time = jiffies; io->flags = 0; + if (blk_queue_io_stat(md->queue)) + dm_io_set_flag(io, DM_IO_BLK_STAT); if (static_branch_unlikely(&stats_enabled)) dm_stats_record_start(&md->stats, &io->stats_aux); @@ -2341,6 +2346,7 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) break; case DM_TYPE_BIO_BASED: case DM_TYPE_DAX_BIO_BASED: + blk_queue_flag_set(QUEUE_FLAG_IO_STAT, md->queue); break; case DM_TYPE_NONE: WARN_ON_ONCE(true); -- cgit v1.2.3-58-ga151 From 06eed768ea64c7a128582efda4f6107cb14ee962 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 13 Jun 2023 15:19:42 -0400 Subject: dm: avoid needless dm_io access if all IO accounting is disabled Update dm_io_acct() to eliminate most dm_io struct accesses if both block core's IO stats and dm-stats are disabled. Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index c8b3d686dc18..658323aff676 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -487,51 +487,50 @@ u64 dm_start_time_ns_from_clone(struct bio *bio) } EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone); -static bool bio_is_flush_with_data(struct bio *bio) +static inline bool bio_is_flush_with_data(struct bio *bio) { return ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size); } -static void dm_io_acct(struct dm_io *io, bool end) +static inline unsigned int dm_io_sectors(struct dm_io *io, struct bio *bio) { - struct dm_stats_aux *stats_aux = &io->stats_aux; - unsigned long start_time = io->start_time; - struct mapped_device *md = io->md; - struct bio *bio = io->orig_bio; - unsigned int sectors; - /* * If REQ_PREFLUSH set, don't account payload, it will be * submitted (and accounted) after this flush completes. */ if (bio_is_flush_with_data(bio)) - sectors = 0; - else if (likely(!(dm_io_flagged(io, DM_IO_WAS_SPLIT)))) - sectors = bio_sectors(bio); - else - sectors = io->sectors; + return 0; + if (unlikely(dm_io_flagged(io, DM_IO_WAS_SPLIT))) + return io->sectors; + return bio_sectors(bio); +} + +static void dm_io_acct(struct dm_io *io, bool end) +{ + struct bio *bio = io->orig_bio; if (dm_io_flagged(io, DM_IO_BLK_STAT)) { if (!end) bdev_start_io_acct(bio->bi_bdev, bio_op(bio), - start_time); + io->start_time); else bdev_end_io_acct(bio->bi_bdev, bio_op(bio), - sectors, start_time); + dm_io_sectors(io, bio), + io->start_time); } if (static_branch_unlikely(&stats_enabled) && - unlikely(dm_stats_used(&md->stats))) { + unlikely(dm_stats_used(&io->md->stats))) { sector_t sector; - if (likely(!dm_io_flagged(io, DM_IO_WAS_SPLIT))) - sector = bio->bi_iter.bi_sector; - else + if (unlikely(dm_io_flagged(io, DM_IO_WAS_SPLIT))) sector = bio_end_sector(bio) - io->sector_offset; + else + sector = bio->bi_iter.bi_sector; - dm_stats_account_io(&md->stats, bio_data_dir(bio), - sector, sectors, - end, start_time, stats_aux); + dm_stats_account_io(&io->md->stats, bio_data_dir(bio), + sector, dm_io_sectors(io, bio), + end, io->start_time, &io->stats_aux); } } -- cgit v1.2.3-58-ga151 From c4f512d255e3c4ade80a1d68ca816c1b11556a13 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 13 Jun 2023 16:02:17 -0400 Subject: dm: skip dm-stats work in alloc_io() unless needed Don't dm_stats_record_start() if dm_stats_used() is false. Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 658323aff676..ea1671c39ba1 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -597,7 +597,8 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) if (blk_queue_io_stat(md->queue)) dm_io_set_flag(io, DM_IO_BLK_STAT); - if (static_branch_unlikely(&stats_enabled)) + if (static_branch_unlikely(&stats_enabled) && + unlikely(dm_stats_used(&md->stats))) dm_stats_record_start(&md->stats, &io->stats_aux); return io; -- cgit v1.2.3-58-ga151 From 862c6663c12ba217e8e920dc6dd158383ea5cf76 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 15 Jun 2023 16:41:20 -0400 Subject: dm: remove stale/redundant dm_internal_{suspend,resume} prototypes in dm.h dm_internal_suspend() no longer exists. Signed-off-by: Mike Snitzer --- drivers/md/dm.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 63d9010d8e61..f682295af91f 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -210,9 +210,6 @@ void dm_put_table_device(struct mapped_device *md, struct dm_dev *d); int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, unsigned int cookie, bool need_resize_uevent); -void dm_internal_suspend(struct mapped_device *md); -void dm_internal_resume(struct mapped_device *md); - int dm_io_init(void); void dm_io_exit(void); -- cgit v1.2.3-58-ga151 From fa375646241b5350f7326fd4d686891b95d9fbe5 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 16 Jun 2023 17:21:24 -0400 Subject: dm thin: disable discards for thin-pool if no_discard_passdown Also rename disable_passdown_if_not_supported to disable_discard_passdown_if_not_supported. And fold passdown_enabled() into only caller. Signed-off-by: Mike Snitzer --- drivers/md/dm-thin.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index ebcfd84e8b7f..5b0c2f004dbb 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -2528,16 +2528,11 @@ static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *)) /*----------------------------------------------------------------*/ -static bool passdown_enabled(struct pool_c *pt) -{ - return pt->adjusted_pf.discard_passdown; -} - static void set_discard_callbacks(struct pool *pool) { struct pool_c *pt = pool->ti->private; - if (passdown_enabled(pt)) { + if (pt->adjusted_pf.discard_passdown) { pool->process_discard_cell = process_discard_cell_passdown; pool->process_prepared_discard = process_prepared_discard_passdown_pt1; pool->process_prepared_discard_pt2 = process_prepared_discard_passdown_pt2; @@ -2846,7 +2841,7 @@ static bool is_factor(sector_t block_size, uint32_t n) * If discard_passdown was enabled verify that the data device * supports discards. Disable discard_passdown if not. */ -static void disable_passdown_if_not_supported(struct pool_c *pt) +static void disable_discard_passdown_if_not_supported(struct pool_c *pt) { struct pool *pool = pt->pool; struct block_device *data_bdev = pt->data_dev->bdev; @@ -4100,7 +4095,9 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) */ if (pt->adjusted_pf.discard_enabled) { - disable_passdown_if_not_supported(pt); + disable_discard_passdown_if_not_supported(pt); + if (!pt->adjusted_pf.discard_passdown) + limits->max_discard_sectors = 0; /* * The pool uses the same discard limits as the underlying data * device. DM core has already set this up. -- cgit v1.2.3-58-ga151 From 25c9a4ab4d73d251886e6b317181cfc433e011f9 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 13 Jun 2023 00:47:51 +0300 Subject: dm integrity: Use %*ph for printing hexdump of a small buffer The kernel already has a helper to print a hexdump of a small buffer via pointer extension. Use that instead of open coded variant. In long term it helps to kill pr_cont() or at least narrow down its use. Note, the format is slightly changed, i.e. the trailing space is always printed. Also the IV dump is limited by 64 bytes which seems fine. Signed-off-by: Andy Shevchenko Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 31838b13ea54..5e5f1c029b75 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -342,24 +342,9 @@ static struct kmem_cache *journal_io_cache; #define JOURNAL_IO_MEMPOOL 32 #ifdef DEBUG_PRINT -#define DEBUG_print(x, ...) printk(KERN_DEBUG x, ##__VA_ARGS__) -static void __DEBUG_bytes(__u8 *bytes, size_t len, const char *msg, ...) -{ - va_list args; - - va_start(args, msg); - vprintk(msg, args); - va_end(args); - if (len) - pr_cont(":"); - while (len) { - pr_cont(" %02x", *bytes); - bytes++; - len--; - } - pr_cont("\n"); -} -#define DEBUG_bytes(bytes, len, msg, ...) __DEBUG_bytes(bytes, len, KERN_DEBUG msg, ##__VA_ARGS__) +#define DEBUG_print(x, ...) printk(KERN_DEBUG x, ##__VA_ARGS__) +#define DEBUG_bytes(bytes, len, msg, ...) printk(KERN_DEBUG msg "%s%*ph\n", ##__VA_ARGS__, \ + len ? ": " : "", len, bytes) #else #define DEBUG_print(x, ...) do { } while (0) #define DEBUG_bytes(bytes, len, msg, ...) do { } while (0) -- cgit v1.2.3-58-ga151 From b60528d9e68113e2c297c3a45102332cb1d3e608 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Sat, 3 Jun 2023 10:52:39 -0400 Subject: dm ioctl: Check dm_target_spec is sufficiently aligned Otherwise subsequent code, if given malformed input, could dereference a misaligned 'struct dm_target_spec *'. Signed-off-by: Demi Marie Obenour Signed-off-by: Stephen Rothwell # use %zu Reviewed-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-ioctl.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'drivers') diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 8ba4cbb92351..3a6989b7817d 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1394,6 +1394,15 @@ static inline blk_mode_t get_mode(struct dm_ioctl *param) static int next_target(struct dm_target_spec *last, uint32_t next, void *end, struct dm_target_spec **spec, char **target_params) { + static_assert(__alignof__(struct dm_target_spec) <= 8, + "struct dm_target_spec must not require more than 8-byte alignment"); + + if (next % __alignof__(struct dm_target_spec)) { + DMERR("Next dm_target_spec (offset %u) is not %zu-byte aligned", + next, __alignof__(struct dm_target_spec)); + return -EINVAL; + } + *spec = (struct dm_target_spec *) ((unsigned char *) last + next); *target_params = (char *) (*spec + 1); -- cgit v1.2.3-58-ga151 From 13f4a697f8b4feb705569f9336127e9e2f9ac596 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Sat, 3 Jun 2023 10:52:40 -0400 Subject: dm ioctl: Avoid pointer arithmetic overflow Especially on 32-bit systems, it is possible for the pointer arithmetic to overflow and cause a userspace pointer to be dereferenced in the kernel. Signed-off-by: Demi Marie Obenour Reviewed-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-ioctl.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'drivers') diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 3a6989b7817d..e322fd490634 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1397,6 +1397,22 @@ static int next_target(struct dm_target_spec *last, uint32_t next, void *end, static_assert(__alignof__(struct dm_target_spec) <= 8, "struct dm_target_spec must not require more than 8-byte alignment"); + /* + * Number of bytes remaining, starting with last. This is always + * sizeof(struct dm_target_spec) or more, as otherwise *last was + * out of bounds already. + */ + size_t remaining = (char *)end - (char *)last; + + /* + * There must be room for both the next target spec and the + * NUL-terminator of the target itself. + */ + if (remaining - sizeof(struct dm_target_spec) <= next) { + DMERR("Target spec extends beyond end of parameters"); + return -EINVAL; + } + if (next % __alignof__(struct dm_target_spec)) { DMERR("Next dm_target_spec (offset %u) is not %zu-byte aligned", next, __alignof__(struct dm_target_spec)); -- cgit v1.2.3-58-ga151 From 10655c7a48570315343fdd9cc6acb261d57c2c7a Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Sat, 3 Jun 2023 10:52:41 -0400 Subject: dm ioctl: structs and parameter strings must not overlap The NUL terminator for each target parameter string must precede the following 'struct dm_target_spec'. Otherwise, dm_split_args() might corrupt this struct. Furthermore, the first 'struct dm_target_spec' must come after the 'struct dm_ioctl', as if it overlaps too much dm_split_args() could corrupt the 'struct dm_ioctl'. Signed-off-by: Demi Marie Obenour Reviewed-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-ioctl.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index e322fd490634..a92abbe90981 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1391,7 +1391,7 @@ static inline blk_mode_t get_mode(struct dm_ioctl *param) return mode; } -static int next_target(struct dm_target_spec *last, uint32_t next, void *end, +static int next_target(struct dm_target_spec *last, uint32_t next, const char *end, struct dm_target_spec **spec, char **target_params) { static_assert(__alignof__(struct dm_target_spec) <= 8, @@ -1402,7 +1402,7 @@ static int next_target(struct dm_target_spec *last, uint32_t next, void *end, * sizeof(struct dm_target_spec) or more, as otherwise *last was * out of bounds already. */ - size_t remaining = (char *)end - (char *)last; + size_t remaining = end - (char *)last; /* * There must be room for both the next target spec and the @@ -1422,10 +1422,7 @@ static int next_target(struct dm_target_spec *last, uint32_t next, void *end, *spec = (struct dm_target_spec *) ((unsigned char *) last + next); *target_params = (char *) (*spec + 1); - if (*spec < (last + 1)) - return -EINVAL; - - return invalid_str(*target_params, end); + return 0; } static int populate_table(struct dm_table *table, @@ -1435,8 +1432,9 @@ static int populate_table(struct dm_table *table, unsigned int i = 0; struct dm_target_spec *spec = (struct dm_target_spec *) param; uint32_t next = param->data_start; - void *end = (void *) param + param_size; + const char *const end = (const char *) param + param_size; char *target_params; + size_t min_size = sizeof(struct dm_ioctl); if (!param->target_count) { DMERR("%s: no targets specified", __func__); @@ -1444,6 +1442,13 @@ static int populate_table(struct dm_table *table, } for (i = 0; i < param->target_count; i++) { + const char *nul_terminator; + + if (next < min_size) { + DMERR("%s: next target spec (offset %u) overlaps %s", + __func__, next, i ? "previous target" : "'struct dm_ioctl'"); + return -EINVAL; + } r = next_target(spec, next, end, &spec, &target_params); if (r) { @@ -1451,6 +1456,15 @@ static int populate_table(struct dm_table *table, return r; } + nul_terminator = memchr(target_params, 0, (size_t)(end - target_params)); + if (nul_terminator == NULL) { + DMERR("%s: target parameters not NUL-terminated", __func__); + return -EINVAL; + } + + /* Add 1 for NUL terminator */ + min_size = (size_t)(nul_terminator - (const char *)spec) + 1; + r = dm_table_add_target(table, spec->target_type, (sector_t) spec->sector_start, (sector_t) spec->length, -- cgit v1.2.3-58-ga151 From 249bed821b4db6d95a99160f7d6d236ea5fe6362 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Sat, 3 Jun 2023 10:52:42 -0400 Subject: dm ioctl: Avoid double-fetch of version The version is fetched once in check_version(), which then does some validation and then overwrites the version in userspace with the API version supported by the kernel. copy_params() then fetches the version from userspace *again*, and this time no validation is done. The result is that the kernel's version number is completely controllable by userspace, provided that userspace can win a race condition. Fix this flaw by not copying the version back to the kernel the second time. This is not exploitable as the version is not further used in the kernel. However, it could become a problem if future patches start relying on the version field. Cc: stable@vger.kernel.org Signed-off-by: Demi Marie Obenour Signed-off-by: Mike Snitzer --- drivers/md/dm-ioctl.c | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index a92abbe90981..bfaebc02833a 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1872,30 +1872,36 @@ static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags) * As well as checking the version compatibility this always * copies the kernel interface version out. */ -static int check_version(unsigned int cmd, struct dm_ioctl __user *user) +static int check_version(unsigned int cmd, struct dm_ioctl __user *user, + struct dm_ioctl *kernel_params) { - uint32_t version[3]; int r = 0; - if (copy_from_user(version, user->version, sizeof(version))) + /* Make certain version is first member of dm_ioctl struct */ + BUILD_BUG_ON(offsetof(struct dm_ioctl, version) != 0); + + if (copy_from_user(kernel_params->version, user->version, sizeof(kernel_params->version))) return -EFAULT; - if ((version[0] != DM_VERSION_MAJOR) || - (version[1] > DM_VERSION_MINOR)) { + if ((kernel_params->version[0] != DM_VERSION_MAJOR) || + (kernel_params->version[1] > DM_VERSION_MINOR)) { DMERR("ioctl interface mismatch: kernel(%u.%u.%u), user(%u.%u.%u), cmd(%d)", DM_VERSION_MAJOR, DM_VERSION_MINOR, DM_VERSION_PATCHLEVEL, - version[0], version[1], version[2], cmd); + kernel_params->version[0], + kernel_params->version[1], + kernel_params->version[2], + cmd); r = -EINVAL; } /* * Fill in the kernel version. */ - version[0] = DM_VERSION_MAJOR; - version[1] = DM_VERSION_MINOR; - version[2] = DM_VERSION_PATCHLEVEL; - if (copy_to_user(user->version, version, sizeof(version))) + kernel_params->version[0] = DM_VERSION_MAJOR; + kernel_params->version[1] = DM_VERSION_MINOR; + kernel_params->version[2] = DM_VERSION_PATCHLEVEL; + if (copy_to_user(user->version, kernel_params->version, sizeof(kernel_params->version))) return -EFAULT; return r; @@ -1921,7 +1927,10 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kern const size_t minimum_data_size = offsetof(struct dm_ioctl, data); unsigned int noio_flag; - if (copy_from_user(param_kernel, user, minimum_data_size)) + /* check_version() already copied version from userspace, avoid TOCTOU */ + if (copy_from_user((char *)param_kernel + sizeof(param_kernel->version), + (char __user *)user + sizeof(param_kernel->version), + minimum_data_size - sizeof(param_kernel->version))) return -EFAULT; if (param_kernel->data_size < minimum_data_size) { @@ -2033,7 +2042,7 @@ static int ctl_ioctl(struct file *file, uint command, struct dm_ioctl __user *us * Check the interface version passed in. This also * writes out the kernel's interface version. */ - r = check_version(cmd, user); + r = check_version(cmd, user, ¶m_kernel); if (r) return r; -- cgit v1.2.3-58-ga151 From a85f1a9de91a59cd9b12d60f631cbda9c56a1c3c Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Sat, 3 Jun 2023 10:52:43 -0400 Subject: dm ioctl: Refuse to create device named "control" Typical userspace setups create a symlink under /dev/mapper with the name of the device, but /dev/mapper/control is reserved for DM's control device. Therefore, trying to create such a device is almost certain to be a userspace bug. Signed-off-by: Demi Marie Obenour Signed-off-by: Mike Snitzer --- drivers/md/dm-ioctl.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index bfaebc02833a..e172a91e88dc 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -767,7 +767,12 @@ static int get_target_version(struct file *filp, struct dm_ioctl *param, size_t static int check_name(const char *name) { if (strchr(name, '/')) { - DMERR("invalid device name"); + DMERR("device name cannot contain '/'"); + return -EINVAL; + } + + if (strcmp(name, DM_CONTROL_NODE) == 0) { + DMERR("device name cannot be \"%s\"", DM_CONTROL_NODE); return -EINVAL; } -- cgit v1.2.3-58-ga151 From 81ca2dbefaabe1a2ca1c7cfc84dfd45c072c82a6 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Sat, 3 Jun 2023 10:52:44 -0400 Subject: dm ioctl: Refuse to create device named "." or ".." Using either of these is going to greatly confuse userspace, as they are not valid symlink names and so creating the usual /dev/mapper/NAME symlink will not be possible. As creating a device with either of these names is almost certainly a userspace bug, just error out. Signed-off-by: Demi Marie Obenour Signed-off-by: Mike Snitzer --- drivers/md/dm-ioctl.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index e172a91e88dc..16244a7b193c 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -771,8 +771,10 @@ static int check_name(const char *name) return -EINVAL; } - if (strcmp(name, DM_CONTROL_NODE) == 0) { - DMERR("device name cannot be \"%s\"", DM_CONTROL_NODE); + if (strcmp(name, DM_CONTROL_NODE) == 0 || + strcmp(name, ".") == 0 || + strcmp(name, "..") == 0) { + DMERR("device name cannot be \"%s\", \".\", or \"..\"", DM_CONTROL_NODE); return -EINVAL; } -- cgit v1.2.3-58-ga151 From 6d50eb4725934fd22f5eeccb401000687c790fd0 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 26 Jun 2023 16:44:34 +0200 Subject: dm integrity: reduce vmalloc space footprint on 32-bit architectures It was reported that dm-integrity runs out of vmalloc space on 32-bit architectures. On x86, there is only 128MiB vmalloc space and dm-integrity consumes it quickly because it has a 64MiB journal and 8MiB recalculate buffer. Fix this by reducing the size of the journal to 4MiB and the size of the recalculate buffer to 1MiB, so that multiple dm-integrity devices can be created and activated on 32-bit architectures. Cc: stable@vger.kernel.org Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 5e5f1c029b75..0a910bb8db17 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -34,11 +34,11 @@ #define DEFAULT_BUFFER_SECTORS 128 #define DEFAULT_JOURNAL_WATERMARK 50 #define DEFAULT_SYNC_MSEC 10000 -#define DEFAULT_MAX_JOURNAL_SECTORS 131072 +#define DEFAULT_MAX_JOURNAL_SECTORS (IS_ENABLED(CONFIG_64BIT) ? 131072 : 8192) #define MIN_LOG2_INTERLEAVE_SECTORS 3 #define MAX_LOG2_INTERLEAVE_SECTORS 31 #define METADATA_WORKQUEUE_MAX_ACTIVE 16 -#define RECALC_SECTORS 32768 +#define RECALC_SECTORS (IS_ENABLED(CONFIG_64BIT) ? 32768 : 2048) #define RECALC_WRITE_SUPER 16 #define BITMAP_BLOCK_SIZE 4096 /* don't change it */ #define BITMAP_FLUSH_INTERVAL (10 * HZ) -- cgit v1.2.3-58-ga151 From da8b4fc1f63a01a0eca9338ae338b804c437b51f Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 26 Jun 2023 16:46:00 +0200 Subject: dm integrity: only allocate recalculate buffer when needed dm-integrity preallocated 8MiB buffer for recalculating in the constructor and freed it in the destructor. This wastes memory when the user has many dm-integrity devices. Fix dm-integrity so that the buffer is only allocated when recalculation is in progress; allocate the buffer at the beginning of integrity_recalc() and free it at the end. Note that integrity_recalc() doesn't hold any locks when allocating the buffer, so it shouldn't cause low-memory deadlock. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 52 +++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 26 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 0a910bb8db17..16d1aa263066 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -251,8 +251,6 @@ struct dm_integrity_c { struct workqueue_struct *recalc_wq; struct work_struct recalc_work; - u8 *recalc_buffer; - u8 *recalc_tags; struct bio_list flush_bio_list; @@ -2646,6 +2644,9 @@ static void recalc_write_super(struct dm_integrity_c *ic) static void integrity_recalc(struct work_struct *w) { struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, recalc_work); + size_t recalc_tags_size; + u8 *recalc_buffer = NULL; + u8 *recalc_tags = NULL; struct dm_integrity_range range; struct dm_io_request io_req; struct dm_io_region io_loc; @@ -2658,6 +2659,20 @@ static void integrity_recalc(struct work_struct *w) int r; unsigned int super_counter = 0; + recalc_buffer = __vmalloc(RECALC_SECTORS << SECTOR_SHIFT, GFP_NOIO); + if (!recalc_buffer) { + DMCRIT("out of memory for recalculate buffer - recalculation disabled"); + goto free_ret; + } + recalc_tags_size = (RECALC_SECTORS >> ic->sb->log2_sectors_per_block) * ic->tag_size; + if (crypto_shash_digestsize(ic->internal_hash) > ic->tag_size) + recalc_tags_size += crypto_shash_digestsize(ic->internal_hash) - ic->tag_size; + recalc_tags = kvmalloc(recalc_tags_size, GFP_NOIO); + if (!recalc_tags) { + DMCRIT("out of memory for recalculate buffer - recalculation disabled"); + goto free_ret; + } + DEBUG_print("start recalculation... (position %llx)\n", le64_to_cpu(ic->sb->recalc_sector)); spin_lock_irq(&ic->endio_wait.lock); @@ -2720,7 +2735,7 @@ next_chunk: io_req.bi_opf = REQ_OP_READ; io_req.mem.type = DM_IO_VMA; - io_req.mem.ptr.addr = ic->recalc_buffer; + io_req.mem.ptr.addr = recalc_buffer; io_req.notify.fn = NULL; io_req.client = ic->io; io_loc.bdev = ic->dev->bdev; @@ -2733,15 +2748,15 @@ next_chunk: goto err; } - t = ic->recalc_tags; + t = recalc_tags; for (i = 0; i < n_sectors; i += ic->sectors_per_block) { - integrity_sector_checksum(ic, logical_sector + i, ic->recalc_buffer + (i << SECTOR_SHIFT), t); + integrity_sector_checksum(ic, logical_sector + i, recalc_buffer + (i << SECTOR_SHIFT), t); t += ic->tag_size; } metadata_block = get_metadata_sector_and_offset(ic, area, offset, &metadata_offset); - r = dm_integrity_rw_tag(ic, ic->recalc_tags, &metadata_block, &metadata_offset, t - ic->recalc_tags, TAG_WRITE); + r = dm_integrity_rw_tag(ic, recalc_tags, &metadata_block, &metadata_offset, t - recalc_tags, TAG_WRITE); if (unlikely(r)) { dm_integrity_io_error(ic, "writing tags", r); goto err; @@ -2769,12 +2784,16 @@ advance_and_next: err: remove_range(ic, &range); - return; + goto free_ret; unlock_ret: spin_unlock_irq(&ic->endio_wait.lock); recalc_write_super(ic); + +free_ret: + vfree(recalc_buffer); + kvfree(recalc_tags); } static void bitmap_block_work(struct work_struct *w) @@ -4439,8 +4458,6 @@ try_smaller_buffer: } if (ic->internal_hash) { - size_t recalc_tags_size; - ic->recalc_wq = alloc_workqueue("dm-integrity-recalc", WQ_MEM_RECLAIM, 1); if (!ic->recalc_wq) { ti->error = "Cannot allocate workqueue"; @@ -4448,21 +4465,6 @@ try_smaller_buffer: goto bad; } INIT_WORK(&ic->recalc_work, integrity_recalc); - ic->recalc_buffer = vmalloc(RECALC_SECTORS << SECTOR_SHIFT); - if (!ic->recalc_buffer) { - ti->error = "Cannot allocate buffer for recalculating"; - r = -ENOMEM; - goto bad; - } - recalc_tags_size = (RECALC_SECTORS >> ic->sb->log2_sectors_per_block) * ic->tag_size; - if (crypto_shash_digestsize(ic->internal_hash) > ic->tag_size) - recalc_tags_size += crypto_shash_digestsize(ic->internal_hash) - ic->tag_size; - ic->recalc_tags = kvmalloc(recalc_tags_size, GFP_KERNEL); - if (!ic->recalc_tags) { - ti->error = "Cannot allocate tags for recalculating"; - r = -ENOMEM; - goto bad; - } } else { if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) { ti->error = "Recalculate can only be specified with internal_hash"; @@ -4606,8 +4608,6 @@ static void dm_integrity_dtr(struct dm_target *ti) destroy_workqueue(ic->writer_wq); if (ic->recalc_wq) destroy_workqueue(ic->recalc_wq); - vfree(ic->recalc_buffer); - kvfree(ic->recalc_tags); kvfree(ic->bbs); if (ic->bufio) dm_bufio_client_destroy(ic->bufio); -- cgit v1.2.3-58-ga151 From 3be1622895af25101f7046ed0b2286bead2219d4 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 26 Jun 2023 16:46:57 +0200 Subject: dm integrity: scale down the recalculate buffer if memory allocation fails If memory allocation fails, try to reduce the size of the recalculate buffer and continue with that smaller buffer. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 16d1aa263066..5ca3fc62e8f3 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -2658,19 +2658,25 @@ static void integrity_recalc(struct work_struct *w) unsigned int i; int r; unsigned int super_counter = 0; + unsigned recalc_sectors = RECALC_SECTORS; - recalc_buffer = __vmalloc(RECALC_SECTORS << SECTOR_SHIFT, GFP_NOIO); +retry: + recalc_buffer = __vmalloc(recalc_sectors << SECTOR_SHIFT, GFP_NOIO); if (!recalc_buffer) { +oom: + recalc_sectors >>= 1; + if (recalc_sectors >= 1U << ic->sb->log2_sectors_per_block) + goto retry; DMCRIT("out of memory for recalculate buffer - recalculation disabled"); goto free_ret; } - recalc_tags_size = (RECALC_SECTORS >> ic->sb->log2_sectors_per_block) * ic->tag_size; + recalc_tags_size = (recalc_sectors >> ic->sb->log2_sectors_per_block) * ic->tag_size; if (crypto_shash_digestsize(ic->internal_hash) > ic->tag_size) recalc_tags_size += crypto_shash_digestsize(ic->internal_hash) - ic->tag_size; recalc_tags = kvmalloc(recalc_tags_size, GFP_NOIO); if (!recalc_tags) { - DMCRIT("out of memory for recalculate buffer - recalculation disabled"); - goto free_ret; + vfree(recalc_buffer); + goto oom; } DEBUG_print("start recalculation... (position %llx)\n", le64_to_cpu(ic->sb->recalc_sector)); @@ -2693,7 +2699,7 @@ next_chunk: } get_area_and_offset(ic, range.logical_sector, &area, &offset); - range.n_sectors = min((sector_t)RECALC_SECTORS, ic->provided_data_sectors - range.logical_sector); + range.n_sectors = min((sector_t)recalc_sectors, ic->provided_data_sectors - range.logical_sector); if (!ic->meta_dev) range.n_sectors = min(range.n_sectors, ((sector_t)1U << ic->sb->log2_interleave_sectors) - (unsigned int)offset); -- cgit v1.2.3-58-ga151 From e2c789cab60a493a72b42cb53eb5fbf96d5f1ae3 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 26 Jun 2023 16:48:40 +0200 Subject: dm: get rid of GFP_NOIO workarounds for __vmalloc and kvmalloc In the past, the function __vmalloc didn't respect the GFP flags - it allocated memory with the provided gfp flags, but it allocated page tables with GFP_KERNEL. This was fixed in commit 451769ebb7e7 ("mm/vmalloc: alloc GFP_NO{FS,IO} for vmalloc") so the memalloc_noio_{save,restore} workaround is no longer needed. The function kvmalloc didn't like flags different from GFP_KERNEL. This was fixed in commit a421ef303008 ("mm: allow !GFP_KERNEL allocations for kvmalloc"), so kvmalloc can now be called with GFP_NOIO. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-bufio.c | 17 ----------------- drivers/md/dm-ioctl.c | 5 +---- 2 files changed, 1 insertion(+), 21 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index a7079b38756a..bc309e41d074 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1157,23 +1157,6 @@ static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, *data_mode = DATA_MODE_VMALLOC; - /* - * __vmalloc allocates the data pages and auxiliary structures with - * gfp_flags that were specified, but pagetables are always allocated - * with GFP_KERNEL, no matter what was specified as gfp_mask. - * - * Consequently, we must set per-process flag PF_MEMALLOC_NOIO so that - * all allocations done by this process (including pagetables) are done - * as if GFP_NOIO was specified. - */ - if (gfp_mask & __GFP_NORETRY) { - unsigned int noio_flag = memalloc_noio_save(); - void *ptr = __vmalloc(c->block_size, gfp_mask); - - memalloc_noio_restore(noio_flag); - return ptr; - } - return __vmalloc(c->block_size, gfp_mask); } diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 16244a7b193c..8e14a4a0996d 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1932,7 +1932,6 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kern struct dm_ioctl *dmi; int secure_data; const size_t minimum_data_size = offsetof(struct dm_ioctl, data); - unsigned int noio_flag; /* check_version() already copied version from userspace, avoid TOCTOU */ if (copy_from_user((char *)param_kernel + sizeof(param_kernel->version), @@ -1962,9 +1961,7 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kern * Use kmalloc() rather than vmalloc() when we can. */ dmi = NULL; - noio_flag = memalloc_noio_save(); - dmi = kvmalloc(param_kernel->data_size, GFP_KERNEL | __GFP_HIGH); - memalloc_noio_restore(noio_flag); + dmi = kvmalloc(param_kernel->data_size, GFP_NOIO | __GFP_HIGH); if (!dmi) { if (secure_data && clear_user(user, param_kernel->data_size)) -- cgit v1.2.3-58-ga151