summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-12-13 14:13:15 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2019-12-13 14:13:15 -0800
commit15da849c910da05622c75c5741dd6e176a8b6fe2 (patch)
tree62541d69d79a3327120b8e406709fd0749e7b069
parent22ff311af9c7d0eca4e9d276e95c4793a6ecf84f (diff)
parent7fc979f8204fb763e203d3e716c17d352eb96b35 (diff)
Merge tag 'for-5.5/dm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper fixes from Mike Snitzer: - Fix DM multipath by restoring full path selector functionality for bio-based configurations that don't haave a SCSI device handler. - Fix dm-btree removal to ensure non-root btree nodes have at least (max_entries / 3) entries. This resolves userspace thin_check utility's report of "too few entries in btree_node". - Fix both the DM thin-provisioning and dm-clone targets to properly flush the data device prior to metadata commit. This resolves the potential for inconsistency across a power loss event when the data device has a volatile writeback cache. - Small documentation fixes to dm-clone and dm-integrity. * tag 'for-5.5/dm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: docs: dm-integrity: remove reference to ARC4 dm thin: Flush data device before committing metadata dm thin metadata: Add support for a pre-commit callback dm clone: Flush destination device before committing metadata dm clone metadata: Use a two phase commit dm clone metadata: Track exact changes per transaction dm btree: increase rebalance threshold in __rebalance2() dm: add dm-clone to the documentation index dm mpath: remove harmful bio-based optimization
-rw-r--r--Documentation/admin-guide/device-mapper/dm-integrity.rst2
-rw-r--r--Documentation/admin-guide/device-mapper/index.rst1
-rw-r--r--drivers/md/dm-clone-metadata.c136
-rw-r--r--drivers/md/dm-clone-metadata.h17
-rw-r--r--drivers/md/dm-clone-target.c53
-rw-r--r--drivers/md/dm-mpath.c37
-rw-r--r--drivers/md/dm-thin-metadata.c29
-rw-r--r--drivers/md/dm-thin-metadata.h7
-rw-r--r--drivers/md/dm-thin.c42
-rw-r--r--drivers/md/persistent-data/dm-btree-remove.c8
10 files changed, 248 insertions, 84 deletions
diff --git a/Documentation/admin-guide/device-mapper/dm-integrity.rst b/Documentation/admin-guide/device-mapper/dm-integrity.rst
index 594095b54b29..c00f9f11e3f3 100644
--- a/Documentation/admin-guide/device-mapper/dm-integrity.rst
+++ b/Documentation/admin-guide/device-mapper/dm-integrity.rst
@@ -144,7 +144,7 @@ journal_crypt:algorithm(:key) (the key is optional)
Encrypt the journal using given algorithm to make sure that the
attacker can't read the journal. You can use a block cipher here
(such as "cbc(aes)") or a stream cipher (for example "chacha20",
- "salsa20", "ctr(aes)" or "ecb(arc4)").
+ "salsa20" or "ctr(aes)").
The journal contains history of last writes to the block device,
an attacker reading the journal could see the last sector nubmers
diff --git a/Documentation/admin-guide/device-mapper/index.rst b/Documentation/admin-guide/device-mapper/index.rst
index 4872fb6d2952..ec62fcc8eece 100644
--- a/Documentation/admin-guide/device-mapper/index.rst
+++ b/Documentation/admin-guide/device-mapper/index.rst
@@ -8,6 +8,7 @@ Device Mapper
cache-policies
cache
delay
+ dm-clone
dm-crypt
dm-dust
dm-flakey
diff --git a/drivers/md/dm-clone-metadata.c b/drivers/md/dm-clone-metadata.c
index 08c552e5e41b..c05b12110456 100644
--- a/drivers/md/dm-clone-metadata.c
+++ b/drivers/md/dm-clone-metadata.c
@@ -67,23 +67,34 @@ struct superblock_disk {
* To save constantly doing look ups on disk we keep an in core copy of the
* on-disk bitmap, the region_map.
*
- * To further reduce metadata I/O overhead we use a second bitmap, the dmap
- * (dirty bitmap), which tracks the dirty words, i.e. longs, of the region_map.
+ * In order to track which regions are hydrated during a metadata transaction,
+ * we use a second set of bitmaps, the dmap (dirty bitmap), which includes two
+ * bitmaps, namely dirty_regions and dirty_words. The dirty_regions bitmap
+ * tracks the regions that got hydrated during the current metadata
+ * transaction. The dirty_words bitmap tracks the dirty words, i.e. longs, of
+ * the dirty_regions bitmap.
+ *
+ * This allows us to precisely track the regions that were hydrated during the
+ * current metadata transaction and update the metadata accordingly, when we
+ * commit the current transaction. This is important because dm-clone should
+ * only commit the metadata of regions that were properly flushed to the
+ * destination device beforehand. Otherwise, in case of a crash, we could end
+ * up with a corrupted dm-clone device.
*
* When a region finishes hydrating dm-clone calls
* dm_clone_set_region_hydrated(), or for discard requests
* dm_clone_cond_set_range(), which sets the corresponding bits in region_map
* and dmap.
*
- * During a metadata commit we scan the dmap for dirty region_map words (longs)
- * and update accordingly the on-disk metadata. Thus, we don't have to flush to
- * disk the whole region_map. We can just flush the dirty region_map words.
+ * During a metadata commit we scan dmap->dirty_words and dmap->dirty_regions
+ * and update the on-disk metadata accordingly. Thus, we don't have to flush to
+ * disk the whole region_map. We can just flush the dirty region_map bits.
*
- * We use a dirty bitmap, which is smaller than the original region_map, to
- * reduce the amount of memory accesses during a metadata commit. As dm-bitset
- * accesses the on-disk bitmap in 64-bit word granularity, there is no
- * significant benefit in tracking the dirty region_map bits with a smaller
- * granularity.
+ * We use the helper dmap->dirty_words bitmap, which is smaller than the
+ * original region_map, to reduce the amount of memory accesses during a
+ * metadata commit. Moreover, as dm-bitset also accesses the on-disk bitmap in
+ * 64-bit word granularity, the dirty_words bitmap helps us avoid useless disk
+ * accesses.
*
* We could update directly the on-disk bitmap, when dm-clone calls either
* dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), buts this
@@ -92,12 +103,13 @@ struct superblock_disk {
* e.g., in a hooked overwrite bio's completion routine, and further reduce the
* I/O completion latency.
*
- * We maintain two dirty bitmaps. During a metadata commit we atomically swap
- * the currently used dmap with the unused one. This allows the metadata update
- * functions to run concurrently with an ongoing commit.
+ * We maintain two dirty bitmap sets. During a metadata commit we atomically
+ * swap the currently used dmap with the unused one. This allows the metadata
+ * update functions to run concurrently with an ongoing commit.
*/
struct dirty_map {
unsigned long *dirty_words;
+ unsigned long *dirty_regions;
unsigned int changed;
};
@@ -115,6 +127,9 @@ struct dm_clone_metadata {
struct dirty_map dmap[2];
struct dirty_map *current_dmap;
+ /* Protected by lock */
+ struct dirty_map *committing_dmap;
+
/*
* In core copy of the on-disk bitmap to save constantly doing look ups
* on disk.
@@ -461,34 +476,53 @@ static size_t bitmap_size(unsigned long nr_bits)
return BITS_TO_LONGS(nr_bits) * sizeof(long);
}
-static int dirty_map_init(struct dm_clone_metadata *cmd)
+static int __dirty_map_init(struct dirty_map *dmap, unsigned long nr_words,
+ unsigned long nr_regions)
{
- cmd->dmap[0].changed = 0;
- cmd->dmap[0].dirty_words = kvzalloc(bitmap_size(cmd->nr_words), GFP_KERNEL);
+ dmap->changed = 0;
- if (!cmd->dmap[0].dirty_words) {
- DMERR("Failed to allocate dirty bitmap");
+ dmap->dirty_words = kvzalloc(bitmap_size(nr_words), GFP_KERNEL);
+ if (!dmap->dirty_words)
+ return -ENOMEM;
+
+ dmap->dirty_regions = kvzalloc(bitmap_size(nr_regions), GFP_KERNEL);
+ if (!dmap->dirty_regions) {
+ kvfree(dmap->dirty_words);
return -ENOMEM;
}
- cmd->dmap[1].changed = 0;
- cmd->dmap[1].dirty_words = kvzalloc(bitmap_size(cmd->nr_words), GFP_KERNEL);
+ return 0;
+}
+
+static void __dirty_map_exit(struct dirty_map *dmap)
+{
+ kvfree(dmap->dirty_words);
+ kvfree(dmap->dirty_regions);
+}
+
+static int dirty_map_init(struct dm_clone_metadata *cmd)
+{
+ if (__dirty_map_init(&cmd->dmap[0], cmd->nr_words, cmd->nr_regions)) {
+ DMERR("Failed to allocate dirty bitmap");
+ return -ENOMEM;
+ }
- if (!cmd->dmap[1].dirty_words) {
+ if (__dirty_map_init(&cmd->dmap[1], cmd->nr_words, cmd->nr_regions)) {
DMERR("Failed to allocate dirty bitmap");
- kvfree(cmd->dmap[0].dirty_words);
+ __dirty_map_exit(&cmd->dmap[0]);
return -ENOMEM;
}
cmd->current_dmap = &cmd->dmap[0];
+ cmd->committing_dmap = NULL;
return 0;
}
static void dirty_map_exit(struct dm_clone_metadata *cmd)
{
- kvfree(cmd->dmap[0].dirty_words);
- kvfree(cmd->dmap[1].dirty_words);
+ __dirty_map_exit(&cmd->dmap[0]);
+ __dirty_map_exit(&cmd->dmap[1]);
}
static int __load_bitset_in_core(struct dm_clone_metadata *cmd)
@@ -633,21 +667,23 @@ unsigned long dm_clone_find_next_unhydrated_region(struct dm_clone_metadata *cmd
return find_next_zero_bit(cmd->region_map, cmd->nr_regions, start);
}
-static int __update_metadata_word(struct dm_clone_metadata *cmd, unsigned long word)
+static int __update_metadata_word(struct dm_clone_metadata *cmd,
+ unsigned long *dirty_regions,
+ unsigned long word)
{
int r;
unsigned long index = word * BITS_PER_LONG;
unsigned long max_index = min(cmd->nr_regions, (word + 1) * BITS_PER_LONG);
while (index < max_index) {
- if (test_bit(index, cmd->region_map)) {
+ if (test_bit(index, dirty_regions)) {
r = dm_bitset_set_bit(&cmd->bitset_info, cmd->bitset_root,
index, &cmd->bitset_root);
-
if (r) {
DMERR("dm_bitset_set_bit failed");
return r;
}
+ __clear_bit(index, dirty_regions);
}
index++;
}
@@ -721,7 +757,7 @@ static int __flush_dmap(struct dm_clone_metadata *cmd, struct dirty_map *dmap)
if (word == cmd->nr_words)
break;
- r = __update_metadata_word(cmd, word);
+ r = __update_metadata_word(cmd, dmap->dirty_regions, word);
if (r)
return r;
@@ -743,15 +779,17 @@ static int __flush_dmap(struct dm_clone_metadata *cmd, struct dirty_map *dmap)
return 0;
}
-int dm_clone_metadata_commit(struct dm_clone_metadata *cmd)
+int dm_clone_metadata_pre_commit(struct dm_clone_metadata *cmd)
{
- int r = -EPERM;
+ int r = 0;
struct dirty_map *dmap, *next_dmap;
down_write(&cmd->lock);
- if (cmd->fail_io || dm_bm_is_read_only(cmd->bm))
+ if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) {
+ r = -EPERM;
goto out;
+ }
/* Get current dirty bitmap */
dmap = cmd->current_dmap;
@@ -763,7 +801,7 @@ int dm_clone_metadata_commit(struct dm_clone_metadata *cmd)
* The last commit failed, so we don't have a clean dirty-bitmap to
* use.
*/
- if (WARN_ON(next_dmap->changed)) {
+ if (WARN_ON(next_dmap->changed || cmd->committing_dmap)) {
r = -EINVAL;
goto out;
}
@@ -773,11 +811,33 @@ int dm_clone_metadata_commit(struct dm_clone_metadata *cmd)
cmd->current_dmap = next_dmap;
spin_unlock_irq(&cmd->bitmap_lock);
- /*
- * No one is accessing the old dirty bitmap anymore, so we can flush
- * it.
- */
- r = __flush_dmap(cmd, dmap);
+ /* Set old dirty bitmap as currently committing */
+ cmd->committing_dmap = dmap;
+out:
+ up_write(&cmd->lock);
+
+ return r;
+}
+
+int dm_clone_metadata_commit(struct dm_clone_metadata *cmd)
+{
+ int r = -EPERM;
+
+ down_write(&cmd->lock);
+
+ if (cmd->fail_io || dm_bm_is_read_only(cmd->bm))
+ goto out;
+
+ if (WARN_ON(!cmd->committing_dmap)) {
+ r = -EINVAL;
+ goto out;
+ }
+
+ r = __flush_dmap(cmd, cmd->committing_dmap);
+ if (!r) {
+ /* Clear committing dmap */
+ cmd->committing_dmap = NULL;
+ }
out:
up_write(&cmd->lock);
@@ -802,6 +862,7 @@ int dm_clone_set_region_hydrated(struct dm_clone_metadata *cmd, unsigned long re
dmap = cmd->current_dmap;
__set_bit(word, dmap->dirty_words);
+ __set_bit(region_nr, dmap->dirty_regions);
__set_bit(region_nr, cmd->region_map);
dmap->changed = 1;
@@ -830,6 +891,7 @@ int dm_clone_cond_set_range(struct dm_clone_metadata *cmd, unsigned long start,
if (!test_bit(region_nr, cmd->region_map)) {
word = region_nr / BITS_PER_LONG;
__set_bit(word, dmap->dirty_words);
+ __set_bit(region_nr, dmap->dirty_regions);
__set_bit(region_nr, cmd->region_map);
dmap->changed = 1;
}
diff --git a/drivers/md/dm-clone-metadata.h b/drivers/md/dm-clone-metadata.h
index 3fe50a781c11..14af1ebd853f 100644
--- a/drivers/md/dm-clone-metadata.h
+++ b/drivers/md/dm-clone-metadata.h
@@ -75,7 +75,23 @@ void dm_clone_metadata_close(struct dm_clone_metadata *cmd);
/*
* Commit dm-clone metadata to disk.
+ *
+ * We use a two phase commit:
+ *
+ * 1. dm_clone_metadata_pre_commit(): Prepare the current transaction for
+ * committing. After this is called, all subsequent metadata updates, done
+ * through either dm_clone_set_region_hydrated() or
+ * dm_clone_cond_set_range(), will be part of the **next** transaction.
+ *
+ * 2. dm_clone_metadata_commit(): Actually commit the current transaction to
+ * disk and start a new transaction.
+ *
+ * This allows dm-clone to flush the destination device after step (1) to
+ * ensure that all freshly hydrated regions, for which we are updating the
+ * metadata, are properly written to non-volatile storage and won't be lost in
+ * case of a crash.
*/
+int dm_clone_metadata_pre_commit(struct dm_clone_metadata *cmd);
int dm_clone_metadata_commit(struct dm_clone_metadata *cmd);
/*
@@ -112,6 +128,7 @@ int dm_clone_metadata_abort(struct dm_clone_metadata *cmd);
* Switches metadata to a read only mode. Once read-only mode has been entered
* the following functions will return -EPERM:
*
+ * dm_clone_metadata_pre_commit()
* dm_clone_metadata_commit()
* dm_clone_set_region_hydrated()
* dm_clone_cond_set_range()
diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c
index b3d89072d21c..d1e1b5b56b1b 100644
--- a/drivers/md/dm-clone-target.c
+++ b/drivers/md/dm-clone-target.c
@@ -86,6 +86,12 @@ struct clone {
struct dm_clone_metadata *cmd;
+ /*
+ * bio used to flush the destination device, before committing the
+ * metadata.
+ */
+ struct bio flush_bio;
+
/* Region hydration hash table */
struct hash_table_bucket *ht;
@@ -1108,10 +1114,13 @@ static bool need_commit_due_to_time(struct clone *clone)
/*
* A non-zero return indicates read-only or fail mode.
*/
-static int commit_metadata(struct clone *clone)
+static int commit_metadata(struct clone *clone, bool *dest_dev_flushed)
{
int r = 0;
+ if (dest_dev_flushed)
+ *dest_dev_flushed = false;
+
mutex_lock(&clone->commit_lock);
if (!dm_clone_changed_this_transaction(clone->cmd))
@@ -1122,8 +1131,26 @@ static int commit_metadata(struct clone *clone)
goto out;
}
- r = dm_clone_metadata_commit(clone->cmd);
+ r = dm_clone_metadata_pre_commit(clone->cmd);
+ if (unlikely(r)) {
+ __metadata_operation_failed(clone, "dm_clone_metadata_pre_commit", r);
+ goto out;
+ }
+ bio_reset(&clone->flush_bio);
+ bio_set_dev(&clone->flush_bio, clone->dest_dev->bdev);
+ clone->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+
+ r = submit_bio_wait(&clone->flush_bio);
+ if (unlikely(r)) {
+ __metadata_operation_failed(clone, "flush destination device", r);
+ goto out;
+ }
+
+ if (dest_dev_flushed)
+ *dest_dev_flushed = true;
+
+ r = dm_clone_metadata_commit(clone->cmd);
if (unlikely(r)) {
__metadata_operation_failed(clone, "dm_clone_metadata_commit", r);
goto out;
@@ -1194,6 +1221,7 @@ static void process_deferred_bios(struct clone *clone)
static void process_deferred_flush_bios(struct clone *clone)
{
struct bio *bio;
+ bool dest_dev_flushed;
struct bio_list bios = BIO_EMPTY_LIST;
struct bio_list bio_completions = BIO_EMPTY_LIST;
@@ -1213,7 +1241,7 @@ static void process_deferred_flush_bios(struct clone *clone)
!(dm_clone_changed_this_transaction(clone->cmd) && need_commit_due_to_time(clone)))
return;
- if (commit_metadata(clone)) {
+ if (commit_metadata(clone, &dest_dev_flushed)) {
bio_list_merge(&bios, &bio_completions);
while ((bio = bio_list_pop(&bios)))
@@ -1227,8 +1255,17 @@ static void process_deferred_flush_bios(struct clone *clone)
while ((bio = bio_list_pop(&bio_completions)))
bio_endio(bio);
- while ((bio = bio_list_pop(&bios)))
- generic_make_request(bio);
+ while ((bio = bio_list_pop(&bios))) {
+ if ((bio->bi_opf & REQ_PREFLUSH) && dest_dev_flushed) {
+ /* We just flushed the destination device as part of
+ * the metadata commit, so there is no reason to send
+ * another flush.
+ */
+ bio_endio(bio);
+ } else {
+ generic_make_request(bio);
+ }
+ }
}
static void do_worker(struct work_struct *work)
@@ -1400,7 +1437,7 @@ static void clone_status(struct dm_target *ti, status_type_t type,
/* Commit to ensure statistics aren't out-of-date */
if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
- (void) commit_metadata(clone);
+ (void) commit_metadata(clone, NULL);
r = dm_clone_get_free_metadata_block_count(clone->cmd, &nr_free_metadata_blocks);
@@ -1834,6 +1871,7 @@ static int clone_ctr(struct dm_target *ti, unsigned int argc, char **argv)
bio_list_init(&clone->deferred_flush_completions);
clone->hydration_offset = 0;
atomic_set(&clone->hydrations_in_flight, 0);
+ bio_init(&clone->flush_bio, NULL, 0);
clone->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
if (!clone->wq) {
@@ -1907,6 +1945,7 @@ static void clone_dtr(struct dm_target *ti)
struct clone *clone = ti->private;
mutex_destroy(&clone->commit_lock);
+ bio_uninit(&clone->flush_bio);
for (i = 0; i < clone->nr_ctr_args; i++)
kfree(clone->ctr_args[i]);
@@ -1961,7 +2000,7 @@ static void clone_postsuspend(struct dm_target *ti)
wait_event(clone->hydration_stopped, !atomic_read(&clone->hydrations_in_flight));
flush_workqueue(clone->wq);
- (void) commit_metadata(clone);
+ (void) commit_metadata(clone, NULL);
}
static void clone_resume(struct dm_target *ti)
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index dbcc1e41cd57..e0c32793c248 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -599,45 +599,10 @@ static struct pgpath *__map_bio(struct multipath *m, struct bio *bio)
return pgpath;
}
-static struct pgpath *__map_bio_fast(struct multipath *m, struct bio *bio)
-{
- struct pgpath *pgpath;
- unsigned long flags;
-
- /* Do we need to select a new pgpath? */
- /*
- * FIXME: currently only switching path if no path (due to failure, etc)
- * - which negates the point of using a path selector
- */
- pgpath = READ_ONCE(m->current_pgpath);
- if (!pgpath)
- pgpath = choose_pgpath(m, bio->bi_iter.bi_size);
-
- if (!pgpath) {
- if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
- /* Queue for the daemon to resubmit */
- spin_lock_irqsave(&m->lock, flags);
- bio_list_add(&m->queued_bios, bio);
- spin_unlock_irqrestore(&m->lock, flags);
- queue_work(kmultipathd, &m->process_queued_bios);
-
- return ERR_PTR(-EAGAIN);
- }
- return NULL;
- }
-
- return pgpath;
-}
-
static int __multipath_map_bio(struct multipath *m, struct bio *bio,
struct dm_mpath_io *mpio)
{
- struct pgpath *pgpath;
-
- if (!m->hw_handler_name)
- pgpath = __map_bio_fast(m, bio);
- else
- pgpath = __map_bio(m, bio);
+ struct pgpath *pgpath = __map_bio(m, bio);
if (IS_ERR(pgpath))
return DM_MAPIO_SUBMITTED;
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 4c68a7b93d5e..b88d6d701f5b 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -189,6 +189,15 @@ struct dm_pool_metadata {
sector_t data_block_size;
/*
+ * Pre-commit callback.
+ *
+ * This allows the thin provisioning target to run a callback before
+ * the metadata are committed.
+ */
+ dm_pool_pre_commit_fn pre_commit_fn;
+ void *pre_commit_context;
+
+ /*
* We reserve a section of the metadata for commit overhead.
* All reported space does *not* include this.
*/
@@ -826,6 +835,14 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
if (unlikely(!pmd->in_service))
return 0;
+ if (pmd->pre_commit_fn) {
+ r = pmd->pre_commit_fn(pmd->pre_commit_context);
+ if (r < 0) {
+ DMERR("pre-commit callback failed");
+ return r;
+ }
+ }
+
r = __write_changed_details(pmd);
if (r < 0)
return r;
@@ -892,6 +909,8 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
pmd->in_service = false;
pmd->bdev = bdev;
pmd->data_block_size = data_block_size;
+ pmd->pre_commit_fn = NULL;
+ pmd->pre_commit_context = NULL;
r = __create_persistent_data_objects(pmd, format_device);
if (r) {
@@ -2044,6 +2063,16 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
return r;
}
+void dm_pool_register_pre_commit_callback(struct dm_pool_metadata *pmd,
+ dm_pool_pre_commit_fn fn,
+ void *context)
+{
+ pmd_write_lock_in_core(pmd);
+ pmd->pre_commit_fn = fn;
+ pmd->pre_commit_context = context;
+ pmd_write_unlock(pmd);
+}
+
int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
{
int r = -EINVAL;
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index f6be0d733c20..7ef56bd2a7e3 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -230,6 +230,13 @@ bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd);
*/
void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd);
+/* Pre-commit callback */
+typedef int (*dm_pool_pre_commit_fn)(void *context);
+
+void dm_pool_register_pre_commit_callback(struct dm_pool_metadata *pmd,
+ dm_pool_pre_commit_fn fn,
+ void *context);
+
/*----------------------------------------------------------------*/
#endif
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 5a2c494cb552..57626c27a54b 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -328,6 +328,7 @@ struct pool_c {
dm_block_t low_water_blocks;
struct pool_features requested_pf; /* Features requested during table load */
struct pool_features adjusted_pf; /* Features used after adjusting for constituent devices */
+ struct bio flush_bio;
};
/*
@@ -2383,8 +2384,16 @@ static void process_deferred_bios(struct pool *pool)
while ((bio = bio_list_pop(&bio_completions)))
bio_endio(bio);
- while ((bio = bio_list_pop(&bios)))
- generic_make_request(bio);
+ while ((bio = bio_list_pop(&bios))) {
+ /*
+ * The data device was flushed as part of metadata commit,
+ * so complete redundant flushes immediately.
+ */
+ if (bio->bi_opf & REQ_PREFLUSH)
+ bio_endio(bio);
+ else
+ generic_make_request(bio);
+ }
}
static void do_worker(struct work_struct *ws)
@@ -3115,6 +3124,7 @@ static void pool_dtr(struct dm_target *ti)
__pool_dec(pt->pool);
dm_put_device(ti, pt->metadata_dev);
dm_put_device(ti, pt->data_dev);
+ bio_uninit(&pt->flush_bio);
kfree(pt);
mutex_unlock(&dm_thin_pool_table.mutex);
@@ -3180,6 +3190,29 @@ static void metadata_low_callback(void *context)
dm_table_event(pool->ti->table);
}
+/*
+ * We need to flush the data device **before** committing the metadata.
+ *
+ * This ensures that the data blocks of any newly inserted mappings are
+ * properly written to non-volatile storage and won't be lost in case of a
+ * crash.
+ *
+ * Failure to do so can result in data corruption in the case of internal or
+ * external snapshots and in the case of newly provisioned blocks, when block
+ * zeroing is enabled.
+ */
+static int metadata_pre_commit_callback(void *context)
+{
+ struct pool_c *pt = context;
+ struct bio *flush_bio = &pt->flush_bio;
+
+ bio_reset(flush_bio);
+ bio_set_dev(flush_bio, pt->data_dev->bdev);
+ flush_bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+
+ return submit_bio_wait(flush_bio);
+}
+
static sector_t get_dev_size(struct block_device *bdev)
{
return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
@@ -3348,6 +3381,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
pt->data_dev = data_dev;
pt->low_water_blocks = low_water_blocks;
pt->adjusted_pf = pt->requested_pf = pf;
+ bio_init(&pt->flush_bio, NULL, 0);
ti->num_flush_bios = 1;
/*
@@ -3374,6 +3408,10 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
if (r)
goto out_flags_changed;
+ dm_pool_register_pre_commit_callback(pt->pool->pmd,
+ metadata_pre_commit_callback,
+ pt);
+
pt->callbacks.congested_fn = pool_is_congested;
dm_table_add_target_callbacks(ti->table, &pt->callbacks);
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
index 21ea537bd55e..eff04fa23dfa 100644
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -203,7 +203,13 @@ static void __rebalance2(struct dm_btree_info *info, struct btree_node *parent,
struct btree_node *right = r->n;
uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
- unsigned threshold = 2 * merge_threshold(left) + 1;
+ /*
+ * Ensure the number of entries in each child will be greater
+ * than or equal to (max_entries / 3 + 1), so no matter which
+ * child is used for removal, the number will still be not
+ * less than (max_entries / 3).
+ */
+ unsigned int threshold = 2 * (merge_threshold(left) + 1);
if (nr_left + nr_right < threshold) {
/*