summaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Makefile1
-rw-r--r--fs/btrfs/acl.c16
-rw-r--r--fs/btrfs/backref.c10
-rw-r--r--fs/btrfs/backref.h3
-rw-r--r--fs/btrfs/block-group.c481
-rw-r--r--fs/btrfs/block-group.h6
-rw-r--r--fs/btrfs/btrfs_inode.h27
-rw-r--r--fs/btrfs/check-integrity.c60
-rw-r--r--fs/btrfs/compression.c171
-rw-r--r--fs/btrfs/compression.h5
-rw-r--r--fs/btrfs/ctree.c129
-rw-r--r--fs/btrfs/ctree.h96
-rw-r--r--fs/btrfs/delayed-inode.c227
-rw-r--r--fs/btrfs/delayed-ref.c4
-rw-r--r--fs/btrfs/dir-item.c76
-rw-r--r--fs/btrfs/disk-io.c15
-rw-r--r--fs/btrfs/extent-tree.c15
-rw-r--r--fs/btrfs/extent_io.c318
-rw-r--r--fs/btrfs/extent_io.h2
-rw-r--r--fs/btrfs/file-item.c5
-rw-r--r--fs/btrfs/file.c23
-rw-r--r--fs/btrfs/free-space-cache.c26
-rw-r--r--fs/btrfs/inode.c449
-rw-r--r--fs/btrfs/ioctl.c188
-rw-r--r--fs/btrfs/lzo.c236
-rw-r--r--fs/btrfs/ordered-data.c7
-rw-r--r--fs/btrfs/ordered-data.h5
-rw-r--r--fs/btrfs/qgroup.c34
-rw-r--r--fs/btrfs/qgroup.h2
-rw-r--r--fs/btrfs/raid56.c18
-rw-r--r--fs/btrfs/ref-verify.c10
-rw-r--r--fs/btrfs/relocation.c306
-rw-r--r--fs/btrfs/send.c35
-rw-r--r--fs/btrfs/space-info.c98
-rw-r--r--fs/btrfs/struct-funcs.c8
-rw-r--r--fs/btrfs/subpage.c24
-rw-r--r--fs/btrfs/subpage.h3
-rw-r--r--fs/btrfs/super.c56
-rw-r--r--fs/btrfs/sysfs.c108
-rw-r--r--fs/btrfs/tests/qgroup-tests.c30
-rw-r--r--fs/btrfs/transaction.c15
-rw-r--r--fs/btrfs/transaction.h9
-rw-r--r--fs/btrfs/tree-checker.c38
-rw-r--r--fs/btrfs/tree-log.c139
-rw-r--r--fs/btrfs/verity.c811
-rw-r--r--fs/btrfs/volumes.c510
-rw-r--r--fs/btrfs/volumes.h34
-rw-r--r--fs/btrfs/zlib.c54
-rw-r--r--fs/btrfs/zoned.c34
-rw-r--r--fs/btrfs/zoned.h1
-rw-r--r--fs/btrfs/zstd.c39
51 files changed, 3362 insertions, 1655 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index cec88a66bd6c..3dcf9bcc2326 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -36,6 +36,7 @@ btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o
btrfs-$(CONFIG_BLK_DEV_ZONED) += zoned.o
+btrfs-$(CONFIG_FS_VERITY) += verity.o
btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
tests/extent-buffer-tests.o tests/btrfs-tests.o \
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index d95eb5c8cb37..0a0d0eccee4e 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -16,13 +16,16 @@
#include "btrfs_inode.h"
#include "xattr.h"
-struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
+struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu)
{
int size;
const char *name;
char *value = NULL;
struct posix_acl *acl;
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
switch (type) {
case ACL_TYPE_ACCESS:
name = XATTR_NAME_POSIX_ACL_ACCESS;
@@ -53,7 +56,8 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
}
static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
- struct inode *inode, struct posix_acl *acl, int type)
+ struct user_namespace *mnt_userns,
+ struct inode *inode, struct posix_acl *acl, int type)
{
int ret, size = 0;
const char *name;
@@ -114,12 +118,12 @@ int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
umode_t old_mode = inode->i_mode;
if (type == ACL_TYPE_ACCESS && acl) {
- ret = posix_acl_update_mode(&init_user_ns, inode,
+ ret = posix_acl_update_mode(mnt_userns, inode,
&inode->i_mode, &acl);
if (ret)
return ret;
}
- ret = __btrfs_set_acl(NULL, inode, acl, type);
+ ret = __btrfs_set_acl(NULL, mnt_userns, inode, acl, type);
if (ret)
inode->i_mode = old_mode;
return ret;
@@ -140,14 +144,14 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans,
return ret;
if (default_acl) {
- ret = __btrfs_set_acl(trans, inode, default_acl,
+ ret = __btrfs_set_acl(trans, &init_user_ns, inode, default_acl,
ACL_TYPE_DEFAULT);
posix_acl_release(default_acl);
}
if (acl) {
if (!ret)
- ret = __btrfs_set_acl(trans, inode, acl,
+ ret = __btrfs_set_acl(trans, &init_user_ns, inode, acl,
ACL_TYPE_ACCESS);
posix_acl_release(acl);
}
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 7a8a2fc19533..f735b8798ba1 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1211,7 +1211,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
again:
head = NULL;
- ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
if (ret < 0)
goto out;
BUG_ON(ret == 0);
@@ -1488,15 +1488,15 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans,
int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
u64 time_seq, struct ulist **roots,
- bool ignore_offset)
+ bool skip_commit_root_sem)
{
int ret;
- if (!trans)
+ if (!trans && !skip_commit_root_sem)
down_read(&fs_info->commit_root_sem);
ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr,
- time_seq, roots, ignore_offset);
- if (!trans)
+ time_seq, roots, false);
+ if (!trans && !skip_commit_root_sem)
up_read(&fs_info->commit_root_sem);
return ret;
}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 17abde7f794c..ba454032dbe2 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -47,7 +47,8 @@ int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
const u64 *extent_item_pos, bool ignore_offset);
int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 time_seq, struct ulist **roots, bool ignore_offset);
+ u64 time_seq, struct ulist **roots,
+ bool skip_commit_root_sem);
char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
u32 name_len, unsigned long name_off,
struct extent_buffer *eb_in, u64 parent,
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 38b127b9edfc..a3b830b8410a 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1498,9 +1498,18 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
return;
- mutex_lock(&fs_info->reclaim_bgs_lock);
+ /*
+ * Long running balances can keep us blocked here for eternity, so
+ * simply skip reclaim if we're unable to get the mutex.
+ */
+ if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
+ btrfs_exclop_finish(fs_info);
+ return;
+ }
+
spin_lock(&fs_info->unused_bgs_lock);
while (!list_empty(&fs_info->reclaim_bgs)) {
+ u64 zone_unusable;
int ret = 0;
bg = list_first_entry(&fs_info->reclaim_bgs,
@@ -1534,16 +1543,25 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
goto next;
}
+ /*
+ * Cache the zone_unusable value before turning the block group
+ * to read only. As soon as the blog group is read only it's
+ * zone_unusable value gets moved to the block group's read-only
+ * bytes and isn't available for calculations anymore.
+ */
+ zone_unusable = bg->zone_unusable;
ret = inc_block_group_ro(bg, 0);
up_write(&space_info->groups_sem);
if (ret < 0)
goto next;
- btrfs_info(fs_info, "reclaiming chunk %llu with %llu%% used",
- bg->start, div_u64(bg->used * 100, bg->length));
+ btrfs_info(fs_info,
+ "reclaiming chunk %llu with %llu%% used %llu%% unusable",
+ bg->start, div_u64(bg->used * 100, bg->length),
+ div64_u64(zone_unusable * 100, bg->length));
trace_btrfs_reclaim_block_group(bg);
ret = btrfs_relocate_chunk(fs_info, bg->start);
- if (ret)
+ if (ret && ret != -EAGAIN)
btrfs_err(fs_info, "error relocating chunk %llu",
bg->start);
@@ -2087,11 +2105,22 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
bg->used = em->len;
bg->flags = map->type;
ret = btrfs_add_block_group_cache(fs_info, bg);
+ /*
+ * We may have some valid block group cache added already, in
+ * that case we skip to the next one.
+ */
+ if (ret == -EEXIST) {
+ ret = 0;
+ btrfs_put_block_group(bg);
+ continue;
+ }
+
if (ret) {
btrfs_remove_free_space_cache(bg);
btrfs_put_block_group(bg);
break;
}
+
btrfs_update_space_info(fs_info, bg->flags, em->len, em->len,
0, 0, &space_info);
bg->space_info = space_info;
@@ -2194,9 +2223,24 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
ret = check_chunk_block_group_mappings(info);
error:
btrfs_free_path(path);
+ /*
+ * We've hit some error while reading the extent tree, and have
+ * rescue=ibadroots mount option.
+ * Try to fill the tree using dummy block groups so that the user can
+ * continue to mount and grab their data.
+ */
+ if (ret && btrfs_test_opt(info, IGNOREBADROOTS))
+ ret = fill_dummy_bgs(info);
return ret;
}
+/*
+ * This function, insert_block_group_item(), belongs to the phase 2 of chunk
+ * allocation.
+ *
+ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
+ * phases.
+ */
static int insert_block_group_item(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group)
{
@@ -2219,15 +2263,108 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans,
return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
}
+static int insert_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device, u64 chunk_offset,
+ u64 start, u64 num_bytes)
+{
+ struct btrfs_fs_info *fs_info = device->fs_info;
+ struct btrfs_root *root = fs_info->dev_root;
+ struct btrfs_path *path;
+ struct btrfs_dev_extent *extent;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ int ret;
+
+ WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
+ WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = device->devid;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+ key.offset = start;
+ ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent));
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
+ btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID);
+ btrfs_set_dev_extent_chunk_objectid(leaf, extent,
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID);
+ btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
+
+ btrfs_set_dev_extent_length(leaf, extent, num_bytes);
+ btrfs_mark_buffer_dirty(leaf);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * This function belongs to phase 2.
+ *
+ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
+ * phases.
+ */
+static int insert_dev_extents(struct btrfs_trans_handle *trans,
+ u64 chunk_offset, u64 chunk_size)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_device *device;
+ struct extent_map *em;
+ struct map_lookup *map;
+ u64 dev_offset;
+ u64 stripe_size;
+ int i;
+ int ret = 0;
+
+ em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+
+ map = em->map_lookup;
+ stripe_size = em->orig_block_len;
+
+ /*
+ * Take the device list mutex to prevent races with the final phase of
+ * a device replace operation that replaces the device object associated
+ * with the map's stripes, because the device object's id can change
+ * at any time during that final phase of the device replace operation
+ * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
+ * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
+ * resulting in persisting a device extent item with such ID.
+ */
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ for (i = 0; i < map->num_stripes; i++) {
+ device = map->stripes[i].dev;
+ dev_offset = map->stripes[i].physical;
+
+ ret = insert_dev_extent(trans, device, chunk_offset, dev_offset,
+ stripe_size);
+ if (ret)
+ break;
+ }
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+
+ free_extent_map(em);
+ return ret;
+}
+
+/*
+ * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of
+ * chunk allocation.
+ *
+ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
+ * phases.
+ */
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group *block_group;
int ret = 0;
- if (!trans->can_flush_pending_bgs)
- return;
-
while (!list_empty(&trans->new_bgs)) {
int index;
@@ -2242,8 +2379,15 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
ret = insert_block_group_item(trans, block_group);
if (ret)
btrfs_abort_transaction(trans, ret);
- ret = btrfs_finish_chunk_alloc(trans, block_group->start,
- block_group->length);
+ if (!block_group->chunk_item_inserted) {
+ mutex_lock(&fs_info->chunk_mutex);
+ ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group);
+ mutex_unlock(&fs_info->chunk_mutex);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ }
+ ret = insert_dev_extents(trans, block_group->start,
+ block_group->length);
if (ret)
btrfs_abort_transaction(trans, ret);
add_block_group_free_space(trans, block_group);
@@ -2265,8 +2409,9 @@ next:
btrfs_trans_release_chunk_metadata(trans);
}
-int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
- u64 type, u64 chunk_offset, u64 size)
+struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
+ u64 bytes_used, u64 type,
+ u64 chunk_offset, u64 size)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group *cache;
@@ -2276,7 +2421,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
cache = btrfs_create_block_group_cache(fs_info, chunk_offset);
if (!cache)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
cache->length = size;
set_free_space_tree_thresholds(cache);
@@ -2290,7 +2435,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
ret = btrfs_load_block_group_zone_info(cache, true);
if (ret) {
btrfs_put_block_group(cache);
- return ret;
+ return ERR_PTR(ret);
}
ret = exclude_super_stripes(cache);
@@ -2298,7 +2443,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
/* We may have excluded something, so call this just in case */
btrfs_free_excluded_extents(cache);
btrfs_put_block_group(cache);
- return ret;
+ return ERR_PTR(ret);
}
add_new_free_space(cache, chunk_offset, chunk_offset + size);
@@ -2325,7 +2470,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
if (ret) {
btrfs_remove_free_space_cache(cache);
btrfs_put_block_group(cache);
- return ret;
+ return ERR_PTR(ret);
}
/*
@@ -2344,7 +2489,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
btrfs_update_delayed_refs_rsv(trans);
set_avail_alloc_bits(fs_info, type);
- return 0;
+ return cache;
}
/*
@@ -3222,11 +3367,203 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
}
+static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
+{
+ struct btrfs_block_group *bg;
+ int ret;
+
+ /*
+ * Check if we have enough space in the system space info because we
+ * will need to update device items in the chunk btree and insert a new
+ * chunk item in the chunk btree as well. This will allocate a new
+ * system block group if needed.
+ */
+ check_system_chunk(trans, flags);
+
+ bg = btrfs_alloc_chunk(trans, flags);
+ if (IS_ERR(bg)) {
+ ret = PTR_ERR(bg);
+ goto out;
+ }
+
+ /*
+ * If this is a system chunk allocation then stop right here and do not
+ * add the chunk item to the chunk btree. This is to prevent a deadlock
+ * because this system chunk allocation can be triggered while COWing
+ * some extent buffer of the chunk btree and while holding a lock on a
+ * parent extent buffer, in which case attempting to insert the chunk
+ * item (or update the device item) would result in a deadlock on that
+ * parent extent buffer. In this case defer the chunk btree updates to
+ * the second phase of chunk allocation and keep our reservation until
+ * the second phase completes.
+ *
+ * This is a rare case and can only be triggered by the very few cases
+ * we have where we need to touch the chunk btree outside chunk allocation
+ * and chunk removal. These cases are basically adding a device, removing
+ * a device or resizing a device.
+ */
+ if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ return 0;
+
+ ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
+ /*
+ * Normally we are not expected to fail with -ENOSPC here, since we have
+ * previously reserved space in the system space_info and allocated one
+ * new system chunk if necessary. However there are two exceptions:
+ *
+ * 1) We may have enough free space in the system space_info but all the
+ * existing system block groups have a profile which can not be used
+ * for extent allocation.
+ *
+ * This happens when mounting in degraded mode. For example we have a
+ * RAID1 filesystem with 2 devices, lose one device and mount the fs
+ * using the other device in degraded mode. If we then allocate a chunk,
+ * we may have enough free space in the existing system space_info, but
+ * none of the block groups can be used for extent allocation since they
+ * have a RAID1 profile, and because we are in degraded mode with a
+ * single device, we are forced to allocate a new system chunk with a
+ * SINGLE profile. Making check_system_chunk() iterate over all system
+ * block groups and check if they have a usable profile and enough space
+ * can be slow on very large filesystems, so we tolerate the -ENOSPC and
+ * try again after forcing allocation of a new system chunk. Like this
+ * we avoid paying the cost of that search in normal circumstances, when
+ * we were not mounted in degraded mode;
+ *
+ * 2) We had enough free space info the system space_info, and one suitable
+ * block group to allocate from when we called check_system_chunk()
+ * above. However right after we called it, the only system block group
+ * with enough free space got turned into RO mode by a running scrub,
+ * and in this case we have to allocate a new one and retry. We only
+ * need do this allocate and retry once, since we have a transaction
+ * handle and scrub uses the commit root to search for block groups.
+ */
+ if (ret == -ENOSPC) {
+ const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
+ struct btrfs_block_group *sys_bg;
+
+ sys_bg = btrfs_alloc_chunk(trans, sys_flags);
+ if (IS_ERR(sys_bg)) {
+ ret = PTR_ERR(sys_bg);
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+ } else if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+out:
+ btrfs_trans_release_chunk_metadata(trans);
+
+ return ret;
+}
+
/*
- * If force is CHUNK_ALLOC_FORCE:
+ * Chunk allocation is done in 2 phases:
+ *
+ * 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for
+ * the chunk, the chunk mapping, create its block group and add the items
+ * that belong in the chunk btree to it - more specifically, we need to
+ * update device items in the chunk btree and add a new chunk item to it.
+ *
+ * 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block
+ * group item to the extent btree and the device extent items to the devices
+ * btree.
+ *
+ * This is done to prevent deadlocks. For example when COWing a node from the
+ * extent btree we are holding a write lock on the node's parent and if we
+ * trigger chunk allocation and attempted to insert the new block group item
+ * in the extent btree right way, we could deadlock because the path for the
+ * insertion can include that parent node. At first glance it seems impossible
+ * to trigger chunk allocation after starting a transaction since tasks should
+ * reserve enough transaction units (metadata space), however while that is true
+ * most of the time, chunk allocation may still be triggered for several reasons:
+ *
+ * 1) When reserving metadata, we check if there is enough free space in the
+ * metadata space_info and therefore don't trigger allocation of a new chunk.
+ * However later when the task actually tries to COW an extent buffer from
+ * the extent btree or from the device btree for example, it is forced to
+ * allocate a new block group (chunk) because the only one that had enough
+ * free space was just turned to RO mode by a running scrub for example (or
+ * device replace, block group reclaim thread, etc), so we can not use it
+ * for allocating an extent and end up being forced to allocate a new one;
+ *
+ * 2) Because we only check that the metadata space_info has enough free bytes,
+ * we end up not allocating a new metadata chunk in that case. However if
+ * the filesystem was mounted in degraded mode, none of the existing block
+ * groups might be suitable for extent allocation due to their incompatible
+ * profile (for e.g. mounting a 2 devices filesystem, where all block groups
+ * use a RAID1 profile, in degraded mode using a single device). In this case
+ * when the task attempts to COW some extent buffer of the extent btree for
+ * example, it will trigger allocation of a new metadata block group with a
+ * suitable profile (SINGLE profile in the example of the degraded mount of
+ * the RAID1 filesystem);
+ *
+ * 3) The task has reserved enough transaction units / metadata space, but when
+ * it attempts to COW an extent buffer from the extent or device btree for
+ * example, it does not find any free extent in any metadata block group,
+ * therefore forced to try to allocate a new metadata block group.
+ * This is because some other task allocated all available extents in the
+ * meanwhile - this typically happens with tasks that don't reserve space
+ * properly, either intentionally or as a bug. One example where this is
+ * done intentionally is fsync, as it does not reserve any transaction units
+ * and ends up allocating a variable number of metadata extents for log
+ * tree extent buffers.
+ *
+ * We also need this 2 phases setup when adding a device to a filesystem with
+ * a seed device - we must create new metadata and system chunks without adding
+ * any of the block group items to the chunk, extent and device btrees. If we
+ * did not do it this way, we would get ENOSPC when attempting to update those
+ * btrees, since all the chunks from the seed device are read-only.
+ *
+ * Phase 1 does the updates and insertions to the chunk btree because if we had
+ * it done in phase 2 and have a thundering herd of tasks allocating chunks in
+ * parallel, we risk having too many system chunks allocated by many tasks if
+ * many tasks reach phase 1 without the previous ones completing phase 2. In the
+ * extreme case this leads to exhaustion of the system chunk array in the
+ * superblock. This is easier to trigger if using a btree node/leaf size of 64K
+ * and with RAID filesystems (so we have more device items in the chunk btree).
+ * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
+ * the system chunk array due to concurrent allocations") provides more details.
+ *
+ * For allocation of system chunks, we defer the updates and insertions into the
+ * chunk btree to phase 2. This is to prevent deadlocks on extent buffers because
+ * if the chunk allocation is triggered while COWing an extent buffer of the
+ * chunk btree, we are holding a lock on the parent of that extent buffer and
+ * doing the chunk btree updates and insertions can require locking that parent.
+ * This is for the very few and rare cases where we update the chunk btree that
+ * are not chunk allocation or chunk removal: adding a device, removing a device
+ * or resizing a device.
+ *
+ * The reservation of system space, done through check_system_chunk(), as well
+ * as all the updates and insertions into the chunk btree must be done while
+ * holding fs_info->chunk_mutex. This is important to guarantee that while COWing
+ * an extent buffer from the chunks btree we never trigger allocation of a new
+ * system chunk, which would result in a deadlock (trying to lock twice an
+ * extent buffer of the chunk btree, first time before triggering the chunk
+ * allocation and the second time during chunk allocation while attempting to
+ * update the chunks btree). The system chunk array is also updated while holding
+ * that mutex. The same logic applies to removing chunks - we must reserve system
+ * space, update the chunk btree and the system chunk array in the superblock
+ * while holding fs_info->chunk_mutex.
+ *
+ * This function, btrfs_chunk_alloc(), belongs to phase 1.
+ *
+ * If @force is CHUNK_ALLOC_FORCE:
* - return 1 if it successfully allocates a chunk,
* - return errors including -ENOSPC otherwise.
- * If force is NOT CHUNK_ALLOC_FORCE:
+ * If @force is NOT CHUNK_ALLOC_FORCE:
* - return 0 if it doesn't need to allocate a new chunk,
* - return 1 if it successfully allocates a chunk,
* - return errors including -ENOSPC otherwise.
@@ -3243,6 +3580,13 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
/* Don't re-enter if we're already allocating a chunk */
if (trans->allocating_chunk)
return -ENOSPC;
+ /*
+ * If we are removing a chunk, don't re-enter or we would deadlock.
+ * System space reservation and system chunk allocation is done by the
+ * chunk remove operation (btrfs_remove_chunk()).
+ */
+ if (trans->removing_chunk)
+ return -ENOSPC;
space_info = btrfs_find_space_info(fs_info, flags);
ASSERT(space_info);
@@ -3306,13 +3650,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
force_metadata_allocation(fs_info);
}
- /*
- * Check if we have enough space in SYSTEM chunk because we may need
- * to update devices.
- */
- check_system_chunk(trans, flags);
-
- ret = btrfs_alloc_chunk(trans, flags);
+ ret = do_chunk_alloc(trans, flags);
trans->allocating_chunk = false;
spin_lock(&space_info->lock);
@@ -3331,22 +3669,6 @@ out:
space_info->chunk_alloc = 0;
spin_unlock(&space_info->lock);
mutex_unlock(&fs_info->chunk_mutex);
- /*
- * When we allocate a new chunk we reserve space in the chunk block
- * reserve to make sure we can COW nodes/leafs in the chunk tree or
- * add new nodes/leafs to it if we end up needing to do it when
- * inserting the chunk item and updating device items as part of the
- * second phase of chunk allocation, performed by
- * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
- * large number of new block groups to create in our transaction
- * handle's new_bgs list to avoid exhausting the chunk block reserve
- * in extreme cases - like having a single transaction create many new
- * block groups when starting to write out the free space caches of all
- * the block groups that were made dirty during the lifetime of the
- * transaction.
- */
- if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
- btrfs_create_pending_block_groups(trans);
return ret;
}
@@ -3367,7 +3689,6 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
*/
void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
{
- struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_space_info *info;
u64 left;
@@ -3382,7 +3703,6 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
lockdep_assert_held(&fs_info->chunk_mutex);
info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
-again:
spin_lock(&info->lock);
left = info->total_bytes - btrfs_space_info_used(info, true);
spin_unlock(&info->lock);
@@ -3401,76 +3721,39 @@ again:
if (left < thresh) {
u64 flags = btrfs_system_alloc_profile(fs_info);
- u64 reserved = atomic64_read(&cur_trans->chunk_bytes_reserved);
-
- /*
- * If there's not available space for the chunk tree (system
- * space) and there are other tasks that reserved space for
- * creating a new system block group, wait for them to complete
- * the creation of their system block group and release excess
- * reserved space. We do this because:
- *
- * *) We can end up allocating more system chunks than necessary
- * when there are multiple tasks that are concurrently
- * allocating block groups, which can lead to exhaustion of
- * the system array in the superblock;
- *
- * *) If we allocate extra and unnecessary system block groups,
- * despite being empty for a long time, and possibly forever,
- * they end not being added to the list of unused block groups
- * because that typically happens only when deallocating the
- * last extent from a block group - which never happens since
- * we never allocate from them in the first place. The few
- * exceptions are when mounting a filesystem or running scrub,
- * which add unused block groups to the list of unused block
- * groups, to be deleted by the cleaner kthread.
- * And even when they are added to the list of unused block
- * groups, it can take a long time until they get deleted,
- * since the cleaner kthread might be sleeping or busy with
- * other work (deleting subvolumes, running delayed iputs,
- * defrag scheduling, etc);
- *
- * This is rare in practice, but can happen when too many tasks
- * are allocating blocks groups in parallel (via fallocate())
- * and before the one that reserved space for a new system block
- * group finishes the block group creation and releases the space
- * reserved in excess (at btrfs_create_pending_block_groups()),
- * other tasks end up here and see free system space temporarily
- * not enough for updating the chunk tree.
- *
- * We unlock the chunk mutex before waiting for such tasks and
- * lock it again after the wait, otherwise we would deadlock.
- * It is safe to do so because allocating a system chunk is the
- * first thing done while allocating a new block group.
- */
- if (reserved > trans->chunk_bytes_reserved) {
- const u64 min_needed = reserved - thresh;
-
- mutex_unlock(&fs_info->chunk_mutex);
- wait_event(cur_trans->chunk_reserve_wait,
- atomic64_read(&cur_trans->chunk_bytes_reserved) <=
- min_needed);
- mutex_lock(&fs_info->chunk_mutex);
- goto again;
- }
+ struct btrfs_block_group *bg;
/*
* Ignore failure to create system chunk. We might end up not
* needing it, as we might not need to COW all nodes/leafs from
* the paths we visit in the chunk tree (they were already COWed
* or created in the current transaction for example).
+ *
+ * Also, if our caller is allocating a system chunk, do not
+ * attempt to insert the chunk item in the chunk btree, as we
+ * could deadlock on an extent buffer since our caller may be
+ * COWing an extent buffer from the chunk btree.
*/
- ret = btrfs_alloc_chunk(trans, flags);
+ bg = btrfs_alloc_chunk(trans, flags);
+ if (IS_ERR(bg)) {
+ ret = PTR_ERR(bg);
+ } else if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) {
+ /*
+ * If we fail to add the chunk item here, we end up
+ * trying again at phase 2 of chunk allocation, at
+ * btrfs_create_pending_block_groups(). So ignore
+ * any error here.
+ */
+ btrfs_chunk_alloc_add_chunk_item(trans, bg);
+ }
}
if (!ret) {
ret = btrfs_block_rsv_add(fs_info->chunk_root,
&fs_info->chunk_block_rsv,
thresh, BTRFS_RESERVE_NO_FLUSH);
- if (!ret) {
- atomic64_add(thresh, &cur_trans->chunk_bytes_reserved);
+ if (!ret)
trans->chunk_bytes_reserved += thresh;
- }
}
}
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 7b927425dc71..c72a71efcb18 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -97,6 +97,7 @@ struct btrfs_block_group {
unsigned int removed:1;
unsigned int to_copy:1;
unsigned int relocating_repair:1;
+ unsigned int chunk_item_inserted:1;
int disk_cache_state;
@@ -268,8 +269,9 @@ void btrfs_reclaim_bgs_work(struct work_struct *work);
void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info);
void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg);
int btrfs_read_block_groups(struct btrfs_fs_info *info);
-int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
- u64 type, u64 chunk_offset, u64 size);
+struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
+ u64 bytes_used, u64 type,
+ u64 chunk_offset, u64 size);
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
bool do_chunk_alloc);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index c652e19ad74e..76ee1452c57b 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -51,6 +51,13 @@ enum {
* the file range, inode's io_tree).
*/
BTRFS_INODE_NO_DELALLOC_FLUSH,
+ /*
+ * Set when we are working on enabling verity for a file. Computing and
+ * writing the whole Merkle tree can take a while so we want to prevent
+ * races where two separate tasks attempt to simultaneously start verity
+ * on the same file.
+ */
+ BTRFS_INODE_VERITY_IN_PROGRESS,
};
/* in memory btrfs inode */
@@ -189,8 +196,10 @@ struct btrfs_inode {
*/
u64 csum_bytes;
- /* flags field from the on disk inode */
+ /* Backwards incompatible flags, lower half of inode_item::flags */
u32 flags;
+ /* Read-only compatibility flags, upper half of inode_item::flags */
+ u32 ro_flags;
/*
* Counters to keep track of the number of extent item's we may use due
@@ -348,6 +357,22 @@ struct btrfs_dio_private {
u8 csums[];
};
+/*
+ * btrfs_inode_item stores flags in a u64, btrfs_inode stores them in two
+ * separate u32s. These two functions convert between the two representations.
+ */
+static inline u64 btrfs_inode_combine_flags(u32 flags, u32 ro_flags)
+{
+ return (flags | ((u64)ro_flags << 32));
+}
+
+static inline void btrfs_inode_split_flags(u64 inode_item_flags,
+ u32 *flags, u32 *ro_flags)
+{
+ *flags = (u32)inode_item_flags;
+ *ro_flags = (u32)(inode_item_flags >> 32);
+}
+
/* Array of bytes with variable length, hexadecimal format 0x1234 */
#define CSUM_FMT "0x%*phN"
#define CSUM_FMT_VALUE(size, bytes) size, bytes
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 169508609324..86816088927f 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -243,47 +243,6 @@ struct btrfsic_state {
u32 datablock_size;
};
-static void btrfsic_block_init(struct btrfsic_block *b);
-static struct btrfsic_block *btrfsic_block_alloc(void);
-static void btrfsic_block_free(struct btrfsic_block *b);
-static void btrfsic_block_link_init(struct btrfsic_block_link *n);
-static struct btrfsic_block_link *btrfsic_block_link_alloc(void);
-static void btrfsic_block_link_free(struct btrfsic_block_link *n);
-static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds);
-static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void);
-static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds);
-static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h);
-static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
- struct btrfsic_block_hashtable *h);
-static void btrfsic_block_hashtable_remove(struct btrfsic_block *b);
-static struct btrfsic_block *btrfsic_block_hashtable_lookup(
- struct block_device *bdev,
- u64 dev_bytenr,
- struct btrfsic_block_hashtable *h);
-static void btrfsic_block_link_hashtable_init(
- struct btrfsic_block_link_hashtable *h);
-static void btrfsic_block_link_hashtable_add(
- struct btrfsic_block_link *l,
- struct btrfsic_block_link_hashtable *h);
-static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l);
-static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
- struct block_device *bdev_ref_to,
- u64 dev_bytenr_ref_to,
- struct block_device *bdev_ref_from,
- u64 dev_bytenr_ref_from,
- struct btrfsic_block_link_hashtable *h);
-static void btrfsic_dev_state_hashtable_init(
- struct btrfsic_dev_state_hashtable *h);
-static void btrfsic_dev_state_hashtable_add(
- struct btrfsic_dev_state *ds,
- struct btrfsic_dev_state_hashtable *h);
-static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds);
-static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(dev_t dev,
- struct btrfsic_dev_state_hashtable *h);
-static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void);
-static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf);
-static int btrfsic_process_superblock(struct btrfsic_state *state,
- struct btrfs_fs_devices *fs_devices);
static int btrfsic_process_metablock(struct btrfsic_state *state,
struct btrfsic_block *block,
struct btrfsic_block_data_ctx *block_ctx,
@@ -313,14 +272,6 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
static int btrfsic_read_block(struct btrfsic_state *state,
struct btrfsic_block_data_ctx *block_ctx);
-static void btrfsic_dump_database(struct btrfsic_state *state);
-static int btrfsic_test_for_metadata(struct btrfsic_state *state,
- char **datav, unsigned int num_pages);
-static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
- u64 dev_bytenr, char **mapped_datav,
- unsigned int num_pages,
- struct bio *bio, int *bio_is_patched,
- int submit_bio_bh_rw);
static int btrfsic_process_written_superblock(
struct btrfsic_state *state,
struct btrfsic_block *const block,
@@ -1558,10 +1509,8 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
/* Pages must be unmapped in reverse order */
while (num_pages > 0) {
num_pages--;
- if (block_ctx->datav[num_pages]) {
- kunmap_local(block_ctx->datav[num_pages]);
+ if (block_ctx->datav[num_pages])
block_ctx->datav[num_pages] = NULL;
- }
if (block_ctx->pagev[num_pages]) {
__free_page(block_ctx->pagev[num_pages]);
block_ctx->pagev[num_pages] = NULL;
@@ -1638,7 +1587,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
i = j;
}
for (i = 0; i < num_pages; i++)
- block_ctx->datav[i] = kmap_local_page(block_ctx->pagev[i]);
+ block_ctx->datav[i] = page_address(block_ctx->pagev[i]);
return block_ctx->len;
}
@@ -2703,7 +2652,7 @@ static void __btrfsic_submit_bio(struct bio *bio)
bio_for_each_segment(bvec, bio, iter) {
BUG_ON(bvec.bv_len != PAGE_SIZE);
- mapped_datav[i] = kmap_local_page(bvec.bv_page);
+ mapped_datav[i] = page_address(bvec.bv_page);
i++;
if (dev_state->state->print_mask &
@@ -2716,9 +2665,6 @@ static void __btrfsic_submit_bio(struct bio *bio)
mapped_datav, segs,
bio, &bio_is_patched,
bio->bi_opf);
- /* Unmap in reverse order */
- for (--i; i >= 0; i--)
- kunmap_local(mapped_datav[i]);
kfree(mapped_datav);
} else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) {
if (dev_state->state->print_mask &
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 9a023ae0f98b..7869ad12bc6e 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -172,10 +172,9 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
/* Hash through the page sector by sector */
for (pg_offset = 0; pg_offset < bytes_left;
pg_offset += sectorsize) {
- kaddr = kmap_atomic(page);
+ kaddr = page_address(page);
crypto_shash_digest(shash, kaddr + pg_offset,
sectorsize, csum);
- kunmap_atomic(kaddr);
if (memcmp(&csum, cb_sum, csum_size) != 0) {
btrfs_print_data_csum_error(inode, disk_start,
@@ -352,7 +351,7 @@ static void end_compressed_bio_write(struct bio *bio)
btrfs_record_physical_zoned(inode, cb->start, bio);
btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL,
cb->start, cb->start + cb->len - 1,
- bio->bi_status == BLK_STS_OK);
+ !cb->errors);
end_compressed_writeback(inode, cb);
/* note, our inode could be gone now */
@@ -565,6 +564,16 @@ static noinline int add_ra_bio_pages(struct inode *inode,
if (isize == 0)
return 0;
+ /*
+ * For current subpage support, we only support 64K page size,
+ * which means maximum compressed extent size (128K) is just 2x page
+ * size.
+ * This makes readahead less effective, so here disable readahead for
+ * subpage for now, until full compressed write is supported.
+ */
+ if (btrfs_sb(inode->i_sb)->sectorsize < PAGE_SIZE)
+ return 0;
+
end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
while (last_offset < compressed_end) {
@@ -673,6 +682,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
struct page *page;
struct bio *comp_bio;
u64 cur_disk_byte = bio->bi_iter.bi_sector << 9;
+ u64 file_offset;
u64 em_len;
u64 em_start;
struct extent_map *em;
@@ -682,15 +692,17 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
em_tree = &BTRFS_I(inode)->extent_tree;
+ file_offset = bio_first_bvec_all(bio)->bv_offset +
+ page_offset(bio_first_page_all(bio));
+
/* we need the actual starting offset of this extent in the file */
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree,
- page_offset(bio_first_page_all(bio)),
- fs_info->sectorsize);
+ em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize);
read_unlock(&em_tree->lock);
if (!em)
return BLK_STS_IOERR;
+ ASSERT(em->compress_type != BTRFS_COMPRESS_NONE);
compressed_len = em->block_len;
cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
if (!cb)
@@ -721,8 +733,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
goto fail1;
for (pg_index = 0; pg_index < nr_pages; pg_index++) {
- cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS |
- __GFP_HIGHMEM);
+ cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS);
if (!cb->compressed_pages[pg_index]) {
faili = pg_index - 1;
ret = BLK_STS_RESOURCE;
@@ -1261,96 +1272,82 @@ void __cold btrfs_exit_compress(void)
}
/*
- * Copy uncompressed data from working buffer to pages.
+ * Copy decompressed data from working buffer to pages.
+ *
+ * @buf: The decompressed data buffer
+ * @buf_len: The decompressed data length
+ * @decompressed: Number of bytes that are already decompressed inside the
+ * compressed extent
+ * @cb: The compressed extent descriptor
+ * @orig_bio: The original bio that the caller wants to read for
+ *
+ * An easier to understand graph is like below:
+ *
+ * |<- orig_bio ->| |<- orig_bio->|
+ * |<------- full decompressed extent ----->|
+ * |<----------- @cb range ---->|
+ * | |<-- @buf_len -->|
+ * |<--- @decompressed --->|
+ *
+ * Note that, @cb can be a subpage of the full decompressed extent, but
+ * @cb->start always has the same as the orig_file_offset value of the full
+ * decompressed extent.
*
- * buf_start is the byte offset we're of the start of our workspace buffer.
+ * When reading compressed extent, we have to read the full compressed extent,
+ * while @orig_bio may only want part of the range.
+ * Thus this function will ensure only data covered by @orig_bio will be copied
+ * to.
*
- * total_out is the last byte of the buffer
+ * Return 0 if we have copied all needed contents for @orig_bio.
+ * Return >0 if we need continue decompress.
*/
-int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
- unsigned long total_out, u64 disk_start,
- struct bio *bio)
+int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
+ struct compressed_bio *cb, u32 decompressed)
{
- unsigned long buf_offset;
- unsigned long current_buf_start;
- unsigned long start_byte;
- unsigned long prev_start_byte;
- unsigned long working_bytes = total_out - buf_start;
- unsigned long bytes;
- struct bio_vec bvec = bio_iter_iovec(bio, bio->bi_iter);
-
- /*
- * start byte is the first byte of the page we're currently
- * copying into relative to the start of the compressed data.
- */
- start_byte = page_offset(bvec.bv_page) - disk_start;
-
- /* we haven't yet hit data corresponding to this page */
- if (total_out <= start_byte)
- return 1;
-
- /*
- * the start of the data we care about is offset into
- * the middle of our working buffer
- */
- if (total_out > start_byte && buf_start < start_byte) {
- buf_offset = start_byte - buf_start;
- working_bytes -= buf_offset;
- } else {
- buf_offset = 0;
- }
- current_buf_start = buf_start;
-
- /* copy bytes from the working buffer into the pages */
- while (working_bytes > 0) {
- bytes = min_t(unsigned long, bvec.bv_len,
- PAGE_SIZE - (buf_offset % PAGE_SIZE));
- bytes = min(bytes, working_bytes);
-
- memcpy_to_page(bvec.bv_page, bvec.bv_offset, buf + buf_offset,
- bytes);
- flush_dcache_page(bvec.bv_page);
+ struct bio *orig_bio = cb->orig_bio;
+ /* Offset inside the full decompressed extent */
+ u32 cur_offset;
+
+ cur_offset = decompressed;
+ /* The main loop to do the copy */
+ while (cur_offset < decompressed + buf_len) {
+ struct bio_vec bvec;
+ size_t copy_len;
+ u32 copy_start;
+ /* Offset inside the full decompressed extent */
+ u32 bvec_offset;
+
+ bvec = bio_iter_iovec(orig_bio, orig_bio->bi_iter);
+ /*
+ * cb->start may underflow, but subtracting that value can still
+ * give us correct offset inside the full decompressed extent.
+ */
+ bvec_offset = page_offset(bvec.bv_page) + bvec.bv_offset - cb->start;
- buf_offset += bytes;
- working_bytes -= bytes;
- current_buf_start += bytes;
+ /* Haven't reached the bvec range, exit */
+ if (decompressed + buf_len <= bvec_offset)
+ return 1;
- /* check if we need to pick another page */
- bio_advance(bio, bytes);
- if (!bio->bi_iter.bi_size)
- return 0;
- bvec = bio_iter_iovec(bio, bio->bi_iter);
- prev_start_byte = start_byte;
- start_byte = page_offset(bvec.bv_page) - disk_start;
+ copy_start = max(cur_offset, bvec_offset);
+ copy_len = min(bvec_offset + bvec.bv_len,
+ decompressed + buf_len) - copy_start;
+ ASSERT(copy_len);
/*
- * We need to make sure we're only adjusting
- * our offset into compression working buffer when
- * we're switching pages. Otherwise we can incorrectly
- * keep copying when we were actually done.
+ * Extra range check to ensure we didn't go beyond
+ * @buf + @buf_len.
*/
- if (start_byte != prev_start_byte) {
- /*
- * make sure our new page is covered by this
- * working buffer
- */
- if (total_out <= start_byte)
- return 1;
+ ASSERT(copy_start - decompressed < buf_len);
+ memcpy_to_page(bvec.bv_page, bvec.bv_offset,
+ buf + copy_start - decompressed, copy_len);
+ flush_dcache_page(bvec.bv_page);
+ cur_offset += copy_len;
- /*
- * the next page in the biovec might not be adjacent
- * to the last page, but it might still be found
- * inside this working buffer. bump our offset pointer
- */
- if (total_out > start_byte &&
- current_buf_start < start_byte) {
- buf_offset = start_byte - buf_start;
- working_bytes = total_out - start_byte;
- current_buf_start = buf_start + buf_offset;
- }
- }
+ bio_advance(orig_bio, copy_len);
+ /* Finished the bio */
+ if (!orig_bio->bi_iter.bi_size)
+ return 0;
}
-
return 1;
}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index c359f20920d0..399be0b435bf 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -86,9 +86,8 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
unsigned long *total_out);
int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
unsigned long start_byte, size_t srclen, size_t destlen);
-int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
- unsigned long total_out, u64 disk_start,
- struct bio *bio);
+int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
+ struct compressed_bio *cb, u32 decompressed);
blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
unsigned int len, u64 disk_start,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 4bc3ca2cbd7d..84627cbd5b5b 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -364,49 +364,6 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
return 0;
}
-static struct extent_buffer *alloc_tree_block_no_bg_flush(
- struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 parent_start,
- const struct btrfs_disk_key *disk_key,
- int level,
- u64 hint,
- u64 empty_size,
- enum btrfs_lock_nesting nest)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct extent_buffer *ret;
-
- /*
- * If we are COWing a node/leaf from the extent, chunk, device or free
- * space trees, make sure that we do not finish block group creation of
- * pending block groups. We do this to avoid a deadlock.
- * COWing can result in allocation of a new chunk, and flushing pending
- * block groups (btrfs_create_pending_block_groups()) can be triggered
- * when finishing allocation of a new chunk. Creation of a pending block
- * group modifies the extent, chunk, device and free space trees,
- * therefore we could deadlock with ourselves since we are holding a
- * lock on an extent buffer that btrfs_create_pending_block_groups() may
- * try to COW later.
- * For similar reasons, we also need to delay flushing pending block
- * groups when splitting a leaf or node, from one of those trees, since
- * we are holding a write lock on it and its parent or when inserting a
- * new root node for one of those trees.
- */
- if (root == fs_info->extent_root ||
- root == fs_info->chunk_root ||
- root == fs_info->dev_root ||
- root == fs_info->free_space_root)
- trans->can_flush_pending_bgs = false;
-
- ret = btrfs_alloc_tree_block(trans, root, parent_start,
- root->root_key.objectid, disk_key, level,
- hint, empty_size, nest);
- trans->can_flush_pending_bgs = true;
-
- return ret;
-}
-
/*
* does the dirty work in cow of a single block. The parent block (if
* supplied) is updated to point to the new cow copy. The new buffer is marked
@@ -455,8 +412,9 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
parent_start = parent->start;
- cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
- level, search_start, empty_size, nest);
+ cow = btrfs_alloc_tree_block(trans, root, parent_start,
+ root->root_key.objectid, &disk_key, level,
+ search_start, empty_size, nest);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -768,21 +726,21 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
/*
* search for key in the extent_buffer. The items start at offset p,
- * and they are item_size apart. There are 'max' items in p.
+ * and they are item_size apart.
*
* the slot in the array is returned via slot, and it points to
* the place where you would insert key if it is not found in
* the array.
*
- * slot may point to max if the key is bigger than all of the keys
+ * Slot may point to total number of items if the key is bigger than
+ * all of the keys
*/
static noinline int generic_bin_search(struct extent_buffer *eb,
unsigned long p, int item_size,
- const struct btrfs_key *key,
- int max, int *slot)
+ const struct btrfs_key *key, int *slot)
{
int low = 0;
- int high = max;
+ int high = btrfs_header_nritems(eb);
int ret;
const int key_size = sizeof(struct btrfs_disk_key);
@@ -841,15 +799,11 @@ int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
if (btrfs_header_level(eb) == 0)
return generic_bin_search(eb,
offsetof(struct btrfs_leaf, items),
- sizeof(struct btrfs_item),
- key, btrfs_header_nritems(eb),
- slot);
+ sizeof(struct btrfs_item), key, slot);
else
return generic_bin_search(eb,
offsetof(struct btrfs_node, ptrs),
- sizeof(struct btrfs_key_ptr),
- key, btrfs_header_nritems(eb),
- slot);
+ sizeof(struct btrfs_key_ptr), key, slot);
}
static void root_add_used(struct btrfs_root *root, u32 size)
@@ -1279,7 +1233,6 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
u64 target;
u64 nread = 0;
u64 nread_max;
- struct extent_buffer *eb;
u32 nr;
u32 blocksize;
u32 nscan = 0;
@@ -1308,10 +1261,14 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
search = btrfs_node_blockptr(node, slot);
blocksize = fs_info->nodesize;
- eb = find_extent_buffer(fs_info, search);
- if (eb) {
- free_extent_buffer(eb);
- return;
+ if (path->reada != READA_FORWARD_ALWAYS) {
+ struct extent_buffer *eb;
+
+ eb = find_extent_buffer(fs_info, search);
+ if (eb) {
+ free_extent_buffer(eb);
+ return;
+ }
}
target = search;
@@ -2145,6 +2102,27 @@ again:
}
/*
+ * Execute search and call btrfs_previous_item to traverse backwards if the item
+ * was not found.
+ *
+ * Return 0 if found, 1 if not found and < 0 if error.
+ */
+int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
+ struct btrfs_path *path)
+{
+ int ret;
+
+ ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+ if (ret > 0)
+ ret = btrfs_previous_item(root, path, key->objectid, key->type);
+
+ if (ret == 0)
+ btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
+
+ return ret;
+}
+
+/*
* adjust the pointers going up the tree, starting at level
* making sure the right key of each node is points to 'key'.
* This is used after shifting pointers to the left, so it stops
@@ -2458,9 +2436,9 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
else
btrfs_node_key(lower, &lower_key, 0);
- c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
- root->node->start, 0,
- BTRFS_NESTING_NEW_ROOT);
+ c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ &lower_key, level, root->node->start, 0,
+ BTRFS_NESTING_NEW_ROOT);
if (IS_ERR(c))
return PTR_ERR(c);
@@ -2589,8 +2567,9 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
mid = (c_nritems + 1) / 2;
btrfs_node_key(c, &disk_key, mid);
- split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
- c->start, 0, BTRFS_NESTING_SPLIT);
+ split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ &disk_key, level, c->start, 0,
+ BTRFS_NESTING_SPLIT);
if (IS_ERR(split))
return PTR_ERR(split);
@@ -3381,10 +3360,10 @@ again:
* BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
* use BTRFS_NESTING_NEW_ROOT.
*/
- right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
- l->start, 0, num_doubles ?
- BTRFS_NESTING_NEW_ROOT :
- BTRFS_NESTING_SPLIT);
+ right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+ &disk_key, 0, l->start, 0,
+ num_doubles ? BTRFS_NESTING_NEW_ROOT :
+ BTRFS_NESTING_SPLIT);
if (IS_ERR(right))
return PTR_ERR(right);
@@ -4399,16 +4378,6 @@ next:
return 1;
}
-/*
- * search the tree again to find a leaf with greater keys
- * returns 0 if it found something or 1 if there are no greater leaves.
- * returns < 0 on io errors.
- */
-int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
-{
- return btrfs_next_old_leaf(root, path, 0);
-}
-
int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
u64 time_seq)
{
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index e5e53e592d4f..dff2c8a3e059 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -281,7 +281,8 @@ struct btrfs_super_block {
#define BTRFS_FEATURE_COMPAT_RO_SUPP \
(BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \
- BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID)
+ BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \
+ BTRFS_FEATURE_COMPAT_RO_VERITY)
#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL
#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL
@@ -1012,8 +1013,6 @@ struct btrfs_fs_info {
u64 zoned;
};
- /* Max size to emit ZONE_APPEND write command */
- u64 max_zone_append_size;
struct mutex zoned_meta_io_lock;
spinlock_t treelog_bg_lock;
u64 treelog_bg;
@@ -1484,20 +1483,20 @@ do { \
/*
* Inode flags
*/
-#define BTRFS_INODE_NODATASUM (1 << 0)
-#define BTRFS_INODE_NODATACOW (1 << 1)
-#define BTRFS_INODE_READONLY (1 << 2)
-#define BTRFS_INODE_NOCOMPRESS (1 << 3)
-#define BTRFS_INODE_PREALLOC (1 << 4)
-#define BTRFS_INODE_SYNC (1 << 5)
-#define BTRFS_INODE_IMMUTABLE (1 << 6)
-#define BTRFS_INODE_APPEND (1 << 7)
-#define BTRFS_INODE_NODUMP (1 << 8)
-#define BTRFS_INODE_NOATIME (1 << 9)
-#define BTRFS_INODE_DIRSYNC (1 << 10)
-#define BTRFS_INODE_COMPRESS (1 << 11)
-
-#define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31)
+#define BTRFS_INODE_NODATASUM (1U << 0)
+#define BTRFS_INODE_NODATACOW (1U << 1)
+#define BTRFS_INODE_READONLY (1U << 2)
+#define BTRFS_INODE_NOCOMPRESS (1U << 3)
+#define BTRFS_INODE_PREALLOC (1U << 4)
+#define BTRFS_INODE_SYNC (1U << 5)
+#define BTRFS_INODE_IMMUTABLE (1U << 6)
+#define BTRFS_INODE_APPEND (1U << 7)
+#define BTRFS_INODE_NODUMP (1U << 8)
+#define BTRFS_INODE_NOATIME (1U << 9)
+#define BTRFS_INODE_DIRSYNC (1U << 10)
+#define BTRFS_INODE_COMPRESS (1U << 11)
+
+#define BTRFS_INODE_ROOT_ITEM_INIT (1U << 31)
#define BTRFS_INODE_FLAG_MASK \
(BTRFS_INODE_NODATASUM | \
@@ -1514,6 +1513,10 @@ do { \
BTRFS_INODE_COMPRESS | \
BTRFS_INODE_ROOT_ITEM_INIT)
+#define BTRFS_INODE_RO_VERITY (1U << 0)
+
+#define BTRFS_INODE_RO_FLAG_MASK (BTRFS_INODE_RO_VERITY)
+
struct btrfs_map_token {
struct extent_buffer *eb;
char *kaddr;
@@ -2781,10 +2784,11 @@ enum btrfs_flush_state {
FLUSH_DELAYED_REFS = 4,
FLUSH_DELALLOC = 5,
FLUSH_DELALLOC_WAIT = 6,
- ALLOC_CHUNK = 7,
- ALLOC_CHUNK_FORCE = 8,
- RUN_DELAYED_IPUTS = 9,
- COMMIT_TRANS = 10,
+ FLUSH_DELALLOC_FULL = 7,
+ ALLOC_CHUNK = 8,
+ ALLOC_CHUNK_FORCE = 9,
+ RUN_DELAYED_IPUTS = 10,
+ COMMIT_TRANS = 11,
};
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
@@ -2901,10 +2905,13 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1);
}
-int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
u64 time_seq);
+
+int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
+ struct btrfs_path *path);
+
static inline int btrfs_next_old_item(struct btrfs_root *root,
struct btrfs_path *p, u64 time_seq)
{
@@ -2913,6 +2920,18 @@ static inline int btrfs_next_old_item(struct btrfs_root *root,
return btrfs_next_old_leaf(root, p, time_seq);
return 0;
}
+
+/*
+ * Search the tree again to find a leaf with greater keys.
+ *
+ * Returns 0 if it found something or 1 if there are no greater leaves.
+ * Returns < 0 on error.
+ */
+static inline int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
+{
+ return btrfs_next_old_leaf(root, path, 0);
+}
+
static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
{
return btrfs_next_old_item(root, p, 0);
@@ -3145,7 +3164,8 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct extent_state **cached_state);
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
struct btrfs_root *new_root,
- struct btrfs_root *parent_root);
+ struct btrfs_root *parent_root,
+ struct user_namespace *mnt_userns);
void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
unsigned *bits);
void btrfs_clear_delalloc_extent(struct inode *inode,
@@ -3194,10 +3214,10 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
u64 start, u64 end, int *page_started, unsigned long *nr_written,
struct writeback_control *wbc);
-int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
+int btrfs_writepage_cow_fixup(struct page *page);
void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
struct page *page, u64 start,
- u64 end, int uptodate);
+ u64 end, bool uptodate);
extern const struct dentry_operations btrfs_dentry_operations;
extern const struct iomap_ops btrfs_dio_iomap_ops;
extern const struct iomap_dio_ops btrfs_dio_ops;
@@ -3686,7 +3706,7 @@ static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag)
/* acl.c */
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
-struct posix_acl *btrfs_get_acl(struct inode *inode, int type);
+struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu);
int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
struct posix_acl *acl, int type);
int btrfs_init_acl(struct btrfs_trans_handle *trans,
@@ -3779,6 +3799,30 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
return signal_pending(current);
}
+/* verity.c */
+#ifdef CONFIG_FS_VERITY
+
+extern const struct fsverity_operations btrfs_verityops;
+int btrfs_drop_verity_items(struct btrfs_inode *inode);
+
+BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item,
+ encryption, 8);
+BTRFS_SETGET_FUNCS(verity_descriptor_size, struct btrfs_verity_descriptor_item,
+ size, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption,
+ struct btrfs_verity_descriptor_item, encryption, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size,
+ struct btrfs_verity_descriptor_item, size, 64);
+
+#else
+
+static inline int btrfs_drop_verity_items(struct btrfs_inode *inode)
+{
+ return 0;
+}
+
+#endif
+
/* Sanity test specific functions */
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
void btrfs_test_destroy_inode(struct inode *inode);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 257c1e18abd4..1e08eb2b27f0 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -6,7 +6,6 @@
#include <linux/slab.h>
#include <linux/iversion.h>
-#include <linux/sched/mm.h>
#include "misc.h"
#include "delayed-inode.h"
#include "disk-io.h"
@@ -672,176 +671,119 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info,
}
/*
- * This helper will insert some continuous items into the same leaf according
- * to the free space of the leaf.
+ * Insert a single delayed item or a batch of delayed items that have consecutive
+ * keys if they exist.
*/
-static int btrfs_batch_insert_items(struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_delayed_item *item)
+static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_delayed_item *first_item)
{
- struct btrfs_delayed_item *curr, *next;
- int free_space;
- int total_size = 0;
- struct extent_buffer *leaf;
- char *data_ptr;
- struct btrfs_key *keys;
- u32 *data_size;
- struct list_head head;
- int slot;
+ LIST_HEAD(batch);
+ struct btrfs_delayed_item *curr;
+ struct btrfs_delayed_item *next;
+ const int max_size = BTRFS_LEAF_DATA_SIZE(root->fs_info);
+ int total_size;
int nitems;
- int i;
- int ret = 0;
-
- BUG_ON(!path->nodes[0]);
+ char *ins_data = NULL;
+ struct btrfs_key *ins_keys;
+ u32 *ins_sizes;
+ int ret;
- leaf = path->nodes[0];
- free_space = btrfs_leaf_free_space(leaf);
- INIT_LIST_HEAD(&head);
+ list_add_tail(&first_item->tree_list, &batch);
+ nitems = 1;
+ total_size = first_item->data_len + sizeof(struct btrfs_item);
+ curr = first_item;
- next = item;
- nitems = 0;
+ while (true) {
+ int next_size;
- /*
- * count the number of the continuous items that we can insert in batch
- */
- while (total_size + next->data_len + sizeof(struct btrfs_item) <=
- free_space) {
- total_size += next->data_len + sizeof(struct btrfs_item);
- list_add_tail(&next->tree_list, &head);
- nitems++;
-
- curr = next;
next = __btrfs_next_delayed_item(curr);
- if (!next)
+ if (!next || !btrfs_is_continuous_delayed_item(curr, next))
break;
- if (!btrfs_is_continuous_delayed_item(curr, next))
+ next_size = next->data_len + sizeof(struct btrfs_item);
+ if (total_size + next_size > max_size)
break;
- }
- if (!nitems) {
- ret = 0;
- goto out;
+ list_add_tail(&next->tree_list, &batch);
+ nitems++;
+ total_size += next_size;
+ curr = next;
}
- keys = kmalloc_array(nitems, sizeof(struct btrfs_key), GFP_NOFS);
- if (!keys) {
- ret = -ENOMEM;
- goto out;
- }
+ if (nitems == 1) {
+ ins_keys = &first_item->key;
+ ins_sizes = &first_item->data_len;
+ } else {
+ int i = 0;
- data_size = kmalloc_array(nitems, sizeof(u32), GFP_NOFS);
- if (!data_size) {
- ret = -ENOMEM;
- goto error;
+ ins_data = kmalloc(nitems * sizeof(u32) +
+ nitems * sizeof(struct btrfs_key), GFP_NOFS);
+ if (!ins_data) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ins_sizes = (u32 *)ins_data;
+ ins_keys = (struct btrfs_key *)(ins_data + nitems * sizeof(u32));
+ list_for_each_entry(curr, &batch, tree_list) {
+ ins_keys[i] = curr->key;
+ ins_sizes[i] = curr->data_len;
+ i++;
+ }
}
- /* get keys of all the delayed items */
- i = 0;
- list_for_each_entry(next, &head, tree_list) {
- keys[i] = next->key;
- data_size[i] = next->data_len;
- i++;
- }
+ ret = btrfs_insert_empty_items(trans, root, path, ins_keys, ins_sizes,
+ nitems);
+ if (ret)
+ goto out;
- /* insert the keys of the items */
- setup_items_for_insert(root, path, keys, data_size, nitems);
+ list_for_each_entry(curr, &batch, tree_list) {
+ char *data_ptr;
- /* insert the dir index items */
- slot = path->slots[0];
- list_for_each_entry_safe(curr, next, &head, tree_list) {
- data_ptr = btrfs_item_ptr(leaf, slot, char);
- write_extent_buffer(leaf, &curr->data,
- (unsigned long)data_ptr,
- curr->data_len);
- slot++;
+ data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
+ write_extent_buffer(path->nodes[0], &curr->data,
+ (unsigned long)data_ptr, curr->data_len);
+ path->slots[0]++;
+ }
- btrfs_delayed_item_release_metadata(root, curr);
+ /*
+ * Now release our path before releasing the delayed items and their
+ * metadata reservations, so that we don't block other tasks for more
+ * time than needed.
+ */
+ btrfs_release_path(path);
+ list_for_each_entry_safe(curr, next, &batch, tree_list) {
list_del(&curr->tree_list);
+ btrfs_delayed_item_release_metadata(root, curr);
btrfs_release_delayed_item(curr);
}
-
-error:
- kfree(data_size);
- kfree(keys);
out:
+ kfree(ins_data);
return ret;
}
-/*
- * This helper can just do simple insertion that needn't extend item for new
- * data, such as directory name index insertion, inode insertion.
- */
-static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_delayed_item *delayed_item)
-{
- struct extent_buffer *leaf;
- unsigned int nofs_flag;
- char *ptr;
- int ret;
-
- nofs_flag = memalloc_nofs_save();
- ret = btrfs_insert_empty_item(trans, root, path, &delayed_item->key,
- delayed_item->data_len);
- memalloc_nofs_restore(nofs_flag);
- if (ret < 0 && ret != -EEXIST)
- return ret;
-
- leaf = path->nodes[0];
-
- ptr = btrfs_item_ptr(leaf, path->slots[0], char);
-
- write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr,
- delayed_item->data_len);
- btrfs_mark_buffer_dirty(leaf);
-
- btrfs_delayed_item_release_metadata(root, delayed_item);
- return 0;
-}
-
-/*
- * we insert an item first, then if there are some continuous items, we try
- * to insert those items into the same leaf.
- */
static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_root *root,
struct btrfs_delayed_node *node)
{
- struct btrfs_delayed_item *curr, *prev;
int ret = 0;
-do_again:
- mutex_lock(&node->mutex);
- curr = __btrfs_first_delayed_insertion_item(node);
- if (!curr)
- goto insert_end;
-
- ret = btrfs_insert_delayed_item(trans, root, path, curr);
- if (ret < 0) {
- btrfs_release_path(path);
- goto insert_end;
- }
+ while (ret == 0) {
+ struct btrfs_delayed_item *curr;
- prev = curr;
- curr = __btrfs_next_delayed_item(prev);
- if (curr && btrfs_is_continuous_delayed_item(prev, curr)) {
- /* insert the continuous items into the same leaf */
- path->slots[0]++;
- btrfs_batch_insert_items(root, path, curr);
+ mutex_lock(&node->mutex);
+ curr = __btrfs_first_delayed_insertion_item(node);
+ if (!curr) {
+ mutex_unlock(&node->mutex);
+ break;
+ }
+ ret = btrfs_insert_delayed_item(trans, root, path, curr);
+ mutex_unlock(&node->mutex);
}
- btrfs_release_delayed_item(prev);
- btrfs_mark_buffer_dirty(path->nodes[0]);
- btrfs_release_path(path);
- mutex_unlock(&node->mutex);
- goto do_again;
-
-insert_end:
- mutex_unlock(&node->mutex);
return ret;
}
@@ -914,7 +856,6 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
struct btrfs_delayed_node *node)
{
struct btrfs_delayed_item *curr, *prev;
- unsigned int nofs_flag;
int ret = 0;
do_again:
@@ -923,9 +864,7 @@ do_again:
if (!curr)
goto delete_fail;
- nofs_flag = memalloc_nofs_save();
ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1);
- memalloc_nofs_restore(nofs_flag);
if (ret < 0)
goto delete_fail;
else if (ret > 0) {
@@ -994,7 +933,6 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
struct btrfs_key key;
struct btrfs_inode_item *inode_item;
struct extent_buffer *leaf;
- unsigned int nofs_flag;
int mod;
int ret;
@@ -1007,9 +945,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
else
mod = 1;
- nofs_flag = memalloc_nofs_save();
ret = btrfs_lookup_inode(trans, root, path, &key, mod);
- memalloc_nofs_restore(nofs_flag);
if (ret > 0)
ret = -ENOENT;
if (ret < 0)
@@ -1066,9 +1002,7 @@ search:
key.type = BTRFS_INODE_EXTREF_KEY;
key.offset = -1;
- nofs_flag = memalloc_nofs_save();
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
- memalloc_nofs_restore(nofs_flag);
if (ret < 0)
goto err_out;
ASSERT(ret);
@@ -1711,6 +1645,8 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_inode_item *inode_item,
struct inode *inode)
{
+ u64 flags;
+
btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode));
btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode));
btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size);
@@ -1723,7 +1659,9 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
inode_peek_iversion(inode));
btrfs_set_stack_inode_transid(inode_item, trans->transid);
btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
- btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
+ flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
+ BTRFS_I(inode)->ro_flags);
+ btrfs_set_stack_inode_flags(inode_item, flags);
btrfs_set_stack_inode_block_group(inode_item, 0);
btrfs_set_stack_timespec_sec(&inode_item->atime,
@@ -1781,7 +1719,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
btrfs_stack_inode_sequence(inode_item));
inode->i_rdev = 0;
*rdev = btrfs_stack_inode_rdev(inode_item);
- BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
+ btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item),
+ &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime);
inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 06bc842ecdb3..ca848b183474 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -974,7 +974,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
if (qrecord_inserted)
- btrfs_qgroup_trace_extent_post(fs_info, record);
+ btrfs_qgroup_trace_extent_post(trans, record);
return 0;
}
@@ -1069,7 +1069,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
if (qrecord_inserted)
- return btrfs_qgroup_trace_extent_post(fs_info, record);
+ return btrfs_qgroup_trace_extent_post(trans, record);
return 0;
}
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 98b63ebed539..f1274d5c3805 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -170,6 +170,25 @@ out_free:
return 0;
}
+static struct btrfs_dir_item *btrfs_lookup_match_dir(
+ struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *key, const char *name,
+ int name_len, int mod)
+{
+ const int ins_len = (mod < 0 ? -1 : 0);
+ const int cow = (mod != 0);
+ int ret;
+
+ ret = btrfs_search_slot(trans, root, key, path, ins_len, cow);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret > 0)
+ return ERR_PTR(-ENOENT);
+
+ return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
+}
+
/*
* lookup a directory item based on name. 'dir' is the objectid
* we're searching in, and 'mod' tells us if you plan on deleting the
@@ -181,23 +200,18 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
const char *name, int name_len,
int mod)
{
- int ret;
struct btrfs_key key;
- int ins_len = mod < 0 ? -1 : 0;
- int cow = mod != 0;
+ struct btrfs_dir_item *di;
key.objectid = dir;
key.type = BTRFS_DIR_ITEM_KEY;
-
key.offset = btrfs_name_hash(name, name_len);
- ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
- if (ret < 0)
- return ERR_PTR(ret);
- if (ret > 0)
+ di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
+ if (IS_ERR(di) && PTR_ERR(di) == -ENOENT)
return NULL;
- return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
+ return di;
}
int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
@@ -211,7 +225,6 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
int slot;
struct btrfs_path *path;
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -220,20 +233,20 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
key.type = BTRFS_DIR_ITEM_KEY;
key.offset = btrfs_name_hash(name, name_len);
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-
- /* return back any errors */
- if (ret < 0)
- goto out;
+ di = btrfs_lookup_match_dir(NULL, root, path, &key, name, name_len, 0);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ /* Nothing found, we're safe */
+ if (ret == -ENOENT) {
+ ret = 0;
+ goto out;
+ }
- /* nothing found, we're safe */
- if (ret > 0) {
- ret = 0;
- goto out;
+ if (ret < 0)
+ goto out;
}
/* we found an item, look for our name in the item */
- di = btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
if (di) {
/* our exact name was found */
ret = -EEXIST;
@@ -274,21 +287,13 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
u64 objectid, const char *name, int name_len,
int mod)
{
- int ret;
struct btrfs_key key;
- int ins_len = mod < 0 ? -1 : 0;
- int cow = mod != 0;
key.objectid = dir;
key.type = BTRFS_DIR_INDEX_KEY;
key.offset = objectid;
- ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
- if (ret < 0)
- return ERR_PTR(ret);
- if (ret > 0)
- return ERR_PTR(-ENOENT);
- return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
+ return btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
}
struct btrfs_dir_item *
@@ -345,21 +350,18 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
const char *name, u16 name_len,
int mod)
{
- int ret;
struct btrfs_key key;
- int ins_len = mod < 0 ? -1 : 0;
- int cow = mod != 0;
+ struct btrfs_dir_item *di;
key.objectid = dir;
key.type = BTRFS_XATTR_ITEM_KEY;
key.offset = btrfs_name_hash(name, name_len);
- ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
- if (ret < 0)
- return ERR_PTR(ret);
- if (ret > 0)
+
+ di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
+ if (IS_ERR(di) && PTR_ERR(di) == -ENOENT)
return NULL;
- return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
+ return di;
}
/*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b117dd3b8172..2f9515dccce0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -209,7 +209,7 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
static void csum_tree_block(struct extent_buffer *buf, u8 *result)
{
struct btrfs_fs_info *fs_info = buf->fs_info;
- const int num_pages = fs_info->nodesize >> PAGE_SHIFT;
+ const int num_pages = num_extent_pages(buf);
const int first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
char *kaddr;
@@ -3392,11 +3392,16 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_alloc;
}
- /* For 4K sector size support, it's only read-only */
- if (PAGE_SIZE == SZ_64K && sectorsize == SZ_4K) {
- if (!sb_rdonly(sb) || btrfs_super_log_root(disk_super)) {
+ if (sectorsize != PAGE_SIZE) {
+ btrfs_warn(fs_info,
+ "read-write for sector size %u with page size %lu is experimental",
+ sectorsize, PAGE_SIZE);
+ }
+ if (sectorsize != PAGE_SIZE) {
+ if (btrfs_super_incompat_flags(fs_info->super_copy) &
+ BTRFS_FEATURE_INCOMPAT_RAID56) {
btrfs_err(fs_info,
- "subpage sectorsize %u only supported read-only for page size %lu",
+ "RAID56 is not yet supported for sector size %u with page size %lu",
sectorsize, PAGE_SIZE);
err = -EINVAL;
goto fail_alloc;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d296483d148f..fc3da7585fb7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -153,7 +153,7 @@ search_again:
else
key.type = BTRFS_EXTENT_ITEM_KEY;
- ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
if (ret < 0)
goto out_free;
@@ -5950,9 +5950,9 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
*/
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_block_group *cache = NULL;
struct btrfs_device *device;
- struct list_head *devices;
u64 group_trimmed;
u64 range_end = U64_MAX;
u64 start;
@@ -6016,9 +6016,12 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
btrfs_warn(fs_info,
"failed to trim %llu block group(s), last error %d",
bg_failed, bg_ret);
- mutex_lock(&fs_info->fs_devices->device_list_mutex);
- devices = &fs_info->fs_devices->devices;
- list_for_each_entry(device, devices, dev_list) {
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
+ continue;
+
ret = btrfs_trim_free_extents(device, &group_trimmed);
if (ret) {
dev_failed++;
@@ -6028,7 +6031,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
trimmed += group_trimmed;
}
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&fs_devices->device_list_mutex);
if (dev_failed)
btrfs_warn(fs_info,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 9e81d25dea70..aaddd7225348 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -13,6 +13,7 @@
#include <linux/pagevec.h>
#include <linux/prefetch.h>
#include <linux/cleancache.h>
+#include <linux/fsverity.h>
#include "misc.h"
#include "extent_io.h"
#include "extent-io-tree.h"
@@ -172,6 +173,8 @@ int __must_check submit_one_bio(struct bio *bio, int mirror_num,
bio->bi_private = NULL;
+ /* Caller should ensure the bio has at least some range added */
+ ASSERT(bio->bi_iter.bi_size);
if (is_data_inode(tree->private_data))
ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num,
bio_flags);
@@ -2245,18 +2248,6 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
return bitset;
}
-/*
- * helper function to set a given page up to date if all the
- * extents in the tree for that page are up to date
- */
-static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
-{
- u64 start = page_offset(page);
- u64 end = start + PAGE_SIZE - 1;
- if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
- SetPageUptodate(page);
-}
-
int free_io_failure(struct extent_io_tree *failure_tree,
struct extent_io_tree *io_tree,
struct io_failure_record *rec)
@@ -2688,7 +2679,15 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
start + len <= page_offset(page) + PAGE_SIZE);
if (uptodate) {
- btrfs_page_set_uptodate(fs_info, page, start, len);
+ if (fsverity_active(page->mapping->host) &&
+ !PageError(page) &&
+ !PageUptodate(page) &&
+ start < i_size_read(page->mapping->host) &&
+ !fsverity_verify_page(page)) {
+ btrfs_page_set_error(fs_info, page, start, len);
+ } else {
+ btrfs_page_set_uptodate(fs_info, page, start, len);
+ }
} else {
btrfs_page_clear_uptodate(fs_info, page, start, len);
btrfs_page_set_error(fs_info, page, start, len);
@@ -2779,7 +2778,7 @@ next:
void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
{
struct btrfs_inode *inode;
- int uptodate = (err == 0);
+ const bool uptodate = (err == 0);
int ret = 0;
ASSERT(page && page->mapping);
@@ -2787,8 +2786,14 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate);
if (!uptodate) {
- ClearPageUptodate(page);
- SetPageError(page);
+ const struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ u32 len;
+
+ ASSERT(end + 1 - start <= U32_MAX);
+ len = end + 1 - start;
+
+ btrfs_page_clear_uptodate(fs_info, page, start, len);
+ btrfs_page_set_error(fs_info, page, start, len);
ret = err < 0 ? err : -EIO;
mapping_set_error(page->mapping, ret);
}
@@ -3097,7 +3102,7 @@ readpage_ok:
/* Update page status and unlock */
end_page_read(page, uptodate, start, len);
endio_readpage_release_extent(&processed, BTRFS_I(inode),
- start, end, uptodate);
+ start, end, PageUptodate(page));
}
/* Release the last extent */
endio_readpage_release_extent(&processed, NULL, 0, 0, false);
@@ -3153,11 +3158,13 @@ struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs)
return bio;
}
-struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
+struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
{
struct bio *bio;
struct btrfs_io_bio *btrfs_bio;
+ ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
+
/* this will never fail when it's backed by a bioset */
bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset);
ASSERT(bio);
@@ -3181,20 +3188,22 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
* @size: portion of page that we want to write
* @prev_bio_flags: flags of previous bio to see if we can merge the current one
* @bio_flags: flags of the current bio to see if we can merge them
- * @return: true if page was added, false otherwise
*
* Attempt to add a page to bio considering stripe alignment etc.
*
- * Return true if successfully page added. Otherwise, return false.
+ * Return >= 0 for the number of bytes added to the bio.
+ * Can return 0 if the current bio is already at stripe/zone boundary.
+ * Return <0 for error.
*/
-static bool btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
- struct page *page,
- u64 disk_bytenr, unsigned int size,
- unsigned int pg_offset,
- unsigned long bio_flags)
+static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
+ struct page *page,
+ u64 disk_bytenr, unsigned int size,
+ unsigned int pg_offset,
+ unsigned long bio_flags)
{
struct bio *bio = bio_ctrl->bio;
u32 bio_size = bio->bi_iter.bi_size;
+ u32 real_size;
const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
bool contig;
int ret;
@@ -3203,29 +3212,36 @@ static bool btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
/* The limit should be calculated when bio_ctrl->bio is allocated */
ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary);
if (bio_ctrl->bio_flags != bio_flags)
- return false;
+ return 0;
if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED)
contig = bio->bi_iter.bi_sector == sector;
else
contig = bio_end_sector(bio) == sector;
if (!contig)
- return false;
+ return 0;
- if (bio_size + size > bio_ctrl->len_to_oe_boundary ||
- bio_size + size > bio_ctrl->len_to_stripe_boundary)
- return false;
+ real_size = min(bio_ctrl->len_to_oe_boundary,
+ bio_ctrl->len_to_stripe_boundary) - bio_size;
+ real_size = min(real_size, size);
+
+ /*
+ * If real_size is 0, never call bio_add_*_page(), as even size is 0,
+ * bio will still execute its endio function on the page!
+ */
+ if (real_size == 0)
+ return 0;
if (bio_op(bio) == REQ_OP_ZONE_APPEND)
- ret = bio_add_zone_append_page(bio, page, size, pg_offset);
+ ret = bio_add_zone_append_page(bio, page, real_size, pg_offset);
else
- ret = bio_add_page(bio, page, size, pg_offset);
+ ret = bio_add_page(bio, page, real_size, pg_offset);
- return ret == size;
+ return ret;
}
static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
- struct btrfs_inode *inode)
+ struct btrfs_inode *inode, u64 file_offset)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_io_geometry geom;
@@ -3266,9 +3282,8 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
return 0;
}
- ASSERT(fs_info->max_zone_append_size > 0);
/* Ordered extent not yet created, so we're good */
- ordered = btrfs_lookup_ordered_extent(inode, logical);
+ ordered = btrfs_lookup_ordered_extent(inode, file_offset);
if (!ordered) {
bio_ctrl->len_to_oe_boundary = U32_MAX;
return 0;
@@ -3280,6 +3295,62 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
return 0;
}
+static int alloc_new_bio(struct btrfs_inode *inode,
+ struct btrfs_bio_ctrl *bio_ctrl,
+ struct writeback_control *wbc,
+ unsigned int opf,
+ bio_end_io_t end_io_func,
+ u64 disk_bytenr, u32 offset, u64 file_offset,
+ unsigned long bio_flags)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct bio *bio;
+ int ret;
+
+ /*
+ * For compressed page range, its disk_bytenr is always @disk_bytenr
+ * passed in, no matter if we have added any range into previous bio.
+ */
+ if (bio_flags & EXTENT_BIO_COMPRESSED)
+ bio = btrfs_bio_alloc(disk_bytenr);
+ else
+ bio = btrfs_bio_alloc(disk_bytenr + offset);
+ bio_ctrl->bio = bio;
+ bio_ctrl->bio_flags = bio_flags;
+ bio->bi_end_io = end_io_func;
+ bio->bi_private = &inode->io_tree;
+ bio->bi_write_hint = inode->vfs_inode.i_write_hint;
+ bio->bi_opf = opf;
+ ret = calc_bio_boundaries(bio_ctrl, inode, file_offset);
+ if (ret < 0)
+ goto error;
+ if (wbc) {
+ struct block_device *bdev;
+
+ bdev = fs_info->fs_devices->latest_bdev;
+ bio_set_dev(bio, bdev);
+ wbc_init_bio(wbc, bio);
+ }
+ if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ struct btrfs_device *device;
+
+ device = btrfs_zoned_get_device(fs_info, disk_bytenr,
+ fs_info->sectorsize);
+ if (IS_ERR(device)) {
+ ret = PTR_ERR(device);
+ goto error;
+ }
+
+ btrfs_io_bio(bio)->device = device;
+ }
+ return 0;
+error:
+ bio_ctrl->bio = NULL;
+ bio->bi_status = errno_to_blk_status(ret);
+ bio_endio(bio);
+ return ret;
+}
+
/*
* @opf: bio REQ_OP_* and REQ_* flags as one value
* @wbc: optional writeback control for io accounting
@@ -3305,61 +3376,67 @@ static int submit_extent_page(unsigned int opf,
bool force_bio_submit)
{
int ret = 0;
- struct bio *bio;
- size_t io_size = min_t(size_t, size, PAGE_SIZE);
struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
- struct extent_io_tree *tree = &inode->io_tree;
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ unsigned int cur = pg_offset;
ASSERT(bio_ctrl);
ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE &&
pg_offset + size <= PAGE_SIZE);
- if (bio_ctrl->bio) {
- bio = bio_ctrl->bio;
- if (force_bio_submit ||
- !btrfs_bio_add_page(bio_ctrl, page, disk_bytenr, io_size,
- pg_offset, bio_flags)) {
- ret = submit_one_bio(bio, mirror_num, bio_ctrl->bio_flags);
+ if (force_bio_submit && bio_ctrl->bio) {
+ ret = submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->bio_flags);
+ bio_ctrl->bio = NULL;
+ if (ret < 0)
+ return ret;
+ }
+
+ while (cur < pg_offset + size) {
+ u32 offset = cur - pg_offset;
+ int added;
+
+ /* Allocate new bio if needed */
+ if (!bio_ctrl->bio) {
+ ret = alloc_new_bio(inode, bio_ctrl, wbc, opf,
+ end_io_func, disk_bytenr, offset,
+ page_offset(page) + cur,
+ bio_flags);
+ if (ret < 0)
+ return ret;
+ }
+ /*
+ * We must go through btrfs_bio_add_page() to ensure each
+ * page range won't cross various boundaries.
+ */
+ if (bio_flags & EXTENT_BIO_COMPRESSED)
+ added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr,
+ size - offset, pg_offset + offset,
+ bio_flags);
+ else
+ added = btrfs_bio_add_page(bio_ctrl, page,
+ disk_bytenr + offset, size - offset,
+ pg_offset + offset, bio_flags);
+
+ /* Metadata page range should never be split */
+ if (!is_data_inode(&inode->vfs_inode))
+ ASSERT(added == 0 || added == size - offset);
+
+ /* At least we added some page, update the account */
+ if (wbc && added)
+ wbc_account_cgroup_owner(wbc, page, added);
+
+ /* We have reached boundary, submit right now */
+ if (added < size - offset) {
+ /* The bio should contain some page(s) */
+ ASSERT(bio_ctrl->bio->bi_iter.bi_size);
+ ret = submit_one_bio(bio_ctrl->bio, mirror_num,
+ bio_ctrl->bio_flags);
bio_ctrl->bio = NULL;
if (ret < 0)
return ret;
- } else {
- if (wbc)
- wbc_account_cgroup_owner(wbc, page, io_size);
- return 0;
}
+ cur += added;
}
-
- bio = btrfs_bio_alloc(disk_bytenr);
- bio_add_page(bio, page, io_size, pg_offset);
- bio->bi_end_io = end_io_func;
- bio->bi_private = tree;
- bio->bi_write_hint = page->mapping->host->i_write_hint;
- bio->bi_opf = opf;
- if (wbc) {
- struct block_device *bdev;
-
- bdev = fs_info->fs_devices->latest_bdev;
- bio_set_dev(bio, bdev);
- wbc_init_bio(wbc, bio);
- wbc_account_cgroup_owner(wbc, page, io_size);
- }
- if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) {
- struct btrfs_device *device;
-
- device = btrfs_zoned_get_device(fs_info, disk_bytenr, io_size);
- if (IS_ERR(device))
- return PTR_ERR(device);
-
- btrfs_io_bio(bio)->device = device;
- }
-
- bio_ctrl->bio = bio;
- bio_ctrl->bio_flags = bio_flags;
- ret = calc_bio_boundaries(bio_ctrl, inode);
-
- return ret;
+ return 0;
}
static int attach_extent_buffer_page(struct extent_buffer *eb,
@@ -3488,7 +3565,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
size_t pg_offset = 0;
size_t iosize;
size_t blocksize = inode->i_sb->s_blocksize;
- unsigned long this_bio_flag = 0;
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
ret = set_page_extent_mapped(page);
@@ -3519,6 +3595,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
}
begin_page_read(fs_info, page);
while (cur <= end) {
+ unsigned long this_bio_flag = 0;
bool force_bio_submit = false;
u64 disk_bytenr;
@@ -3627,7 +3704,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
/* the get_extent function already copied into the page */
if (test_range_bit(tree, cur, cur_end,
EXTENT_UPTODATE, 1, NULL)) {
- check_page_uptodate(tree, page);
unlock_extent(tree, cur, cur + iosize - 1);
end_page_read(page, true, cur, iosize);
cur = cur + iosize;
@@ -3722,14 +3798,9 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
delalloc_end, &page_started, nr_written, wbc);
if (ret) {
- SetPageError(page);
- /*
- * btrfs_run_delalloc_range should return < 0 for error
- * but just in case, we use > 0 here meaning the IO is
- * started, so we don't want to return > 0 unless
- * things are going well.
- */
- return ret < 0 ? ret : -EIO;
+ btrfs_page_set_error(inode->root->fs_info, page,
+ page_offset(page), PAGE_SIZE);
+ return ret;
}
/*
* delalloc_end is already one less than the total length, so
@@ -3829,9 +3900,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
int *nr_ret)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- u64 start = page_offset(page);
- u64 end = start + PAGE_SIZE - 1;
- u64 cur = start;
+ u64 cur = page_offset(page);
+ u64 end = cur + PAGE_SIZE - 1;
u64 extent_offset;
u64 block_start;
struct extent_map *em;
@@ -3841,7 +3911,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
const unsigned int write_flags = wbc_to_write_flags(wbc);
bool compressed;
- ret = btrfs_writepage_cow_fixup(page, start, end);
+ ret = btrfs_writepage_cow_fixup(page);
if (ret) {
/* Fixup worker will requeue */
redirty_page_for_writepage(wbc, page);
@@ -3865,7 +3935,16 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
if (cur >= i_size) {
btrfs_writepage_endio_finish_ordered(inode, page, cur,
- end, 1);
+ end, true);
+ /*
+ * This range is beyond i_size, thus we don't need to
+ * bother writing back.
+ * But we still need to clear the dirty subpage bit, or
+ * the next time the page gets dirtied, we will try to
+ * writeback the sectors with subpage dirty bits,
+ * causing writeback without ordered extent.
+ */
+ btrfs_page_clear_dirty(fs_info, page, cur, end + 1 - cur);
break;
}
@@ -3915,7 +3994,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
nr++;
else
btrfs_writepage_endio_finish_ordered(inode,
- page, cur, cur + iosize - 1, 1);
+ page, cur, cur + iosize - 1, true);
+ btrfs_page_clear_dirty(fs_info, page, cur, iosize);
cur += iosize;
continue;
}
@@ -3951,6 +4031,12 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
cur += iosize;
nr++;
}
+ /*
+ * If we finish without problem, we should not only clear page dirty,
+ * but also empty subpage dirty bits
+ */
+ if (!ret)
+ btrfs_page_assert_not_dirty(fs_info, page);
*nr_ret = nr;
return ret;
}
@@ -3981,7 +4067,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
WARN_ON(!PageLocked(page));
- ClearPageError(page);
+ btrfs_page_clear_error(btrfs_sb(inode->i_sb), page,
+ page_offset(page), PAGE_SIZE);
pg_offset = offset_in_page(i_size);
if (page->index > end_index ||
@@ -4022,10 +4109,39 @@ done:
set_page_writeback(page);
end_page_writeback(page);
}
- if (PageError(page)) {
- ret = ret < 0 ? ret : -EIO;
+ /*
+ * Here we used to have a check for PageError() and then set @ret and
+ * call end_extent_writepage().
+ *
+ * But in fact setting @ret here will cause different error paths
+ * between subpage and regular sectorsize.
+ *
+ * For regular page size, we never submit current page, but only add
+ * current page to current bio.
+ * The bio submission can only happen in next page.
+ * Thus if we hit the PageError() branch, @ret is already set to
+ * non-zero value and will not get updated for regular sectorsize.
+ *
+ * But for subpage case, it's possible we submit part of current page,
+ * thus can get PageError() set by submitted bio of the same page,
+ * while our @ret is still 0.
+ *
+ * So here we unify the behavior and don't set @ret.
+ * Error can still be properly passed to higher layer as page will
+ * be set error, here we just don't handle the IO failure.
+ *
+ * NOTE: This is just a hotfix for subpage.
+ * The root fix will be properly ending ordered extent when we hit
+ * an error during writeback.
+ *
+ * But that needs a bigger refactoring, as we not only need to grab the
+ * submitted OE, but also need to know exactly at which bytenr we hit
+ * the error.
+ * Currently the full page based __extent_writepage_io() is not
+ * capable of that.
+ */
+ if (PageError(page))
end_extent_writepage(page, ret, start, page_end);
- }
unlock_page(page);
ASSERT(ret <= 0);
return ret;
@@ -4984,7 +5100,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
ret = __extent_writepage(page, &wbc_writepages, &epd);
else {
btrfs_writepage_endio_finish_ordered(BTRFS_I(inode),
- page, start, start + PAGE_SIZE - 1, 1);
+ page, start, start + PAGE_SIZE - 1, true);
unlock_page(page);
}
put_page(page);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 62027f551b44..53abdc280451 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -280,7 +280,7 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct bio *btrfs_bio_alloc(u64 first_byte);
struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs);
struct bio *btrfs_bio_clone(struct bio *bio);
-struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size);
+struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size);
int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
u64 length, u64 logical, struct page *page,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index df6631eefc65..2673c6ba7a4e 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -233,7 +233,6 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_path *path, u64 objectid,
u64 offset, int mod)
{
- int ret;
struct btrfs_key file_key;
int ins_len = mod < 0 ? -1 : 0;
int cow = mod != 0;
@@ -241,8 +240,8 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
file_key.objectid = objectid;
file_key.offset = offset;
file_key.type = BTRFS_EXTENT_DATA_KEY;
- ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
- return ret;
+
+ return btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
}
/*
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index ee34497500e1..7ff577005d0f 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -16,6 +16,7 @@
#include <linux/btrfs.h>
#include <linux/uio.h>
#include <linux/iversion.h>
+#include <linux/fsverity.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -1340,7 +1341,18 @@ static int prepare_uptodate_page(struct inode *inode,
unlock_page(page);
return -EIO;
}
- if (page->mapping != inode->i_mapping) {
+
+ /*
+ * Since btrfs_readpage() will unlock the page before it
+ * returns, there is a window where btrfs_releasepage() can be
+ * called to release the page. Here we check both inode
+ * mapping and PagePrivate() to make sure the page was not
+ * released.
+ *
+ * The private flag check is essential for subpage as we need
+ * to store extra bitmap using page->private.
+ */
+ if (page->mapping != inode->i_mapping || !PagePrivate(page)) {
unlock_page(page);
return -EAGAIN;
}
@@ -3604,7 +3616,13 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
static int btrfs_file_open(struct inode *inode, struct file *filp)
{
+ int ret;
+
filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
+
+ ret = fsverity_file_open(inode, filp);
+ if (ret)
+ return ret;
return generic_file_open(inode, filp);
}
@@ -3633,6 +3651,9 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
struct inode *inode = file_inode(iocb->ki_filp);
ssize_t ret;
+ if (fsverity_active(inode))
+ return 0;
+
if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos))
return 0;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 2131ae5b9ed7..da0eee7c9e5f 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -344,19 +344,13 @@ fail:
static void readahead_cache(struct inode *inode)
{
- struct file_ra_state *ra;
+ struct file_ra_state ra;
unsigned long last_index;
- ra = kzalloc(sizeof(*ra), GFP_NOFS);
- if (!ra)
- return;
-
- file_ra_state_init(ra, inode->i_mapping);
+ file_ra_state_init(&ra, inode->i_mapping);
last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
- page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
-
- kfree(ra);
+ page_cache_sync_readahead(inode->i_mapping, &ra, NULL, 0, last_index);
}
static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
@@ -2544,6 +2538,7 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
u64 offset = bytenr - block_group->start;
u64 to_free, to_unusable;
+ const int bg_reclaim_threshold = READ_ONCE(fs_info->bg_reclaim_threshold);
spin_lock(&ctl->tree_lock);
if (!used)
@@ -2573,9 +2568,9 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
/* All the region is now unusable. Mark it as unused and reclaim */
if (block_group->zone_unusable == block_group->length) {
btrfs_mark_bg_unused(block_group);
- } else if (block_group->zone_unusable >=
- div_factor_fine(block_group->length,
- fs_info->bg_reclaim_threshold)) {
+ } else if (bg_reclaim_threshold &&
+ block_group->zone_unusable >=
+ div_factor_fine(block_group->length, bg_reclaim_threshold)) {
btrfs_mark_bg_to_reclaim(block_group);
}
@@ -2652,8 +2647,11 @@ int btrfs_remove_free_space(struct btrfs_block_group *block_group,
* btrfs_pin_extent_for_log_replay() when replaying the log.
* Advance the pointer not to overwrite the tree-log nodes.
*/
- if (block_group->alloc_offset < offset + bytes)
- block_group->alloc_offset = offset + bytes;
+ if (block_group->start + block_group->alloc_offset <
+ offset + bytes) {
+ block_group->alloc_offset =
+ offset + bytes - block_group->start;
+ }
return 0;
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e6eb20987351..487533c35ddb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -32,6 +32,7 @@
#include <linux/sched/mm.h>
#include <linux/iomap.h>
#include <asm/unaligned.h>
+#include <linux/fsverity.h>
#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
@@ -286,9 +287,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
cur_size = min_t(unsigned long, compressed_size,
PAGE_SIZE);
- kaddr = kmap_atomic(cpage);
+ kaddr = page_address(cpage);
write_extent_buffer(leaf, kaddr, ptr, cur_size);
- kunmap_atomic(kaddr);
i++;
ptr += cur_size;
@@ -490,6 +490,9 @@ static noinline int add_async_extent(struct async_chunk *cow,
*/
static inline bool inode_can_compress(struct btrfs_inode *inode)
{
+ /* Subpage doesn't support compression yet */
+ if (inode->root->fs_info->sectorsize < PAGE_SIZE)
+ return false;
if (inode->flags & BTRFS_INODE_NODATACOW ||
inode->flags & BTRFS_INODE_NODATASUM)
return false;
@@ -629,7 +632,7 @@ again:
* inode has not been flagged as nocompress. This flag can
* change at any time if we discover bad compression ratios.
*/
- if (nr_pages > 1 && inode_need_compress(BTRFS_I(inode), start, end)) {
+ if (inode_need_compress(BTRFS_I(inode), start, end)) {
WARN_ON(pages);
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages) {
@@ -682,7 +685,11 @@ again:
}
}
cont:
- if (start == 0) {
+ /*
+ * Check cow_file_range() for why we don't even try to create inline
+ * extent for subpage case.
+ */
+ if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
/* lets try to make an inline extent */
if (ret || total_in < actual_end) {
/* we didn't compress the entire range, try
@@ -973,7 +980,7 @@ retry:
p->mapping = inode->vfs_inode.i_mapping;
btrfs_writepage_endio_finish_ordered(inode, p, start,
- end, 0);
+ end, false);
p->mapping = NULL;
extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
@@ -1080,7 +1087,17 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
- if (start == 0) {
+ /*
+ * Due to the page size limit, for subpage we can only trigger the
+ * writeback for the dirty sectors of page, that means data writeback
+ * is doing more writeback than what we want.
+ *
+ * This is especially unexpected for some call sites like fallocate,
+ * where we only increase i_size after everything is done.
+ * This means we can trigger inline extent even if we didn't want to.
+ * So here we skip inline extent creation completely.
+ */
+ if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
/* lets try to make an inline extent */
ret = cow_file_range_inline(inode, start, end, 0,
BTRFS_COMPRESS_NONE, NULL);
@@ -1290,11 +1307,6 @@ static noinline void async_cow_submit(struct btrfs_work *work)
nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
PAGE_SHIFT;
- /* atomic_sub_return implies a barrier */
- if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
- 5 * SZ_1M)
- cond_wake_up_nomb(&fs_info->async_submit_wait);
-
/*
* ->inode could be NULL if async_chunk_start has failed to compress,
* in which case we don't have anything to submit, yet we need to
@@ -1303,6 +1315,11 @@ static noinline void async_cow_submit(struct btrfs_work *work)
*/
if (async_chunk->inode)
submit_compressed_extents(async_chunk);
+
+ /* atomic_sub_return implies a barrier */
+ if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
+ 5 * SZ_1M)
+ cond_wake_up_nomb(&fs_info->async_submit_wait);
}
static noinline void async_cow_free(struct btrfs_work *work)
@@ -1946,6 +1963,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
ret = cow_file_range_async(inode, wbc, locked_page, start, end,
page_started, nr_written);
}
+ ASSERT(ret <= 0);
if (ret)
btrfs_cleanup_ordered_extents(inode, locked_page, start,
end - start + 1);
@@ -2271,13 +2289,127 @@ static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
}
+/*
+ * Split an extent_map at [start, start + len]
+ *
+ * This function is intended to be used only for extract_ordered_extent().
+ */
+static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
+ u64 pre, u64 post)
+{
+ struct extent_map_tree *em_tree = &inode->extent_tree;
+ struct extent_map *em;
+ struct extent_map *split_pre = NULL;
+ struct extent_map *split_mid = NULL;
+ struct extent_map *split_post = NULL;
+ int ret = 0;
+ unsigned long flags;
+
+ /* Sanity check */
+ if (pre == 0 && post == 0)
+ return 0;
+
+ split_pre = alloc_extent_map();
+ if (pre)
+ split_mid = alloc_extent_map();
+ if (post)
+ split_post = alloc_extent_map();
+ if (!split_pre || (pre && !split_mid) || (post && !split_post)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ASSERT(pre + post < len);
+
+ lock_extent(&inode->io_tree, start, start + len - 1);
+ write_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, len);
+ if (!em) {
+ ret = -EIO;
+ goto out_unlock;
+ }
+
+ ASSERT(em->len == len);
+ ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
+ ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
+ ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags));
+ ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags));
+ ASSERT(!list_empty(&em->list));
+
+ flags = em->flags;
+ clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+ /* First, replace the em with a new extent_map starting from * em->start */
+ split_pre->start = em->start;
+ split_pre->len = (pre ? pre : em->len - post);
+ split_pre->orig_start = split_pre->start;
+ split_pre->block_start = em->block_start;
+ split_pre->block_len = split_pre->len;
+ split_pre->orig_block_len = split_pre->block_len;
+ split_pre->ram_bytes = split_pre->len;
+ split_pre->flags = flags;
+ split_pre->compress_type = em->compress_type;
+ split_pre->generation = em->generation;
+
+ replace_extent_mapping(em_tree, em, split_pre, 1);
+
+ /*
+ * Now we only have an extent_map at:
+ * [em->start, em->start + pre] if pre != 0
+ * [em->start, em->start + em->len - post] if pre == 0
+ */
+
+ if (pre) {
+ /* Insert the middle extent_map */
+ split_mid->start = em->start + pre;
+ split_mid->len = em->len - pre - post;
+ split_mid->orig_start = split_mid->start;
+ split_mid->block_start = em->block_start + pre;
+ split_mid->block_len = split_mid->len;
+ split_mid->orig_block_len = split_mid->block_len;
+ split_mid->ram_bytes = split_mid->len;
+ split_mid->flags = flags;
+ split_mid->compress_type = em->compress_type;
+ split_mid->generation = em->generation;
+ add_extent_mapping(em_tree, split_mid, 1);
+ }
+
+ if (post) {
+ split_post->start = em->start + em->len - post;
+ split_post->len = post;
+ split_post->orig_start = split_post->start;
+ split_post->block_start = em->block_start + em->len - post;
+ split_post->block_len = split_post->len;
+ split_post->orig_block_len = split_post->block_len;
+ split_post->ram_bytes = split_post->len;
+ split_post->flags = flags;
+ split_post->compress_type = em->compress_type;
+ split_post->generation = em->generation;
+ add_extent_mapping(em_tree, split_post, 1);
+ }
+
+ /* Once for us */
+ free_extent_map(em);
+ /* Once for the tree */
+ free_extent_map(em);
+
+out_unlock:
+ write_unlock(&em_tree->lock);
+ unlock_extent(&inode->io_tree, start, start + len - 1);
+out:
+ free_extent_map(split_pre);
+ free_extent_map(split_mid);
+ free_extent_map(split_post);
+
+ return ret;
+}
+
static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
struct bio *bio, loff_t file_offset)
{
struct btrfs_ordered_extent *ordered;
- struct extent_map *em = NULL, *em_new = NULL;
- struct extent_map_tree *em_tree = &inode->extent_tree;
u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ u64 file_len;
u64 len = bio->bi_iter.bi_size;
u64 end = start + len;
u64 ordered_end;
@@ -2317,41 +2449,16 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
goto out;
}
+ file_len = ordered->num_bytes;
pre = start - ordered->disk_bytenr;
post = ordered_end - end;
ret = btrfs_split_ordered_extent(ordered, pre, post);
if (ret)
goto out;
-
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, ordered->file_offset, len);
- if (!em) {
- read_unlock(&em_tree->lock);
- ret = -EIO;
- goto out;
- }
- read_unlock(&em_tree->lock);
-
- ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
- /*
- * We cannot reuse em_new here but have to create a new one, as
- * unpin_extent_cache() expects the start of the extent map to be the
- * logical offset of the file, which does not hold true anymore after
- * splitting.
- */
- em_new = create_io_em(inode, em->start + pre, len,
- em->start + pre, em->block_start + pre, len,
- len, len, BTRFS_COMPRESS_NONE,
- BTRFS_ORDERED_REGULAR);
- if (IS_ERR(em_new)) {
- ret = PTR_ERR(em_new);
- goto out;
- }
- free_extent_map(em_new);
+ ret = split_zoned_em(inode, file_offset, file_len, pre, post);
out:
- free_extent_map(em);
btrfs_put_ordered_extent(ordered);
return errno_to_blk_status(ret);
@@ -2681,7 +2788,7 @@ out_page:
* to fix it up. The async helper will wait for ordered extents, set
* the delalloc bit and make it safe to write the page.
*/
-int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
+int btrfs_writepage_cow_fixup(struct page *page)
{
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2903,7 +3010,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
- if (ordered_extent->disk)
+ if (ordered_extent->bdev)
btrfs_rewrite_logical_zoned(ordered_extent);
btrfs_free_io_failure_record(inode, start, end);
@@ -3082,7 +3189,7 @@ static void finish_ordered_fn(struct btrfs_work *work)
void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
struct page *page, u64 start,
- u64 end, int uptodate)
+ u64 end, bool uptodate)
{
trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate);
@@ -3168,25 +3275,44 @@ unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
return 0;
}
+ /*
+ * For subpage case, above PageChecked is not safe as it's not subpage
+ * compatible.
+ * But for now only cow fixup and compressed read utilize PageChecked
+ * flag, while in this context we can easily use io_bio->csum to
+ * determine if we really need to do csum verification.
+ *
+ * So for now, just exit if io_bio->csum is NULL, as it means it's
+ * compressed read, and its compressed data csum has already been
+ * verified.
+ */
+ if (io_bio->csum == NULL)
+ return 0;
+
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
return 0;
if (!root->fs_info->csum_root)
return 0;
- if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
- test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
- clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM);
- return 0;
- }
-
ASSERT(page_offset(page) <= start &&
end <= page_offset(page) + PAGE_SIZE - 1);
for (pg_off = offset_in_page(start);
pg_off < offset_in_page(end);
pg_off += sectorsize, bio_offset += sectorsize) {
+ u64 file_offset = pg_off + page_offset(page);
int ret;
+ if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
+ test_range_bit(io_tree, file_offset,
+ file_offset + sectorsize - 1,
+ EXTENT_NODATASUM, 1, NULL)) {
+ /* Skip the range without csum for data reloc inode */
+ clear_extent_bits(io_tree, file_offset,
+ file_offset + sectorsize - 1,
+ EXTENT_NODATASUM);
+ continue;
+ }
ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off,
page_offset(page) + pg_off);
if (ret < 0) {
@@ -3431,7 +3557,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
/*
* If we have an inode with links, there are a couple of
- * possibilities. Old kernels (before v3.12) used to create an
+ * possibilities:
+ *
+ * 1. We were halfway through creating fsverity metadata for the
+ * file. In that case, the orphan item represents incomplete
+ * fsverity metadata which must be cleaned up with
+ * btrfs_drop_verity_items and deleting the orphan item.
+
+ * 2. Old kernels (before v3.12) used to create an
* orphan item for truncate indicating that there were possibly
* extent items past i_size that needed to be deleted. In v3.12,
* truncate was changed to update i_size in sync with the extent
@@ -3449,8 +3582,12 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
* but either way, we can delete the orphan item.
*/
if (ret == -ENOENT || inode->i_nlink) {
- if (!ret)
+ if (!ret) {
+ ret = btrfs_drop_verity_items(BTRFS_I(inode));
iput(inode);
+ if (ret)
+ goto out;
+ }
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -3639,7 +3776,8 @@ static int btrfs_read_locked_inode(struct inode *inode,
rdev = btrfs_inode_rdev(leaf, inode_item);
BTRFS_I(inode)->index_cnt = (u64)-1;
- BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
+ btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
+ &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
cache_index:
/*
@@ -3770,6 +3908,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
struct inode *inode)
{
struct btrfs_map_token token;
+ u64 flags;
btrfs_init_map_token(&token, leaf);
@@ -3805,7 +3944,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
btrfs_set_token_inode_transid(&token, item, trans->transid);
btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
- btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
+ flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
+ BTRFS_I(inode)->ro_flags);
+ btrfs_set_token_inode_flags(&token, item, flags);
btrfs_set_token_inode_block_group(&token, item, 0);
}
@@ -4999,15 +5140,13 @@ static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
int ret;
/*
- * Still need to make sure the inode looks like it's been updated so
- * that any holes get logged if we fsync.
+ * If NO_HOLES is enabled, we don't need to do anything.
+ * Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
+ * or btrfs_update_inode() will be called, which guarantee that the next
+ * fsync will know this inode was changed and needs to be logged.
*/
- if (btrfs_fs_incompat(fs_info, NO_HOLES)) {
- inode->last_trans = fs_info->generation;
- inode->last_sub_trans = root->log_transid;
- inode->last_log_commit = root->last_log_commit;
+ if (btrfs_fs_incompat(fs_info, NO_HOLES))
return 0;
- }
/*
* 1 - for the one we're dropping
@@ -5253,7 +5392,7 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr
if (btrfs_root_readonly(root))
return -EROFS;
- err = setattr_prepare(&init_user_ns, dentry, attr);
+ err = setattr_prepare(mnt_userns, dentry, attr);
if (err)
return err;
@@ -5264,13 +5403,12 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr
}
if (attr->ia_valid) {
- setattr_copy(&init_user_ns, inode, attr);
+ setattr_copy(mnt_userns, inode, attr);
inode_inc_iversion(inode);
err = btrfs_dirty_inode(inode);
if (!err && attr->ia_valid & ATTR_MODE)
- err = posix_acl_chmod(&init_user_ns, inode,
- inode->i_mode);
+ err = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
}
return err;
@@ -5433,6 +5571,7 @@ void btrfs_evict_inode(struct inode *inode)
trace_btrfs_inode_evict(inode);
if (!root) {
+ fsverity_cleanup_inode(inode);
clear_inode(inode);
return;
}
@@ -5515,6 +5654,7 @@ no_delete:
* to retry these periodically in the future.
*/
btrfs_remove_delayed_node(BTRFS_I(inode));
+ fsverity_cleanup_inode(inode);
clear_inode(inode);
}
@@ -6281,6 +6421,7 @@ static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
+ struct user_namespace *mnt_userns,
struct inode *dir,
const char *name, int name_len,
u64 ref_objectid, u64 objectid,
@@ -6390,7 +6531,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
if (ret != 0)
goto fail_unlock;
- inode_init_owner(&init_user_ns, inode, dir, mode);
+ inode_init_owner(mnt_userns, inode, dir, mode);
inode_set_bytes(inode, 0);
inode->i_mtime = current_time(inode);
@@ -6575,9 +6716,9 @@ static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
goto out_unlock;
- inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
- dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
- mode, &index);
+ inode = btrfs_new_inode(trans, root, mnt_userns, dir,
+ dentry->d_name.name, dentry->d_name.len,
+ btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
inode = NULL;
@@ -6639,9 +6780,9 @@ static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
goto out_unlock;
- inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
- dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
- mode, &index);
+ inode = btrfs_new_inode(trans, root, mnt_userns, dir,
+ dentry->d_name.name, dentry->d_name.len,
+ btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
inode = NULL;
@@ -6784,8 +6925,9 @@ static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
goto out_fail;
- inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
- dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
+ inode = btrfs_new_inode(trans, root, mnt_userns, dir,
+ dentry->d_name.name, dentry->d_name.len,
+ btrfs_ino(BTRFS_I(dir)), objectid,
S_IFDIR | mode, &index);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
@@ -8105,9 +8247,10 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
return dip;
}
-static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
+static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,
struct bio *dio_bio, loff_t file_offset)
{
+ struct inode *inode = iter->inode;
const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
const bool raid56 = (btrfs_data_alloc_profile(fs_info) &
@@ -8117,13 +8260,13 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
u64 start_sector;
int async_submit = 0;
u64 submit_len;
- int clone_offset = 0;
- int clone_len;
+ u64 clone_offset = 0;
+ u64 clone_len;
u64 logical;
int ret;
blk_status_t status;
struct btrfs_io_geometry geom;
- struct btrfs_dio_data *dio_data = iomap->private;
+ struct btrfs_dio_data *dio_data = iter->iomap.private;
struct extent_map *em = NULL;
dip = btrfs_create_dio_private(dio_bio, inode, file_offset);
@@ -8166,9 +8309,9 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
status = errno_to_blk_status(ret);
goto out_err_em;
}
- ASSERT(geom.len <= INT_MAX);
- clone_len = min_t(int, submit_len, geom.len);
+ clone_len = min(submit_len, geom.len);
+ ASSERT(clone_len <= UINT_MAX);
/*
* This will never fail as it's passing GPF_NOFS and
@@ -8312,11 +8455,47 @@ static void btrfs_readahead(struct readahead_control *rac)
extent_readahead(rac);
}
+/*
+ * For releasepage() and invalidatepage() we have a race window where
+ * end_page_writeback() is called but the subpage spinlock is not yet released.
+ * If we continue to release/invalidate the page, we could cause use-after-free
+ * for subpage spinlock. So this function is to spin and wait for subpage
+ * spinlock.
+ */
+static void wait_subpage_spinlock(struct page *page)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ struct btrfs_subpage *subpage;
+
+ if (fs_info->sectorsize == PAGE_SIZE)
+ return;
+
+ ASSERT(PagePrivate(page) && page->private);
+ subpage = (struct btrfs_subpage *)page->private;
+
+ /*
+ * This may look insane as we just acquire the spinlock and release it,
+ * without doing anything. But we just want to make sure no one is
+ * still holding the subpage spinlock.
+ * And since the page is not dirty nor writeback, and we have page
+ * locked, the only possible way to hold a spinlock is from the endio
+ * function to clear page writeback.
+ *
+ * Here we just acquire the spinlock so that all existing callers
+ * should exit and we're safe to release/invalidate the page.
+ */
+ spin_lock_irq(&subpage->lock);
+ spin_unlock_irq(&subpage->lock);
+}
+
static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
{
int ret = try_release_extent_mapping(page, gfp_flags);
- if (ret == 1)
+
+ if (ret == 1) {
+ wait_subpage_spinlock(page);
clear_page_extent_mapped(page);
+ }
return ret;
}
@@ -8380,6 +8559,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
* do double ordered extent accounting on the same page.
*/
wait_on_page_writeback(page);
+ wait_subpage_spinlock(page);
/*
* For subpage case, we have call sites like
@@ -8468,7 +8648,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
spin_unlock_irq(&inode->ordered_tree.lock);
if (btrfs_dec_test_ordered_pending(inode, &ordered,
- cur, range_end + 1 - cur, 1)) {
+ cur, range_end + 1 - cur)) {
btrfs_finish_ordered_io(ordered);
/*
* The ordered extent has finished, now we're again
@@ -8849,7 +9029,8 @@ out:
*/
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
struct btrfs_root *new_root,
- struct btrfs_root *parent_root)
+ struct btrfs_root *parent_root,
+ struct user_namespace *mnt_userns)
{
struct inode *inode;
int err;
@@ -8860,7 +9041,8 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
if (err < 0)
return err;
- inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, ino, ino,
+ inode = btrfs_new_inode(trans, new_root, mnt_userns, NULL, "..", 2,
+ ino, ino,
S_IFDIR | (~current_umask() & S_IRWXUGO),
&index);
if (IS_ERR(inode))
@@ -8904,6 +9086,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->defrag_bytes = 0;
ei->disk_i_size = 0;
ei->flags = 0;
+ ei->ro_flags = 0;
ei->csum_bytes = 0;
ei->index_cnt = (u64)-1;
ei->dir_index = 0;
@@ -9085,6 +9268,7 @@ static int btrfs_getattr(struct user_namespace *mnt_userns,
struct inode *inode = d_inode(path->dentry);
u32 blocksize = inode->i_sb->s_blocksize;
u32 bi_flags = BTRFS_I(inode)->flags;
+ u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
stat->result_mask |= STATX_BTIME;
stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
@@ -9097,13 +9281,15 @@ static int btrfs_getattr(struct user_namespace *mnt_userns,
stat->attributes |= STATX_ATTR_IMMUTABLE;
if (bi_flags & BTRFS_INODE_NODUMP)
stat->attributes |= STATX_ATTR_NODUMP;
+ if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
+ stat->attributes |= STATX_ATTR_VERITY;
stat->attributes_mask |= (STATX_ATTR_APPEND |
STATX_ATTR_COMPRESSED |
STATX_ATTR_IMMUTABLE |
STATX_ATTR_NODUMP);
- generic_fillattr(&init_user_ns, inode, stat);
+ generic_fillattr(mnt_userns, inode, stat);
stat->dev = BTRFS_I(inode)->root->anon_dev;
spin_lock(&BTRFS_I(inode)->lock);
@@ -9137,8 +9323,14 @@ static int btrfs_rename_exchange(struct inode *old_dir,
bool dest_log_pinned = false;
bool need_abort = false;
- /* we only allow rename subvolume link between subvolumes */
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
+ /*
+ * For non-subvolumes allow exchange only within one subvolume, in the
+ * same inode namespace. Two subvolumes (represented as directory) can
+ * be exchanged as they're a logical link and have a fixed inode number.
+ */
+ if (root != dest &&
+ (old_ino != BTRFS_FIRST_FREE_OBJECTID ||
+ new_ino != BTRFS_FIRST_FREE_OBJECTID))
return -EXDEV;
/* close the race window with snapshot create/destroy ioctl */
@@ -9185,8 +9377,6 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(trans);
} else {
- btrfs_pin_log_trans(root);
- root_log_pinned = true;
ret = btrfs_insert_inode_ref(trans, dest,
new_dentry->d_name.name,
new_dentry->d_name.len,
@@ -9203,8 +9393,6 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(trans);
} else {
- btrfs_pin_log_trans(dest);
- dest_log_pinned = true;
ret = btrfs_insert_inode_ref(trans, root,
old_dentry->d_name.name,
old_dentry->d_name.len,
@@ -9235,6 +9423,29 @@ static int btrfs_rename_exchange(struct inode *old_dir,
BTRFS_I(new_inode), 1);
}
+ /*
+ * Now pin the logs of the roots. We do it to ensure that no other task
+ * can sync the logs while we are in progress with the rename, because
+ * that could result in an inconsistency in case any of the inodes that
+ * are part of this rename operation were logged before.
+ *
+ * We pin the logs even if at this precise moment none of the inodes was
+ * logged before. This is because right after we checked for that, some
+ * other task fsyncing some other inode not involved with this rename
+ * operation could log that one of our inodes exists.
+ *
+ * We don't need to pin the logs before the above calls to
+ * btrfs_insert_inode_ref(), since those don't ever need to change a log.
+ */
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ btrfs_pin_log_trans(root);
+ root_log_pinned = true;
+ }
+ if (new_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ btrfs_pin_log_trans(dest);
+ dest_log_pinned = true;
+ }
+
/* src is a subvolume */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
@@ -9316,8 +9527,7 @@ out_fail:
if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
- (new_inode &&
- btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
+ btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))
btrfs_set_log_full_commit(trans);
if (root_log_pinned) {
@@ -9341,6 +9551,7 @@ out_notrans:
static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
+ struct user_namespace *mnt_userns,
struct inode *dir,
struct dentry *dentry)
{
@@ -9353,7 +9564,7 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
if (ret)
return ret;
- inode = btrfs_new_inode(trans, root, dir,
+ inode = btrfs_new_inode(trans, root, mnt_userns, dir,
dentry->d_name.name,
dentry->d_name.len,
btrfs_ino(BTRFS_I(dir)),
@@ -9390,9 +9601,10 @@ out:
return ret;
}
-static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry,
- unsigned int flags)
+static int btrfs_rename(struct user_namespace *mnt_userns,
+ struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
struct btrfs_trans_handle *trans;
@@ -9487,8 +9699,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
/* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(trans);
} else {
- btrfs_pin_log_trans(root);
- log_pinned = true;
ret = btrfs_insert_inode_ref(trans, dest,
new_dentry->d_name.name,
new_dentry->d_name.len,
@@ -9512,6 +9722,25 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
} else {
+ /*
+ * Now pin the log. We do it to ensure that no other task can
+ * sync the log while we are in progress with the rename, as
+ * that could result in an inconsistency in case any of the
+ * inodes that are part of this rename operation were logged
+ * before.
+ *
+ * We pin the log even if at this precise moment none of the
+ * inodes was logged before. This is because right after we
+ * checked for that, some other task fsyncing some other inode
+ * not involved with this rename operation could log that one of
+ * our inodes exists.
+ *
+ * We don't need to pin the logs before the above call to
+ * btrfs_insert_inode_ref(), since that does not need to change
+ * a log.
+ */
+ btrfs_pin_log_trans(root);
+ log_pinned = true;
ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
BTRFS_I(d_inode(old_dentry)),
old_dentry->d_name.name,
@@ -9565,8 +9794,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
}
if (flags & RENAME_WHITEOUT) {
- ret = btrfs_whiteout_for_rename(trans, root, old_dir,
- old_dentry);
+ ret = btrfs_whiteout_for_rename(trans, root, mnt_userns,
+ old_dir, old_dentry);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -9616,7 +9845,8 @@ static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_di
return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
new_dentry);
- return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
+ return btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir,
+ new_dentry, flags);
}
struct btrfs_delalloc_work {
@@ -9713,11 +9943,7 @@ static int start_delalloc_inodes(struct btrfs_root *root,
btrfs_queue_work(root->fs_info->flush_workers,
&work->work);
} else {
- ret = sync_inode(inode, wbc);
- if (!ret &&
- test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- ret = sync_inode(inode, wbc);
+ ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
btrfs_add_delayed_iput(inode);
if (ret || wbc->nr_to_write <= 0)
goto out;
@@ -9852,9 +10078,10 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
goto out_unlock;
- inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
- dentry->d_name.len, btrfs_ino(BTRFS_I(dir)),
- objectid, S_IFLNK|S_IRWXUGO, &index);
+ inode = btrfs_new_inode(trans, root, mnt_userns, dir,
+ dentry->d_name.name, dentry->d_name.len,
+ btrfs_ino(BTRFS_I(dir)), objectid,
+ S_IFLNK | S_IRWXUGO, &index);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
inode = NULL;
@@ -10178,7 +10405,7 @@ static int btrfs_permission(struct user_namespace *mnt_userns,
if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
return -EACCES;
}
- return generic_permission(&init_user_ns, inode, mask);
+ return generic_permission(mnt_userns, inode, mask);
}
static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
@@ -10203,7 +10430,7 @@ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
if (ret)
goto out;
- inode = btrfs_new_inode(trans, root, dir, NULL, 0,
+ inode = btrfs_new_inode(trans, root, mnt_userns, dir, NULL, 0,
btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0ba98e08a029..41524f9aeac3 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -27,6 +27,7 @@
#include <linux/uaccess.h>
#include <linux/iversion.h>
#include <linux/fileattr.h>
+#include <linux/fsverity.h>
#include "ctree.h"
#include "disk-io.h"
#include "export.h"
@@ -103,9 +104,11 @@ static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode,
* Export internal inode flags to the format expected by the FS_IOC_GETFLAGS
* ioctl.
*/
-static unsigned int btrfs_inode_flags_to_fsflags(unsigned int flags)
+static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode)
{
unsigned int iflags = 0;
+ u32 flags = binode->flags;
+ u32 ro_flags = binode->ro_flags;
if (flags & BTRFS_INODE_SYNC)
iflags |= FS_SYNC_FL;
@@ -121,6 +124,8 @@ static unsigned int btrfs_inode_flags_to_fsflags(unsigned int flags)
iflags |= FS_DIRSYNC_FL;
if (flags & BTRFS_INODE_NODATACOW)
iflags |= FS_NOCOW_FL;
+ if (ro_flags & BTRFS_INODE_RO_VERITY)
+ iflags |= FS_VERITY_FL;
if (flags & BTRFS_INODE_NOCOMPRESS)
iflags |= FS_NOCOMP_FL;
@@ -148,10 +153,12 @@ void btrfs_sync_inode_flags_to_i_flags(struct inode *inode)
new_fl |= S_NOATIME;
if (binode->flags & BTRFS_INODE_DIRSYNC)
new_fl |= S_DIRSYNC;
+ if (binode->ro_flags & BTRFS_INODE_RO_VERITY)
+ new_fl |= S_VERITY;
set_mask_bits(&inode->i_flags,
- S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC,
- new_fl);
+ S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC |
+ S_VERITY, new_fl);
}
/*
@@ -200,7 +207,7 @@ int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
{
struct btrfs_inode *binode = BTRFS_I(d_inode(dentry));
- fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode->flags));
+ fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode));
return 0;
}
@@ -224,7 +231,7 @@ int btrfs_fileattr_set(struct user_namespace *mnt_userns,
return -EOPNOTSUPP;
fsflags = btrfs_mask_fsflags_for_type(inode, fa->flags);
- old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags);
+ old_fsflags = btrfs_inode_flags_to_fsflags(binode);
ret = check_fsflags(old_fsflags, fsflags);
if (ret)
return ret;
@@ -492,8 +499,8 @@ int __pure btrfs_is_empty_uuid(u8 *uuid)
return 1;
}
-static noinline int create_subvol(struct inode *dir,
- struct dentry *dentry,
+static noinline int create_subvol(struct user_namespace *mnt_userns,
+ struct inode *dir, struct dentry *dentry,
const char *name, int namelen,
struct btrfs_qgroup_inherit *inherit)
{
@@ -638,7 +645,7 @@ static noinline int create_subvol(struct inode *dir,
goto fail;
}
- ret = btrfs_create_subvol_root(trans, new_root, root);
+ ret = btrfs_create_subvol_root(trans, new_root, root, mnt_userns);
btrfs_put_root(new_root);
if (ret) {
/* We potentially lose an unused inode item here */
@@ -830,7 +837,8 @@ free_pending:
* nfs_async_unlink().
*/
-static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
+static int btrfs_may_delete(struct user_namespace *mnt_userns,
+ struct inode *dir, struct dentry *victim, int isdir)
{
int error;
@@ -840,12 +848,12 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
BUG_ON(d_inode(victim->d_parent) != dir);
audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
- error = inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
+ error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
if (error)
return error;
if (IS_APPEND(dir))
return -EPERM;
- if (check_sticky(&init_user_ns, dir, d_inode(victim)) ||
+ if (check_sticky(mnt_userns, dir, d_inode(victim)) ||
IS_APPEND(d_inode(victim)) || IS_IMMUTABLE(d_inode(victim)) ||
IS_SWAPFILE(d_inode(victim)))
return -EPERM;
@@ -864,13 +872,16 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
}
/* copy of may_create in fs/namei.c() */
-static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
+static inline int btrfs_may_create(struct user_namespace *mnt_userns,
+ struct inode *dir, struct dentry *child)
{
if (d_really_is_positive(child))
return -EEXIST;
if (IS_DEADDIR(dir))
return -ENOENT;
- return inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
+ if (!fsuidgid_has_mapping(dir->i_sb, mnt_userns))
+ return -EOVERFLOW;
+ return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
}
/*
@@ -879,6 +890,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
* inside this filesystem so it's quite a bit simpler.
*/
static noinline int btrfs_mksubvol(const struct path *parent,
+ struct user_namespace *mnt_userns,
const char *name, int namelen,
struct btrfs_root *snap_src,
bool readonly,
@@ -893,12 +905,12 @@ static noinline int btrfs_mksubvol(const struct path *parent,
if (error == -EINTR)
return error;
- dentry = lookup_one_len(name, parent->dentry, namelen);
+ dentry = lookup_one(mnt_userns, name, parent->dentry, namelen);
error = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto out_unlock;
- error = btrfs_may_create(dir, dentry);
+ error = btrfs_may_create(mnt_userns, dir, dentry);
if (error)
goto out_dput;
@@ -920,7 +932,7 @@ static noinline int btrfs_mksubvol(const struct path *parent,
if (snap_src)
error = create_snapshot(snap_src, dir, dentry, readonly, inherit);
else
- error = create_subvol(dir, dentry, name, namelen, inherit);
+ error = create_subvol(mnt_userns, dir, dentry, name, namelen, inherit);
if (!error)
fsnotify_mkdir(dir, dentry);
@@ -934,6 +946,7 @@ out_unlock:
}
static noinline int btrfs_mksnapshot(const struct path *parent,
+ struct user_namespace *mnt_userns,
const char *name, int namelen,
struct btrfs_root *root,
bool readonly,
@@ -963,7 +976,7 @@ static noinline int btrfs_mksnapshot(const struct path *parent,
btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
- ret = btrfs_mksubvol(parent, name, namelen,
+ ret = btrfs_mksubvol(parent, mnt_userns, name, namelen,
root, readonly, inherit);
out:
if (snapshot_force_cow)
@@ -1792,6 +1805,7 @@ out_drop:
}
static noinline int __btrfs_ioctl_snap_create(struct file *file,
+ struct user_namespace *mnt_userns,
const char *name, unsigned long fd, int subvol,
bool readonly,
struct btrfs_qgroup_inherit *inherit)
@@ -1819,8 +1833,8 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
}
if (subvol) {
- ret = btrfs_mksubvol(&file->f_path, name, namelen,
- NULL, readonly, inherit);
+ ret = btrfs_mksubvol(&file->f_path, mnt_userns, name,
+ namelen, NULL, readonly, inherit);
} else {
struct fd src = fdget(fd);
struct inode *src_inode;
@@ -1834,16 +1848,17 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
btrfs_info(BTRFS_I(file_inode(file))->root->fs_info,
"Snapshot src from another FS");
ret = -EXDEV;
- } else if (!inode_owner_or_capable(&init_user_ns, src_inode)) {
+ } else if (!inode_owner_or_capable(mnt_userns, src_inode)) {
/*
* Subvolume creation is not restricted, but snapshots
* are limited to own subvolumes only
*/
ret = -EPERM;
} else {
- ret = btrfs_mksnapshot(&file->f_path, name, namelen,
- BTRFS_I(src_inode)->root,
- readonly, inherit);
+ ret = btrfs_mksnapshot(&file->f_path, mnt_userns,
+ name, namelen,
+ BTRFS_I(src_inode)->root,
+ readonly, inherit);
}
fdput(src);
}
@@ -1867,8 +1882,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
return PTR_ERR(vol_args);
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
- ret = __btrfs_ioctl_snap_create(file, vol_args->name, vol_args->fd,
- subvol, false, NULL);
+ ret = __btrfs_ioctl_snap_create(file, file_mnt_user_ns(file),
+ vol_args->name, vol_args->fd, subvol,
+ false, NULL);
kfree(vol_args);
return ret;
@@ -1926,8 +1942,9 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
}
}
- ret = __btrfs_ioctl_snap_create(file, vol_args->name, vol_args->fd,
- subvol, readonly, inherit);
+ ret = __btrfs_ioctl_snap_create(file, file_mnt_user_ns(file),
+ vol_args->name, vol_args->fd, subvol,
+ readonly, inherit);
if (ret)
goto free_inherit;
free_inherit:
@@ -1971,7 +1988,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
u64 flags;
int ret = 0;
- if (!inode_owner_or_capable(&init_user_ns, inode))
+ if (!inode_owner_or_capable(file_mnt_user_ns(file), inode))
return -EPERM;
ret = mnt_want_write_file(file);
@@ -2382,23 +2399,16 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
key.offset = (u64)-1;
while (1) {
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ ret = btrfs_search_backwards(root, &key, path);
if (ret < 0)
goto out;
else if (ret > 0) {
- ret = btrfs_previous_item(root, path, dirid,
- BTRFS_INODE_REF_KEY);
- if (ret < 0)
- goto out;
- else if (ret > 0) {
- ret = -ENOENT;
- goto out;
- }
+ ret = -ENOENT;
+ goto out;
}
l = path->nodes[0];
slot = path->slots[0];
- btrfs_item_key_to_cpu(l, &key, slot);
iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref);
len = btrfs_inode_ref_name_len(l, iref);
@@ -2429,7 +2439,8 @@ out:
return ret;
}
-static int btrfs_search_path_in_tree_user(struct inode *inode,
+static int btrfs_search_path_in_tree_user(struct user_namespace *mnt_userns,
+ struct inode *inode,
struct btrfs_ioctl_ino_lookup_user_args *args)
{
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
@@ -2473,23 +2484,16 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,
key.type = BTRFS_INODE_REF_KEY;
key.offset = (u64)-1;
while (1) {
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0) {
+ ret = btrfs_search_backwards(root, &key, path);
+ if (ret < 0)
+ goto out_put;
+ else if (ret > 0) {
+ ret = -ENOENT;
goto out_put;
- } else if (ret > 0) {
- ret = btrfs_previous_item(root, path, dirid,
- BTRFS_INODE_REF_KEY);
- if (ret < 0) {
- goto out_put;
- } else if (ret > 0) {
- ret = -ENOENT;
- goto out_put;
- }
}
leaf = path->nodes[0];
slot = path->slots[0];
- btrfs_item_key_to_cpu(leaf, &key, slot);
iref = btrfs_item_ptr(leaf, slot, struct btrfs_inode_ref);
len = btrfs_inode_ref_name_len(leaf, iref);
@@ -2527,7 +2531,7 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,
ret = PTR_ERR(temp_inode);
goto out_put;
}
- ret = inode_permission(&init_user_ns, temp_inode,
+ ret = inode_permission(mnt_userns, temp_inode,
MAY_READ | MAY_EXEC);
iput(temp_inode);
if (ret) {
@@ -2669,7 +2673,7 @@ static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp)
return -EACCES;
}
- ret = btrfs_search_path_in_tree_user(inode, args);
+ ret = btrfs_search_path_in_tree_user(file_mnt_user_ns(file), inode, args);
if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
ret = -EFAULT;
@@ -2905,6 +2909,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
struct btrfs_root *dest = NULL;
struct btrfs_ioctl_vol_args *vol_args = NULL;
struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL;
+ struct user_namespace *mnt_userns = file_mnt_user_ns(file);
char *subvol_name, *subvol_name_ptr = NULL;
int subvol_namelen;
int err = 0;
@@ -2932,6 +2937,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
if (err)
goto out;
} else {
+ struct inode *old_dir;
+
if (vol_args2->subvolid < BTRFS_FIRST_FREE_OBJECTID) {
err = -EINVAL;
goto out;
@@ -2968,6 +2975,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
err = PTR_ERR(parent);
goto out_drop_write;
}
+ old_dir = dir;
dir = d_inode(parent);
/*
@@ -2978,6 +2986,20 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
*/
destroy_parent = true;
+ /*
+ * On idmapped mounts, deletion via subvolid is
+ * restricted to subvolumes that are immediate
+ * ancestors of the inode referenced by the file
+ * descriptor in the ioctl. Otherwise the idmapping
+ * could potentially be abused to delete subvolumes
+ * anywhere in the filesystem the user wouldn't be able
+ * to delete without an idmapped mount.
+ */
+ if (old_dir != dir && mnt_userns != &init_user_ns) {
+ err = -EOPNOTSUPP;
+ goto free_parent;
+ }
+
subvol_name_ptr = btrfs_get_subvol_name_from_objectid(
fs_info, vol_args2->subvolid);
if (IS_ERR(subvol_name_ptr)) {
@@ -3016,7 +3038,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
if (err == -EINTR)
goto free_subvol_name;
- dentry = lookup_one_len(subvol_name, parent, subvol_namelen);
+ dentry = lookup_one(mnt_userns, subvol_name, parent, subvol_namelen);
if (IS_ERR(dentry)) {
err = PTR_ERR(dentry);
goto out_unlock_dir;
@@ -3058,14 +3080,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
if (root == dest)
goto out_dput;
- err = inode_permission(&init_user_ns, inode,
- MAY_WRITE | MAY_EXEC);
+ err = inode_permission(mnt_userns, inode, MAY_WRITE | MAY_EXEC);
if (err)
goto out_dput;
}
/* check if subvolume may be deleted by a user */
- err = btrfs_may_delete(dir, dentry, 1);
+ err = btrfs_may_delete(mnt_userns, dir, dentry, 1);
if (err)
goto out_dput;
@@ -3103,7 +3124,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
{
struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_ioctl_defrag_range_args *range;
+ struct btrfs_ioctl_defrag_range_args range = {0};
int ret;
ret = mnt_want_write_file(file);
@@ -3115,6 +3136,12 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
goto out;
}
+ /* Subpage defrag will be supported in later commits */
+ if (root->fs_info->sectorsize < PAGE_SIZE) {
+ ret = -ENOTTY;
+ goto out;
+ }
+
switch (inode->i_mode & S_IFMT) {
case S_IFDIR:
if (!capable(CAP_SYS_ADMIN)) {
@@ -3135,33 +3162,24 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
goto out;
}
- range = kzalloc(sizeof(*range), GFP_KERNEL);
- if (!range) {
- ret = -ENOMEM;
- goto out;
- }
-
if (argp) {
- if (copy_from_user(range, argp,
- sizeof(*range))) {
+ if (copy_from_user(&range, argp, sizeof(range))) {
ret = -EFAULT;
- kfree(range);
goto out;
}
/* compression requires us to start the IO */
- if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
- range->flags |= BTRFS_DEFRAG_RANGE_START_IO;
- range->extent_thresh = (u32)-1;
+ if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
+ range.flags |= BTRFS_DEFRAG_RANGE_START_IO;
+ range.extent_thresh = (u32)-1;
}
} else {
/* the rest are all set to zero by kzalloc */
- range->len = (u64)-1;
+ range.len = (u64)-1;
}
ret = btrfs_defrag_file(file_inode(file), file,
- range, BTRFS_OLDEST_GENERATION, 0);
+ &range, BTRFS_OLDEST_GENERATION, 0);
if (ret > 0)
ret = 0;
- kfree(range);
break;
default:
ret = -EINVAL;
@@ -4404,25 +4422,20 @@ drop_write:
static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,
void __user *arg)
{
- struct btrfs_ioctl_quota_rescan_args *qsa;
+ struct btrfs_ioctl_quota_rescan_args qsa = {0};
int ret = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- qsa = kzalloc(sizeof(*qsa), GFP_KERNEL);
- if (!qsa)
- return -ENOMEM;
-
if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
- qsa->flags = 1;
- qsa->progress = fs_info->qgroup_rescan_progress.objectid;
+ qsa.flags = 1;
+ qsa.progress = fs_info->qgroup_rescan_progress.objectid;
}
- if (copy_to_user(arg, qsa, sizeof(*qsa)))
+ if (copy_to_user(arg, &qsa, sizeof(qsa)))
ret = -EFAULT;
- kfree(qsa);
return ret;
}
@@ -4436,6 +4449,7 @@ static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info,
}
static long _btrfs_ioctl_set_received_subvol(struct file *file,
+ struct user_namespace *mnt_userns,
struct btrfs_ioctl_received_subvol_args *sa)
{
struct inode *inode = file_inode(file);
@@ -4447,7 +4461,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
int ret = 0;
int received_uuid_changed;
- if (!inode_owner_or_capable(&init_user_ns, inode))
+ if (!inode_owner_or_capable(mnt_userns, inode))
return -EPERM;
ret = mnt_want_write_file(file);
@@ -4552,7 +4566,7 @@ static long btrfs_ioctl_set_received_subvol_32(struct file *file,
args64->rtime.nsec = args32->rtime.nsec;
args64->flags = args32->flags;
- ret = _btrfs_ioctl_set_received_subvol(file, args64);
+ ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_user_ns(file), args64);
if (ret)
goto out;
@@ -4586,7 +4600,7 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
if (IS_ERR(sa))
return PTR_ERR(sa);
- ret = _btrfs_ioctl_set_received_subvol(file, sa);
+ ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_user_ns(file), sa);
if (ret)
goto out;
@@ -5013,6 +5027,10 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_get_subvol_rootref(file, argp);
case BTRFS_IOC_INO_LOOKUP_USER:
return btrfs_ioctl_ino_lookup_user(file, argp);
+ case FS_IOC_ENABLE_VERITY:
+ return fsverity_ioctl_enable(file, (const void __user *)argp);
+ case FS_IOC_MEASURE_VERITY:
+ return fsverity_ioctl_measure(file, argp);
}
return -ENOTTY;
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index cd042c7567a4..c25dfd1a8a54 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -14,6 +14,7 @@
#include <linux/lzo.h>
#include <linux/refcount.h>
#include "compression.h"
+#include "ctree.h"
#define LZO_LEN 4
@@ -140,18 +141,18 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
*total_in = 0;
in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- data_in = kmap(in_page);
+ data_in = page_address(in_page);
/*
* store the size of all chunks of compressed data in
* the first 4 bytes
*/
- out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ out_page = alloc_page(GFP_NOFS);
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
}
- cpage_out = kmap(out_page);
+ cpage_out = page_address(out_page);
out_offset = LZO_LEN;
tot_out = LZO_LEN;
pages[0] = out_page;
@@ -209,19 +210,18 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
if (out_len == 0 && tot_in >= len)
break;
- kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
ret = -E2BIG;
goto out;
}
- out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ out_page = alloc_page(GFP_NOFS);
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
}
- cpage_out = kmap(out_page);
+ cpage_out = page_address(out_page);
pages[nr_pages++] = out_page;
pg_bytes_left = PAGE_SIZE;
@@ -243,12 +243,11 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
break;
bytes_left = len - tot_in;
- kunmap(in_page);
put_page(in_page);
start += PAGE_SIZE;
in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- data_in = kmap(in_page);
+ data_in = page_address(in_page);
in_len = min(bytes_left, PAGE_SIZE);
}
@@ -258,164 +257,130 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
}
/* store the size of all chunks of compressed data */
- sizes_ptr = kmap_local_page(pages[0]);
+ sizes_ptr = page_address(pages[0]);
write_compress_length(sizes_ptr, tot_out);
- kunmap_local(sizes_ptr);
ret = 0;
*total_out = tot_out;
*total_in = tot_in;
out:
*out_pages = nr_pages;
- if (out_page)
- kunmap(out_page);
- if (in_page) {
- kunmap(in_page);
+ if (in_page)
put_page(in_page);
- }
return ret;
}
+/*
+ * Copy the compressed segment payload into @dest.
+ *
+ * For the payload there will be no padding, just need to do page switching.
+ */
+static void copy_compressed_segment(struct compressed_bio *cb,
+ char *dest, u32 len, u32 *cur_in)
+{
+ u32 orig_in = *cur_in;
+
+ while (*cur_in < orig_in + len) {
+ struct page *cur_page;
+ u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in),
+ orig_in + len - *cur_in);
+
+ ASSERT(copy_len);
+ cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE];
+
+ memcpy(dest + *cur_in - orig_in,
+ page_address(cur_page) + offset_in_page(*cur_in),
+ copy_len);
+
+ *cur_in += copy_len;
+ }
+}
+
int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
- int ret = 0, ret2;
- char *data_in;
- unsigned long page_in_index = 0;
- size_t srclen = cb->compressed_len;
- unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
- unsigned long buf_start;
- unsigned long buf_offset = 0;
- unsigned long bytes;
- unsigned long working_bytes;
- size_t in_len;
- size_t out_len;
- const size_t max_segment_len = lzo1x_worst_compress(PAGE_SIZE);
- unsigned long in_offset;
- unsigned long in_page_bytes_left;
- unsigned long tot_in;
- unsigned long tot_out;
- unsigned long tot_len;
- char *buf;
- bool may_late_unmap, need_unmap;
- struct page **pages_in = cb->compressed_pages;
- u64 disk_start = cb->start;
- struct bio *orig_bio = cb->orig_bio;
+ const struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
+ const u32 sectorsize = fs_info->sectorsize;
+ int ret;
+ /* Compressed data length, can be unaligned */
+ u32 len_in;
+ /* Offset inside the compressed data */
+ u32 cur_in = 0;
+ /* Bytes decompressed so far */
+ u32 cur_out = 0;
+
+ len_in = read_compress_length(page_address(cb->compressed_pages[0]));
+ cur_in += LZO_LEN;
- data_in = kmap(pages_in[0]);
- tot_len = read_compress_length(data_in);
/*
- * Compressed data header check.
+ * LZO header length check
*
- * The real compressed size can't exceed the maximum extent length, and
- * all pages should be used (whole unused page with just the segment
- * header is not possible). If this happens it means the compressed
- * extent is corrupted.
+ * The total length should not exceed the maximum extent length,
+ * and all sectors should be used.
+ * If this happens, it means the compressed extent is corrupted.
*/
- if (tot_len > min_t(size_t, BTRFS_MAX_COMPRESSED, srclen) ||
- tot_len < srclen - PAGE_SIZE) {
- ret = -EUCLEAN;
- goto done;
+ if (len_in > min_t(size_t, BTRFS_MAX_COMPRESSED, cb->compressed_len) ||
+ round_up(len_in, sectorsize) < cb->compressed_len) {
+ btrfs_err(fs_info,
+ "invalid lzo header, lzo len %u compressed len %u",
+ len_in, cb->compressed_len);
+ return -EUCLEAN;
}
- tot_in = LZO_LEN;
- in_offset = LZO_LEN;
- in_page_bytes_left = PAGE_SIZE - LZO_LEN;
-
- tot_out = 0;
-
- while (tot_in < tot_len) {
- in_len = read_compress_length(data_in + in_offset);
- in_page_bytes_left -= LZO_LEN;
- in_offset += LZO_LEN;
- tot_in += LZO_LEN;
+ /* Go through each lzo segment */
+ while (cur_in < len_in) {
+ struct page *cur_page;
+ /* Length of the compressed segment */
+ u32 seg_len;
+ u32 sector_bytes_left;
+ size_t out_len = lzo1x_worst_compress(sectorsize);
/*
- * Segment header check.
- *
- * The segment length must not exceed the maximum LZO
- * compression size, nor the total compressed size.
+ * We should always have enough space for one segment header
+ * inside current sector.
*/
- if (in_len > max_segment_len || tot_in + in_len > tot_len) {
- ret = -EUCLEAN;
- goto done;
- }
-
- tot_in += in_len;
- working_bytes = in_len;
- may_late_unmap = need_unmap = false;
-
- /* fast path: avoid using the working buffer */
- if (in_page_bytes_left >= in_len) {
- buf = data_in + in_offset;
- bytes = in_len;
- may_late_unmap = true;
- goto cont;
- }
-
- /* copy bytes from the pages into the working buffer */
- buf = workspace->cbuf;
- buf_offset = 0;
- while (working_bytes) {
- bytes = min(working_bytes, in_page_bytes_left);
-
- memcpy(buf + buf_offset, data_in + in_offset, bytes);
- buf_offset += bytes;
-cont:
- working_bytes -= bytes;
- in_page_bytes_left -= bytes;
- in_offset += bytes;
-
- /* check if we need to pick another page */
- if ((working_bytes == 0 && in_page_bytes_left < LZO_LEN)
- || in_page_bytes_left == 0) {
- tot_in += in_page_bytes_left;
-
- if (working_bytes == 0 && tot_in >= tot_len)
- break;
-
- if (page_in_index + 1 >= total_pages_in) {
- ret = -EIO;
- goto done;
- }
-
- if (may_late_unmap)
- need_unmap = true;
- else
- kunmap(pages_in[page_in_index]);
-
- data_in = kmap(pages_in[++page_in_index]);
-
- in_page_bytes_left = PAGE_SIZE;
- in_offset = 0;
- }
- }
-
- out_len = max_segment_len;
- ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
- &out_len);
- if (need_unmap)
- kunmap(pages_in[page_in_index - 1]);
+ ASSERT(cur_in / sectorsize ==
+ (cur_in + LZO_LEN - 1) / sectorsize);
+ cur_page = cb->compressed_pages[cur_in / PAGE_SIZE];
+ ASSERT(cur_page);
+ seg_len = read_compress_length(page_address(cur_page) +
+ offset_in_page(cur_in));
+ cur_in += LZO_LEN;
+
+ /* Copy the compressed segment payload into workspace */
+ copy_compressed_segment(cb, workspace->cbuf, seg_len, &cur_in);
+
+ /* Decompress the data */
+ ret = lzo1x_decompress_safe(workspace->cbuf, seg_len,
+ workspace->buf, &out_len);
if (ret != LZO_E_OK) {
- pr_warn("BTRFS: decompress failed\n");
+ btrfs_err(fs_info, "failed to decompress");
ret = -EIO;
- break;
+ goto out;
}
- buf_start = tot_out;
- tot_out += out_len;
+ /* Copy the data into inode pages */
+ ret = btrfs_decompress_buf2page(workspace->buf, out_len, cb, cur_out);
+ cur_out += out_len;
- ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
- tot_out, disk_start, orig_bio);
- if (ret2 == 0)
- break;
+ /* All data read, exit */
+ if (ret == 0)
+ goto out;
+ ret = 0;
+
+ /* Check if the sector has enough space for a segment header */
+ sector_bytes_left = sectorsize - (cur_in % sectorsize);
+ if (sector_bytes_left >= LZO_LEN)
+ continue;
+
+ /* Skip the padding zeros */
+ cur_in += sector_bytes_left;
}
-done:
- kunmap(pages_in[page_in_index]);
+out:
if (!ret)
- zero_fill_bio(orig_bio);
+ zero_fill_bio(cb->orig_bio);
return ret;
}
@@ -466,7 +431,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in,
destlen = min_t(unsigned long, destlen, PAGE_SIZE);
bytes = min_t(unsigned long, destlen, out_len - start_byte);
- kaddr = kmap_local_page(dest_page);
+ kaddr = page_address(dest_page);
memcpy(kaddr, workspace->buf + start_byte, bytes);
/*
@@ -476,7 +441,6 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in,
*/
if (bytes < destlen)
memset(kaddr+bytes, 0, destlen-bytes);
- kunmap_local(kaddr);
out:
return ret;
}
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 6eb41b7c0c84..edb65abf0393 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -190,8 +190,6 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset
entry->truncated_len = (u64)-1;
entry->qgroup_rsv = ret;
entry->physical = (u64)-1;
- entry->disk = NULL;
- entry->partno = (u8)-1;
ASSERT(type == BTRFS_ORDERED_REGULAR ||
type == BTRFS_ORDERED_NOCOW ||
@@ -448,7 +446,6 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
* Will be also used to store the finished ordered extent.
* @file_offset: File offset for the finished IO
* @io_size: Length of the finish IO range
- * @uptodate: If the IO finishes without problem
*
* Return true if the ordered extent is finished in the range, and update
* @cached.
@@ -459,7 +456,7 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
*/
bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
- u64 file_offset, u64 io_size, int uptodate)
+ u64 file_offset, u64 io_size)
{
struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *node;
@@ -488,8 +485,6 @@ have_entry:
entry->bytes_left, io_size);
entry->bytes_left -= io_size;
- if (!uptodate)
- set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
if (entry->bytes_left == 0) {
/*
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 566472004edd..4194e960ff61 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -145,8 +145,7 @@ struct btrfs_ordered_extent {
* command in a workqueue context
*/
u64 physical;
- struct gendisk *disk;
- u8 partno;
+ struct block_device *bdev;
};
/*
@@ -178,7 +177,7 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
bool uptodate);
bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
- u64 file_offset, u64 io_size, int uptodate);
+ u64 file_offset, u64 io_size);
int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes,
int type);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 07ec06d4e972..db680f5be745 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1704,17 +1704,39 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
return 0;
}
-int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info,
+int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
struct btrfs_qgroup_extent_record *qrecord)
{
struct ulist *old_root;
u64 bytenr = qrecord->bytenr;
int ret;
- ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, &old_root, false);
+ /*
+ * We are always called in a context where we are already holding a
+ * transaction handle. Often we are called when adding a data delayed
+ * reference from btrfs_truncate_inode_items() (truncating or unlinking),
+ * in which case we will be holding a write lock on extent buffer from a
+ * subvolume tree. In this case we can't allow btrfs_find_all_roots() to
+ * acquire fs_info->commit_root_sem, because that is a higher level lock
+ * that must be acquired before locking any extent buffers.
+ *
+ * So we want btrfs_find_all_roots() to not acquire the commit_root_sem
+ * but we can't pass it a non-NULL transaction handle, because otherwise
+ * it would not use commit roots and would lock extent buffers, causing
+ * a deadlock if it ends up trying to read lock the same extent buffer
+ * that was previously write locked at btrfs_truncate_inode_items().
+ *
+ * So pass a NULL transaction handle to btrfs_find_all_roots() and
+ * explicitly tell it to not acquire the commit_root_sem - if we are
+ * holding a transaction handle we don't need its protection.
+ */
+ ASSERT(trans != NULL);
+
+ ret = btrfs_find_all_roots(NULL, trans->fs_info, bytenr, 0, &old_root,
+ true);
if (ret < 0) {
- fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
- btrfs_warn(fs_info,
+ trans->fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ btrfs_warn(trans->fs_info,
"error accounting new delayed refs extent (err code: %d), quota inconsistent",
ret);
return 0;
@@ -1758,7 +1780,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
kfree(record);
return 0;
}
- return btrfs_qgroup_trace_extent_post(fs_info, record);
+ return btrfs_qgroup_trace_extent_post(trans, record);
}
int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
@@ -2645,7 +2667,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
* current root. It's safe inside commit_transaction().
*/
ret = btrfs_find_all_roots(trans, fs_info,
- record->bytenr, BTRFS_SEQ_LAST, &new_roots, false);
+ record->bytenr, BTRFS_SEQ_LAST, &new_roots, false);
if (ret < 0)
goto cleanup;
if (qgroup_to_skip) {
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 7283e4f549af..880e9df0dac1 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -298,7 +298,7 @@ int btrfs_qgroup_trace_extent_nolock(
* using current root, then we can move all expensive backref walk out of
* transaction committing, but not now as qgroup accounting will be wrong again.
*/
-int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info,
+int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
struct btrfs_qgroup_extent_record *qrecord);
/*
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 244d499ebc72..d8d268ca8aa7 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1035,7 +1035,7 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
for (i = 0; i < rbio->nr_pages; i++) {
if (rbio->stripe_pages[i])
continue;
- page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ page = alloc_page(GFP_NOFS);
if (!page)
return -ENOMEM;
rbio->stripe_pages[i] = page;
@@ -1054,7 +1054,7 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
for (; i < rbio->nr_pages; i++) {
if (rbio->stripe_pages[i])
continue;
- page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ page = alloc_page(GFP_NOFS);
if (!page)
return -ENOMEM;
rbio->stripe_pages[i] = page;
@@ -1636,10 +1636,10 @@ struct btrfs_plug_cb {
static int plug_cmp(void *priv, const struct list_head *a,
const struct list_head *b)
{
- struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
- plug_list);
- struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
- plug_list);
+ const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
+ plug_list);
+ const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
+ plug_list);
u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
@@ -2300,7 +2300,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
if (rbio->stripe_pages[index])
continue;
- page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ page = alloc_page(GFP_NOFS);
if (!page)
return -ENOMEM;
rbio->stripe_pages[index] = page;
@@ -2350,14 +2350,14 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
if (!need_check)
goto writeback;
- p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ p_page = alloc_page(GFP_NOFS);
if (!p_page)
goto cleanup;
SetPageUptodate(p_page);
if (has_qstripe) {
/* RAID6, allocate and map temp space for the Q stripe */
- q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ q_page = alloc_page(GFP_NOFS);
if (!q_page) {
__free_page(p_page);
goto cleanup;
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 8e026de74c44..d2062d5f71dd 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -264,8 +264,8 @@ static struct block_entry *add_block_entry(struct btrfs_fs_info *fs_info,
struct block_entry *be = NULL, *exist;
struct root_entry *re = NULL;
- re = kzalloc(sizeof(struct root_entry), GFP_KERNEL);
- be = kzalloc(sizeof(struct block_entry), GFP_KERNEL);
+ re = kzalloc(sizeof(struct root_entry), GFP_NOFS);
+ be = kzalloc(sizeof(struct block_entry), GFP_NOFS);
if (!be || !re) {
kfree(re);
kfree(be);
@@ -313,7 +313,7 @@ static int add_tree_block(struct btrfs_fs_info *fs_info, u64 ref_root,
struct root_entry *re;
struct ref_entry *ref = NULL, *exist;
- ref = kmalloc(sizeof(struct ref_entry), GFP_KERNEL);
+ ref = kmalloc(sizeof(struct ref_entry), GFP_NOFS);
if (!ref)
return -ENOMEM;
@@ -358,7 +358,7 @@ static int add_shared_data_ref(struct btrfs_fs_info *fs_info,
struct block_entry *be;
struct ref_entry *ref;
- ref = kzalloc(sizeof(struct ref_entry), GFP_KERNEL);
+ ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS);
if (!ref)
return -ENOMEM;
be = add_block_entry(fs_info, bytenr, num_bytes, 0);
@@ -393,7 +393,7 @@ static int add_extent_data_ref(struct btrfs_fs_info *fs_info,
u64 offset = btrfs_extent_data_ref_offset(leaf, dref);
u32 num_refs = btrfs_extent_data_ref_count(leaf, dref);
- ref = kzalloc(sizeof(struct ref_entry), GFP_KERNEL);
+ ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS);
if (!ref)
return -ENOMEM;
be = add_block_entry(fs_info, bytenr, num_bytes, ref_root);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index fc831597cb22..914d403b4415 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -24,6 +24,7 @@
#include "block-group.h"
#include "backref.h"
#include "misc.h"
+#include "subpage.h"
/*
* Relocation overview
@@ -2781,10 +2782,70 @@ static noinline_for_stack int prealloc_file_extent_cluster(
u64 num_bytes;
int nr;
int ret = 0;
+ u64 i_size = i_size_read(&inode->vfs_inode);
u64 prealloc_start = cluster->start - offset;
u64 prealloc_end = cluster->end - offset;
u64 cur_offset = prealloc_start;
+ /*
+ * For subpage case, previous i_size may not be aligned to PAGE_SIZE.
+ * This means the range [i_size, PAGE_END + 1) is filled with zeros by
+ * btrfs_do_readpage() call of previously relocated file cluster.
+ *
+ * If the current cluster starts in the above range, btrfs_do_readpage()
+ * will skip the read, and relocate_one_page() will later writeback
+ * the padding zeros as new data, causing data corruption.
+ *
+ * Here we have to manually invalidate the range (i_size, PAGE_END + 1).
+ */
+ if (!IS_ALIGNED(i_size, PAGE_SIZE)) {
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ const u32 sectorsize = fs_info->sectorsize;
+ struct page *page;
+
+ ASSERT(sectorsize < PAGE_SIZE);
+ ASSERT(IS_ALIGNED(i_size, sectorsize));
+
+ /*
+ * Subpage can't handle page with DIRTY but without UPTODATE
+ * bit as it can lead to the following deadlock:
+ *
+ * btrfs_readpage()
+ * | Page already *locked*
+ * |- btrfs_lock_and_flush_ordered_range()
+ * |- btrfs_start_ordered_extent()
+ * |- extent_write_cache_pages()
+ * |- lock_page()
+ * We try to lock the page we already hold.
+ *
+ * Here we just writeback the whole data reloc inode, so that
+ * we will be ensured to have no dirty range in the page, and
+ * are safe to clear the uptodate bits.
+ *
+ * This shouldn't cause too much overhead, as we need to write
+ * the data back anyway.
+ */
+ ret = filemap_write_and_wait(mapping);
+ if (ret < 0)
+ return ret;
+
+ clear_extent_bits(&inode->io_tree, i_size,
+ round_up(i_size, PAGE_SIZE) - 1,
+ EXTENT_UPTODATE);
+ page = find_lock_page(mapping, i_size >> PAGE_SHIFT);
+ /*
+ * If page is freed we don't need to do anything then, as we
+ * will re-read the whole page anyway.
+ */
+ if (page) {
+ btrfs_subpage_clear_uptodate(fs_info, page, i_size,
+ round_up(i_size, PAGE_SIZE) - i_size);
+ unlock_page(page);
+ put_page(page);
+ }
+ }
+
BUG_ON(cluster->start != cluster->boundary[0]);
ret = btrfs_alloc_data_chunk_ondemand(inode,
prealloc_end + 1 - prealloc_start);
@@ -2886,19 +2947,149 @@ noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
}
ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE);
-static int relocate_file_extent_cluster(struct inode *inode,
- struct file_extent_cluster *cluster)
+static u64 get_cluster_boundary_end(struct file_extent_cluster *cluster,
+ int cluster_nr)
+{
+ /* Last extent, use cluster end directly */
+ if (cluster_nr >= cluster->nr - 1)
+ return cluster->end;
+
+ /* Use next boundary start*/
+ return cluster->boundary[cluster_nr + 1] - 1;
+}
+
+static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
+ struct file_extent_cluster *cluster,
+ int *cluster_nr, unsigned long page_index)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ u64 offset = BTRFS_I(inode)->index_cnt;
+ const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT;
+ gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+ struct page *page;
u64 page_start;
u64 page_end;
+ u64 cur;
+ int ret;
+
+ ASSERT(page_index <= last_index);
+ page = find_lock_page(inode->i_mapping, page_index);
+ if (!page) {
+ page_cache_sync_readahead(inode->i_mapping, ra, NULL,
+ page_index, last_index + 1 - page_index);
+ page = find_or_create_page(inode->i_mapping, page_index, mask);
+ if (!page)
+ return -ENOMEM;
+ }
+ ret = set_page_extent_mapped(page);
+ if (ret < 0)
+ goto release_page;
+
+ if (PageReadahead(page))
+ page_cache_async_readahead(inode->i_mapping, ra, NULL, page,
+ page_index, last_index + 1 - page_index);
+
+ if (!PageUptodate(page)) {
+ btrfs_readpage(NULL, page);
+ lock_page(page);
+ if (!PageUptodate(page)) {
+ ret = -EIO;
+ goto release_page;
+ }
+ }
+
+ page_start = page_offset(page);
+ page_end = page_start + PAGE_SIZE - 1;
+
+ /*
+ * Start from the cluster, as for subpage case, the cluster can start
+ * inside the page.
+ */
+ cur = max(page_start, cluster->boundary[*cluster_nr] - offset);
+ while (cur <= page_end) {
+ u64 extent_start = cluster->boundary[*cluster_nr] - offset;
+ u64 extent_end = get_cluster_boundary_end(cluster,
+ *cluster_nr) - offset;
+ u64 clamped_start = max(page_start, extent_start);
+ u64 clamped_end = min(page_end, extent_end);
+ u32 clamped_len = clamped_end + 1 - clamped_start;
+
+ /* Reserve metadata for this range */
+ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
+ clamped_len);
+ if (ret)
+ goto release_page;
+
+ /* Mark the range delalloc and dirty for later writeback */
+ lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end);
+ ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start,
+ clamped_end, 0, NULL);
+ if (ret) {
+ clear_extent_bits(&BTRFS_I(inode)->io_tree,
+ clamped_start, clamped_end,
+ EXTENT_LOCKED | EXTENT_BOUNDARY);
+ btrfs_delalloc_release_metadata(BTRFS_I(inode),
+ clamped_len, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode),
+ clamped_len);
+ goto release_page;
+ }
+ btrfs_page_set_dirty(fs_info, page, clamped_start, clamped_len);
+
+ /*
+ * Set the boundary if it's inside the page.
+ * Data relocation requires the destination extents to have the
+ * same size as the source.
+ * EXTENT_BOUNDARY bit prevents current extent from being merged
+ * with previous extent.
+ */
+ if (in_range(cluster->boundary[*cluster_nr] - offset,
+ page_start, PAGE_SIZE)) {
+ u64 boundary_start = cluster->boundary[*cluster_nr] -
+ offset;
+ u64 boundary_end = boundary_start +
+ fs_info->sectorsize - 1;
+
+ set_extent_bits(&BTRFS_I(inode)->io_tree,
+ boundary_start, boundary_end,
+ EXTENT_BOUNDARY);
+ }
+ unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len);
+ cur += clamped_len;
+
+ /* Crossed extent end, go to next extent */
+ if (cur >= extent_end) {
+ (*cluster_nr)++;
+ /* Just finished the last extent of the cluster, exit. */
+ if (*cluster_nr >= cluster->nr)
+ break;
+ }
+ }
+ unlock_page(page);
+ put_page(page);
+
+ balance_dirty_pages_ratelimited(inode->i_mapping);
+ btrfs_throttle(fs_info);
+ if (btrfs_should_cancel_balance(fs_info))
+ ret = -ECANCELED;
+ return ret;
+
+release_page:
+ unlock_page(page);
+ put_page(page);
+ return ret;
+}
+
+static int relocate_file_extent_cluster(struct inode *inode,
+ struct file_extent_cluster *cluster)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 offset = BTRFS_I(inode)->index_cnt;
unsigned long index;
unsigned long last_index;
- struct page *page;
struct file_ra_state *ra;
- gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
- int nr = 0;
+ int cluster_nr = 0;
int ret = 0;
if (!cluster->nr)
@@ -2919,109 +3110,14 @@ static int relocate_file_extent_cluster(struct inode *inode,
if (ret)
goto out;
- index = (cluster->start - offset) >> PAGE_SHIFT;
last_index = (cluster->end - offset) >> PAGE_SHIFT;
- while (index <= last_index) {
- ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
- PAGE_SIZE);
- if (ret)
- goto out;
-
- page = find_lock_page(inode->i_mapping, index);
- if (!page) {
- page_cache_sync_readahead(inode->i_mapping,
- ra, NULL, index,
- last_index + 1 - index);
- page = find_or_create_page(inode->i_mapping, index,
- mask);
- if (!page) {
- btrfs_delalloc_release_metadata(BTRFS_I(inode),
- PAGE_SIZE, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode),
- PAGE_SIZE);
- ret = -ENOMEM;
- goto out;
- }
- }
- ret = set_page_extent_mapped(page);
- if (ret < 0) {
- btrfs_delalloc_release_metadata(BTRFS_I(inode),
- PAGE_SIZE, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
- unlock_page(page);
- put_page(page);
- goto out;
- }
-
- if (PageReadahead(page)) {
- page_cache_async_readahead(inode->i_mapping,
- ra, NULL, page, index,
- last_index + 1 - index);
- }
-
- if (!PageUptodate(page)) {
- btrfs_readpage(NULL, page);
- lock_page(page);
- if (!PageUptodate(page)) {
- unlock_page(page);
- put_page(page);
- btrfs_delalloc_release_metadata(BTRFS_I(inode),
- PAGE_SIZE, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode),
- PAGE_SIZE);
- ret = -EIO;
- goto out;
- }
- }
-
- page_start = page_offset(page);
- page_end = page_start + PAGE_SIZE - 1;
-
- lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
-
- if (nr < cluster->nr &&
- page_start + offset == cluster->boundary[nr]) {
- set_extent_bits(&BTRFS_I(inode)->io_tree,
- page_start, page_end,
- EXTENT_BOUNDARY);
- nr++;
- }
-
- ret = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start,
- page_end, 0, NULL);
- if (ret) {
- unlock_page(page);
- put_page(page);
- btrfs_delalloc_release_metadata(BTRFS_I(inode),
- PAGE_SIZE, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode),
- PAGE_SIZE);
-
- clear_extent_bits(&BTRFS_I(inode)->io_tree,
- page_start, page_end,
- EXTENT_LOCKED | EXTENT_BOUNDARY);
- goto out;
-
- }
- set_page_dirty(page);
-
- unlock_extent(&BTRFS_I(inode)->io_tree,
- page_start, page_end);
- unlock_page(page);
- put_page(page);
-
- index++;
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
- balance_dirty_pages_ratelimited(inode->i_mapping);
- btrfs_throttle(fs_info);
- if (btrfs_should_cancel_balance(fs_info)) {
- ret = -ECANCELED;
- goto out;
- }
- }
- WARN_ON(nr != cluster->nr);
+ for (index = (cluster->start - offset) >> PAGE_SHIFT;
+ index <= last_index && !ret; index++)
+ ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index);
if (btrfs_is_zoned(fs_info) && !ret)
ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ if (ret == 0)
+ WARN_ON(cluster_nr != cluster->nr);
out:
kfree(ra);
return ret;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 6ac37ae6c811..72f9b865e847 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1198,7 +1198,7 @@ struct backref_ctx {
static int __clone_root_cmp_bsearch(const void *key, const void *elt)
{
u64 root = (u64)(uintptr_t)key;
- struct clone_root *cr = (struct clone_root *)elt;
+ const struct clone_root *cr = elt;
if (root < cr->root->root_key.objectid)
return -1;
@@ -1209,8 +1209,8 @@ static int __clone_root_cmp_bsearch(const void *key, const void *elt)
static int __clone_root_cmp_sort(const void *e1, const void *e2)
{
- struct clone_root *cr1 = (struct clone_root *)e1;
- struct clone_root *cr2 = (struct clone_root *)e2;
+ const struct clone_root *cr1 = e1;
+ const struct clone_root *cr2 = e2;
if (cr1->root->root_key.objectid < cr2->root->root_key.objectid)
return -1;
@@ -1307,7 +1307,7 @@ static int find_extent_clone(struct send_ctx *sctx,
u64 flags = 0;
struct btrfs_file_extent_item *fi;
struct extent_buffer *eb = path->nodes[0];
- struct backref_ctx *backref_ctx = NULL;
+ struct backref_ctx backref_ctx = {0};
struct clone_root *cur_clone_root;
struct btrfs_key found_key;
struct btrfs_path *tmp_path;
@@ -1322,12 +1322,6 @@ static int find_extent_clone(struct send_ctx *sctx,
/* We only use this path under the commit sem */
tmp_path->need_commit_sem = 0;
- backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_KERNEL);
- if (!backref_ctx) {
- ret = -ENOMEM;
- goto out;
- }
-
if (data_offset >= ino_size) {
/*
* There may be extents that lie behind the file's size.
@@ -1392,12 +1386,12 @@ static int find_extent_clone(struct send_ctx *sctx,
cur_clone_root->found_refs = 0;
}
- backref_ctx->sctx = sctx;
- backref_ctx->found = 0;
- backref_ctx->cur_objectid = ino;
- backref_ctx->cur_offset = data_offset;
- backref_ctx->found_itself = 0;
- backref_ctx->extent_len = num_bytes;
+ backref_ctx.sctx = sctx;
+ backref_ctx.found = 0;
+ backref_ctx.cur_objectid = ino;
+ backref_ctx.cur_offset = data_offset;
+ backref_ctx.found_itself = 0;
+ backref_ctx.extent_len = num_bytes;
/*
* The last extent of a file may be too large due to page alignment.
@@ -1405,7 +1399,7 @@ static int find_extent_clone(struct send_ctx *sctx,
* __iterate_backrefs work.
*/
if (data_offset + num_bytes >= ino_size)
- backref_ctx->extent_len = ino_size - data_offset;
+ backref_ctx.extent_len = ino_size - data_offset;
/*
* Now collect all backrefs.
@@ -1416,12 +1410,12 @@ static int find_extent_clone(struct send_ctx *sctx,
extent_item_pos = 0;
ret = iterate_extent_inodes(fs_info, found_key.objectid,
extent_item_pos, 1, __iterate_backrefs,
- backref_ctx, false);
+ &backref_ctx, false);
if (ret < 0)
goto out;
- if (!backref_ctx->found_itself) {
+ if (!backref_ctx.found_itself) {
/* found a bug in backref code? */
ret = -EIO;
btrfs_err(fs_info,
@@ -1434,7 +1428,7 @@ static int find_extent_clone(struct send_ctx *sctx,
"find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu",
data_offset, ino, num_bytes, logical);
- if (!backref_ctx->found)
+ if (!backref_ctx.found)
btrfs_debug(fs_info, "no clones found");
cur_clone_root = NULL;
@@ -1458,7 +1452,6 @@ static int find_extent_clone(struct send_ctx *sctx,
out:
btrfs_free_path(tmp_path);
- kfree(backref_ctx);
return ret;
}
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index f79bf85f2439..5ada02e0e629 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -493,6 +493,11 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
long time_left;
int loops;
+ delalloc_bytes = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
+ ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes);
+ if (delalloc_bytes == 0 && ordered_bytes == 0)
+ return;
+
/* Calc the number of the pages we need flush for space reservation */
if (to_reclaim == U64_MAX) {
items = U64_MAX;
@@ -500,22 +505,21 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
/*
* to_reclaim is set to however much metadata we need to
* reclaim, but reclaiming that much data doesn't really track
- * exactly, so increase the amount to reclaim by 2x in order to
- * make sure we're flushing enough delalloc to hopefully reclaim
- * some metadata reservations.
+ * exactly. What we really want to do is reclaim full inode's
+ * worth of reservations, however that's not available to us
+ * here. We will take a fraction of the delalloc bytes for our
+ * flushing loops and hope for the best. Delalloc will expand
+ * the amount we write to cover an entire dirty extent, which
+ * will reclaim the metadata reservation for that range. If
+ * it's not enough subsequent flush stages will be more
+ * aggressive.
*/
+ to_reclaim = max(to_reclaim, delalloc_bytes >> 3);
items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2;
- to_reclaim = items * EXTENT_SIZE_PER_ITEM;
}
trans = (struct btrfs_trans_handle *)current->journal_info;
- delalloc_bytes = percpu_counter_sum_positive(
- &fs_info->delalloc_bytes);
- ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes);
- if (delalloc_bytes == 0 && ordered_bytes == 0)
- return;
-
/*
* If we are doing more ordered than delalloc we need to just wait on
* ordered extents, otherwise we'll waste time trying to flush delalloc
@@ -528,9 +532,49 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
while ((delalloc_bytes || ordered_bytes) && loops < 3) {
u64 temp = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
long nr_pages = min_t(u64, temp, LONG_MAX);
+ int async_pages;
btrfs_start_delalloc_roots(fs_info, nr_pages, true);
+ /*
+ * We need to make sure any outstanding async pages are now
+ * processed before we continue. This is because things like
+ * sync_inode() try to be smart and skip writing if the inode is
+ * marked clean. We don't use filemap_fwrite for flushing
+ * because we want to control how many pages we write out at a
+ * time, thus this is the only safe way to make sure we've
+ * waited for outstanding compressed workers to have started
+ * their jobs and thus have ordered extents set up properly.
+ *
+ * This exists because we do not want to wait for each
+ * individual inode to finish its async work, we simply want to
+ * start the IO on everybody, and then come back here and wait
+ * for all of the async work to catch up. Once we're done with
+ * that we know we'll have ordered extents for everything and we
+ * can decide if we wait for that or not.
+ *
+ * If we choose to replace this in the future, make absolutely
+ * sure that the proper waiting is being done in the async case,
+ * as there have been bugs in that area before.
+ */
+ async_pages = atomic_read(&fs_info->async_delalloc_pages);
+ if (!async_pages)
+ goto skip_async;
+
+ /*
+ * We don't want to wait forever, if we wrote less pages in this
+ * loop than we have outstanding, only wait for that number of
+ * pages, otherwise we can wait for all async pages to finish
+ * before continuing.
+ */
+ if (async_pages > nr_pages)
+ async_pages -= nr_pages;
+ else
+ async_pages = 0;
+ wait_event(fs_info->async_submit_wait,
+ atomic_read(&fs_info->async_delalloc_pages) <=
+ async_pages);
+skip_async:
loops++;
if (wait_ordered && !trans) {
btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
@@ -595,8 +639,11 @@ static void flush_space(struct btrfs_fs_info *fs_info,
break;
case FLUSH_DELALLOC:
case FLUSH_DELALLOC_WAIT:
+ case FLUSH_DELALLOC_FULL:
+ if (state == FLUSH_DELALLOC_FULL)
+ num_bytes = U64_MAX;
shrink_delalloc(fs_info, space_info, num_bytes,
- state == FLUSH_DELALLOC_WAIT, for_preempt);
+ state != FLUSH_DELALLOC, for_preempt);
break;
case FLUSH_DELAYED_REFS_NR:
case FLUSH_DELAYED_REFS:
@@ -686,7 +733,7 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
{
u64 global_rsv_size = fs_info->global_block_rsv.reserved;
u64 ordered, delalloc;
- u64 thresh = div_factor_fine(space_info->total_bytes, 98);
+ u64 thresh = div_factor_fine(space_info->total_bytes, 90);
u64 used;
/* If we're just plain full then async reclaim just slows us down. */
@@ -694,6 +741,20 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
global_rsv_size) >= thresh)
return false;
+ used = space_info->bytes_may_use + space_info->bytes_pinned;
+
+ /* The total flushable belongs to the global rsv, don't flush. */
+ if (global_rsv_size >= used)
+ return false;
+
+ /*
+ * 128MiB is 1/4 of the maximum global rsv size. If we have less than
+ * that devoted to other reservations then there's no sense in flushing,
+ * we don't have a lot of things that need flushing.
+ */
+ if (used - global_rsv_size <= SZ_128M)
+ return false;
+
/*
* We have tickets queued, bail so we don't compete with the async
* flushers.
@@ -824,6 +885,8 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
struct reserve_ticket *ticket;
u64 tickets_id = space_info->tickets_id;
+ trace_btrfs_fail_all_tickets(fs_info, space_info);
+
if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
btrfs_info(fs_info, "cannot satisfy tickets, dumping space info");
__btrfs_dump_space_info(fs_info, space_info);
@@ -905,6 +968,14 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
}
/*
+ * We do not want to empty the system of delalloc unless we're
+ * under heavy pressure, so allow one trip through the flushing
+ * logic before we start doing a FLUSH_DELALLOC_FULL.
+ */
+ if (flush_state == FLUSH_DELALLOC_FULL && !commit_cycles)
+ flush_state++;
+
+ /*
* We don't want to force a chunk allocation until we've tried
* pretty hard to reclaim space. Think of the case where we
* freed up a bunch of space and so have a lot of pinned space
@@ -1067,7 +1138,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
* so if we now have space to allocate do the force chunk allocation.
*/
static const enum btrfs_flush_state data_flush_states[] = {
- FLUSH_DELALLOC_WAIT,
+ FLUSH_DELALLOC_FULL,
RUN_DELAYED_IPUTS,
COMMIT_TRANS,
ALLOC_CHUNK_FORCE,
@@ -1156,6 +1227,7 @@ static const enum btrfs_flush_state evict_flush_states[] = {
FLUSH_DELAYED_REFS,
FLUSH_DELALLOC,
FLUSH_DELALLOC_WAIT,
+ FLUSH_DELALLOC_FULL,
ALLOC_CHUNK,
COMMIT_TRANS,
};
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index 8260f8bb3ff0..f429256f56db 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -73,7 +73,7 @@ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \
} \
token->kaddr = page_address(token->eb->pages[idx]); \
token->offset = idx << PAGE_SHIFT; \
- if (oip + size <= PAGE_SIZE) \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE ) \
return get_unaligned_le##bits(token->kaddr + oip); \
\
memcpy(lebytes, token->kaddr + oip, part); \
@@ -94,7 +94,7 @@ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \
u8 lebytes[sizeof(u##bits)]; \
\
ASSERT(check_setget_bounds(eb, ptr, off, size)); \
- if (oip + size <= PAGE_SIZE) \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) \
return get_unaligned_le##bits(kaddr + oip); \
\
memcpy(lebytes, kaddr + oip, part); \
@@ -124,7 +124,7 @@ void btrfs_set_token_##bits(struct btrfs_map_token *token, \
} \
token->kaddr = page_address(token->eb->pages[idx]); \
token->offset = idx << PAGE_SHIFT; \
- if (oip + size <= PAGE_SIZE) { \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \
put_unaligned_le##bits(val, token->kaddr + oip); \
return; \
} \
@@ -146,7 +146,7 @@ void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \
u8 lebytes[sizeof(u##bits)]; \
\
ASSERT(check_setget_bounds(eb, ptr, off, size)); \
- if (oip + size <= PAGE_SIZE) { \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \
put_unaligned_le##bits(val, kaddr + oip); \
return; \
} \
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 640bcd21bf28..cb10e56ee31e 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -435,8 +435,10 @@ void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
spin_lock_irqsave(&subpage->lock, flags);
subpage->writeback_bitmap &= ~tmp;
- if (subpage->writeback_bitmap == 0)
+ if (subpage->writeback_bitmap == 0) {
+ ASSERT(PageWriteback(page));
end_page_writeback(page);
+ }
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -559,3 +561,23 @@ IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback,
PageWriteback);
IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered,
PageOrdered);
+
+/*
+ * Make sure not only the page dirty bit is cleared, but also subpage dirty bit
+ * is cleared.
+ */
+void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
+ struct page *page)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+
+ if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
+ return;
+
+ ASSERT(!PageDirty(page));
+ if (fs_info->sectorsize == PAGE_SIZE)
+ return;
+
+ ASSERT(PagePrivate(page) && page->private);
+ ASSERT(subpage->dirty_bitmap == 0);
+}
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index 4d7aca85d915..0120948f37a1 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -126,4 +126,7 @@ DECLARE_BTRFS_SUBPAGE_OPS(ordered);
bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len);
+void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
+ struct page *page);
+
#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d07b18b2b250..537d90bf5d84 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1201,21 +1201,14 @@ char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = (u64)-1;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ ret = btrfs_search_backwards(root, &key, path);
if (ret < 0) {
goto err;
} else if (ret > 0) {
- ret = btrfs_previous_item(root, path, subvol_objectid,
- BTRFS_ROOT_BACKREF_KEY);
- if (ret < 0) {
- goto err;
- } else if (ret > 0) {
- ret = -ENOENT;
- goto err;
- }
+ ret = -ENOENT;
+ goto err;
}
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
subvol_objectid = key.offset;
root_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
@@ -1248,21 +1241,14 @@ char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
key.type = BTRFS_INODE_REF_KEY;
key.offset = (u64)-1;
- ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
+ ret = btrfs_search_backwards(fs_root, &key, path);
if (ret < 0) {
goto err;
} else if (ret > 0) {
- ret = btrfs_previous_item(fs_root, path, dirid,
- BTRFS_INODE_REF_KEY);
- if (ret < 0) {
- goto err;
- } else if (ret > 0) {
- ret = -ENOENT;
- goto err;
- }
+ ret = -ENOENT;
+ goto err;
}
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
dirid = key.offset;
inode_ref = btrfs_item_ptr(path->nodes[0],
@@ -1353,6 +1339,9 @@ static int btrfs_fill_super(struct super_block *sb,
sb->s_op = &btrfs_super_ops;
sb->s_d_op = &btrfs_dentry_operations;
sb->s_export_op = &btrfs_export_ops;
+#ifdef CONFIG_FS_VERITY
+ sb->s_vop = &btrfs_verityops;
+#endif
sb->s_xattr = btrfs_xattr_handlers;
sb->s_time_gran = 1;
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
@@ -2041,13 +2030,6 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
ret = -EINVAL;
goto restore;
}
- if (fs_info->sectorsize < PAGE_SIZE) {
- btrfs_warn(fs_info,
- "read-write mount is not yet allowed for sectorsize %u page size %lu",
- fs_info->sectorsize, PAGE_SIZE);
- ret = -EINVAL;
- goto restore;
- }
/*
* NOTE: when remounting with a change that does writes, don't
@@ -2096,16 +2078,15 @@ restore:
}
/* Used to sort the devices by max_avail(descending sort) */
-static inline int btrfs_cmp_device_free_bytes(const void *dev_info1,
- const void *dev_info2)
+static int btrfs_cmp_device_free_bytes(const void *a, const void *b)
{
- if (((struct btrfs_device_info *)dev_info1)->max_avail >
- ((struct btrfs_device_info *)dev_info2)->max_avail)
+ const struct btrfs_device_info *dev_info1 = a;
+ const struct btrfs_device_info *dev_info2 = b;
+
+ if (dev_info1->max_avail > dev_info2->max_avail)
return -1;
- else if (((struct btrfs_device_info *)dev_info1)->max_avail <
- ((struct btrfs_device_info *)dev_info2)->max_avail)
+ else if (dev_info1->max_avail < dev_info2->max_avail)
return 1;
- else
return 0;
}
@@ -2381,7 +2362,7 @@ static struct file_system_type btrfs_root_fs_type = {
.name = "btrfs",
.mount = btrfs_mount_root,
.kill_sb = btrfs_kill_super,
- .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
+ .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("btrfs");
@@ -2572,6 +2553,11 @@ static void __init btrfs_print_mod_info(void)
#else
", zoned=no"
#endif
+#ifdef CONFIG_FS_VERITY
+ ", fsverity=yes"
+#else
+ ", fsverity=no"
+#endif
;
pr_info("Btrfs loaded, crc32c=%s%s\n", crc32c_impl(), options);
}
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 9d1d140118ff..25a6f587852b 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -22,6 +22,26 @@
#include "block-group.h"
#include "qgroup.h"
+/*
+ * Structure name Path
+ * --------------------------------------------------------------------------
+ * btrfs_supported_static_feature_attrs /sys/fs/btrfs/features
+ * btrfs_supported_feature_attrs /sys/fs/btrfs/features and
+ * /sys/fs/btrfs/<uuid>/features
+ * btrfs_attrs /sys/fs/btrfs/<uuid>
+ * devid_attrs /sys/fs/btrfs/<uuid>/devinfo/<devid>
+ * allocation_attrs /sys/fs/btrfs/<uuid>/allocation
+ * qgroup_attrs /sys/fs/btrfs/<uuid>/qgroups/<level>_<qgroupid>
+ * space_info_attrs /sys/fs/btrfs/<uuid>/allocation/<bg-type>
+ * raid_attrs /sys/fs/btrfs/<uuid>/allocation/<bg-type>/<bg-profile>
+ *
+ * When built with BTRFS_CONFIG_DEBUG:
+ *
+ * btrfs_debug_feature_attrs /sys/fs/btrfs/debug
+ * btrfs_debug_mount_attrs /sys/fs/btrfs/<uuid>/debug
+ * discard_debug_attrs /sys/fs/btrfs/<uuid>/debug/discard
+ */
+
struct btrfs_feature_attr {
struct kobj_attribute kobj_attr;
enum btrfs_feature_set feature_set;
@@ -267,7 +287,17 @@ BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34);
#ifdef CONFIG_BTRFS_DEBUG
BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
#endif
+#ifdef CONFIG_FS_VERITY
+BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY);
+#endif
+/*
+ * Features which depend on feature bits and may differ between each fs.
+ *
+ * /sys/fs/btrfs/features - all available features implemeted by this version
+ * /sys/fs/btrfs/UUID/features - features of the fs which are enabled or
+ * can be changed on a mounted filesystem.
+ */
static struct attribute *btrfs_supported_feature_attrs[] = {
BTRFS_FEAT_ATTR_PTR(mixed_backref),
BTRFS_FEAT_ATTR_PTR(default_subvol),
@@ -285,16 +315,12 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
#ifdef CONFIG_BTRFS_DEBUG
BTRFS_FEAT_ATTR_PTR(zoned),
#endif
+#ifdef CONFIG_FS_VERITY
+ BTRFS_FEAT_ATTR_PTR(verity),
+#endif
NULL
};
-/*
- * Features which depend on feature bits and may differ between each fs.
- *
- * /sys/fs/btrfs/features lists all available features of this kernel while
- * /sys/fs/btrfs/UUID/features shows features of the fs which are enabled or
- * can be changed online.
- */
static const struct attribute_group btrfs_feature_attr_group = {
.name = "features",
.is_visible = btrfs_feature_visible,
@@ -366,6 +392,10 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj,
{
ssize_t ret = 0;
+ /* 4K sector size is also supported with 64K page size */
+ if (PAGE_SIZE == SZ_64K)
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%u ", SZ_4K);
+
/* Only sectorsize == PAGE_SIZE is now supported */
ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%lu\n", PAGE_SIZE);
@@ -374,6 +404,12 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj,
BTRFS_ATTR(static_feature, supported_sectorsizes,
supported_sectorsizes_show);
+/*
+ * Features which only depend on kernel version.
+ *
+ * These are listed in /sys/fs/btrfs/features along with
+ * btrfs_supported_feature_attrs.
+ */
static struct attribute *btrfs_supported_static_feature_attrs[] = {
BTRFS_ATTR_PTR(static_feature, rmdir_subvol),
BTRFS_ATTR_PTR(static_feature, supported_checksums),
@@ -383,12 +419,6 @@ static struct attribute *btrfs_supported_static_feature_attrs[] = {
NULL
};
-/*
- * Features which only depend on kernel version.
- *
- * These are listed in /sys/fs/btrfs/features along with
- * btrfs_feature_attr_group
- */
static const struct attribute_group btrfs_static_feature_attr_group = {
.name = "features",
.attrs = btrfs_supported_static_feature_attrs,
@@ -547,6 +577,11 @@ static ssize_t btrfs_discard_max_discard_size_store(struct kobject *kobj,
BTRFS_ATTR_RW(discard, max_discard_size, btrfs_discard_max_discard_size_show,
btrfs_discard_max_discard_size_store);
+/*
+ * Per-filesystem debugging of discard (when mounted with discard=async).
+ *
+ * Path: /sys/fs/btrfs/<uuid>/debug/discard/
+ */
static const struct attribute *discard_debug_attrs[] = {
BTRFS_ATTR_PTR(discard, discardable_bytes),
BTRFS_ATTR_PTR(discard, discardable_extents),
@@ -560,15 +595,19 @@ static const struct attribute *discard_debug_attrs[] = {
};
/*
- * Runtime debugging exported via sysfs
+ * Per-filesystem runtime debugging exported via sysfs.
*
- * /sys/fs/btrfs/debug - applies to module or all filesystems
- * /sys/fs/btrfs/UUID - applies only to the given filesystem
+ * Path: /sys/fs/btrfs/UUID/debug/
*/
static const struct attribute *btrfs_debug_mount_attrs[] = {
NULL,
};
+/*
+ * Runtime debugging exported via sysfs, applies to all mounted filesystems.
+ *
+ * Path: /sys/fs/btrfs/debug
+ */
static struct attribute *btrfs_debug_feature_attrs[] = {
NULL
};
@@ -637,6 +676,11 @@ static ssize_t raid_bytes_show(struct kobject *kobj,
return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
}
+/*
+ * Allocation information about block group profiles.
+ *
+ * Path: /sys/fs/btrfs/<uuid>/allocation/<bg-type>/<bg-profile>/
+ */
static struct attribute *raid_attrs[] = {
BTRFS_ATTR_PTR(raid, total_bytes),
BTRFS_ATTR_PTR(raid, used_bytes),
@@ -676,6 +720,11 @@ SPACE_INFO_ATTR(bytes_zone_unusable);
SPACE_INFO_ATTR(disk_used);
SPACE_INFO_ATTR(disk_total);
+/*
+ * Allocation information about block group types.
+ *
+ * Path: /sys/fs/btrfs/<uuid>/allocation/<bg-type>/
+ */
static struct attribute *space_info_attrs[] = {
BTRFS_ATTR_PTR(space_info, flags),
BTRFS_ATTR_PTR(space_info, total_bytes),
@@ -703,6 +752,11 @@ static struct kobj_type space_info_ktype = {
.default_groups = space_info_groups,
};
+/*
+ * Allocation information about block groups.
+ *
+ * Path: /sys/fs/btrfs/<uuid>/allocation/
+ */
static const struct attribute *allocation_attrs[] = {
BTRFS_ATTR_PTR(allocation, global_rsv_reserved),
BTRFS_ATTR_PTR(allocation, global_rsv_size),
@@ -974,7 +1028,8 @@ static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj,
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
ssize_t ret;
- ret = scnprintf(buf, PAGE_SIZE, "%d\n", fs_info->bg_reclaim_threshold);
+ ret = scnprintf(buf, PAGE_SIZE, "%d\n",
+ READ_ONCE(fs_info->bg_reclaim_threshold));
return ret;
}
@@ -991,16 +1046,21 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj,
if (ret)
return ret;
- if (thresh <= 50 || thresh > 100)
+ if (thresh != 0 && (thresh <= 50 || thresh > 100))
return -EINVAL;
- fs_info->bg_reclaim_threshold = thresh;
+ WRITE_ONCE(fs_info->bg_reclaim_threshold, thresh);
return len;
}
BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show,
btrfs_bg_reclaim_threshold_store);
+/*
+ * Per-filesystem information and stats.
+ *
+ * Path: /sys/fs/btrfs/<uuid>/
+ */
static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, label),
BTRFS_ATTR_PTR(, nodesize),
@@ -1510,6 +1570,11 @@ static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj,
}
BTRFS_ATTR(devid, error_stats, btrfs_devinfo_error_stats_show);
+/*
+ * Information about one device.
+ *
+ * Path: /sys/fs/btrfs/<uuid>/devinfo/<devid>/
+ */
static struct attribute *devid_attrs[] = {
BTRFS_ATTR_PTR(devid, error_stats),
BTRFS_ATTR_PTR(devid, in_fs_metadata),
@@ -1799,6 +1864,11 @@ QGROUP_RSV_ATTR(data, BTRFS_QGROUP_RSV_DATA);
QGROUP_RSV_ATTR(meta_pertrans, BTRFS_QGROUP_RSV_META_PERTRANS);
QGROUP_RSV_ATTR(meta_prealloc, BTRFS_QGROUP_RSV_META_PREALLOC);
+/*
+ * Qgroup information.
+ *
+ * Path: /sys/fs/btrfs/<uuid>/qgroups/<level>_<qgroupid>/
+ */
static struct attribute *qgroup_attrs[] = {
BTRFS_ATTR_PTR(qgroup, referenced),
BTRFS_ATTR_PTR(qgroup, exclusive),
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index f3137285a9e2..19ba7d5b7d8f 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -223,8 +223,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
* we can only call btrfs_qgroup_account_extent() directly to test
* quota.
*/
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
- false);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
if (ret) {
ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
@@ -236,8 +235,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
if (ret)
return ret;
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
- false);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
@@ -260,8 +258,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
old_roots = NULL;
new_roots = NULL;
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
- false);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
if (ret) {
ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
@@ -272,8 +269,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
if (ret)
return -EINVAL;
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
- false);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
@@ -324,8 +320,7 @@ static int test_multiple_refs(struct btrfs_root *root,
return ret;
}
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
- false);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
if (ret) {
ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
@@ -337,8 +332,7 @@ static int test_multiple_refs(struct btrfs_root *root,
if (ret)
return ret;
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
- false);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
@@ -359,8 +353,7 @@ static int test_multiple_refs(struct btrfs_root *root,
return -EINVAL;
}
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
- false);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
if (ret) {
ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
@@ -372,8 +365,7 @@ static int test_multiple_refs(struct btrfs_root *root,
if (ret)
return ret;
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
- false);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
@@ -400,8 +392,7 @@ static int test_multiple_refs(struct btrfs_root *root,
return -EINVAL;
}
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
- false);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
if (ret) {
ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
@@ -413,8 +404,7 @@ static int test_multiple_refs(struct btrfs_root *root,
if (ret)
return ret;
- ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
- false);
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
if (ret) {
ulist_free(old_roots);
ulist_free(new_roots);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 50318231c1a8..14b9fdc8aaa9 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -254,23 +254,21 @@ static inline int extwriter_counter_read(struct btrfs_transaction *trans)
}
/*
- * To be called after all the new block groups attached to the transaction
- * handle have been created (btrfs_create_pending_block_groups()).
+ * To be called after doing the chunk btree updates right after allocating a new
+ * chunk (after btrfs_chunk_alloc_add_chunk_item() is called), when removing a
+ * chunk after all chunk btree updates and after finishing the second phase of
+ * chunk allocation (btrfs_create_pending_block_groups()) in case some block
+ * group had its chunk item insertion delayed to the second phase.
*/
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_transaction *cur_trans = trans->transaction;
if (!trans->chunk_bytes_reserved)
return;
- WARN_ON_ONCE(!list_empty(&trans->new_bgs));
-
btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv,
trans->chunk_bytes_reserved, NULL);
- atomic64_sub(trans->chunk_bytes_reserved, &cur_trans->chunk_bytes_reserved);
- cond_wake_up(&cur_trans->chunk_reserve_wait);
trans->chunk_bytes_reserved = 0;
}
@@ -386,8 +384,6 @@ loop:
spin_lock_init(&cur_trans->dropped_roots_lock);
INIT_LIST_HEAD(&cur_trans->releasing_ebs);
spin_lock_init(&cur_trans->releasing_ebs_lock);
- atomic64_set(&cur_trans->chunk_bytes_reserved, 0);
- init_waitqueue_head(&cur_trans->chunk_reserve_wait);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode);
@@ -701,7 +697,6 @@ again:
h->fs_info = root->fs_info;
h->type = type;
- h->can_flush_pending_bgs = true;
INIT_LIST_HEAD(&h->new_bgs);
smp_mb();
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 07d76029f598..ba45065f9451 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -96,13 +96,6 @@ struct btrfs_transaction {
spinlock_t releasing_ebs_lock;
struct list_head releasing_ebs;
-
- /*
- * The number of bytes currently reserved, by all transaction handles
- * attached to this transaction, for metadata extents of the chunk tree.
- */
- atomic64_t chunk_bytes_reserved;
- wait_queue_head_t chunk_reserve_wait;
};
#define __TRANS_FREEZABLE (1U << 0)
@@ -139,7 +132,7 @@ struct btrfs_trans_handle {
short aborted;
bool adding_csums;
bool allocating_chunk;
- bool can_flush_pending_bgs;
+ bool removing_chunk;
bool reloc_reserved;
bool in_fsync;
struct btrfs_root *root;
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index a8b2e0d2c025..7733e8ac0a69 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -24,6 +24,7 @@
#include "compression.h"
#include "volumes.h"
#include "misc.h"
+#include "btrfs_inode.h"
/*
* Error message should follow the following format:
@@ -873,13 +874,22 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
}
}
- if (unlikely((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
- (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) ||
- (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
- (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
- (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) ||
+ if (unlikely((type & BTRFS_BLOCK_GROUP_RAID10 &&
+ sub_stripes != btrfs_raid_array[BTRFS_RAID_RAID10].sub_stripes) ||
+ (type & BTRFS_BLOCK_GROUP_RAID1 &&
+ num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1].devs_min) ||
+ (type & BTRFS_BLOCK_GROUP_RAID1C3 &&
+ num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1C3].devs_min) ||
+ (type & BTRFS_BLOCK_GROUP_RAID1C4 &&
+ num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1C4].devs_min) ||
+ (type & BTRFS_BLOCK_GROUP_RAID5 &&
+ num_stripes < btrfs_raid_array[BTRFS_RAID_RAID5].devs_min) ||
+ (type & BTRFS_BLOCK_GROUP_RAID6 &&
+ num_stripes < btrfs_raid_array[BTRFS_RAID_RAID6].devs_min) ||
+ (type & BTRFS_BLOCK_GROUP_DUP &&
+ num_stripes != btrfs_raid_array[BTRFS_RAID_DUP].dev_stripes) ||
((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
- num_stripes != 1))) {
+ num_stripes != btrfs_raid_array[BTRFS_RAID_SINGLE].dev_stripes))) {
chunk_err(leaf, chunk, logical,
"invalid num_stripes:sub_stripes %u:%u for profile %llu",
num_stripes, sub_stripes,
@@ -999,6 +1009,8 @@ static int check_inode_item(struct extent_buffer *leaf,
u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777);
u32 mode;
int ret;
+ u32 flags;
+ u32 ro_flags;
ret = check_inode_key(leaf, key, slot);
if (unlikely(ret < 0))
@@ -1054,11 +1066,17 @@ static int check_inode_item(struct extent_buffer *leaf,
btrfs_inode_nlink(leaf, iitem));
return -EUCLEAN;
}
- if (unlikely(btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK)) {
+ btrfs_inode_split_flags(btrfs_inode_flags(leaf, iitem), &flags, &ro_flags);
+ if (unlikely(flags & ~BTRFS_INODE_FLAG_MASK)) {
inode_item_err(leaf, slot,
- "unknown flags detected: 0x%llx",
- btrfs_inode_flags(leaf, iitem) &
- ~BTRFS_INODE_FLAG_MASK);
+ "unknown incompat flags detected: 0x%x", flags);
+ return -EUCLEAN;
+ }
+ if (unlikely(!sb_rdonly(fs_info->sb) &&
+ (ro_flags & ~BTRFS_INODE_RO_FLAG_MASK))) {
+ inode_item_err(leaf, slot,
+ "unknown ro-compat flags detected on writeable mount: 0x%x",
+ ro_flags);
return -EUCLEAN;
}
return 0;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index cab451d19547..f7efc26aa82a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -753,7 +753,9 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
*/
ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
ins.offset);
- if (ret == 0) {
+ if (ret < 0) {
+ goto out;
+ } else if (ret == 0) {
btrfs_init_generic_ref(&ref,
BTRFS_ADD_DELAYED_REF,
ins.objectid, ins.offset, 0);
@@ -3039,8 +3041,6 @@ static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
list_del_init(&ctx->list);
ctx->log_ret = error;
}
-
- INIT_LIST_HEAD(&root->log_ctxs[index]);
}
/*
@@ -3173,7 +3173,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
if (!log_root_tree->node) {
ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
if (ret) {
- mutex_unlock(&fs_info->tree_log_mutex);
+ mutex_unlock(&fs_info->tree_root->log_mutex);
goto out;
}
}
@@ -3328,10 +3328,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
goto out_wake_log_root;
}
- mutex_lock(&root->log_mutex);
- if (root->last_log_commit < log_transid)
- root->last_log_commit = log_transid;
- mutex_unlock(&root->log_mutex);
+ /*
+ * We know there can only be one task here, since we have not yet set
+ * root->log_commit[index1] to 0 and any task attempting to sync the
+ * log must wait for the previous log transaction to commit if it's
+ * still in progress or wait for the current log transaction commit if
+ * someone else already started it. We use <= and not < because the
+ * first log transaction has an ID of 0.
+ */
+ ASSERT(root->last_log_commit <= log_transid);
+ root->last_log_commit = log_transid;
out_wake_log_root:
mutex_lock(&log_root_tree->log_mutex);
@@ -3417,14 +3423,10 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
}
/*
- * Check if an inode was logged in the current transaction. We can't always rely
- * on an inode's logged_trans value, because it's an in-memory only field and
- * therefore not persisted. This means that its value is lost if the inode gets
- * evicted and loaded again from disk (in which case it has a value of 0, and
- * certainly it is smaller then any possible transaction ID), when that happens
- * the full_sync flag is set in the inode's runtime flags, so on that case we
- * assume eviction happened and ignore the logged_trans value, assuming the
- * worst case, that the inode was logged before in the current transaction.
+ * Check if an inode was logged in the current transaction. This may often
+ * return some false positives, because logged_trans is an in memory only field,
+ * not persisted anywhere. This is meant to be used in contexts where a false
+ * positive has no functional consequences.
*/
static bool inode_logged(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode)
@@ -3432,8 +3434,17 @@ static bool inode_logged(struct btrfs_trans_handle *trans,
if (inode->logged_trans == trans->transid)
return true;
- if (inode->last_trans == trans->transid &&
- test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
+ /*
+ * The inode's logged_trans is always 0 when we load it (because it is
+ * not persisted in the inode item or elsewhere). So if it is 0, the
+ * inode was last modified in the current transaction then the inode may
+ * have been logged before in the current transaction, then evicted and
+ * loaded again in the current transaction - or may have never been logged
+ * in the current transaction, but since we can not be sure, we have to
+ * assume it was, otherwise our callers can leave an inconsistent log.
+ */
+ if (inode->logged_trans == 0 &&
+ inode->last_trans == trans->transid &&
!test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
return true;
@@ -3913,6 +3924,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
u64 logged_isize)
{
struct btrfs_map_token token;
+ u64 flags;
btrfs_init_map_token(&token, leaf);
@@ -3962,20 +3974,49 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
btrfs_set_token_inode_transid(&token, item, trans->transid);
btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
- btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
+ flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
+ BTRFS_I(inode)->ro_flags);
+ btrfs_set_token_inode_flags(&token, item, flags);
btrfs_set_token_inode_block_group(&token, item, 0);
}
static int log_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_root *log, struct btrfs_path *path,
- struct btrfs_inode *inode)
+ struct btrfs_inode *inode, bool inode_item_dropped)
{
struct btrfs_inode_item *inode_item;
int ret;
- ret = btrfs_insert_empty_item(trans, log, path,
- &inode->location, sizeof(*inode_item));
- if (ret && ret != -EEXIST)
+ /*
+ * If we are doing a fast fsync and the inode was logged before in the
+ * current transaction, then we know the inode was previously logged and
+ * it exists in the log tree. For performance reasons, in this case use
+ * btrfs_search_slot() directly with ins_len set to 0 so that we never
+ * attempt a write lock on the leaf's parent, which adds unnecessary lock
+ * contention in case there are concurrent fsyncs for other inodes of the
+ * same subvolume. Using btrfs_insert_empty_item() when the inode item
+ * already exists can also result in unnecessarily splitting a leaf.
+ */
+ if (!inode_item_dropped && inode->logged_trans == trans->transid) {
+ ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1);
+ ASSERT(ret <= 0);
+ if (ret > 0)
+ ret = -ENOENT;
+ } else {
+ /*
+ * This means it is the first fsync in the current transaction,
+ * so the inode item is not in the log and we need to insert it.
+ * We can never get -EEXIST because we are only called for a fast
+ * fsync and in case an inode eviction happens after the inode was
+ * logged before in the current transaction, when we load again
+ * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime
+ * flags and set ->logged_trans to 0.
+ */
+ ret = btrfs_insert_empty_item(trans, log, path, &inode->location,
+ sizeof(*inode_item));
+ ASSERT(ret != -EEXIST);
+ }
+ if (ret)
return ret;
inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
@@ -4160,7 +4201,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
static int extent_cmp(void *priv, const struct list_head *a,
const struct list_head *b)
{
- struct extent_map *em1, *em2;
+ const struct extent_map *em1, *em2;
em1 = list_entry(a, struct extent_map, list);
em2 = list_entry(b, struct extent_map, list);
@@ -5053,8 +5094,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
/*
* Check the inode's logged_trans only instead of
* btrfs_inode_in_log(). This is because the last_log_commit of
- * the inode is not updated when we only log that it exists and
- * it has the full sync bit set (see btrfs_log_inode()).
+ * the inode is not updated when we only log that it exists (see
+ * btrfs_log_inode()).
*/
if (BTRFS_I(inode)->logged_trans == trans->transid) {
spin_unlock(&BTRFS_I(inode)->lock);
@@ -5299,6 +5340,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
bool need_log_inode_item = true;
bool xattrs_logged = false;
bool recursive_logging = false;
+ bool inode_item_dropped = true;
path = btrfs_alloc_path();
if (!path)
@@ -5433,6 +5475,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
} else {
if (inode_only == LOG_INODE_ALL)
fast_search = true;
+ inode_item_dropped = false;
goto log_extents;
}
@@ -5466,7 +5509,7 @@ log_extents:
btrfs_release_path(path);
btrfs_release_path(dst_path);
if (need_log_inode_item) {
- err = log_inode_item(trans, log, dst_path, inode);
+ err = log_inode_item(trans, log, dst_path, inode, inode_item_dropped);
if (err)
goto out_unlock;
/*
@@ -5526,16 +5569,29 @@ log_extents:
spin_lock(&inode->lock);
inode->logged_trans = trans->transid;
/*
- * Don't update last_log_commit if we logged that an inode exists
- * after it was loaded to memory (full_sync bit set).
- * This is to prevent data loss when we do a write to the inode,
- * then the inode gets evicted after all delalloc was flushed,
- * then we log it exists (due to a rename for example) and then
- * fsync it. This last fsync would do nothing (not logging the
- * extents previously written).
+ * Don't update last_log_commit if we logged that an inode exists.
+ * We do this for two reasons:
+ *
+ * 1) We might have had buffered writes to this inode that were
+ * flushed and had their ordered extents completed in this
+ * transaction, but we did not previously log the inode with
+ * LOG_INODE_ALL. Later the inode was evicted and after that
+ * it was loaded again and this LOG_INODE_EXISTS log operation
+ * happened. We must make sure that if an explicit fsync against
+ * the inode is performed later, it logs the new extents, an
+ * updated inode item, etc, and syncs the log. The same logic
+ * applies to direct IO writes instead of buffered writes.
+ *
+ * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
+ * is logged with an i_size of 0 or whatever value was logged
+ * before. If later the i_size of the inode is increased by a
+ * truncate operation, the log is synced through an fsync of
+ * some other inode and then finally an explicit fsync against
+ * this inode is made, we must make sure this fsync logs the
+ * inode with the new i_size, the hole between old i_size and
+ * the new i_size, and syncs the log.
*/
- if (inode_only != LOG_INODE_EXISTS ||
- !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
+ if (inode_only != LOG_INODE_EXISTS)
inode->last_log_commit = inode->last_sub_trans;
spin_unlock(&inode->lock);
}
@@ -5560,6 +5616,13 @@ static bool need_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode)
{
/*
+ * If a directory was not modified, no dentries added or removed, we can
+ * and should avoid logging it.
+ */
+ if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid)
+ return false;
+
+ /*
* If this inode does not have new/updated/deleted xattrs since the last
* time it was logged and is flagged as logged in the current transaction,
* we can skip logging it. As for new/deleted names, those are updated in
@@ -6490,8 +6553,8 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
* if this inode hasn't been logged and directory we're renaming it
* from hasn't been logged, we don't need to log it
*/
- if (inode->logged_trans < trans->transid &&
- (!old_dir || old_dir->logged_trans < trans->transid))
+ if (!inode_logged(trans, inode) &&
+ (!old_dir || !inode_logged(trans, old_dir)))
return;
/*
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
new file mode 100644
index 000000000000..28d443d3ef93
--- /dev/null
+++ b/fs/btrfs/verity.c
@@ -0,0 +1,811 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/rwsem.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/iversion.h>
+#include <linux/fsverity.h>
+#include <linux/sched/mm.h>
+#include "ctree.h"
+#include "btrfs_inode.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "locking.h"
+
+/*
+ * Implementation of the interface defined in struct fsverity_operations.
+ *
+ * The main question is how and where to store the verity descriptor and the
+ * Merkle tree. We store both in dedicated btree items in the filesystem tree,
+ * together with the rest of the inode metadata. This means we'll need to do
+ * extra work to encrypt them once encryption is supported in btrfs, but btrfs
+ * has a lot of careful code around i_size and it seems better to make a new key
+ * type than try and adjust all of our expectations for i_size.
+ *
+ * Note that this differs from the implementation in ext4 and f2fs, where
+ * this data is stored as if it were in the file, but past EOF. However, btrfs
+ * does not have a widespread mechanism for caching opaque metadata pages, so we
+ * do pretend that the Merkle tree pages themselves are past EOF for the
+ * purposes of caching them (as opposed to creating a virtual inode).
+ *
+ * fs verity items are stored under two different key types on disk.
+ * The descriptor items:
+ * [ inode objectid, BTRFS_VERITY_DESC_ITEM_KEY, offset ]
+ *
+ * At offset 0, we store a btrfs_verity_descriptor_item which tracks the
+ * size of the descriptor item and some extra data for encryption.
+ * Starting at offset 1, these hold the generic fs verity descriptor.
+ * The latter are opaque to btrfs, we just read and write them as a blob for
+ * the higher level verity code. The most common descriptor size is 256 bytes.
+ *
+ * The merkle tree items:
+ * [ inode objectid, BTRFS_VERITY_MERKLE_ITEM_KEY, offset ]
+ *
+ * These also start at offset 0, and correspond to the merkle tree bytes.
+ * So when fsverity asks for page 0 of the merkle tree, we pull up one page
+ * starting at offset 0 for this key type. These are also opaque to btrfs,
+ * we're blindly storing whatever fsverity sends down.
+ *
+ * Another important consideration is the fact that the Merkle tree data scales
+ * linearly with the size of the file (with 4K pages/blocks and SHA-256, it's
+ * ~1/127th the size) so for large files, writing the tree can be a lengthy
+ * operation. For that reason, we guard the whole enable verity operation
+ * (between begin_enable_verity and end_enable_verity) with an orphan item.
+ * Again, because the data can be pretty large, it's quite possible that we
+ * could run out of space writing it, so we try our best to handle errors by
+ * stopping and rolling back rather than aborting the victim transaction.
+ */
+
+#define MERKLE_START_ALIGN 65536
+
+/*
+ * Compute the logical file offset where we cache the Merkle tree.
+ *
+ * @inode: inode of the verity file
+ *
+ * For the purposes of caching the Merkle tree pages, as required by
+ * fs-verity, it is convenient to do size computations in terms of a file
+ * offset, rather than in terms of page indices.
+ *
+ * Use 64K to be sure it's past the last page in the file, even with 64K pages.
+ * That rounding operation itself can overflow loff_t, so we do it in u64 and
+ * check.
+ *
+ * Returns the file offset on success, negative error code on failure.
+ */
+static loff_t merkle_file_pos(const struct inode *inode)
+{
+ u64 sz = inode->i_size;
+ u64 rounded = round_up(sz, MERKLE_START_ALIGN);
+
+ if (rounded > inode->i_sb->s_maxbytes)
+ return -EFBIG;
+
+ return rounded;
+}
+
+/*
+ * Drop all the items for this inode with this key_type.
+ *
+ * @inode: inode to drop items for
+ * @key_type: type of items to drop (BTRFS_VERITY_DESC_ITEM or
+ * BTRFS_VERITY_MERKLE_ITEM)
+ *
+ * Before doing a verity enable we cleanup any existing verity items.
+ * This is also used to clean up if a verity enable failed half way through.
+ *
+ * Returns number of dropped items on success, negative error code on failure.
+ */
+static int drop_verity_items(struct btrfs_inode *inode, u8 key_type)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int count = 0;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ while (1) {
+ /* 1 for the item being dropped */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
+ /*
+ * Walk backwards through all the items until we find one that
+ * isn't from our key type or objectid
+ */
+ key.objectid = btrfs_ino(inode);
+ key.type = key_type;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0) {
+ ret = 0;
+ /* No more keys of this type, we're done */
+ if (path->slots[0] == 0)
+ break;
+ path->slots[0]--;
+ } else if (ret < 0) {
+ btrfs_end_transaction(trans);
+ goto out;
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ /* No more keys of this type, we're done */
+ if (key.objectid != btrfs_ino(inode) || key.type != key_type)
+ break;
+
+ /*
+ * This shouldn't be a performance sensitive function because
+ * it's not used as part of truncate. If it ever becomes
+ * perf sensitive, change this to walk forward and bulk delete
+ * items
+ */
+ ret = btrfs_del_items(trans, root, path, path->slots[0], 1);
+ if (ret) {
+ btrfs_end_transaction(trans);
+ goto out;
+ }
+ count++;
+ btrfs_release_path(path);
+ btrfs_end_transaction(trans);
+ }
+ ret = count;
+ btrfs_end_transaction(trans);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * Drop all verity items
+ *
+ * @inode: inode to drop verity items for
+ *
+ * In most contexts where we are dropping verity items, we want to do it for all
+ * the types of verity items, not a particular one.
+ *
+ * Returns: 0 on success, negative error code on failure.
+ */
+int btrfs_drop_verity_items(struct btrfs_inode *inode)
+{
+ int ret;
+
+ ret = drop_verity_items(inode, BTRFS_VERITY_DESC_ITEM_KEY);
+ if (ret < 0)
+ return ret;
+ ret = drop_verity_items(inode, BTRFS_VERITY_MERKLE_ITEM_KEY);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+/*
+ * Insert and write inode items with a given key type and offset.
+ *
+ * @inode: inode to insert for
+ * @key_type: key type to insert
+ * @offset: item offset to insert at
+ * @src: source data to write
+ * @len: length of source data to write
+ *
+ * Write len bytes from src into items of up to 2K length.
+ * The inserted items will have key (ino, key_type, offset + off) where off is
+ * consecutively increasing from 0 up to the last item ending at offset + len.
+ *
+ * Returns 0 on success and a negative error code on failure.
+ */
+static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
+ const char *src, u64 len)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_path *path;
+ struct btrfs_root *root = inode->root;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ unsigned long copy_bytes;
+ unsigned long src_offset = 0;
+ void *data;
+ int ret = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ while (len > 0) {
+ /* 1 for the new item being inserted */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
+
+ key.objectid = btrfs_ino(inode);
+ key.type = key_type;
+ key.offset = offset;
+
+ /*
+ * Insert 2K at a time mostly to be friendly for smaller leaf
+ * size filesystems
+ */
+ copy_bytes = min_t(u64, len, 2048);
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key, copy_bytes);
+ if (ret) {
+ btrfs_end_transaction(trans);
+ break;
+ }
+
+ leaf = path->nodes[0];
+
+ data = btrfs_item_ptr(leaf, path->slots[0], void);
+ write_extent_buffer(leaf, src + src_offset,
+ (unsigned long)data, copy_bytes);
+ offset += copy_bytes;
+ src_offset += copy_bytes;
+ len -= copy_bytes;
+
+ btrfs_release_path(path);
+ btrfs_end_transaction(trans);
+ }
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * Read inode items of the given key type and offset from the btree.
+ *
+ * @inode: inode to read items of
+ * @key_type: key type to read
+ * @offset: item offset to read from
+ * @dest: Buffer to read into. This parameter has slightly tricky
+ * semantics. If it is NULL, the function will not do any copying
+ * and will just return the size of all the items up to len bytes.
+ * If dest_page is passed, then the function will kmap_local the
+ * page and ignore dest, but it must still be non-NULL to avoid the
+ * counting-only behavior.
+ * @len: length in bytes to read
+ * @dest_page: copy into this page instead of the dest buffer
+ *
+ * Helper function to read items from the btree. This returns the number of
+ * bytes read or < 0 for errors. We can return short reads if the items don't
+ * exist on disk or aren't big enough to fill the desired length. Supports
+ * reading into a provided buffer (dest) or into the page cache
+ *
+ * Returns number of bytes read or a negative error code on failure.
+ */
+static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
+ char *dest, u64 len, struct page *dest_page)
+{
+ struct btrfs_path *path;
+ struct btrfs_root *root = inode->root;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ u64 item_end;
+ u64 copy_end;
+ int copied = 0;
+ u32 copy_offset;
+ unsigned long copy_bytes;
+ unsigned long dest_offset = 0;
+ void *data;
+ char *kaddr = dest;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ if (dest_page)
+ path->reada = READA_FORWARD;
+
+ key.objectid = btrfs_ino(inode);
+ key.type = key_type;
+ key.offset = offset;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = 0;
+ if (path->slots[0] == 0)
+ goto out;
+ path->slots[0]--;
+ }
+
+ while (len > 0) {
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+ if (key.objectid != btrfs_ino(inode) || key.type != key_type)
+ break;
+
+ item_end = btrfs_item_size_nr(leaf, path->slots[0]) + key.offset;
+
+ if (copied > 0) {
+ /*
+ * Once we've copied something, we want all of the items
+ * to be sequential
+ */
+ if (key.offset != offset)
+ break;
+ } else {
+ /*
+ * Our initial offset might be in the middle of an
+ * item. Make sure it all makes sense.
+ */
+ if (key.offset > offset)
+ break;
+ if (item_end <= offset)
+ break;
+ }
+
+ /* desc = NULL to just sum all the item lengths */
+ if (!dest)
+ copy_end = item_end;
+ else
+ copy_end = min(offset + len, item_end);
+
+ /* Number of bytes in this item we want to copy */
+ copy_bytes = copy_end - offset;
+
+ /* Offset from the start of item for copying */
+ copy_offset = offset - key.offset;
+
+ if (dest) {
+ if (dest_page)
+ kaddr = kmap_local_page(dest_page);
+
+ data = btrfs_item_ptr(leaf, path->slots[0], void);
+ read_extent_buffer(leaf, kaddr + dest_offset,
+ (unsigned long)data + copy_offset,
+ copy_bytes);
+
+ if (dest_page)
+ kunmap_local(kaddr);
+ }
+
+ offset += copy_bytes;
+ dest_offset += copy_bytes;
+ len -= copy_bytes;
+ copied += copy_bytes;
+
+ path->slots[0]++;
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ /*
+ * We've reached the last slot in this leaf and we need
+ * to go to the next leaf.
+ */
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0) {
+ break;
+ } else if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ }
+ }
+out:
+ btrfs_free_path(path);
+ if (!ret)
+ ret = copied;
+ return ret;
+}
+
+/*
+ * Delete an fsverity orphan
+ *
+ * @trans: transaction to do the delete in
+ * @inode: inode to orphan
+ *
+ * Capture verity orphan specific logic that is repeated in the couple places
+ * we delete verity orphans. Specifically, handling ENOENT and ignoring inodes
+ * with 0 links.
+ *
+ * Returns zero on success or a negative error code on failure.
+ */
+static int del_orphan(struct btrfs_trans_handle *trans, struct btrfs_inode *inode)
+{
+ struct btrfs_root *root = inode->root;
+ int ret;
+
+ /*
+ * If the inode has no links, it is either already unlinked, or was
+ * created with O_TMPFILE. In either case, it should have an orphan from
+ * that other operation. Rather than reference count the orphans, we
+ * simply ignore them here, because we only invoke the verity path in
+ * the orphan logic when i_nlink is 1.
+ */
+ if (!inode->vfs_inode.i_nlink)
+ return 0;
+
+ ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
+ if (ret == -ENOENT)
+ ret = 0;
+ return ret;
+}
+
+/*
+ * Rollback in-progress verity if we encounter an error.
+ *
+ * @inode: inode verity had an error for
+ *
+ * We try to handle recoverable errors while enabling verity by rolling it back
+ * and just failing the operation, rather than having an fs level error no
+ * matter what. However, any error in rollback is unrecoverable.
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+static int rollback_verity(struct btrfs_inode *inode)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = inode->root;
+ int ret;
+
+ ASSERT(inode_is_locked(&inode->vfs_inode));
+ truncate_inode_pages(inode->vfs_inode.i_mapping, inode->vfs_inode.i_size);
+ clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags);
+ ret = btrfs_drop_verity_items(inode);
+ if (ret) {
+ btrfs_handle_fs_error(root->fs_info, ret,
+ "failed to drop verity items in rollback %llu",
+ (u64)inode->vfs_inode.i_ino);
+ goto out;
+ }
+
+ /*
+ * 1 for updating the inode flag
+ * 1 for deleting the orphan
+ */
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ btrfs_handle_fs_error(root->fs_info, ret,
+ "failed to start transaction in verity rollback %llu",
+ (u64)inode->vfs_inode.i_ino);
+ goto out;
+ }
+ inode->ro_flags &= ~BTRFS_INODE_RO_VERITY;
+ btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+ ret = del_orphan(trans, inode);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+ btrfs_end_transaction(trans);
+out:
+ return ret;
+}
+
+/*
+ * Finalize making the file a valid verity file
+ *
+ * @inode: inode to be marked as verity
+ * @desc: contents of the verity descriptor to write (not NULL)
+ * @desc_size: size of the verity descriptor
+ *
+ * Do the actual work of finalizing verity after successfully writing the Merkle
+ * tree:
+ *
+ * - write out the descriptor items
+ * - mark the inode with the verity flag
+ * - delete the orphan item
+ * - mark the ro compat bit
+ * - clear the in progress bit
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+static int finish_verity(struct btrfs_inode *inode, const void *desc,
+ size_t desc_size)
+{
+ struct btrfs_trans_handle *trans = NULL;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_verity_descriptor_item item;
+ int ret;
+
+ /* Write out the descriptor item */
+ memset(&item, 0, sizeof(item));
+ btrfs_set_stack_verity_descriptor_size(&item, desc_size);
+ ret = write_key_bytes(inode, BTRFS_VERITY_DESC_ITEM_KEY, 0,
+ (const char *)&item, sizeof(item));
+ if (ret)
+ goto out;
+
+ /* Write out the descriptor itself */
+ ret = write_key_bytes(inode, BTRFS_VERITY_DESC_ITEM_KEY, 1,
+ desc, desc_size);
+ if (ret)
+ goto out;
+
+ /*
+ * 1 for updating the inode flag
+ * 1 for deleting the orphan
+ */
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+ inode->ro_flags |= BTRFS_INODE_RO_VERITY;
+ btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret)
+ goto end_trans;
+ ret = del_orphan(trans, inode);
+ if (ret)
+ goto end_trans;
+ clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags);
+ btrfs_set_fs_compat_ro(root->fs_info, VERITY);
+end_trans:
+ btrfs_end_transaction(trans);
+out:
+ return ret;
+
+}
+
+/*
+ * fsverity op that begins enabling verity.
+ *
+ * @filp: file to enable verity on
+ *
+ * Begin enabling fsverity for the file. We drop any existing verity items, add
+ * an orphan and set the in progress bit.
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+static int btrfs_begin_enable_verity(struct file *filp)
+{
+ struct btrfs_inode *inode = BTRFS_I(file_inode(filp));
+ struct btrfs_root *root = inode->root;
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ ASSERT(inode_is_locked(file_inode(filp)));
+
+ if (test_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags))
+ return -EBUSY;
+
+ /*
+ * This should almost never do anything, but theoretically, it's
+ * possible that we failed to enable verity on a file, then were
+ * interrupted or failed while rolling back, failed to cleanup the
+ * orphan, and finally attempt to enable verity again.
+ */
+ ret = btrfs_drop_verity_items(inode);
+ if (ret)
+ return ret;
+
+ /* 1 for the orphan item */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ ret = btrfs_orphan_add(trans, inode);
+ if (!ret)
+ set_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags);
+ btrfs_end_transaction(trans);
+
+ return 0;
+}
+
+/*
+ * fsverity op that ends enabling verity.
+ *
+ * @filp: file we are finishing enabling verity on
+ * @desc: verity descriptor to write out (NULL in error conditions)
+ * @desc_size: size of the verity descriptor (variable with signatures)
+ * @merkle_tree_size: size of the merkle tree in bytes
+ *
+ * If desc is null, then VFS is signaling an error occurred during verity
+ * enable, and we should try to rollback. Otherwise, attempt to finish verity.
+ *
+ * Returns 0 on success, negative error code on error.
+ */
+static int btrfs_end_enable_verity(struct file *filp, const void *desc,
+ size_t desc_size, u64 merkle_tree_size)
+{
+ struct btrfs_inode *inode = BTRFS_I(file_inode(filp));
+ int ret = 0;
+ int rollback_ret;
+
+ ASSERT(inode_is_locked(file_inode(filp)));
+
+ if (desc == NULL)
+ goto rollback;
+
+ ret = finish_verity(inode, desc, desc_size);
+ if (ret)
+ goto rollback;
+ return ret;
+
+rollback:
+ rollback_ret = rollback_verity(inode);
+ if (rollback_ret)
+ btrfs_err(inode->root->fs_info,
+ "failed to rollback verity items: %d", rollback_ret);
+ return ret;
+}
+
+/*
+ * fsverity op that gets the struct fsverity_descriptor.
+ *
+ * @inode: inode to get the descriptor of
+ * @buf: output buffer for the descriptor contents
+ * @buf_size: size of the output buffer. 0 to query the size
+ *
+ * fsverity does a two pass setup for reading the descriptor, in the first pass
+ * it calls with buf_size = 0 to query the size of the descriptor, and then in
+ * the second pass it actually reads the descriptor off disk.
+ *
+ * Returns the size on success or a negative error code on failure.
+ */
+static int btrfs_get_verity_descriptor(struct inode *inode, void *buf,
+ size_t buf_size)
+{
+ u64 true_size;
+ int ret = 0;
+ struct btrfs_verity_descriptor_item item;
+
+ memset(&item, 0, sizeof(item));
+ ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_DESC_ITEM_KEY, 0,
+ (char *)&item, sizeof(item), NULL);
+ if (ret < 0)
+ return ret;
+
+ if (item.reserved[0] != 0 || item.reserved[1] != 0)
+ return -EUCLEAN;
+
+ true_size = btrfs_stack_verity_descriptor_size(&item);
+ if (true_size > INT_MAX)
+ return -EUCLEAN;
+
+ if (buf_size == 0)
+ return true_size;
+ if (buf_size < true_size)
+ return -ERANGE;
+
+ ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_DESC_ITEM_KEY, 1,
+ buf, buf_size, NULL);
+ if (ret < 0)
+ return ret;
+ if (ret != true_size)
+ return -EIO;
+
+ return true_size;
+}
+
+/*
+ * fsverity op that reads and caches a merkle tree page.
+ *
+ * @inode: inode to read a merkle tree page for
+ * @index: page index relative to the start of the merkle tree
+ * @num_ra_pages: number of pages to readahead. Optional, we ignore it
+ *
+ * The Merkle tree is stored in the filesystem btree, but its pages are cached
+ * with a logical position past EOF in the inode's mapping.
+ *
+ * Returns the page we read, or an ERR_PTR on error.
+ */
+static struct page *btrfs_read_merkle_tree_page(struct inode *inode,
+ pgoff_t index,
+ unsigned long num_ra_pages)
+{
+ struct page *page;
+ u64 off = (u64)index << PAGE_SHIFT;
+ loff_t merkle_pos = merkle_file_pos(inode);
+ int ret;
+
+ if (merkle_pos < 0)
+ return ERR_PTR(merkle_pos);
+ if (merkle_pos > inode->i_sb->s_maxbytes - off - PAGE_SIZE)
+ return ERR_PTR(-EFBIG);
+ index += merkle_pos >> PAGE_SHIFT;
+again:
+ page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED);
+ if (page) {
+ if (PageUptodate(page))
+ return page;
+
+ lock_page(page);
+ /*
+ * We only insert uptodate pages, so !Uptodate has to be
+ * an error
+ */
+ if (!PageUptodate(page)) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(-EIO);
+ }
+ unlock_page(page);
+ return page;
+ }
+
+ page = __page_cache_alloc(mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS));
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Merkle item keys are indexed from byte 0 in the merkle tree.
+ * They have the form:
+ *
+ * [ inode objectid, BTRFS_MERKLE_ITEM_KEY, offset in bytes ]
+ */
+ ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY, off,
+ page_address(page), PAGE_SIZE, page);
+ if (ret < 0) {
+ put_page(page);
+ return ERR_PTR(ret);
+ }
+ if (ret < PAGE_SIZE)
+ memzero_page(page, ret, PAGE_SIZE - ret);
+
+ SetPageUptodate(page);
+ ret = add_to_page_cache_lru(page, inode->i_mapping, index, GFP_NOFS);
+
+ if (!ret) {
+ /* Inserted and ready for fsverity */
+ unlock_page(page);
+ } else {
+ put_page(page);
+ /* Did someone race us into inserting this page? */
+ if (ret == -EEXIST)
+ goto again;
+ page = ERR_PTR(ret);
+ }
+ return page;
+}
+
+/*
+ * fsverity op that writes a Merkle tree block into the btree.
+ *
+ * @inode: inode to write a Merkle tree block for
+ * @buf: Merkle tree data block to write
+ * @index: index of the block in the Merkle tree
+ * @log_blocksize: log base 2 of the Merkle tree block size
+ *
+ * Note that the block size could be different from the page size, so it is not
+ * safe to assume that index is a page index.
+ *
+ * Returns 0 on success or negative error code on failure
+ */
+static int btrfs_write_merkle_tree_block(struct inode *inode, const void *buf,
+ u64 index, int log_blocksize)
+{
+ u64 off = index << log_blocksize;
+ u64 len = 1ULL << log_blocksize;
+ loff_t merkle_pos = merkle_file_pos(inode);
+
+ if (merkle_pos < 0)
+ return merkle_pos;
+ if (merkle_pos > inode->i_sb->s_maxbytes - off - len)
+ return -EFBIG;
+
+ return write_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY,
+ off, buf, len);
+}
+
+const struct fsverity_operations btrfs_verityops = {
+ .begin_enable_verity = btrfs_begin_enable_verity,
+ .end_enable_verity = btrfs_end_enable_verity,
+ .get_verity_descriptor = btrfs_get_verity_descriptor,
+ .read_merkle_tree_page = btrfs_read_merkle_tree_page,
+ .write_merkle_tree_block = btrfs_write_merkle_tree_block,
+};
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 807502cd6510..ec3a874165de 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -38,7 +38,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.sub_stripes = 2,
.dev_stripes = 1,
.devs_max = 0, /* 0 == as many as possible */
- .devs_min = 4,
+ .devs_min = 2,
.tolerated_failures = 1,
.devs_increment = 2,
.ncopies = 2,
@@ -103,7 +103,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.sub_stripes = 1,
.dev_stripes = 1,
.devs_max = 0,
- .devs_min = 2,
+ .devs_min = 1,
.tolerated_failures = 0,
.devs_increment = 1,
.ncopies = 1,
@@ -153,6 +153,32 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
},
};
+/*
+ * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
+ * can be used as index to access btrfs_raid_array[].
+ */
+enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags)
+{
+ if (flags & BTRFS_BLOCK_GROUP_RAID10)
+ return BTRFS_RAID_RAID10;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID1)
+ return BTRFS_RAID_RAID1;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID1C3)
+ return BTRFS_RAID_RAID1C3;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID1C4)
+ return BTRFS_RAID_RAID1C4;
+ else if (flags & BTRFS_BLOCK_GROUP_DUP)
+ return BTRFS_RAID_DUP;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID0)
+ return BTRFS_RAID_RAID0;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID5)
+ return BTRFS_RAID_RAID5;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID6)
+ return BTRFS_RAID_RAID6;
+
+ return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
+}
+
const char *btrfs_bg_type_to_raid_name(u64 flags)
{
const int index = btrfs_bg_flags_to_raid_index(flags);
@@ -404,44 +430,6 @@ void __exit btrfs_cleanup_fs_uuids(void)
}
}
-/*
- * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
- * Returned struct is not linked onto any lists and must be destroyed using
- * btrfs_free_device.
- */
-static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_device *dev;
-
- dev = kzalloc(sizeof(*dev), GFP_KERNEL);
- if (!dev)
- return ERR_PTR(-ENOMEM);
-
- /*
- * Preallocate a bio that's always going to be used for flushing device
- * barriers and matches the device lifespan
- */
- dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0);
- if (!dev->flush_bio) {
- kfree(dev);
- return ERR_PTR(-ENOMEM);
- }
-
- INIT_LIST_HEAD(&dev->dev_list);
- INIT_LIST_HEAD(&dev->dev_alloc_list);
- INIT_LIST_HEAD(&dev->post_commit_list);
-
- atomic_set(&dev->reada_in_flight, 0);
- atomic_set(&dev->dev_stats_ccnt, 0);
- btrfs_device_data_ordered_init(dev);
- INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
- INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
- extent_io_tree_init(fs_info, &dev->alloc_state,
- IO_TREE_DEVICE_ALLOC_STATE, NULL);
-
- return dev;
-}
-
static noinline struct btrfs_fs_devices *find_fsid(
const u8 *fsid, const u8 *metadata_fsid)
{
@@ -1078,6 +1066,7 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
list_del_init(&device->dev_alloc_list);
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+ fs_devices->rw_devices--;
}
list_del_init(&device->dev_list);
fs_devices->num_devices--;
@@ -1129,6 +1118,9 @@ static void btrfs_close_one_device(struct btrfs_device *device)
fs_devices->rw_devices--;
}
+ if (device->devid == BTRFS_DEV_REPLACE_DEVID)
+ clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
+
if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
fs_devices->missing_devices--;
@@ -1227,7 +1219,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
static int devid_cmp(void *priv, const struct list_head *a,
const struct list_head *b)
{
- struct btrfs_device *dev1, *dev2;
+ const struct btrfs_device *dev1, *dev2;
dev1 = list_entry(a, struct btrfs_device, dev_list);
dev2 = list_entry(b, struct btrfs_device, dev_list);
@@ -1597,14 +1589,9 @@ again:
key.offset = search_start;
key.type = BTRFS_DEV_EXTENT_KEY;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ ret = btrfs_search_backwards(root, &key, path);
if (ret < 0)
goto out;
- if (ret > 0) {
- ret = btrfs_previous_item(root, path, key.objectid, key.type);
- if (ret < 0)
- goto out;
- }
while (1) {
l = path->nodes[0];
@@ -1745,61 +1732,14 @@ again:
extent = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_extent);
} else {
- btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
goto out;
}
*dev_extent_len = btrfs_dev_extent_length(leaf, extent);
ret = btrfs_del_item(trans, root, path);
- if (ret) {
- btrfs_handle_fs_error(fs_info, ret,
- "Failed to remove dev extent item");
- } else {
+ if (ret == 0)
set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
- }
-out:
- btrfs_free_path(path);
- return ret;
-}
-
-static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
- struct btrfs_device *device,
- u64 chunk_offset, u64 start, u64 num_bytes)
-{
- int ret;
- struct btrfs_path *path;
- struct btrfs_fs_info *fs_info = device->fs_info;
- struct btrfs_root *root = fs_info->dev_root;
- struct btrfs_dev_extent *extent;
- struct extent_buffer *leaf;
- struct btrfs_key key;
-
- WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
- WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- key.objectid = device->devid;
- key.offset = start;
- key.type = BTRFS_DEV_EXTENT_KEY;
- ret = btrfs_insert_empty_item(trans, root, path, &key,
- sizeof(*extent));
- if (ret)
- goto out;
-
- leaf = path->nodes[0];
- extent = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_dev_extent);
- btrfs_set_dev_extent_chunk_tree(leaf, extent,
- BTRFS_CHUNK_TREE_OBJECTID);
- btrfs_set_dev_extent_chunk_objectid(leaf, extent,
- BTRFS_FIRST_CHUNK_TREE_OBJECTID);
- btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
-
- btrfs_set_dev_extent_length(leaf, extent, num_bytes);
- btrfs_mark_buffer_dirty(leaf);
out:
btrfs_free_path(path);
return ret;
@@ -2007,12 +1947,8 @@ static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
if (!(all_avail & btrfs_raid_array[i].bg_flag))
continue;
- if (num_devices < btrfs_raid_array[i].devs_min) {
- int ret = btrfs_raid_array[i].mindev_error;
-
- if (ret)
- return ret;
- }
+ if (num_devices < btrfs_raid_array[i].devs_min)
+ return btrfs_raid_array[i].mindev_error;
}
return 0;
@@ -2141,7 +2077,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
if (IS_ERR(device)) {
if (PTR_ERR(device) == -ENOENT &&
- strcmp(device_path, "missing") == 0)
+ device_path && strcmp(device_path, "missing") == 0)
ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
else
ret = PTR_ERR(device);
@@ -2942,7 +2878,7 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
u32 cur;
struct btrfs_key key;
- mutex_lock(&fs_info->chunk_mutex);
+ lockdep_assert_held(&fs_info->chunk_mutex);
array_size = btrfs_super_sys_array_size(super_copy);
ptr = super_copy->sys_chunk_array;
@@ -2972,7 +2908,6 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
cur += len;
}
}
- mutex_unlock(&fs_info->chunk_mutex);
return ret;
}
@@ -3012,6 +2947,29 @@ struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
return em;
}
+static int remove_chunk_item(struct btrfs_trans_handle *trans,
+ struct map_lookup *map, u64 chunk_offset)
+{
+ int i;
+
+ /*
+ * Removing chunk items and updating the device items in the chunks btree
+ * requires holding the chunk_mutex.
+ * See the comment at btrfs_chunk_alloc() for the details.
+ */
+ lockdep_assert_held(&trans->fs_info->chunk_mutex);
+
+ for (i = 0; i < map->num_stripes; i++) {
+ int ret;
+
+ ret = btrfs_update_device(trans, map->stripes[i].dev);
+ if (ret)
+ return ret;
+ }
+
+ return btrfs_free_chunk(trans, chunk_offset);
+}
+
int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -3032,14 +2990,16 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
return PTR_ERR(em);
}
map = em->map_lookup;
- mutex_lock(&fs_info->chunk_mutex);
- check_system_chunk(trans, map->type);
- mutex_unlock(&fs_info->chunk_mutex);
/*
- * Take the device list mutex to prevent races with the final phase of
- * a device replace operation that replaces the device object associated
- * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
+ * First delete the device extent items from the devices btree.
+ * We take the device_list_mutex to avoid racing with the finishing phase
+ * of a device replace operation. See the comment below before acquiring
+ * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex
+ * because that can result in a deadlock when deleting the device extent
+ * items from the devices btree - COWing an extent buffer from the btree
+ * may result in allocating a new metadata chunk, which would attempt to
+ * lock again fs_info->chunk_mutex.
*/
mutex_lock(&fs_devices->device_list_mutex);
for (i = 0; i < map->num_stripes; i++) {
@@ -3061,18 +3021,73 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
btrfs_clear_space_info_full(fs_info);
mutex_unlock(&fs_info->chunk_mutex);
}
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
- ret = btrfs_update_device(trans, device);
+ /*
+ * We acquire fs_info->chunk_mutex for 2 reasons:
+ *
+ * 1) Just like with the first phase of the chunk allocation, we must
+ * reserve system space, do all chunk btree updates and deletions, and
+ * update the system chunk array in the superblock while holding this
+ * mutex. This is for similar reasons as explained on the comment at
+ * the top of btrfs_chunk_alloc();
+ *
+ * 2) Prevent races with the final phase of a device replace operation
+ * that replaces the device object associated with the map's stripes,
+ * because the device object's id can change at any time during that
+ * final phase of the device replace operation
+ * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
+ * replaced device and then see it with an ID of
+ * BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating
+ * the device item, which does not exists on the chunk btree.
+ * The finishing phase of device replace acquires both the
+ * device_list_mutex and the chunk_mutex, in that order, so we are
+ * safe by just acquiring the chunk_mutex.
+ */
+ trans->removing_chunk = true;
+ mutex_lock(&fs_info->chunk_mutex);
+
+ check_system_chunk(trans, map->type);
+
+ ret = remove_chunk_item(trans, map, chunk_offset);
+ /*
+ * Normally we should not get -ENOSPC since we reserved space before
+ * through the call to check_system_chunk().
+ *
+ * Despite our system space_info having enough free space, we may not
+ * be able to allocate extents from its block groups, because all have
+ * an incompatible profile, which will force us to allocate a new system
+ * block group with the right profile, or right after we called
+ * check_system_space() above, a scrub turned the only system block group
+ * with enough free space into RO mode.
+ * This is explained with more detail at do_chunk_alloc().
+ *
+ * So if we get -ENOSPC, allocate a new system chunk and retry once.
+ */
+ if (ret == -ENOSPC) {
+ const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
+ struct btrfs_block_group *sys_bg;
+
+ sys_bg = btrfs_alloc_chunk(trans, sys_flags);
+ if (IS_ERR(sys_bg)) {
+ ret = PTR_ERR(sys_bg);
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
if (ret) {
- mutex_unlock(&fs_devices->device_list_mutex);
btrfs_abort_transaction(trans, ret);
goto out;
}
- }
- mutex_unlock(&fs_devices->device_list_mutex);
- ret = btrfs_free_chunk(trans, chunk_offset);
- if (ret) {
+ ret = remove_chunk_item(trans, map, chunk_offset);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+ } else if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -3087,6 +3102,15 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
}
}
+ mutex_unlock(&fs_info->chunk_mutex);
+ trans->removing_chunk = false;
+
+ /*
+ * We are done with chunk btree updates and deletions, so release the
+ * system space we previously reserved (with check_system_chunk()).
+ */
+ btrfs_trans_release_chunk_metadata(trans);
+
ret = btrfs_remove_block_group(trans, chunk_offset, em);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -3094,6 +3118,10 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
}
out:
+ if (trans->removing_chunk) {
+ mutex_unlock(&fs_info->chunk_mutex);
+ trans->removing_chunk = false;
+ }
/* once for us */
free_extent_map(em);
return ret;
@@ -3534,10 +3562,7 @@ static u64 calc_data_stripes(u64 type, int num_stripes)
const int ncopies = btrfs_raid_array[index].ncopies;
const int nparity = btrfs_raid_array[index].nparity;
- if (nparity)
- return num_stripes - nparity;
- else
- return num_stripes / ncopies;
+ return (num_stripes - nparity) / ncopies;
}
/* [pstart, pend) */
@@ -3937,6 +3962,13 @@ static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
return true;
+ if (fs_info->sectorsize < PAGE_SIZE &&
+ bargs->target & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ btrfs_err(fs_info,
+ "RAID56 is not yet supported for sectorsize %u with page size %lu",
+ fs_info->sectorsize, PAGE_SIZE);
+ return false;
+ }
/* Profile is valid and does not have bits outside of the allowed set */
if (alloc_profile_is_valid(bargs->target, 1) &&
(bargs->target & ~allowed) == 0)
@@ -4860,13 +4892,12 @@ static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
u32 array_size;
u8 *ptr;
- mutex_lock(&fs_info->chunk_mutex);
+ lockdep_assert_held(&fs_info->chunk_mutex);
+
array_size = btrfs_super_sys_array_size(super_copy);
if (array_size + item_size + sizeof(disk_key)
- > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
- mutex_unlock(&fs_info->chunk_mutex);
+ > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
return -EFBIG;
- }
ptr = super_copy->sys_chunk_array + array_size;
btrfs_cpu_key_to_disk(&disk_key, key);
@@ -4875,7 +4906,6 @@ static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
memcpy(ptr, chunk, item_size);
item_size += sizeof(disk_key);
btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
- mutex_unlock(&fs_info->chunk_mutex);
return 0;
}
@@ -5225,13 +5255,14 @@ static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
}
}
-static int create_chunk(struct btrfs_trans_handle *trans,
+static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
struct alloc_chunk_ctl *ctl,
struct btrfs_device_info *devices_info)
{
struct btrfs_fs_info *info = trans->fs_info;
struct map_lookup *map = NULL;
struct extent_map_tree *em_tree;
+ struct btrfs_block_group *block_group;
struct extent_map *em;
u64 start = ctl->start;
u64 type = ctl->type;
@@ -5241,7 +5272,7 @@ static int create_chunk(struct btrfs_trans_handle *trans,
map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
if (!map)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
map->num_stripes = ctl->num_stripes;
for (i = 0; i < ctl->ndevs; ++i) {
@@ -5263,7 +5294,7 @@ static int create_chunk(struct btrfs_trans_handle *trans,
em = alloc_extent_map();
if (!em) {
kfree(map);
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
}
set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
em->map_lookup = map;
@@ -5279,12 +5310,12 @@ static int create_chunk(struct btrfs_trans_handle *trans,
if (ret) {
write_unlock(&em_tree->lock);
free_extent_map(em);
- return ret;
+ return ERR_PTR(ret);
}
write_unlock(&em_tree->lock);
- ret = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
- if (ret)
+ block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
+ if (IS_ERR(block_group))
goto error_del_extent;
for (i = 0; i < map->num_stripes; i++) {
@@ -5304,7 +5335,7 @@ static int create_chunk(struct btrfs_trans_handle *trans,
check_raid56_incompat_flag(info, type);
check_raid1c34_incompat_flag(info, type);
- return 0;
+ return block_group;
error_del_extent:
write_lock(&em_tree->lock);
@@ -5316,34 +5347,36 @@ error_del_extent:
/* One for the tree reference */
free_extent_map(em);
- return ret;
+ return block_group;
}
-int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
+struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+ u64 type)
{
struct btrfs_fs_info *info = trans->fs_info;
struct btrfs_fs_devices *fs_devices = info->fs_devices;
struct btrfs_device_info *devices_info = NULL;
struct alloc_chunk_ctl ctl;
+ struct btrfs_block_group *block_group;
int ret;
lockdep_assert_held(&info->chunk_mutex);
if (!alloc_profile_is_valid(type, 0)) {
ASSERT(0);
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
}
if (list_empty(&fs_devices->alloc_list)) {
if (btrfs_test_opt(info, ENOSPC_DEBUG))
btrfs_debug(info, "%s: no writable device", __func__);
- return -ENOSPC;
+ return ERR_PTR(-ENOSPC);
}
if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
btrfs_err(info, "invalid chunk type 0x%llx requested", type);
ASSERT(0);
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
}
ctl.start = find_next_chunk(info);
@@ -5353,100 +5386,111 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
GFP_NOFS);
if (!devices_info)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
ret = gather_device_info(fs_devices, &ctl, devices_info);
- if (ret < 0)
+ if (ret < 0) {
+ block_group = ERR_PTR(ret);
goto out;
+ }
ret = decide_stripe_size(fs_devices, &ctl, devices_info);
- if (ret < 0)
+ if (ret < 0) {
+ block_group = ERR_PTR(ret);
goto out;
+ }
- ret = create_chunk(trans, &ctl, devices_info);
+ block_group = create_chunk(trans, &ctl, devices_info);
out:
kfree(devices_info);
- return ret;
+ return block_group;
}
/*
- * Chunk allocation falls into two parts. The first part does work
- * that makes the new allocated chunk usable, but does not do any operation
- * that modifies the chunk tree. The second part does the work that
- * requires modifying the chunk tree. This division is important for the
- * bootstrap process of adding storage to a seed btrfs.
+ * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the
+ * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system
+ * chunks.
+ *
+ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
+ * phases.
*/
-int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
- u64 chunk_offset, u64 chunk_size)
+int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *bg)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *extent_root = fs_info->extent_root;
struct btrfs_root *chunk_root = fs_info->chunk_root;
struct btrfs_key key;
- struct btrfs_device *device;
struct btrfs_chunk *chunk;
struct btrfs_stripe *stripe;
struct extent_map *em;
struct map_lookup *map;
size_t item_size;
- u64 dev_offset;
- u64 stripe_size;
- int i = 0;
- int ret = 0;
+ int i;
+ int ret;
- em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
- if (IS_ERR(em))
- return PTR_ERR(em);
+ /*
+ * We take the chunk_mutex for 2 reasons:
+ *
+ * 1) Updates and insertions in the chunk btree must be done while holding
+ * the chunk_mutex, as well as updating the system chunk array in the
+ * superblock. See the comment on top of btrfs_chunk_alloc() for the
+ * details;
+ *
+ * 2) To prevent races with the final phase of a device replace operation
+ * that replaces the device object associated with the map's stripes,
+ * because the device object's id can change at any time during that
+ * final phase of the device replace operation
+ * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
+ * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
+ * which would cause a failure when updating the device item, which does
+ * not exists, or persisting a stripe of the chunk item with such ID.
+ * Here we can't use the device_list_mutex because our caller already
+ * has locked the chunk_mutex, and the final phase of device replace
+ * acquires both mutexes - first the device_list_mutex and then the
+ * chunk_mutex. Using any of those two mutexes protects us from a
+ * concurrent device replace.
+ */
+ lockdep_assert_held(&fs_info->chunk_mutex);
+
+ em = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
map = em->map_lookup;
item_size = btrfs_chunk_item_size(map->num_stripes);
- stripe_size = em->orig_block_len;
chunk = kzalloc(item_size, GFP_NOFS);
if (!chunk) {
ret = -ENOMEM;
+ btrfs_abort_transaction(trans, ret);
goto out;
}
- /*
- * Take the device list mutex to prevent races with the final phase of
- * a device replace operation that replaces the device object associated
- * with the map's stripes, because the device object's id can change
- * at any time during that final phase of the device replace operation
- * (dev-replace.c:btrfs_dev_replace_finishing()).
- */
- mutex_lock(&fs_info->fs_devices->device_list_mutex);
for (i = 0; i < map->num_stripes; i++) {
- device = map->stripes[i].dev;
- dev_offset = map->stripes[i].physical;
+ struct btrfs_device *device = map->stripes[i].dev;
ret = btrfs_update_device(trans, device);
if (ret)
- break;
- ret = btrfs_alloc_dev_extent(trans, device, chunk_offset,
- dev_offset, stripe_size);
- if (ret)
- break;
- }
- if (ret) {
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- goto out;
+ goto out;
}
stripe = &chunk->stripe;
for (i = 0; i < map->num_stripes; i++) {
- device = map->stripes[i].dev;
- dev_offset = map->stripes[i].physical;
+ struct btrfs_device *device = map->stripes[i].dev;
+ const u64 dev_offset = map->stripes[i].physical;
btrfs_set_stack_stripe_devid(stripe, device->devid);
btrfs_set_stack_stripe_offset(stripe, dev_offset);
memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
stripe++;
}
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- btrfs_set_stack_chunk_length(chunk, chunk_size);
+ btrfs_set_stack_chunk_length(chunk, bg->length);
btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
btrfs_set_stack_chunk_type(chunk, map->type);
@@ -5458,15 +5502,18 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
key.type = BTRFS_CHUNK_ITEM_KEY;
- key.offset = chunk_offset;
+ key.offset = bg->start;
ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
- if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
- /*
- * TODO: Cleanup of inserted chunk root in case of
- * failure.
- */
+ if (ret)
+ goto out;
+
+ bg->chunk_item_inserted = 1;
+
+ if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
+ if (ret)
+ goto out;
}
out:
@@ -5479,16 +5526,41 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
u64 alloc_profile;
- int ret;
+ struct btrfs_block_group *meta_bg;
+ struct btrfs_block_group *sys_bg;
+
+ /*
+ * When adding a new device for sprouting, the seed device is read-only
+ * so we must first allocate a metadata and a system chunk. But before
+ * adding the block group items to the extent, device and chunk btrees,
+ * we must first:
+ *
+ * 1) Create both chunks without doing any changes to the btrees, as
+ * otherwise we would get -ENOSPC since the block groups from the
+ * seed device are read-only;
+ *
+ * 2) Add the device item for the new sprout device - finishing the setup
+ * of a new block group requires updating the device item in the chunk
+ * btree, so it must exist when we attempt to do it. The previous step
+ * ensures this does not fail with -ENOSPC.
+ *
+ * After that we can add the block group items to their btrees:
+ * update existing device item in the chunk btree, add a new block group
+ * item to the extent btree, add a new chunk item to the chunk btree and
+ * finally add the new device extent items to the devices btree.
+ */
alloc_profile = btrfs_metadata_alloc_profile(fs_info);
- ret = btrfs_alloc_chunk(trans, alloc_profile);
- if (ret)
- return ret;
+ meta_bg = btrfs_alloc_chunk(trans, alloc_profile);
+ if (IS_ERR(meta_bg))
+ return PTR_ERR(meta_bg);
alloc_profile = btrfs_system_alloc_profile(fs_info);
- ret = btrfs_alloc_chunk(trans, alloc_profile);
- return ret;
+ sys_bg = btrfs_alloc_chunk(trans, alloc_profile);
+ if (IS_ERR(sys_bg))
+ return PTR_ERR(sys_bg);
+
+ return 0;
}
static inline int btrfs_chunk_max_errors(struct map_lookup *map)
@@ -6745,9 +6817,31 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
if (WARN_ON(!devid && !fs_info))
return ERR_PTR(-EINVAL);
- dev = __alloc_device(fs_info);
- if (IS_ERR(dev))
- return dev;
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Preallocate a bio that's always going to be used for flushing device
+ * barriers and matches the device lifespan
+ */
+ dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0);
+ if (!dev->flush_bio) {
+ kfree(dev);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ INIT_LIST_HEAD(&dev->dev_list);
+ INIT_LIST_HEAD(&dev->dev_alloc_list);
+ INIT_LIST_HEAD(&dev->post_commit_list);
+
+ atomic_set(&dev->reada_in_flight, 0);
+ atomic_set(&dev->dev_stats_ccnt, 0);
+ btrfs_device_data_ordered_init(dev);
+ INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
+ INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
+ extent_io_tree_init(fs_info, &dev->alloc_state,
+ IO_TREE_DEVICE_ALLOC_STATE, NULL);
if (devid)
tmp = *devid;
@@ -6783,15 +6877,7 @@ static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
{
- int index = btrfs_bg_flags_to_raid_index(type);
- int ncopies = btrfs_raid_array[index].ncopies;
- const int nparity = btrfs_raid_array[index].nparity;
- int data_stripes;
-
- if (nparity)
- data_stripes = num_stripes - nparity;
- else
- data_stripes = num_stripes / ncopies;
+ const int data_stripes = calc_data_stripes(type, num_stripes);
return div_u64(chunk_len, data_stripes);
}
@@ -7415,10 +7501,18 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
total_dev++;
} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
struct btrfs_chunk *chunk;
+
+ /*
+ * We are only called at mount time, so no need to take
+ * fs_info->chunk_mutex. Plus, to avoid lockdep warnings,
+ * we always lock first fs_info->chunk_mutex before
+ * acquiring any locks on the chunk tree. This is a
+ * requirement for chunk allocation, see the comment on
+ * top of btrfs_chunk_alloc() for details.
+ */
+ ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
- mutex_lock(&fs_info->chunk_mutex);
ret = read_one_chunk(&found_key, leaf, chunk);
- mutex_unlock(&fs_info->chunk_mutex);
if (ret)
goto error;
}
@@ -7958,7 +8052,7 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
goto out;
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
- ret = btrfs_next_item(root, path);
+ ret = btrfs_next_leaf(root, path);
if (ret < 0)
goto out;
/* No dev extents at all? Not good */
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index c7fc7caf575c..b082250b42e0 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -450,7 +450,8 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map,
struct btrfs_io_geometry *io_geom);
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
-int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type);
+struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+ u64 type);
void btrfs_mapping_tree_free(struct extent_map_tree *tree);
blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
int mirror_num);
@@ -507,8 +508,8 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
u64 logical, u64 len);
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
u64 logical);
-int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
- u64 chunk_offset, u64 chunk_size);
+int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *bg);
int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);
struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
u64 logical, u64 length);
@@ -565,32 +566,6 @@ static inline void btrfs_dev_stat_set(struct btrfs_device *dev,
atomic_inc(&dev->dev_stats_ccnt);
}
-/*
- * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
- * can be used as index to access btrfs_raid_array[].
- */
-static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags)
-{
- if (flags & BTRFS_BLOCK_GROUP_RAID10)
- return BTRFS_RAID_RAID10;
- else if (flags & BTRFS_BLOCK_GROUP_RAID1)
- return BTRFS_RAID_RAID1;
- else if (flags & BTRFS_BLOCK_GROUP_RAID1C3)
- return BTRFS_RAID_RAID1C3;
- else if (flags & BTRFS_BLOCK_GROUP_RAID1C4)
- return BTRFS_RAID_RAID1C4;
- else if (flags & BTRFS_BLOCK_GROUP_DUP)
- return BTRFS_RAID_DUP;
- else if (flags & BTRFS_BLOCK_GROUP_RAID0)
- return BTRFS_RAID_RAID0;
- else if (flags & BTRFS_BLOCK_GROUP_RAID5)
- return BTRFS_RAID_RAID5;
- else if (flags & BTRFS_BLOCK_GROUP_RAID6)
- return BTRFS_RAID_RAID6;
-
- return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
-}
-
void btrfs_commit_device_sizes(struct btrfs_transaction *trans);
struct list_head * __attribute_const__ btrfs_get_fs_uuids(void);
@@ -600,6 +575,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
struct block_device *bdev,
const char *device_path);
+enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags);
int btrfs_bg_type_to_factor(u64 flags);
const char *btrfs_bg_type_to_raid_name(u64 flags);
int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index c3fa7d3fa770..8afa90074891 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -121,12 +121,12 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
workspace->strm.total_in = 0;
workspace->strm.total_out = 0;
- out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ out_page = alloc_page(GFP_NOFS);
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
}
- cpage_out = kmap(out_page);
+ cpage_out = page_address(out_page);
pages[0] = out_page;
nr_pages = 1;
@@ -148,26 +148,22 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
int i;
for (i = 0; i < in_buf_pages; i++) {
- if (in_page) {
- kunmap(in_page);
+ if (in_page)
put_page(in_page);
- }
in_page = find_get_page(mapping,
start >> PAGE_SHIFT);
- data_in = kmap(in_page);
+ data_in = page_address(in_page);
memcpy(workspace->buf + i * PAGE_SIZE,
data_in, PAGE_SIZE);
start += PAGE_SIZE;
}
workspace->strm.next_in = workspace->buf;
} else {
- if (in_page) {
- kunmap(in_page);
+ if (in_page)
put_page(in_page);
- }
in_page = find_get_page(mapping,
start >> PAGE_SHIFT);
- data_in = kmap(in_page);
+ data_in = page_address(in_page);
start += PAGE_SIZE;
workspace->strm.next_in = data_in;
}
@@ -196,18 +192,17 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
* the stream end if required
*/
if (workspace->strm.avail_out == 0) {
- kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
ret = -E2BIG;
goto out;
}
- out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ out_page = alloc_page(GFP_NOFS);
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
}
- cpage_out = kmap(out_page);
+ cpage_out = page_address(out_page);
pages[nr_pages] = out_page;
nr_pages++;
workspace->strm.avail_out = PAGE_SIZE;
@@ -234,18 +229,17 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
goto out;
} else if (workspace->strm.avail_out == 0) {
/* get another page for the stream end */
- kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
ret = -E2BIG;
goto out;
}
- out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ out_page = alloc_page(GFP_NOFS);
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
}
- cpage_out = kmap(out_page);
+ cpage_out = page_address(out_page);
pages[nr_pages] = out_page;
nr_pages++;
workspace->strm.avail_out = PAGE_SIZE;
@@ -264,13 +258,8 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
*total_in = workspace->strm.total_in;
out:
*out_pages = nr_pages;
- if (out_page)
- kunmap(out_page);
-
- if (in_page) {
- kunmap(in_page);
+ if (in_page)
put_page(in_page);
- }
return ret;
}
@@ -286,10 +275,8 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
unsigned long buf_start;
struct page **pages_in = cb->compressed_pages;
- u64 disk_start = cb->start;
- struct bio *orig_bio = cb->orig_bio;
- data_in = kmap(pages_in[page_in_index]);
+ data_in = page_address(pages_in[page_in_index]);
workspace->strm.next_in = data_in;
workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE);
workspace->strm.total_in = 0;
@@ -311,7 +298,6 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
pr_warn("BTRFS: inflateInit failed\n");
- kunmap(pages_in[page_in_index]);
return -EIO;
}
while (workspace->strm.total_in < srclen) {
@@ -326,9 +312,8 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
if (buf_start == total_out)
break;
- ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
- total_out, disk_start,
- orig_bio);
+ ret2 = btrfs_decompress_buf2page(workspace->buf,
+ total_out - buf_start, cb, buf_start);
if (ret2 == 0) {
ret = 0;
goto done;
@@ -339,17 +324,16 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
if (workspace->strm.avail_in == 0) {
unsigned long tmp;
- kunmap(pages_in[page_in_index]);
+
page_in_index++;
if (page_in_index >= total_pages_in) {
data_in = NULL;
break;
}
- data_in = kmap(pages_in[page_in_index]);
+ data_in = page_address(pages_in[page_in_index]);
workspace->strm.next_in = data_in;
tmp = srclen - workspace->strm.total_in;
- workspace->strm.avail_in = min(tmp,
- PAGE_SIZE);
+ workspace->strm.avail_in = min(tmp, PAGE_SIZE);
}
}
if (ret != Z_STREAM_END)
@@ -358,10 +342,8 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
ret = 0;
done:
zlib_inflateEnd(&workspace->strm);
- if (data_in)
- kunmap(pages_in[page_in_index]);
if (!ret)
- zero_fill_bio(orig_bio);
+ zero_fill_bio(cb->orig_bio);
return ret;
}
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 297c0b1c0634..47af1ab3bf12 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -245,7 +245,7 @@ static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info)
goto out;
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
- ret = btrfs_next_item(root, path);
+ ret = btrfs_next_leaf(root, path);
if (ret < 0)
goto out;
/* No dev extents at all? Not good */
@@ -296,7 +296,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_zoned_device_info *zone_info = NULL;
struct block_device *bdev = device->bdev;
- struct request_queue *queue = bdev_get_queue(bdev);
sector_t nr_sectors;
sector_t sector = 0;
struct blk_zone *zones = NULL;
@@ -348,19 +347,10 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
nr_sectors = bdev_nr_sectors(bdev);
zone_info->zone_size_shift = ilog2(zone_info->zone_size);
- zone_info->max_zone_append_size =
- (u64)queue_max_zone_append_sectors(queue) << SECTOR_SHIFT;
zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors);
if (!IS_ALIGNED(nr_sectors, zone_sectors))
zone_info->nr_zones++;
- if (bdev_is_zoned(bdev) && zone_info->max_zone_append_size == 0) {
- btrfs_err(fs_info, "zoned: device %pg does not support zone append",
- bdev);
- ret = -EINVAL;
- goto out;
- }
-
zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
if (!zone_info->seq_zones) {
ret = -ENOMEM;
@@ -529,7 +519,6 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
u64 zoned_devices = 0;
u64 nr_devices = 0;
u64 zone_size = 0;
- u64 max_zone_append_size = 0;
const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED);
int ret = 0;
@@ -565,11 +554,6 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
ret = -EINVAL;
goto out;
}
- if (!max_zone_append_size ||
- (zone_info->max_zone_append_size &&
- zone_info->max_zone_append_size < max_zone_append_size))
- max_zone_append_size =
- zone_info->max_zone_append_size;
}
nr_devices++;
}
@@ -619,7 +603,6 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
}
fs_info->zone_size = zone_size;
- fs_info->max_zone_append_size = max_zone_append_size;
fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
/*
@@ -1318,9 +1301,6 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
if (!btrfs_is_zoned(fs_info))
return false;
- if (!fs_info->max_zone_append_size)
- return false;
-
if (!is_data_inode(&inode->vfs_inode))
return false;
@@ -1349,8 +1329,7 @@ void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset,
return;
ordered->physical = physical;
- ordered->disk = bio->bi_bdev->bd_disk;
- ordered->partno = bio->bi_bdev->bd_partno;
+ ordered->bdev = bio->bi_bdev;
btrfs_put_ordered_extent(ordered);
}
@@ -1362,18 +1341,16 @@ void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered)
struct extent_map_tree *em_tree;
struct extent_map *em;
struct btrfs_ordered_sum *sum;
- struct block_device *bdev;
u64 orig_logical = ordered->disk_bytenr;
u64 *logical = NULL;
int nr, stripe_len;
/* Zoned devices should not have partitions. So, we can assume it is 0 */
- ASSERT(ordered->partno == 0);
- bdev = bdgrab(ordered->disk->part0);
- if (WARN_ON(!bdev))
+ ASSERT(!bdev_is_partition(ordered->bdev));
+ if (WARN_ON(!ordered->bdev))
return;
- if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, bdev,
+ if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, ordered->bdev,
ordered->physical, &logical, &nr,
&stripe_len)))
goto out;
@@ -1402,7 +1379,6 @@ void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered)
out:
kfree(logical);
- bdput(bdev);
}
bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index b0ae2608cb6b..4b299705bb12 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -22,7 +22,6 @@ struct btrfs_zoned_device_info {
*/
u64 zone_size;
u8 zone_size_shift;
- u64 max_zone_append_size;
u32 nr_zones;
unsigned long *seq_zones;
unsigned long *empty_zones;
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 3e26b466476a..56dce9f00988 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -399,19 +399,19 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
/* map in the first page of input data */
in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- workspace->in_buf.src = kmap(in_page);
+ workspace->in_buf.src = page_address(in_page);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
/* Allocate and map in the output buffer */
- out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ out_page = alloc_page(GFP_NOFS);
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
}
pages[nr_pages++] = out_page;
- workspace->out_buf.dst = kmap(out_page);
+ workspace->out_buf.dst = page_address(out_page);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
@@ -446,19 +446,18 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
if (workspace->out_buf.pos == workspace->out_buf.size) {
tot_out += PAGE_SIZE;
max_out -= PAGE_SIZE;
- kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
ret = -E2BIG;
goto out;
}
- out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ out_page = alloc_page(GFP_NOFS);
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
}
pages[nr_pages++] = out_page;
- workspace->out_buf.dst = kmap(out_page);
+ workspace->out_buf.dst = page_address(out_page);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out,
PAGE_SIZE);
@@ -473,13 +472,12 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
/* Check if we need more input */
if (workspace->in_buf.pos == workspace->in_buf.size) {
tot_in += PAGE_SIZE;
- kunmap(in_page);
put_page(in_page);
start += PAGE_SIZE;
len -= PAGE_SIZE;
in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- workspace->in_buf.src = kmap(in_page);
+ workspace->in_buf.src = page_address(in_page);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
}
@@ -506,19 +504,18 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
tot_out += PAGE_SIZE;
max_out -= PAGE_SIZE;
- kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
ret = -E2BIG;
goto out;
}
- out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ out_page = alloc_page(GFP_NOFS);
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
}
pages[nr_pages++] = out_page;
- workspace->out_buf.dst = kmap(out_page);
+ workspace->out_buf.dst = page_address(out_page);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
}
@@ -534,12 +531,8 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
out:
*out_pages = nr_pages;
/* Cleanup */
- if (in_page) {
- kunmap(in_page);
+ if (in_page)
put_page(in_page);
- }
- if (out_page)
- kunmap(out_page);
return ret;
}
@@ -547,8 +540,6 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
struct page **pages_in = cb->compressed_pages;
- u64 disk_start = cb->start;
- struct bio *orig_bio = cb->orig_bio;
size_t srclen = cb->compressed_len;
ZSTD_DStream *stream;
int ret = 0;
@@ -565,7 +556,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
goto done;
}
- workspace->in_buf.src = kmap(pages_in[page_in_index]);
+ workspace->in_buf.src = page_address(pages_in[page_in_index]);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
@@ -589,7 +580,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
workspace->out_buf.pos = 0;
ret = btrfs_decompress_buf2page(workspace->out_buf.dst,
- buf_start, total_out, disk_start, orig_bio);
+ total_out - buf_start, cb, buf_start);
if (ret == 0)
break;
@@ -601,23 +592,21 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
break;
if (workspace->in_buf.pos == workspace->in_buf.size) {
- kunmap(pages_in[page_in_index++]);
+ page_in_index++;
if (page_in_index >= total_pages_in) {
workspace->in_buf.src = NULL;
ret = -EIO;
goto done;
}
srclen -= PAGE_SIZE;
- workspace->in_buf.src = kmap(pages_in[page_in_index]);
+ workspace->in_buf.src = page_address(pages_in[page_in_index]);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
}
}
ret = 0;
- zero_fill_bio(orig_bio);
+ zero_fill_bio(cb->orig_bio);
done:
- if (workspace->in_buf.src)
- kunmap(pages_in[page_in_index]);
return ret;
}