summaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/disk-io.c2
-rw-r--r--fs/btrfs/extent_map.c123
-rw-r--r--fs/btrfs/fs.h1
-rw-r--r--fs/btrfs/inode.c2
-rw-r--r--fs/btrfs/ioctl.c2
5 files changed, 99 insertions, 31 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 38cdb8875e8e..cabb558dbdaa 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2856,6 +2856,8 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
if (ret)
return ret;
+ spin_lock_init(&fs_info->extent_map_shrinker_lock);
+
ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
if (ret)
return ret;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 744e8952abb0..b4c9a6aa118c 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1028,7 +1028,14 @@ out_free_pre:
return ret;
}
-static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_to_scan)
+struct btrfs_em_shrink_ctx {
+ long nr_to_scan;
+ long scanned;
+ u64 last_ino;
+ u64 last_root;
+};
+
+static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx)
{
const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info);
struct extent_map_tree *tree = &inode->extent_tree;
@@ -1057,14 +1064,25 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_t
if (!down_read_trylock(&inode->i_mmap_lock))
return 0;
- write_lock(&tree->lock);
+ /*
+ * We want to be fast because we can be called from any path trying to
+ * allocate memory, so if the lock is busy we don't want to spend time
+ * waiting for it - either some task is about to do IO for the inode or
+ * we may have another task shrinking extent maps, here in this code, so
+ * skip this inode.
+ */
+ if (!write_trylock(&tree->lock)) {
+ up_read(&inode->i_mmap_lock);
+ return 0;
+ }
+
node = rb_first_cached(&tree->map);
while (node) {
struct extent_map *em;
em = rb_entry(node, struct extent_map, rb_node);
node = rb_next(node);
- (*scanned)++;
+ ctx->scanned++;
if (em->flags & EXTENT_FLAG_PINNED)
goto next;
@@ -1085,16 +1103,18 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_t
free_extent_map(em);
nr_dropped++;
next:
- if (*scanned >= nr_to_scan)
+ if (ctx->scanned >= ctx->nr_to_scan)
break;
/*
- * Restart if we had to reschedule, and any extent maps that were
- * pinned before may have become unpinned after we released the
- * lock and took it again.
+ * Stop if we need to reschedule or there's contention on the
+ * lock. This is to avoid slowing other tasks trying to take the
+ * lock and because the shrinker might be called during a memory
+ * allocation path and we want to avoid taking a very long time
+ * and slowing down all sorts of tasks.
*/
- if (cond_resched_rwlock_write(&tree->lock))
- node = rb_first_cached(&tree->map);
+ if (need_resched() || rwlock_needbreak(&tree->lock))
+ break;
}
write_unlock(&tree->lock);
up_read(&inode->i_mmap_lock);
@@ -1102,25 +1122,30 @@ next:
return nr_dropped;
}
-static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_scan)
+static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_inode *inode;
long nr_dropped = 0;
- u64 min_ino = fs_info->extent_map_shrinker_last_ino + 1;
+ u64 min_ino = ctx->last_ino + 1;
inode = btrfs_find_first_inode(root, min_ino);
while (inode) {
- nr_dropped += btrfs_scan_inode(inode, scanned, nr_to_scan);
+ nr_dropped += btrfs_scan_inode(inode, ctx);
min_ino = btrfs_ino(inode) + 1;
- fs_info->extent_map_shrinker_last_ino = btrfs_ino(inode);
- iput(&inode->vfs_inode);
+ ctx->last_ino = btrfs_ino(inode);
+ btrfs_add_delayed_iput(inode);
- if (*scanned >= nr_to_scan)
+ if (ctx->scanned >= ctx->nr_to_scan)
+ break;
+
+ /*
+ * We may be called from memory allocation paths, so we don't
+ * want to take too much time and slowdown tasks.
+ */
+ if (need_resched())
break;
- cond_resched();
inode = btrfs_find_first_inode(root, min_ino);
}
@@ -1132,14 +1157,14 @@ static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_s
* inode if there is one or we will find out this was the last
* one and move to the next root.
*/
- fs_info->extent_map_shrinker_last_root = btrfs_root_id(root);
+ ctx->last_root = btrfs_root_id(root);
} else {
/*
* No more inodes in this root, set extent_map_shrinker_last_ino to 0 so
* that when processing the next root we start from its first inode.
*/
- fs_info->extent_map_shrinker_last_ino = 0;
- fs_info->extent_map_shrinker_last_root = btrfs_root_id(root) + 1;
+ ctx->last_ino = 0;
+ ctx->last_root = btrfs_root_id(root) + 1;
}
return nr_dropped;
@@ -1147,19 +1172,41 @@ static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_s
long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
{
- const u64 start_root_id = fs_info->extent_map_shrinker_last_root;
- u64 next_root_id = start_root_id;
+ struct btrfs_em_shrink_ctx ctx;
+ u64 start_root_id;
+ u64 next_root_id;
bool cycled = false;
long nr_dropped = 0;
- long scanned = 0;
+
+ ctx.scanned = 0;
+ ctx.nr_to_scan = nr_to_scan;
+
+ /*
+ * In case we have multiple tasks running this shrinker, make the next
+ * one start from the next inode in case it starts before we finish.
+ */
+ spin_lock(&fs_info->extent_map_shrinker_lock);
+ ctx.last_ino = fs_info->extent_map_shrinker_last_ino;
+ fs_info->extent_map_shrinker_last_ino++;
+ ctx.last_root = fs_info->extent_map_shrinker_last_root;
+ spin_unlock(&fs_info->extent_map_shrinker_lock);
+
+ start_root_id = ctx.last_root;
+ next_root_id = ctx.last_root;
if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
- trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan, nr);
+ trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan,
+ nr, ctx.last_root,
+ ctx.last_ino);
}
- while (scanned < nr_to_scan) {
+ /*
+ * We may be called from memory allocation paths, so we don't want to
+ * take too much time and slowdown tasks, so stop if we need reschedule.
+ */
+ while (ctx.scanned < ctx.nr_to_scan && !need_resched()) {
struct btrfs_root *root;
unsigned long count;
@@ -1171,8 +1218,8 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
spin_unlock(&fs_info->fs_roots_radix_lock);
if (start_root_id > 0 && !cycled) {
next_root_id = 0;
- fs_info->extent_map_shrinker_last_root = 0;
- fs_info->extent_map_shrinker_last_ino = 0;
+ ctx.last_root = 0;
+ ctx.last_ino = 0;
cycled = true;
continue;
}
@@ -1186,15 +1233,33 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
continue;
if (is_fstree(btrfs_root_id(root)))
- nr_dropped += btrfs_scan_root(root, &scanned, nr_to_scan);
+ nr_dropped += btrfs_scan_root(root, &ctx);
btrfs_put_root(root);
}
+ /*
+ * In case of multiple tasks running this extent map shrinking code this
+ * isn't perfect but it's simple and silences things like KCSAN. It's
+ * not possible to know which task made more progress because we can
+ * cycle back to the first root and first inode if it's not the first
+ * time the shrinker ran, see the above logic. Also a task that started
+ * later may finish ealier than another task and made less progress. So
+ * make this simple and update to the progress of the last task that
+ * finished, with the occasional possiblity of having two consecutive
+ * runs of the shrinker process the same inodes.
+ */
+ spin_lock(&fs_info->extent_map_shrinker_lock);
+ fs_info->extent_map_shrinker_last_ino = ctx.last_ino;
+ fs_info->extent_map_shrinker_last_root = ctx.last_root;
+ spin_unlock(&fs_info->extent_map_shrinker_lock);
+
if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) {
s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
- trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr);
+ trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped,
+ nr, ctx.last_root,
+ ctx.last_ino);
}
return nr_dropped;
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 89f0650631cd..833dc3fe0a38 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -630,6 +630,7 @@ struct btrfs_fs_info {
s32 delalloc_batch;
struct percpu_counter evictable_extent_maps;
+ spinlock_t extent_map_shrinker_lock;
u64 extent_map_shrinker_last_root;
u64 extent_map_shrinker_last_ino;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3a2b902b2d1f..d62c96f00ff8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5587,7 +5587,7 @@ static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
args.ino = ino;
args.root = root;
- inode = iget5_locked(s, hashval, btrfs_find_actor,
+ inode = iget5_locked_rcu(s, hashval, btrfs_find_actor,
btrfs_init_locked_inode,
(void *)&args);
return inode;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index efd5d6e9589e..6ad524b894fc 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4627,7 +4627,7 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool
goto out_iov;
init_sync_kiocb(&kiocb, file);
- ret = kiocb_set_rw_flags(&kiocb, 0);
+ ret = kiocb_set_rw_flags(&kiocb, 0, WRITE);
if (ret)
goto out_iov;
kiocb.ki_pos = pos;