diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-12 15:01:38 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-12 15:01:38 -0700 |
commit | 26935fb06ee88f1188789807687c03041f3c70d9 (patch) | |
tree | 381c487716540b52348d78bee6555f8fa61d77ef /fs | |
parent | 3cc69b638e11bfda5d013c2b75b60934aa0e88a1 (diff) | |
parent | bf2ba3bc185269eca274b458aac46ba1ad7c1121 (diff) |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
Pull vfs pile 4 from Al Viro:
"list_lru pile, mostly"
This came out of Andrew's pile, Al ended up doing the merge work so that
Andrew didn't have to.
Additionally, a few fixes.
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs: (42 commits)
super: fix for destroy lrus
list_lru: dynamically adjust node arrays
shrinker: Kill old ->shrink API.
shrinker: convert remaining shrinkers to count/scan API
staging/lustre/libcfs: cleanup linux-mem.h
staging/lustre/ptlrpc: convert to new shrinker API
staging/lustre/obdclass: convert lu_object shrinker to count/scan API
staging/lustre/ldlm: convert to shrinkers to count/scan API
hugepage: convert huge zero page shrinker to new shrinker API
i915: bail out earlier when shrinker cannot acquire mutex
drivers: convert shrinkers to new count/scan API
fs: convert fs shrinkers to new scan/count API
xfs: fix dquot isolation hang
xfs-convert-dquot-cache-lru-to-list_lru-fix
xfs: convert dquot cache lru to list_lru
xfs: rework buffer dispose list tracking
xfs-convert-buftarg-lru-to-generic-code-fix
xfs: convert buftarg LRU to generic code
fs: convert inode and dentry shrinking to be node aware
vmscan: per-node deferred work
...
Diffstat (limited to 'fs')
-rw-r--r-- | fs/dcache.c | 276 | ||||
-rw-r--r-- | fs/drop_caches.c | 1 | ||||
-rw-r--r-- | fs/ext4/extents_status.c | 33 | ||||
-rw-r--r-- | fs/gfs2/glock.c | 30 | ||||
-rw-r--r-- | fs/gfs2/main.c | 3 | ||||
-rw-r--r-- | fs/gfs2/quota.c | 18 | ||||
-rw-r--r-- | fs/gfs2/quota.h | 6 | ||||
-rw-r--r-- | fs/inode.c | 193 | ||||
-rw-r--r-- | fs/internal.h | 6 | ||||
-rw-r--r-- | fs/mbcache.c | 49 | ||||
-rw-r--r-- | fs/namei.c | 49 | ||||
-rw-r--r-- | fs/nfs/dir.c | 16 | ||||
-rw-r--r-- | fs/nfs/internal.h | 6 | ||||
-rw-r--r-- | fs/nfs/super.c | 3 | ||||
-rw-r--r-- | fs/nfsd/nfscache.c | 32 | ||||
-rw-r--r-- | fs/quota/dquot.c | 34 | ||||
-rw-r--r-- | fs/super.c | 111 | ||||
-rw-r--r-- | fs/ubifs/shrinker.c | 29 | ||||
-rw-r--r-- | fs/ubifs/super.c | 3 | ||||
-rw-r--r-- | fs/ubifs/ubifs.h | 5 | ||||
-rw-r--r-- | fs/xfs/xfs_buf.c | 253 | ||||
-rw-r--r-- | fs/xfs/xfs_buf.h | 17 | ||||
-rw-r--r-- | fs/xfs/xfs_dquot.c | 7 | ||||
-rw-r--r-- | fs/xfs/xfs_icache.c | 4 | ||||
-rw-r--r-- | fs/xfs/xfs_icache.h | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_qm.c | 287 | ||||
-rw-r--r-- | fs/xfs/xfs_qm.h | 4 | ||||
-rw-r--r-- | fs/xfs/xfs_super.c | 12 |
28 files changed, 823 insertions, 666 deletions
diff --git a/fs/dcache.c b/fs/dcache.c index dddc67fed732..1bd4614ce93b 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -37,6 +37,7 @@ #include <linux/rculist_bl.h> #include <linux/prefetch.h> #include <linux/ratelimit.h> +#include <linux/list_lru.h> #include "internal.h" #include "mount.h" @@ -48,7 +49,7 @@ * - the dcache hash table * s_anon bl list spinlock protects: * - the s_anon list (see __d_drop) - * dcache_lru_lock protects: + * dentry->d_sb->s_dentry_lru_lock protects: * - the dcache lru lists and counters * d_lock protects: * - d_flags @@ -63,7 +64,7 @@ * Ordering: * dentry->d_inode->i_lock * dentry->d_lock - * dcache_lru_lock + * dentry->d_sb->s_dentry_lru_lock * dcache_hash_bucket lock * s_anon lock * @@ -81,7 +82,6 @@ int sysctl_vfs_cache_pressure __read_mostly = 100; EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock); __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); EXPORT_SYMBOL(rename_lock); @@ -146,23 +146,47 @@ struct dentry_stat_t dentry_stat = { .age_limit = 45, }; -static DEFINE_PER_CPU(unsigned int, nr_dentry); +static DEFINE_PER_CPU(long, nr_dentry); +static DEFINE_PER_CPU(long, nr_dentry_unused); #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) -static int get_nr_dentry(void) + +/* + * Here we resort to our own counters instead of using generic per-cpu counters + * for consistency with what the vfs inode code does. We are expected to harvest + * better code and performance by having our own specialized counters. + * + * Please note that the loop is done over all possible CPUs, not over all online + * CPUs. The reason for this is that we don't want to play games with CPUs going + * on and off. If one of them goes off, we will just keep their counters. + * + * glommer: See cffbc8a for details, and if you ever intend to change this, + * please update all vfs counters to match. + */ +static long get_nr_dentry(void) { int i; - int sum = 0; + long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry, i); return sum < 0 ? 0 : sum; } +static long get_nr_dentry_unused(void) +{ + int i; + long sum = 0; + for_each_possible_cpu(i) + sum += per_cpu(nr_dentry_unused, i); + return sum < 0 ? 0 : sum; +} + int proc_nr_dentry(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { dentry_stat.nr_dentry = get_nr_dentry(); - return proc_dointvec(table, write, buffer, lenp, ppos); + dentry_stat.nr_unused = get_nr_dentry_unused(); + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #endif @@ -333,52 +357,35 @@ static void dentry_unlink_inode(struct dentry * dentry) } /* - * dentry_lru_(add|del|prune|move_tail) must be called with d_lock held. + * dentry_lru_(add|del)_list) must be called with d_lock held. */ static void dentry_lru_add(struct dentry *dentry) { if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) { - spin_lock(&dcache_lru_lock); + if (list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)) + this_cpu_inc(nr_dentry_unused); dentry->d_flags |= DCACHE_LRU_LIST; - list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); - dentry->d_sb->s_nr_dentry_unused++; - dentry_stat.nr_unused++; - spin_unlock(&dcache_lru_lock); } } -static void __dentry_lru_del(struct dentry *dentry) -{ - list_del_init(&dentry->d_lru); - dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); - dentry->d_sb->s_nr_dentry_unused--; - dentry_stat.nr_unused--; -} - /* * Remove a dentry with references from the LRU. + * + * If we are on the shrink list, then we can get to try_prune_one_dentry() and + * lose our last reference through the parent walk. In this case, we need to + * remove ourselves from the shrink list, not the LRU. */ static void dentry_lru_del(struct dentry *dentry) { - if (!list_empty(&dentry->d_lru)) { - spin_lock(&dcache_lru_lock); - __dentry_lru_del(dentry); - spin_unlock(&dcache_lru_lock); + if (dentry->d_flags & DCACHE_SHRINK_LIST) { + list_del_init(&dentry->d_lru); + dentry->d_flags &= ~DCACHE_SHRINK_LIST; + return; } -} -static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list) -{ - spin_lock(&dcache_lru_lock); - if (list_empty(&dentry->d_lru)) { - dentry->d_flags |= DCACHE_LRU_LIST; - list_add_tail(&dentry->d_lru, list); - dentry->d_sb->s_nr_dentry_unused++; - dentry_stat.nr_unused++; - } else { - list_move_tail(&dentry->d_lru, list); - } - spin_unlock(&dcache_lru_lock); + if (list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)) + this_cpu_dec(nr_dentry_unused); + dentry->d_flags &= ~DCACHE_LRU_LIST; } /** @@ -474,7 +481,8 @@ EXPORT_SYMBOL(d_drop); * If ref is non-zero, then decrement the refcount too. * Returns dentry requiring refcount drop, or NULL if we're done. */ -static inline struct dentry *dentry_kill(struct dentry *dentry) +static inline struct dentry * +dentry_kill(struct dentry *dentry, int unlock_on_failure) __releases(dentry->d_lock) { struct inode *inode; @@ -483,8 +491,10 @@ static inline struct dentry *dentry_kill(struct dentry *dentry) inode = dentry->d_inode; if (inode && !spin_trylock(&inode->i_lock)) { relock: - spin_unlock(&dentry->d_lock); - cpu_relax(); + if (unlock_on_failure) { + spin_unlock(&dentry->d_lock); + cpu_relax(); + } return dentry; /* try again with same dentry */ } if (IS_ROOT(dentry)) @@ -567,7 +577,7 @@ repeat: return; kill_it: - dentry = dentry_kill(dentry); + dentry = dentry_kill(dentry, 1); if (dentry) goto repeat; } @@ -787,12 +797,12 @@ EXPORT_SYMBOL(d_prune_aliases); * * This may fail if locks cannot be acquired no problem, just try again. */ -static void try_prune_one_dentry(struct dentry *dentry) +static struct dentry * try_prune_one_dentry(struct dentry *dentry) __releases(dentry->d_lock) { struct dentry *parent; - parent = dentry_kill(dentry); + parent = dentry_kill(dentry, 0); /* * If dentry_kill returns NULL, we have nothing more to do. * if it returns the same dentry, trylocks failed. In either @@ -804,17 +814,18 @@ static void try_prune_one_dentry(struct dentry *dentry) * fragmentation. */ if (!parent) - return; + return NULL; if (parent == dentry) - return; + return dentry; /* Prune ancestors. */ dentry = parent; while (dentry) { if (lockref_put_or_lock(&dentry->d_lockref)) - return; - dentry = dentry_kill(dentry); + return NULL; + dentry = dentry_kill(dentry, 1); } + return NULL; } static void shrink_dentry_list(struct list_head *list) @@ -833,76 +844,143 @@ static void shrink_dentry_list(struct list_head *list) } /* + * The dispose list is isolated and dentries are not accounted + * to the LRU here, so we can simply remove it from the list + * here regardless of whether it is referenced or not. + */ + list_del_init(&dentry->d_lru); + dentry->d_flags &= ~DCACHE_SHRINK_LIST; + + /* * We found an inuse dentry which was not removed from - * the LRU because of laziness during lookup. Do not free - * it - just keep it off the LRU list. + * the LRU because of laziness during lookup. Do not free it. */ if (dentry->d_lockref.count) { - dentry_lru_del(dentry); spin_unlock(&dentry->d_lock); continue; } - rcu_read_unlock(); - try_prune_one_dentry(dentry); + dentry = try_prune_one_dentry(dentry); rcu_read_lock(); + if (dentry) { + dentry->d_flags |= DCACHE_SHRINK_LIST; + list_add(&dentry->d_lru, list); + spin_unlock(&dentry->d_lock); + } } rcu_read_unlock(); } +static enum lru_status +dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) +{ + struct list_head *freeable = arg; + struct dentry *dentry = container_of(item, struct dentry, d_lru); + + + /* + * we are inverting the lru lock/dentry->d_lock here, + * so use a trylock. If we fail to get the lock, just skip + * it + */ + if (!spin_trylock(&dentry->d_lock)) + return LRU_SKIP; + + /* + * Referenced dentries are still in use. If they have active + * counts, just remove them from the LRU. Otherwise give them + * another pass through the LRU. + */ + if (dentry->d_lockref.count) { + list_del_init(&dentry->d_lru); + spin_unlock(&dentry->d_lock); + return LRU_REMOVED; + } + + if (dentry->d_flags & DCACHE_REFERENCED) { + dentry->d_flags &= ~DCACHE_REFERENCED; + spin_unlock(&dentry->d_lock); + + /* + * The list move itself will be made by the common LRU code. At + * this point, we've dropped the dentry->d_lock but keep the + * lru lock. This is safe to do, since every list movement is + * protected by the lru lock even if both locks are held. + * + * This is guaranteed by the fact that all LRU management + * functions are intermediated by the LRU API calls like + * list_lru_add and list_lru_del. List movement in this file + * only ever occur through this functions or through callbacks + * like this one, that are called from the LRU API. + * + * The only exceptions to this are functions like + * shrink_dentry_list, and code that first checks for the + * DCACHE_SHRINK_LIST flag. Those are guaranteed to be + * operating only with stack provided lists after they are + * properly isolated from the main list. It is thus, always a + * local access. + */ + return LRU_ROTATE; + } + + dentry->d_flags |= DCACHE_SHRINK_LIST; + list_move_tail(&dentry->d_lru, freeable); + this_cpu_dec(nr_dentry_unused); + spin_unlock(&dentry->d_lock); + + return LRU_REMOVED; +} + /** * prune_dcache_sb - shrink the dcache * @sb: superblock - * @count: number of entries to try to free + * @nr_to_scan : number of entries to try to free + * @nid: which node to scan for freeable entities * - * Attempt to shrink the superblock dcache LRU by @count entries. This is + * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is * done when we need more memory an called from the superblock shrinker * function. * * This function may fail to free any resources if all the dentries are in * use. */ -void prune_dcache_sb(struct super_block *sb, int count) +long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, + int nid) { - struct dentry *dentry; - LIST_HEAD(referenced); - LIST_HEAD(tmp); + LIST_HEAD(dispose); + long freed; -relock: - spin_lock(&dcache_lru_lock); - while (!list_empty(&sb->s_dentry_lru)) { - dentry = list_entry(sb->s_dentry_lru.prev, - struct dentry, d_lru); - BUG_ON(dentry->d_sb != sb); - - if (!spin_trylock(&dentry->d_lock)) { - spin_unlock(&dcache_lru_lock); - cpu_relax(); - goto relock; - } + freed = list_lru_walk_node(&sb->s_dentry_lru, nid, dentry_lru_isolate, + &dispose, &nr_to_scan); + shrink_dentry_list(&dispose); + return freed; +} - if (dentry->d_flags & DCACHE_REFERENCED) { - dentry->d_flags &= ~DCACHE_REFERENCED; - list_move(&dentry->d_lru, &referenced); - spin_unlock(&dentry->d_lock); - } else { - list_move_tail(&dentry->d_lru, &tmp); - dentry->d_flags |= DCACHE_SHRINK_LIST; - spin_unlock(&dentry->d_lock); - if (!--count) - break; - } - cond_resched_lock(&dcache_lru_lock); - } - if (!list_empty(&referenced)) - list_splice(&referenced, &sb->s_dentry_lru); - spin_unlock(&dcache_lru_lock); +static enum lru_status dentry_lru_isolate_shrink(struct list_head *item, + spinlock_t *lru_lock, void *arg) +{ + struct list_head *freeable = arg; + struct dentry *dentry = container_of(item, struct dentry, d_lru); - shrink_dentry_list(&tmp); + /* + * we are inverting the lru lock/dentry->d_lock here, + * so use a trylock. If we fail to get the lock, just skip + * it + */ + if (!spin_trylock(&dentry->d_lock)) + return LRU_SKIP; + + dentry->d_flags |= DCACHE_SHRINK_LIST; + list_move_tail(&dentry->d_lru, freeable); + this_cpu_dec(nr_dentry_unused); + spin_unlock(&dentry->d_lock); + + return LRU_REMOVED; } + /** * shrink_dcache_sb - shrink dcache for a superblock * @sb: superblock @@ -912,16 +990,17 @@ relock: */ void shrink_dcache_sb(struct super_block *sb) { - LIST_HEAD(tmp); + long freed; - spin_lock(&dcache_lru_lock); - while (!list_empty(&sb->s_dentry_lru)) { - list_splice_init(&sb->s_dentry_lru, &tmp); - spin_unlock(&dcache_lru_lock); - shrink_dentry_list(&tmp); - spin_lock(&dcache_lru_lock); - } - spin_unlock(&dcache_lru_lock); + do { + LIST_HEAD(dispose); + + freed = list_lru_walk(&sb->s_dentry_lru, + dentry_lru_isolate_shrink, &dispose, UINT_MAX); + + this_cpu_sub(nr_dentry_unused, freed); + shrink_dentry_list(&dispose); + } while (freed > 0); } EXPORT_SYMBOL(shrink_dcache_sb); @@ -1283,7 +1362,8 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) if (dentry->d_lockref.count) { dentry_lru_del(dentry); } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { - dentry_lru_move_list(dentry, &data->dispose); + dentry_lru_del(dentry); + list_add_tail(&dentry->d_lru, &data->dispose); dentry->d_flags |= DCACHE_SHRINK_LIST; data->found++; ret = D_WALK_NORETRY; diff --git a/fs/drop_caches.c b/fs/drop_caches.c index c00e055b6282..9fd702f5bfb2 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -44,6 +44,7 @@ static void drop_slab(void) .gfp_mask = GFP_KERNEL, }; + nodes_setall(shrink.nodes_to_scan); do { nr_objects = shrink_slab(&shrink, 1000, 1000); } while (nr_objects > 10); diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 2d1bdbe78c04..3981ff783950 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -931,13 +931,15 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, struct ext4_inode_info *ei; struct list_head *cur, *tmp; LIST_HEAD(skipped); - int ret, nr_shrunk = 0; + int nr_shrunk = 0; int retried = 0, skip_precached = 1, nr_skipped = 0; spin_lock(&sbi->s_es_lru_lock); retry: list_for_each_safe(cur, tmp, &sbi->s_es_lru) { + int shrunk; + /* * If we have already reclaimed all extents from extent * status tree, just stop the loop immediately. @@ -964,13 +966,13 @@ retry: continue; write_lock(&ei->i_es_lock); - ret = __es_try_to_reclaim_extents(ei, nr_to_scan); + shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan); if (ei->i_es_lru_nr == 0) list_del_init(&ei->i_es_lru); write_unlock(&ei->i_es_lock); - nr_shrunk += ret; - nr_to_scan -= ret; + nr_shrunk += shrunk; + nr_to_scan -= shrunk; if (nr_to_scan == 0) break; } @@ -1007,7 +1009,20 @@ retry: return nr_shrunk; } -static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) +static unsigned long ext4_es_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + unsigned long nr; + struct ext4_sb_info *sbi; + + sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker); + nr = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); + trace_ext4_es_shrink_enter(sbi->s_sb, sc->nr_to_scan, nr); + return nr; +} + +static unsigned long ext4_es_scan(struct shrinker *shrink, + struct shrink_control *sc) { struct ext4_sb_info *sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker); @@ -1022,9 +1037,8 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL); - ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret); - return ret; + return nr_shrunk; } void ext4_es_register_shrinker(struct ext4_sb_info *sbi) @@ -1032,7 +1046,8 @@ void ext4_es_register_shrinker(struct ext4_sb_info *sbi) INIT_LIST_HEAD(&sbi->s_es_lru); spin_lock_init(&sbi->s_es_lru_lock); sbi->s_es_last_sorted = 0; - sbi->s_es_shrinker.shrink = ext4_es_shrink; + sbi->s_es_shrinker.scan_objects = ext4_es_scan; + sbi->s_es_shrinker.count_objects = ext4_es_count; sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; register_shrinker(&sbi->s_es_shrinker); } @@ -1076,7 +1091,7 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, struct ext4_es_tree *tree = &ei->i_es_tree; struct rb_node *node; struct extent_status *es; - int nr_shrunk = 0; + unsigned long nr_shrunk = 0; static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 722329cac98f..c2f41b4d00b9 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1427,21 +1427,22 @@ __acquires(&lru_lock) * gfs2_dispose_glock_lru() above. */ -static void gfs2_scan_glock_lru(int nr) +static long gfs2_scan_glock_lru(int nr) { struct gfs2_glock *gl; LIST_HEAD(skipped); LIST_HEAD(dispose); + long freed = 0; spin_lock(&lru_lock); - while(nr && !list_empty(&lru_list)) { + while ((nr-- >= 0) && !list_empty(&lru_list)) { gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru); /* Test for being demotable */ if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { list_move(&gl->gl_lru, &dispose); atomic_dec(&lru_count); - nr--; + freed++; continue; } @@ -1451,23 +1452,28 @@ static void gfs2_scan_glock_lru(int nr) if (!list_empty(&dispose)) gfs2_dispose_glock_lru(&dispose); spin_unlock(&lru_lock); + + return freed; } -static int gfs2_shrink_glock_memory(struct shrinker *shrink, - struct shrink_control *sc) +static unsigned long gfs2_glock_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) { - if (sc->nr_to_scan) { - if (!(sc->gfp_mask & __GFP_FS)) - return -1; - gfs2_scan_glock_lru(sc->nr_to_scan); - } + if (!(sc->gfp_mask & __GFP_FS)) + return SHRINK_STOP; + return gfs2_scan_glock_lru(sc->nr_to_scan); +} - return (atomic_read(&lru_count) / 100) * sysctl_vfs_cache_pressure; +static unsigned long gfs2_glock_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + return vfs_pressure_ratio(atomic_read(&lru_count)); } static struct shrinker glock_shrinker = { - .shrink = gfs2_shrink_glock_memory, .seeks = DEFAULT_SEEKS, + .count_objects = gfs2_glock_shrink_count, + .scan_objects = gfs2_glock_shrink_scan, }; /** diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 7b0f5043cf24..351586e24e30 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -32,7 +32,8 @@ struct workqueue_struct *gfs2_control_wq; static struct shrinker qd_shrinker = { - .shrink = gfs2_shrink_qd_memory, + .count_objects = gfs2_qd_shrink_count, + .scan_objects = gfs2_qd_shrink_scan, .seeks = DEFAULT_SEEKS, }; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 3768c2f40e43..db441359ee8c 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -75,17 +75,16 @@ static LIST_HEAD(qd_lru_list); static atomic_t qd_lru_count = ATOMIC_INIT(0); static DEFINE_SPINLOCK(qd_lru_lock); -int gfs2_shrink_qd_memory(struct shrinker *shrink, struct shrink_control *sc) +unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) { struct gfs2_quota_data *qd; struct gfs2_sbd *sdp; int nr_to_scan = sc->nr_to_scan; - - if (nr_to_scan == 0) - goto out; + long freed = 0; if (!(sc->gfp_mask & __GFP_FS)) - return -1; + return SHRINK_STOP; spin_lock(&qd_lru_lock); while (nr_to_scan && !list_empty(&qd_lru_list)) { @@ -110,11 +109,16 @@ int gfs2_shrink_qd_memory(struct shrinker *shrink, struct shrink_control *sc) kmem_cache_free(gfs2_quotad_cachep, qd); spin_lock(&qd_lru_lock); nr_to_scan--; + freed++; } spin_unlock(&qd_lru_lock); + return freed; +} -out: - return (atomic_read(&qd_lru_count) * sysctl_vfs_cache_pressure) / 100; +unsigned long gfs2_qd_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + return vfs_pressure_ratio(atomic_read(&qd_lru_count)); } static u64 qd2index(struct gfs2_quota_data *qd) diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index 4f5e6e44ed83..0f64d9deb1b0 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -53,8 +53,10 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip) return ret; } -extern int gfs2_shrink_qd_memory(struct shrinker *shrink, - struct shrink_control *sc); +extern unsigned long gfs2_qd_shrink_count(struct shrinker *shrink, + struct shrink_control *sc); +extern unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc); extern const struct quotactl_ops gfs2_quotactl_ops; #endif /* __QUOTA_DOT_H__ */ diff --git a/fs/inode.c b/fs/inode.c index 93a0625b46e4..b33ba8e021cc 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -17,6 +17,7 @@ #include <linux/prefetch.h> #include <linux/buffer_head.h> /* for inode_has_buffers */ #include <linux/ratelimit.h> +#include <linux/list_lru.h> #include "internal.h" /* @@ -24,7 +25,7 @@ * * inode->i_lock protects: * inode->i_state, inode->i_hash, __iget() - * inode->i_sb->s_inode_lru_lock protects: + * Inode LRU list locks protect: * inode->i_sb->s_inode_lru, inode->i_lru * inode_sb_list_lock protects: * sb->s_inodes, inode->i_sb_list @@ -37,7 +38,7 @@ * * inode_sb_list_lock * inode->i_lock - * inode->i_sb->s_inode_lru_lock + * Inode LRU list locks * * bdi->wb.list_lock * inode->i_lock @@ -70,33 +71,33 @@ EXPORT_SYMBOL(empty_aops); */ struct inodes_stat_t inodes_stat; -static DEFINE_PER_CPU(unsigned int, nr_inodes); -static DEFINE_PER_CPU(unsigned int, nr_unused); +static DEFINE_PER_CPU(unsigned long, nr_inodes); +static DEFINE_PER_CPU(unsigned long, nr_unused); static struct kmem_cache *inode_cachep __read_mostly; -static int get_nr_inodes(void) +static long get_nr_inodes(void) { int i; - int sum = 0; + long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_inodes, i); return sum < 0 ? 0 : sum; } -static inline int get_nr_inodes_unused(void) +static inline long get_nr_inodes_unused(void) { int i; - int sum = 0; + long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_unused, i); return sum < 0 ? 0 : sum; } -int get_nr_dirty_inodes(void) +long get_nr_dirty_inodes(void) { /* not actually dirty inodes, but a wild approximation */ - int nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); + long nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); return nr_dirty > 0 ? nr_dirty : 0; } @@ -109,7 +110,7 @@ int proc_nr_inodes(ctl_table *table, int write, { inodes_stat.nr_inodes = get_nr_inodes(); inodes_stat.nr_unused = get_nr_inodes_unused(); - return proc_dointvec(table, write, buffer, lenp, ppos); + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #endif @@ -401,13 +402,8 @@ EXPORT_SYMBOL(ihold); static void inode_lru_list_add(struct inode *inode) { - spin_lock(&inode->i_sb->s_inode_lru_lock); - if (list_empty(&inode->i_lru)) { - list_add(&inode->i_lru, &inode->i_sb->s_inode_lru); - inode->i_sb->s_nr_inodes_unused++; + if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_inc(nr_unused); - } - spin_unlock(&inode->i_sb->s_inode_lru_lock); } /* @@ -425,13 +421,9 @@ void inode_add_lru(struct inode *inode) static void inode_lru_list_del(struct inode *inode) { - spin_lock(&inode->i_sb->s_inode_lru_lock); - if (!list_empty(&inode->i_lru)) { - list_del_init(&inode->i_lru); - inode->i_sb->s_nr_inodes_unused--; + + if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_dec(nr_unused); - } - spin_unlock(&inode->i_sb->s_inode_lru_lock); } /** @@ -675,24 +667,8 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) return busy; } -static int can_unuse(struct inode *inode) -{ - if (inode->i_state & ~I_REFERENCED) - return 0; - if (inode_has_buffers(inode)) - return 0; - if (atomic_read(&inode->i_count)) - return 0; - if (inode->i_data.nrpages) - return 0; - return 1; -} - /* - * Walk the superblock inode LRU for freeable inodes and attempt to free them. - * This is called from the superblock shrinker function with a number of inodes - * to trim from the LRU. Inodes to be freed are moved to a temporary list and - * then are freed outside inode_lock by dispose_list(). + * Isolate the inode from the LRU in preparation for freeing it. * * Any inodes which are pinned purely because of attached pagecache have their * pagecache removed. If the inode has metadata buffers attached to @@ -706,89 +682,82 @@ static int can_unuse(struct inode *inode) * LRU does not have strict ordering. Hence we don't want to reclaim inodes * with this flag set because they are the inodes that are out of order. */ -void prune_icache_sb(struct super_block *sb, int nr_to_scan) +static enum lru_status +inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) { - LIST_HEAD(freeable); - int nr_scanned; - unsigned long reap = 0; + struct list_head *freeable = arg; + struct inode *inode = container_of(item, struct inode, i_lru); - spin_lock(&sb->s_inode_lru_lock); - for (nr_scanned = nr_to_scan; nr_scanned >= 0; nr_scanned--) { - struct inode *inode; + /* + * we are inverting the lru lock/inode->i_lock here, so use a trylock. + * If we fail to get the lock, just skip it. + */ + if (!spin_trylock(&inode->i_lock)) + return LRU_SKIP; - if (list_empty(&sb->s_inode_lru)) - break; + /* + * Referenced or dirty inodes are still in use. Give them another pass + * through the LRU as we canot reclaim them now. + */ + if (atomic_read(&inode->i_count) || + (inode->i_state & ~I_REFERENCED)) { + list_del_init(&inode->i_lru); + spin_unlock(&inode->i_lock); + this_cpu_dec(nr_unused); + return LRU_REMOVED; + } - inode = list_entry(sb->s_inode_lru.prev, struct inode, i_lru); + /* recently referenced inodes get one more pass */ + if (inode->i_state & I_REFERENCED) { + inode->i_state &= ~I_REFERENCED; + spin_unlock(&inode->i_lock); + return LRU_ROTATE; + } - /* - * we are inverting the sb->s_inode_lru_lock/inode->i_lock here, - * so use a trylock. If we fail to get the lock, just move the - * inode to the back of the list so we don't spin on it. - */ - if (!spin_trylock(&inode->i_lock)) { - list_move(&inode->i_lru, &sb->s_inode_lru); - continue; + if (inode_has_buffers(inode) || inode->i_data.nrpages) { + __iget(inode); + spin_unlock(&inode->i_lock); + spin_unlock(lru_lock); + if (remove_inode_buffers(inode)) { + unsigned long reap; + reap = invalidate_mapping_pages(&inode->i_data, 0, -1); + if (current_is_kswapd()) + __count_vm_events(KSWAPD_INODESTEAL, reap); + else + __count_vm_events(PGINODESTEAL, reap); + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += reap; } + iput(inode); + spin_lock(lru_lock); + return LRU_RETRY; + } - /* - * Referenced or dirty inodes are still in use. Give them - * another pass through the LRU as we canot reclaim them now. - */ - if (atomic_read(&inode->i_count) || - (inode->i_state & ~I_REFERENCED)) { - list_del_init(&inode->i_lru); - spin_unlock(&inode->i_lock); - sb->s_nr_inodes_unused--; - this_cpu_dec(nr_unused); - continue; - } + WARN_ON(inode->i_state & I_NEW); + inode->i_state |= I_FREEING; + list_move(&inode->i_lru, freeable); + spin_unlock(&inode->i_lock); - /* recently referenced inodes get one more pass */ - if (inode->i_state & I_REFERENCED) { - inode->i_state &= ~I_REFERENCED; - list_move(&inode->i_lru, &sb->s_inode_lru); - spin_unlock(&inode->i_lock); - continue; - } - if (inode_has_buffers(inode) || inode->i_data.nrpages) { - __iget(inode); - spin_unlock(&inode->i_lock); - spin_unlock(&sb->s_inode_lru_lock); - if (remove_inode_buffers(inode)) - reap += invalidate_mapping_pages(&inode->i_data, - 0, -1); - iput(inode); - spin_lock(&sb->s_inode_lru_lock); - - if (inode != list_entry(sb->s_inode_lru.next, - struct inode, i_lru)) - continue; /* wrong inode or list_empty */ - /* avoid lock inversions with trylock */ - if (!spin_trylock(&inode->i_lock)) - continue; - if (!can_unuse(inode)) { - spin_unlock(&inode->i_lock); - continue; - } - } - WARN_ON(inode->i_state & I_NEW); - inode->i_state |= I_FREEING; - spin_unlock(&inode->i_lock); + this_cpu_dec(nr_unused); + return LRU_REMOVED; +} - list_move(&inode->i_lru, &freeable); - sb->s_nr_inodes_unused--; - this_cpu_dec(nr_unused); - } - if (current_is_kswapd()) - __count_vm_events(KSWAPD_INODESTEAL, reap); - else - __count_vm_events(PGINODESTEAL, reap); - spin_unlock(&sb->s_inode_lru_lock); - if (current->reclaim_state) - current->reclaim_state->reclaimed_slab += reap; +/* + * Walk the superblock inode LRU for freeable inodes and attempt to free them. + * This is called from the superblock shrinker function with a number of inodes + * to trim from the LRU. Inodes to be freed are moved to a temporary list and + * then are freed outside inode_lock by dispose_list(). + */ +long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan, + int nid) +{ + LIST_HEAD(freeable); + long freed; + freed = list_lru_walk_node(&sb->s_inode_lru, nid, inode_lru_isolate, + &freeable, &nr_to_scan); dispose_list(&freeable); + return freed; } static void __wait_on_freeing_inode(struct inode *inode); diff --git a/fs/internal.h b/fs/internal.h index 2be46ea5dd0b..513e0d859a6c 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -114,6 +114,8 @@ extern int open_check_o_direct(struct file *f); * inode.c */ extern spinlock_t inode_sb_list_lock; +extern long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan, + int nid); extern void inode_add_lru(struct inode *inode); /* @@ -121,7 +123,7 @@ extern void inode_add_lru(struct inode *inode); */ extern void inode_wb_list_del(struct inode *inode); -extern int get_nr_dirty_inodes(void); +extern long get_nr_dirty_inodes(void); extern void evict_inodes(struct super_block *); extern int invalidate_inodes(struct super_block *, bool); @@ -130,6 +132,8 @@ extern int invalidate_inodes(struct super_block *, bool); */ extern struct dentry *__d_alloc(struct super_block *, const struct qstr *); extern int d_set_mounted(struct dentry *dentry); +extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, + int nid); /* * read_write.c diff --git a/fs/mbcache.c b/fs/mbcache.c index 8c32ef3ba88e..e519e45bf673 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -86,18 +86,6 @@ static LIST_HEAD(mb_cache_list); static LIST_HEAD(mb_cache_lru_list); static DEFINE_SPINLOCK(mb_cache_spinlock); -/* - * What the mbcache registers as to get shrunk dynamically. - */ - -static int mb_cache_shrink_fn(struct shrinker *shrink, - struct shrink_control *sc); - -static struct shrinker mb_cache_shrinker = { - .shrink = mb_cache_shrink_fn, - .seeks = DEFAULT_SEEKS, -}; - static inline int __mb_cache_entry_is_hashed(struct mb_cache_entry *ce) { @@ -151,7 +139,7 @@ forget: /* - * mb_cache_shrink_fn() memory pressure callback + * mb_cache_shrink_scan() memory pressure callback * * This function is called by the kernel memory management when memory * gets low. @@ -159,17 +147,16 @@ forget: * @shrink: (ignored) * @sc: shrink_control passed from reclaim * - * Returns the number of objects which are present in the cache. + * Returns the number of objects freed. */ -static int -mb_cache_shrink_fn(struct shrinker *shrink, struct shrink_control *sc) +static unsigned long +mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { LIST_HEAD(free_list); - struct mb_cache *cache; struct mb_cache_entry *entry, *tmp; - int count = 0; int nr_to_scan = sc->nr_to_scan; gfp_t gfp_mask = sc->gfp_mask; + unsigned long freed = 0; mb_debug("trying to free %d entries", nr_to_scan); spin_lock(&mb_cache_spinlock); @@ -179,19 +166,37 @@ mb_cache_shrink_fn(struct shrinker *shrink, struct shrink_control *sc) struct mb_cache_entry, e_lru_list); list_move_tail(&ce->e_lru_list, &free_list); __mb_cache_entry_unhash(ce); + freed++; + } + spin_unlock(&mb_cache_spinlock); + list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) { + __mb_cache_entry_forget(entry, gfp_mask); } + return freed; +} + +static unsigned long +mb_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc) +{ + struct mb_cache *cache; + unsigned long count = 0; + + spin_lock(&mb_cache_spinlock); list_for_each_entry(cache, &mb_cache_list, c_cache_list) { mb_debug("cache %s (%d)", cache->c_name, atomic_read(&cache->c_entry_count)); count += atomic_read(&cache->c_entry_count); } spin_unlock(&mb_cache_spinlock); - list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) { - __mb_cache_entry_forget(entry, gfp_mask); - } - return (count / 100) * sysctl_vfs_cache_pressure; + + return vfs_pressure_ratio(count); } +static struct shrinker mb_cache_shrinker = { + .count_objects = mb_cache_shrink_count, + .scan_objects = mb_cache_shrink_scan, + .seeks = DEFAULT_SEEKS, +}; /* * mb_cache_create() create a new cache diff --git a/fs/namei.c b/fs/namei.c index 409a441ba2ae..0dc4cbf21f37 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -660,29 +660,6 @@ static __always_inline void set_root_rcu(struct nameidata *nd) } } -static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link) -{ - int ret; - - if (IS_ERR(link)) - goto fail; - - if (*link == '/') { - set_root(nd); - path_put(&nd->path); - nd->path = nd->root; - path_get(&nd->root); - nd->flags |= LOOKUP_JUMPED; - } - nd->inode = nd->path.dentry->d_inode; - - ret = link_path_walk(link, nd); - return ret; -fail: - path_put(&nd->path); - return PTR_ERR(link); -} - static void path_put_conditional(struct path *path, struct nameidata *nd) { dput(path->dentry); @@ -874,7 +851,20 @@ follow_link(struct path *link, struct nameidata *nd, void **p) error = 0; s = nd_get_link(nd); if (s) { - error = __vfs_follow_link(nd, s); + if (unlikely(IS_ERR(s))) { + path_put(&nd->path); + put_link(nd, link, *p); + return PTR_ERR(s); + } + if (*s == '/') { + set_root(nd); + path_put(&nd->path); + nd->path = nd->root; + path_get(&nd->root); + nd->flags |= LOOKUP_JUMPED; + } + nd->inode = nd->path.dentry->d_inode; + error = link_path_walk(s, nd); if (unlikely(error)) put_link(nd, link, *p); } @@ -2271,12 +2261,15 @@ mountpoint_last(struct nameidata *nd, struct path *path) dentry = d_alloc(dir, &nd->last); if (!dentry) { error = -ENOMEM; + mutex_unlock(&dir->d_inode->i_mutex); goto out; } dentry = lookup_real(dir->d_inode, dentry, nd->flags); error = PTR_ERR(dentry); - if (IS_ERR(dentry)) + if (IS_ERR(dentry)) { + mutex_unlock(&dir->d_inode->i_mutex); goto out; + } } mutex_unlock(&dir->d_inode->i_mutex); @@ -4236,11 +4229,6 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen) return res; } -int vfs_follow_link(struct nameidata *nd, const char *link) -{ - return __vfs_follow_link(nd, link); -} - /* get the link contents into pagecache */ static char *page_getlink(struct dentry * dentry, struct page **ppage) { @@ -4352,7 +4340,6 @@ EXPORT_SYMBOL(vfs_path_lookup); EXPORT_SYMBOL(inode_permission); EXPORT_SYMBOL(unlock_rename); EXPORT_SYMBOL(vfs_create); -EXPORT_SYMBOL(vfs_follow_link); EXPORT_SYMBOL(vfs_link); EXPORT_SYMBOL(vfs_mkdir); EXPORT_SYMBOL(vfs_mknod); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index e79bc6ce828e..de434f309af0 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2006,17 +2006,18 @@ static void nfs_access_free_list(struct list_head *head) } } -int nfs_access_cache_shrinker(struct shrinker *shrink, - struct shrink_control *sc) +unsigned long +nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { LIST_HEAD(head); struct nfs_inode *nfsi, *next; struct nfs_access_entry *cache; int nr_to_scan = sc->nr_to_scan; gfp_t gfp_mask = sc->gfp_mask; + long freed = 0; if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL) - return (nr_to_scan == 0) ? 0 : -1; + return SHRINK_STOP; spin_lock(&nfs_access_lru_lock); list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) { @@ -2032,6 +2033,7 @@ int nfs_access_cache_shrinker(struct shrinker *shrink, struct nfs_access_entry, lru); list_move(&cache->lru, &head); rb_erase(&cache->rb_node, &nfsi->access_cache); + freed++; if (!list_empty(&nfsi->access_cache_entry_lru)) list_move_tail(&nfsi->access_cache_inode_lru, &nfs_access_lru_list); @@ -2046,7 +2048,13 @@ remove_lru_entry: } spin_unlock(&nfs_access_lru_lock); nfs_access_free_list(&head); - return (atomic_long_read(&nfs_access_nr_entries) / 100) * sysctl_vfs_cache_pressure; + return freed; +} + +unsigned long +nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc) +{ + return vfs_pressure_ratio(atomic_long_read(&nfs_access_nr_entries)); } static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index d388302c005f..38da8c2b81ac 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -273,8 +273,10 @@ extern struct nfs_client *nfs_init_client(struct nfs_client *clp, const char *ip_addr); /* dir.c */ -extern int nfs_access_cache_shrinker(struct shrinker *shrink, - struct shrink_control *sc); +extern unsigned long nfs_access_cache_count(struct shrinker *shrink, + struct shrink_control *sc); +extern unsigned long nfs_access_cache_scan(struct shrinker *shrink, + struct shrink_control *sc); struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int); int nfs_create(struct inode *, struct dentry *, umode_t, bool); int nfs_mkdir(struct inode *, struct dentry *, umode_t); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 5793f24613c8..a03b9c6f9489 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -360,7 +360,8 @@ static void unregister_nfs4_fs(void) #endif static struct shrinker acl_shrinker = { - .shrink = nfs_access_cache_shrinker, + .count_objects = nfs_access_cache_count, + .scan_objects = nfs_access_cache_scan, .seeks = DEFAULT_SEEKS, }; diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index e76244edd748..9186c7ce0b14 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -59,11 +59,14 @@ static unsigned int longest_chain_cachesize; static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec); static void cache_cleaner_func(struct work_struct *unused); -static int nfsd_reply_cache_shrink(struct shrinker *shrink, - struct shrink_control *sc); +static unsigned long nfsd_reply_cache_count(struct shrinker *shrink, + struct shrink_control *sc); +static unsigned long nfsd_reply_cache_scan(struct shrinker *shrink, + struct shrink_control *sc); static struct shrinker nfsd_reply_cache_shrinker = { - .shrink = nfsd_reply_cache_shrink, + .scan_objects = nfsd_reply_cache_scan, + .count_objects = nfsd_reply_cache_count, .seeks = 1, }; @@ -232,16 +235,18 @@ nfsd_cache_entry_expired(struct svc_cacherep *rp) * Walk the LRU list and prune off entries that are older than RC_EXPIRE. * Also prune the oldest ones when the total exceeds the max number of entries. */ -static void +static long prune_cache_entries(void) { struct svc_cacherep *rp, *tmp; + long freed = 0; list_for_each_entry_safe(rp, tmp, &lru_head, c_lru) { if (!nfsd_cache_entry_expired(rp) && num_drc_entries <= max_drc_entries) break; nfsd_reply_cache_free_locked(rp); + freed++; } /* @@ -254,6 +259,7 @@ prune_cache_entries(void) cancel_delayed_work(&cache_cleaner); else mod_delayed_work(system_wq, &cache_cleaner, RC_EXPIRE); + return freed; } static void @@ -264,20 +270,28 @@ cache_cleaner_func(struct work_struct *unused) spin_unlock(&cache_lock); } -static int -nfsd_reply_cache_shrink(struct shrinker *shrink, struct shrink_control *sc) +static unsigned long +nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc) { - unsigned int num; + unsigned long num; spin_lock(&cache_lock); - if (sc->nr_to_scan) - prune_cache_entries(); num = num_drc_entries; spin_unlock(&cache_lock); return num; } +static unsigned long +nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + unsigned long freed; + + spin_lock(&cache_lock); + freed = prune_cache_entries(); + spin_unlock(&cache_lock); + return freed; +} /* * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes */ diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 9a702e193538..831d49a4111f 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -687,45 +687,37 @@ int dquot_quota_sync(struct super_block *sb, int type) } EXPORT_SYMBOL(dquot_quota_sync); -/* Free unused dquots from cache */ -static void prune_dqcache(int count) +static unsigned long +dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { struct list_head *head; struct dquot *dquot; + unsigned long freed = 0; head = free_dquots.prev; - while (head != &free_dquots && count) { + while (head != &free_dquots && sc->nr_to_scan) { dquot = list_entry(head, struct dquot, dq_free); remove_dquot_hash(dquot); remove_free_dquot(dquot); remove_inuse(dquot); do_destroy_dquot(dquot); - count--; + sc->nr_to_scan--; + freed++; head = free_dquots.prev; } + return freed; } -/* - * This is called from kswapd when we think we need some - * more memory - */ -static int shrink_dqcache_memory(struct shrinker *shrink, - struct shrink_control *sc) +static unsigned long +dqcache_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { - int nr = sc->nr_to_scan; - - if (nr) { - spin_lock(&dq_list_lock); - prune_dqcache(nr); - spin_unlock(&dq_list_lock); - } - return ((unsigned) - percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS]) - /100) * sysctl_vfs_cache_pressure; + return vfs_pressure_ratio( + percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS])); } static struct shrinker dqcache_shrinker = { - .shrink = shrink_dqcache_memory, + .count_objects = dqcache_shrink_count, + .scan_objects = dqcache_shrink_scan, .seeks = DEFAULT_SEEKS, }; diff --git a/fs/super.c b/fs/super.c index f6961ea84c56..3a96c9783a8b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -53,11 +53,15 @@ static char *sb_writers_name[SB_FREEZE_LEVELS] = { * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we * take a passive reference to the superblock to avoid this from occurring. */ -static int prune_super(struct shrinker *shrink, struct shrink_control *sc) +static unsigned long super_cache_scan(struct shrinker *shrink, + struct shrink_control *sc) { struct super_block *sb; - int fs_objects = 0; - int total_objects; + long fs_objects = 0; + long total_objects; + long freed = 0; + long dentries; + long inodes; sb = container_of(shrink, struct super_block, s_shrink); @@ -65,46 +69,62 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc) * Deadlock avoidance. We may hold various FS locks, and we don't want * to recurse into the FS that called us in clear_inode() and friends.. */ - if (sc->nr_to_scan && !(sc->gfp_mask & __GFP_FS)) - return -1; + if (!(sc->gfp_mask & __GFP_FS)) + return SHRINK_STOP; if (!grab_super_passive(sb)) - return -1; + return SHRINK_STOP; if (sb->s_op->nr_cached_objects) - fs_objects = sb->s_op->nr_cached_objects(sb); - - total_objects = sb->s_nr_dentry_unused + - sb->s_nr_inodes_unused + fs_objects + 1; - - if (sc->nr_to_scan) { - int dentries; - int inodes; - - /* proportion the scan between the caches */ - dentries = (sc->nr_to_scan * sb->s_nr_dentry_unused) / - total_objects; - inodes = (sc->nr_to_scan * sb->s_nr_inodes_unused) / - total_objects; - if (fs_objects) - fs_objects = (sc->nr_to_scan * fs_objects) / - total_objects; - /* - * prune the dcache first as the icache is pinned by it, then - * prune the icache, followed by the filesystem specific caches - */ - prune_dcache_sb(sb, dentries); - prune_icache_sb(sb, inodes); + fs_objects = sb->s_op->nr_cached_objects(sb, sc->nid); - if (fs_objects && sb->s_op->free_cached_objects) { - sb->s_op->free_cached_objects(sb, fs_objects); - fs_objects = sb->s_op->nr_cached_objects(sb); - } - total_objects = sb->s_nr_dentry_unused + - sb->s_nr_inodes_unused + fs_objects; + inodes = list_lru_count_node(&sb->s_inode_lru, sc->nid); + dentries = list_lru_count_node(&sb->s_dentry_lru, sc->nid); + total_objects = dentries + inodes + fs_objects + 1; + + /* proportion the scan between the caches */ + dentries = mult_frac(sc->nr_to_scan, dentries, total_objects); + inodes = mult_frac(sc->nr_to_scan, inodes, total_objects); + + /* + * prune the dcache first as the icache is pinned by it, then + * prune the icache, followed by the filesystem specific caches + */ + freed = prune_dcache_sb(sb, dentries, sc->nid); + freed += prune_icache_sb(sb, inodes, sc->nid); + + if (fs_objects) { + fs_objects = mult_frac(sc->nr_to_scan, fs_objects, + total_objects); + freed += sb->s_op->free_cached_objects(sb, fs_objects, + sc->nid); } - total_objects = (total_objects / 100) * sysctl_vfs_cache_pressure; + drop_super(sb); + return freed; +} + +static unsigned long super_cache_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct super_block *sb; + long total_objects = 0; + + sb = container_of(shrink, struct super_block, s_shrink); + + if (!grab_super_passive(sb)) + return 0; + + if (sb->s_op && sb->s_op->nr_cached_objects) + total_objects = sb->s_op->nr_cached_objects(sb, + sc->nid); + + total_objects += list_lru_count_node(&sb->s_dentry_lru, + sc->nid); + total_objects += list_lru_count_node(&sb->s_inode_lru, + sc->nid); + + total_objects = vfs_pressure_ratio(total_objects); drop_super(sb); return total_objects; } @@ -175,9 +195,12 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) INIT_HLIST_NODE(&s->s_instances); INIT_HLIST_BL_HEAD(&s->s_anon); INIT_LIST_HEAD(&s->s_inodes); - INIT_LIST_HEAD(&s->s_dentry_lru); - INIT_LIST_HEAD(&s->s_inode_lru); - spin_lock_init(&s->s_inode_lru_lock); + + if (list_lru_init(&s->s_dentry_lru)) + goto err_out; + if (list_lru_init(&s->s_inode_lru)) + goto err_out_dentry_lru; + INIT_LIST_HEAD(&s->s_mounts); init_rwsem(&s->s_umount); lockdep_set_class(&s->s_umount, &type->s_umount_key); @@ -210,11 +233,16 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) s->cleancache_poolid = -1; s->s_shrink.seeks = DEFAULT_SEEKS; - s->s_shrink.shrink = prune_super; + s->s_shrink.scan_objects = super_cache_scan; + s->s_shrink.count_objects = super_cache_count; s->s_shrink.batch = 1024; + s->s_shrink.flags = SHRINKER_NUMA_AWARE; } out: return s; + +err_out_dentry_lru: + list_lru_destroy(&s->s_dentry_lru); err_out: security_sb_free(s); #ifdef CONFIG_SMP @@ -295,6 +323,9 @@ void deactivate_locked_super(struct super_block *s) /* caches are now gone, we can safely kill the shrinker now */ unregister_shrinker(&s->s_shrink); + list_lru_destroy(&s->s_dentry_lru); + list_lru_destroy(&s->s_inode_lru); + put_filesystem(fs); put_super(s); } else { diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c index 9e1d05666fed..f35135e28e96 100644 --- a/fs/ubifs/shrinker.c +++ b/fs/ubifs/shrinker.c @@ -277,18 +277,25 @@ static int kick_a_thread(void) return 0; } -int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc) +unsigned long ubifs_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) { - int nr = sc->nr_to_scan; - int freed, contention = 0; long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt); - if (nr == 0) - /* - * Due to the way UBIFS updates the clean znode counter it may - * temporarily be negative. - */ - return clean_zn_cnt >= 0 ? clean_zn_cnt : 1; + /* + * Due to the way UBIFS updates the clean znode counter it may + * temporarily be negative. + */ + return clean_zn_cnt >= 0 ? clean_zn_cnt : 1; +} + +unsigned long ubifs_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + unsigned long nr = sc->nr_to_scan; + int contention = 0; + unsigned long freed; + long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt); if (!clean_zn_cnt) { /* @@ -316,10 +323,10 @@ int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc) if (!freed && contention) { dbg_tnc("freed nothing, but contention"); - return -1; + return SHRINK_STOP; } out: - dbg_tnc("%d znodes were freed, requested %d", freed, nr); + dbg_tnc("%lu znodes were freed, requested %lu", freed, nr); return freed; } diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 879b9976c12b..3e4aa7281e04 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -49,7 +49,8 @@ struct kmem_cache *ubifs_inode_slab; /* UBIFS TNC shrinker description */ static struct shrinker ubifs_shrinker_info = { - .shrink = ubifs_shrinker, + .scan_objects = ubifs_shrink_scan, + .count_objects = ubifs_shrink_count, .seeks = DEFAULT_SEEKS, }; diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index b2babce4d70f..e8c8cfe1435c 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1624,7 +1624,10 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot); int ubifs_tnc_end_commit(struct ubifs_info *c); /* shrinker.c */ -int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc); +unsigned long ubifs_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc); +unsigned long ubifs_shrink_count(struct shrinker *shrink, + struct shrink_control *sc); /* commit.c */ int ubifs_bg_thread(void *info); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index c06823fe10d3..263470075ea2 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -81,54 +81,6 @@ xfs_buf_vmap_len( } /* - * xfs_buf_lru_add - add a buffer to the LRU. - * - * The LRU takes a new reference to the buffer so that it will only be freed - * once the shrinker takes the buffer off the LRU. - */ -STATIC void -xfs_buf_lru_add( - struct xfs_buf *bp) -{ - struct xfs_buftarg *btp = bp->b_target; - - spin_lock(&btp->bt_lru_lock); - if (list_empty(&bp->b_lru)) { - atomic_inc(&bp->b_hold); - list_add_tail(&bp->b_lru, &btp->bt_lru); - btp->bt_lru_nr++; - bp->b_lru_flags &= ~_XBF_LRU_DISPOSE; - } - spin_unlock(&btp->bt_lru_lock); -} - -/* - * xfs_buf_lru_del - remove a buffer from the LRU - * - * The unlocked check is safe here because it only occurs when there are not - * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there - * to optimise the shrinker removing the buffer from the LRU and calling - * xfs_buf_free(). i.e. it removes an unnecessary round trip on the - * bt_lru_lock. - */ -STATIC void -xfs_buf_lru_del( - struct xfs_buf *bp) -{ - struct xfs_buftarg *btp = bp->b_target; - - if (list_empty(&bp->b_lru)) - return; - - spin_lock(&btp->bt_lru_lock); - if (!list_empty(&bp->b_lru)) { - list_del_init(&bp->b_lru); - btp->bt_lru_nr--; - } - spin_unlock(&btp->bt_lru_lock); -} - -/* * When we mark a buffer stale, we remove the buffer from the LRU and clear the * b_lru_ref count so that the buffer is freed immediately when the buffer * reference count falls to zero. If the buffer is already on the LRU, we need @@ -151,20 +103,14 @@ xfs_buf_stale( */ bp->b_flags &= ~_XBF_DELWRI_Q; - atomic_set(&(bp)->b_lru_ref, 0); - if (!list_empty(&bp->b_lru)) { - struct xfs_buftarg *btp = bp->b_target; + spin_lock(&bp->b_lock); + atomic_set(&bp->b_lru_ref, 0); + if (!(bp->b_state & XFS_BSTATE_DISPOSE) && + (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru))) + atomic_dec(&bp->b_hold); - spin_lock(&btp->bt_lru_lock); - if (!list_empty(&bp->b_lru) && - !(bp->b_lru_flags & _XBF_LRU_DISPOSE)) { - list_del_init(&bp->b_lru); - btp->bt_lru_nr--; - atomic_dec(&bp->b_hold); - } - spin_unlock(&btp->bt_lru_lock); - } ASSERT(atomic_read(&bp->b_hold) >= 1); + spin_unlock(&bp->b_lock); } static int @@ -228,6 +174,7 @@ _xfs_buf_alloc( INIT_LIST_HEAD(&bp->b_list); RB_CLEAR_NODE(&bp->b_rbnode); sema_init(&bp->b_sema, 0); /* held, no waiters */ + spin_lock_init(&bp->b_lock); XB_SET_OWNER(bp); bp->b_target = target; bp->b_flags = flags; @@ -917,12 +864,33 @@ xfs_buf_rele( ASSERT(atomic_read(&bp->b_hold) > 0); if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { - if (!(bp->b_flags & XBF_STALE) && - atomic_read(&bp->b_lru_ref)) { - xfs_buf_lru_add(bp); + spin_lock(&bp->b_lock); + if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { + /* + * If the buffer is added to the LRU take a new + * reference to the buffer for the LRU and clear the + * (now stale) dispose list state flag + */ + if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) { + bp->b_state &= ~XFS_BSTATE_DISPOSE; + atomic_inc(&bp->b_hold); + } + spin_unlock(&bp->b_lock); spin_unlock(&pag->pag_buf_lock); } else { - xfs_buf_lru_del(bp); + /* + * most of the time buffers will already be removed from + * the LRU, so optimise that case by checking for the + * XFS_BSTATE_DISPOSE flag indicating the last list the + * buffer was on was the disposal list + */ + if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { + list_lru_del(&bp->b_target->bt_lru, &bp->b_lru); + } else { + ASSERT(list_empty(&bp->b_lru)); + } + spin_unlock(&bp->b_lock); + ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); spin_unlock(&pag->pag_buf_lock); @@ -1502,83 +1470,121 @@ xfs_buf_iomove( * returned. These buffers will have an elevated hold count, so wait on those * while freeing all the buffers only held by the LRU. */ +static enum lru_status +xfs_buftarg_wait_rele( + struct list_head *item, + spinlock_t *lru_lock, + void *arg) + +{ + struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); + struct list_head *dispose = arg; + + if (atomic_read(&bp->b_hold) > 1) { + /* need to wait, so skip it this pass */ + trace_xfs_buf_wait_buftarg(bp, _RET_IP_); + return LRU_SKIP; + } + if (!spin_trylock(&bp->b_lock)) + return LRU_SKIP; + + /* + * clear the LRU reference count so the buffer doesn't get + * ignored in xfs_buf_rele(). + */ + atomic_set(&bp->b_lru_ref, 0); + bp->b_state |= XFS_BSTATE_DISPOSE; + list_move(item, dispose); + spin_unlock(&bp->b_lock); + return LRU_REMOVED; +} + void xfs_wait_buftarg( struct xfs_buftarg *btp) { - struct xfs_buf *bp; + LIST_HEAD(dispose); + int loop = 0; -restart: - spin_lock(&btp->bt_lru_lock); - while (!list_empty(&btp->bt_lru)) { - bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru); - if (atomic_read(&bp->b_hold) > 1) { - trace_xfs_buf_wait_buftarg(bp, _RET_IP_); - list_move_tail(&bp->b_lru, &btp->bt_lru); - spin_unlock(&btp->bt_lru_lock); - delay(100); - goto restart; + /* loop until there is nothing left on the lru list. */ + while (list_lru_count(&btp->bt_lru)) { + list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele, + &dispose, LONG_MAX); + + while (!list_empty(&dispose)) { + struct xfs_buf *bp; + bp = list_first_entry(&dispose, struct xfs_buf, b_lru); + list_del_init(&bp->b_lru); + xfs_buf_rele(bp); } - /* - * clear the LRU reference count so the buffer doesn't get - * ignored in xfs_buf_rele(). - */ - atomic_set(&bp->b_lru_ref, 0); - spin_unlock(&btp->bt_lru_lock); - xfs_buf_rele(bp); - spin_lock(&btp->bt_lru_lock); + if (loop++ != 0) + delay(100); } - spin_unlock(&btp->bt_lru_lock); } -int -xfs_buftarg_shrink( +static enum lru_status +xfs_buftarg_isolate( + struct list_head *item, + spinlock_t *lru_lock, + void *arg) +{ + struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); + struct list_head *dispose = arg; + + /* + * we are inverting the lru lock/bp->b_lock here, so use a trylock. + * If we fail to get the lock, just skip it. + */ + if (!spin_trylock(&bp->b_lock)) + return LRU_SKIP; + /* + * Decrement the b_lru_ref count unless the value is already + * zero. If the value is already zero, we need to reclaim the + * buffer, otherwise it gets another trip through the LRU. + */ + if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) { + spin_unlock(&bp->b_lock); + return LRU_ROTATE; + } + + bp->b_state |= XFS_BSTATE_DISPOSE; + list_move(item, dispose); + spin_unlock(&bp->b_lock); + return LRU_REMOVED; +} + +static unsigned long +xfs_buftarg_shrink_scan( struct shrinker *shrink, struct shrink_control *sc) { struct xfs_buftarg *btp = container_of(shrink, struct xfs_buftarg, bt_shrinker); - struct xfs_buf *bp; - int nr_to_scan = sc->nr_to_scan; LIST_HEAD(dispose); + unsigned long freed; + unsigned long nr_to_scan = sc->nr_to_scan; - if (!nr_to_scan) - return btp->bt_lru_nr; - - spin_lock(&btp->bt_lru_lock); - while (!list_empty(&btp->bt_lru)) { - if (nr_to_scan-- <= 0) - break; - - bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru); - - /* - * Decrement the b_lru_ref count unless the value is already - * zero. If the value is already zero, we need to reclaim the - * buffer, otherwise it gets another trip through the LRU. - */ - if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) { - list_move_tail(&bp->b_lru, &btp->bt_lru); - continue; - } - - /* - * remove the buffer from the LRU now to avoid needing another - * lock round trip inside xfs_buf_rele(). - */ - list_move(&bp->b_lru, &dispose); - btp->bt_lru_nr--; - bp->b_lru_flags |= _XBF_LRU_DISPOSE; - } - spin_unlock(&btp->bt_lru_lock); + freed = list_lru_walk_node(&btp->bt_lru, sc->nid, xfs_buftarg_isolate, + &dispose, &nr_to_scan); while (!list_empty(&dispose)) { + struct xfs_buf *bp; bp = list_first_entry(&dispose, struct xfs_buf, b_lru); list_del_init(&bp->b_lru); xfs_buf_rele(bp); } - return btp->bt_lru_nr; + return freed; +} + +static unsigned long +xfs_buftarg_shrink_count( + struct shrinker *shrink, + struct shrink_control *sc) +{ + struct xfs_buftarg *btp = container_of(shrink, + struct xfs_buftarg, bt_shrinker); + return list_lru_count_node(&btp->bt_lru, sc->nid); } void @@ -1587,6 +1593,7 @@ xfs_free_buftarg( struct xfs_buftarg *btp) { unregister_shrinker(&btp->bt_shrinker); + list_lru_destroy(&btp->bt_lru); if (mp->m_flags & XFS_MOUNT_BARRIER) xfs_blkdev_issue_flush(btp); @@ -1660,12 +1667,16 @@ xfs_alloc_buftarg( if (!btp->bt_bdi) goto error; - INIT_LIST_HEAD(&btp->bt_lru); - spin_lock_init(&btp->bt_lru_lock); if (xfs_setsize_buftarg_early(btp, bdev)) goto error; - btp->bt_shrinker.shrink = xfs_buftarg_shrink; + + if (list_lru_init(&btp->bt_lru)) + goto error; + + btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count; + btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan; btp->bt_shrinker.seeks = DEFAULT_SEEKS; + btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE; register_shrinker(&btp->bt_shrinker); return btp; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 433a12ed7b17..e65683361017 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -25,6 +25,7 @@ #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/uio.h> +#include <linux/list_lru.h> /* * Base types @@ -59,7 +60,6 @@ typedef enum { #define _XBF_KMEM (1 << 21)/* backed by heap memory */ #define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ #define _XBF_COMPOUND (1 << 23)/* compound buffer */ -#define _XBF_LRU_DISPOSE (1 << 24)/* buffer being discarded */ typedef unsigned int xfs_buf_flags_t; @@ -78,8 +78,12 @@ typedef unsigned int xfs_buf_flags_t; { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ - { _XBF_COMPOUND, "COMPOUND" }, \ - { _XBF_LRU_DISPOSE, "LRU_DISPOSE" } + { _XBF_COMPOUND, "COMPOUND" } + +/* + * Internal state flags. + */ +#define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */ typedef struct xfs_buftarg { dev_t bt_dev; @@ -92,9 +96,7 @@ typedef struct xfs_buftarg { /* LRU control structures */ struct shrinker bt_shrinker; - struct list_head bt_lru; - spinlock_t bt_lru_lock; - unsigned int bt_lru_nr; + struct list_lru bt_lru; } xfs_buftarg_t; struct xfs_buf; @@ -137,7 +139,8 @@ typedef struct xfs_buf { * bt_lru_lock and not by b_sema */ struct list_head b_lru; /* lru list */ - xfs_buf_flags_t b_lru_flags; /* internal lru status flags */ + spinlock_t b_lock; /* internal state lock */ + unsigned int b_state; /* internal state flags */ wait_queue_head_t b_waiters; /* unpin waiters */ struct list_head b_list; struct xfs_perag *b_pag; /* contains rbtree root */ diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 251c66632e5e..71520e6e5d65 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -940,13 +940,8 @@ xfs_qm_dqput_final( trace_xfs_dqput_free(dqp); - mutex_lock(&qi->qi_lru_lock); - if (list_empty(&dqp->q_lru)) { - list_add_tail(&dqp->q_lru, &qi->qi_lru_list); - qi->qi_lru_count++; + if (list_lru_add(&qi->qi_lru, &dqp->q_lru)) XFS_STATS_INC(xs_qm_dquot_unused); - } - mutex_unlock(&qi->qi_lru_lock); /* * If we just added a udquot to the freelist, then we want to release diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 16219b9c6790..73b62a24ceac 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1167,7 +1167,7 @@ xfs_reclaim_inodes( * them to be cleaned, which we hope will not be very long due to the * background walker having already kicked the IO off on those dirty inodes. */ -void +long xfs_reclaim_inodes_nr( struct xfs_mount *mp, int nr_to_scan) @@ -1176,7 +1176,7 @@ xfs_reclaim_inodes_nr( xfs_reclaim_work_queue(mp); xfs_ail_push_all(mp->m_ail); - xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); + return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); } /* diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 8a89f7d791bd..456f0144e1b6 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -46,7 +46,7 @@ void xfs_reclaim_worker(struct work_struct *work); int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); int xfs_reclaim_inodes_count(struct xfs_mount *mp); -void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); +long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 6218a0aeeeea..3e6c2e6c9cd2 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -51,8 +51,9 @@ */ STATIC int xfs_qm_init_quotainos(xfs_mount_t *); STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); -STATIC int xfs_qm_shake(struct shrinker *, struct shrink_control *); + +STATIC void xfs_qm_dqfree_one(struct xfs_dquot *dqp); /* * We use the batch lookup interface to iterate over the dquots as it * currently is the only interface into the radix tree code that allows @@ -203,12 +204,9 @@ xfs_qm_dqpurge( * We move dquots to the freelist as soon as their reference count * hits zero, so it really should be on the freelist here. */ - mutex_lock(&qi->qi_lru_lock); ASSERT(!list_empty(&dqp->q_lru)); - list_del_init(&dqp->q_lru); - qi->qi_lru_count--; + list_lru_del(&qi->qi_lru, &dqp->q_lru); XFS_STATS_DEC(xs_qm_dquot_unused); - mutex_unlock(&qi->qi_lru_lock); xfs_qm_dqdestroy(dqp); @@ -680,6 +678,143 @@ xfs_qm_calc_dquots_per_chunk( return ndquots; } +struct xfs_qm_isolate { + struct list_head buffers; + struct list_head dispose; +}; + +static enum lru_status +xfs_qm_dquot_isolate( + struct list_head *item, + spinlock_t *lru_lock, + void *arg) +{ + struct xfs_dquot *dqp = container_of(item, + struct xfs_dquot, q_lru); + struct xfs_qm_isolate *isol = arg; + + if (!xfs_dqlock_nowait(dqp)) + goto out_miss_busy; + + /* + * This dquot has acquired a reference in the meantime remove it from + * the freelist and try again. + */ + if (dqp->q_nrefs) { + xfs_dqunlock(dqp); + XFS_STATS_INC(xs_qm_dqwants); + + trace_xfs_dqreclaim_want(dqp); + list_del_init(&dqp->q_lru); + XFS_STATS_DEC(xs_qm_dquot_unused); + return LRU_REMOVED; + } + + /* + * If the dquot is dirty, flush it. If it's already being flushed, just + * skip it so there is time for the IO to complete before we try to + * reclaim it again on the next LRU pass. + */ + if (!xfs_dqflock_nowait(dqp)) { + xfs_dqunlock(dqp); + goto out_miss_busy; + } + + if (XFS_DQ_IS_DIRTY(dqp)) { + struct xfs_buf *bp = NULL; + int error; + + trace_xfs_dqreclaim_dirty(dqp); + + /* we have to drop the LRU lock to flush the dquot */ + spin_unlock(lru_lock); + + error = xfs_qm_dqflush(dqp, &bp); + if (error) { + xfs_warn(dqp->q_mount, "%s: dquot %p flush failed", + __func__, dqp); + goto out_unlock_dirty; + } + + xfs_buf_delwri_queue(bp, &isol->buffers); + xfs_buf_relse(bp); + goto out_unlock_dirty; + } + xfs_dqfunlock(dqp); + + /* + * Prevent lookups now that we are past the point of no return. + */ + dqp->dq_flags |= XFS_DQ_FREEING; + xfs_dqunlock(dqp); + + ASSERT(dqp->q_nrefs == 0); + list_move_tail(&dqp->q_lru, &isol->dispose); + XFS_STATS_DEC(xs_qm_dquot_unused); + trace_xfs_dqreclaim_done(dqp); + XFS_STATS_INC(xs_qm_dqreclaims); + return LRU_REMOVED; + +out_miss_busy: + trace_xfs_dqreclaim_busy(dqp); + XFS_STATS_INC(xs_qm_dqreclaim_misses); + return LRU_SKIP; + +out_unlock_dirty: + trace_xfs_dqreclaim_busy(dqp); + XFS_STATS_INC(xs_qm_dqreclaim_misses); + xfs_dqunlock(dqp); + spin_lock(lru_lock); + return LRU_RETRY; +} + +static unsigned long +xfs_qm_shrink_scan( + struct shrinker *shrink, + struct shrink_control *sc) +{ + struct xfs_quotainfo *qi = container_of(shrink, + struct xfs_quotainfo, qi_shrinker); + struct xfs_qm_isolate isol; + unsigned long freed; + int error; + unsigned long nr_to_scan = sc->nr_to_scan; + + if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT)) + return 0; + + INIT_LIST_HEAD(&isol.buffers); + INIT_LIST_HEAD(&isol.dispose); + + freed = list_lru_walk_node(&qi->qi_lru, sc->nid, xfs_qm_dquot_isolate, &isol, + &nr_to_scan); + + error = xfs_buf_delwri_submit(&isol.buffers); + if (error) + xfs_warn(NULL, "%s: dquot reclaim failed", __func__); + + while (!list_empty(&isol.dispose)) { + struct xfs_dquot *dqp; + + dqp = list_first_entry(&isol.dispose, struct xfs_dquot, q_lru); + list_del_init(&dqp->q_lru); + xfs_qm_dqfree_one(dqp); + } + + return freed; +} + +static unsigned long +xfs_qm_shrink_count( + struct shrinker *shrink, + struct shrink_control *sc) +{ + struct xfs_quotainfo *qi = container_of(shrink, + struct xfs_quotainfo, qi_shrinker); + + return list_lru_count_node(&qi->qi_lru, sc->nid); +} + /* * This initializes all the quota information that's kept in the * mount structure @@ -696,11 +831,18 @@ xfs_qm_init_quotainfo( qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP); + if ((error = list_lru_init(&qinf->qi_lru))) { + kmem_free(qinf); + mp->m_quotainfo = NULL; + return error; + } + /* * See if quotainodes are setup, and if not, allocate them, * and change the superblock accordingly. */ if ((error = xfs_qm_init_quotainos(mp))) { + list_lru_destroy(&qinf->qi_lru); kmem_free(qinf); mp->m_quotainfo = NULL; return error; @@ -711,10 +853,6 @@ xfs_qm_init_quotainfo( INIT_RADIX_TREE(&qinf->qi_pquota_tree, GFP_NOFS); mutex_init(&qinf->qi_tree_lock); - INIT_LIST_HEAD(&qinf->qi_lru_list); - qinf->qi_lru_count = 0; - mutex_init(&qinf->qi_lru_lock); - /* mutex used to serialize quotaoffs */ mutex_init(&qinf->qi_quotaofflock); @@ -779,8 +917,10 @@ xfs_qm_init_quotainfo( qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT; } - qinf->qi_shrinker.shrink = xfs_qm_shake; + qinf->qi_shrinker.count_objects = xfs_qm_shrink_count; + qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan; qinf->qi_shrinker.seeks = DEFAULT_SEEKS; + qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE; register_shrinker(&qinf->qi_shrinker); return 0; } @@ -801,6 +941,7 @@ xfs_qm_destroy_quotainfo( ASSERT(qi != NULL); unregister_shrinker(&qi->qi_shrinker); + list_lru_destroy(&qi->qi_lru); if (qi->qi_uquotaip) { IRELE(qi->qi_uquotaip); @@ -1599,132 +1740,6 @@ xfs_qm_dqfree_one( xfs_qm_dqdestroy(dqp); } -STATIC void -xfs_qm_dqreclaim_one( - struct xfs_dquot *dqp, - struct list_head *buffer_list, - struct list_head *dispose_list) -{ - struct xfs_mount *mp = dqp->q_mount; - struct xfs_quotainfo *qi = mp->m_quotainfo; - int error; - - if (!xfs_dqlock_nowait(dqp)) - goto out_move_tail; - - /* - * This dquot has acquired a reference in the meantime remove it from - * the freelist and try again. - */ - if (dqp->q_nrefs) { - xfs_dqunlock(dqp); - - trace_xfs_dqreclaim_want(dqp); - XFS_STATS_INC(xs_qm_dqwants); - - list_del_init(&dqp->q_lru); - qi->qi_lru_count--; - XFS_STATS_DEC(xs_qm_dquot_unused); - return; - } - - /* - * Try to grab the flush lock. If this dquot is in the process of - * getting flushed to disk, we don't want to reclaim it. - */ - if (!xfs_dqflock_nowait(dqp)) - goto out_unlock_move_tail; - - if (XFS_DQ_IS_DIRTY(dqp)) { - struct xfs_buf *bp = NULL; - - trace_xfs_dqreclaim_dirty(dqp); - - error = xfs_qm_dqflush(dqp, &bp); - if (error) { - xfs_warn(mp, "%s: dquot %p flush failed", - __func__, dqp); - goto out_unlock_move_tail; - } - - xfs_buf_delwri_queue(bp, buffer_list); - xfs_buf_relse(bp); - /* - * Give the dquot another try on the freelist, as the - * flushing will take some time. - */ - goto out_unlock_move_tail; - } - xfs_dqfunlock(dqp); - - /* - * Prevent lookups now that we are past the point of no return. - */ - dqp->dq_flags |= XFS_DQ_FREEING; - xfs_dqunlock(dqp); - - ASSERT(dqp->q_nrefs == 0); - list_move_tail(&dqp->q_lru, dispose_list); - qi->qi_lru_count--; - XFS_STATS_DEC(xs_qm_dquot_unused); - - trace_xfs_dqreclaim_done(dqp); - XFS_STATS_INC(xs_qm_dqreclaims); - return; - - /* - * Move the dquot to the tail of the list so that we don't spin on it. - */ -out_unlock_move_tail: - xfs_dqunlock(dqp); -out_move_tail: - list_move_tail(&dqp->q_lru, &qi->qi_lru_list); - trace_xfs_dqreclaim_busy(dqp); - XFS_STATS_INC(xs_qm_dqreclaim_misses); -} - -STATIC int -xfs_qm_shake( - struct shrinker *shrink, - struct shrink_control *sc) -{ - struct xfs_quotainfo *qi = - container_of(shrink, struct xfs_quotainfo, qi_shrinker); - int nr_to_scan = sc->nr_to_scan; - LIST_HEAD (buffer_list); - LIST_HEAD (dispose_list); - struct xfs_dquot *dqp; - int error; - - if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT)) - return 0; - if (!nr_to_scan) - goto out; - - mutex_lock(&qi->qi_lru_lock); - while (!list_empty(&qi->qi_lru_list)) { - if (nr_to_scan-- <= 0) - break; - dqp = list_first_entry(&qi->qi_lru_list, struct xfs_dquot, - q_lru); - xfs_qm_dqreclaim_one(dqp, &buffer_list, &dispose_list); - } - mutex_unlock(&qi->qi_lru_lock); - - error = xfs_buf_delwri_submit(&buffer_list); - if (error) - xfs_warn(NULL, "%s: dquot reclaim failed", __func__); - - while (!list_empty(&dispose_list)) { - dqp = list_first_entry(&dispose_list, struct xfs_dquot, q_lru); - list_del_init(&dqp->q_lru); - xfs_qm_dqfree_one(dqp); - } - -out: - return (qi->qi_lru_count / 100) * sysctl_vfs_cache_pressure; -} - /* * Start a transaction and write the incore superblock changes to * disk. flags parameter indicates which fields have changed. diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 670cd4464070..2b602df9c242 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -49,9 +49,7 @@ typedef struct xfs_quotainfo { struct xfs_inode *qi_uquotaip; /* user quota inode */ struct xfs_inode *qi_gquotaip; /* group quota inode */ struct xfs_inode *qi_pquotaip; /* project quota inode */ - struct list_head qi_lru_list; - struct mutex qi_lru_lock; - int qi_lru_count; + struct list_lru qi_lru; int qi_dquots; time_t qi_btimelimit; /* limit for blks timer */ time_t qi_itimelimit; /* limit for inodes timer */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 979a77d4b87d..15188cc99449 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1535,19 +1535,21 @@ xfs_fs_mount( return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super); } -static int +static long xfs_fs_nr_cached_objects( - struct super_block *sb) + struct super_block *sb, + int nid) { return xfs_reclaim_inodes_count(XFS_M(sb)); } -static void +static long xfs_fs_free_cached_objects( struct super_block *sb, - int nr_to_scan) + long nr_to_scan, + int nid) { - xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan); + return xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan); } static const struct super_operations xfs_super_operations = { |