diff options
Diffstat (limited to 'lib/rhashtable.c')
-rw-r--r-- | lib/rhashtable.c | 1034 |
1 files changed, 363 insertions, 671 deletions
diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 9cc4c4a90d00..4898442b837f 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -1,13 +1,13 @@ /* * Resizable, Scalable, Concurrent Hash Table * + * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au> * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch> * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net> * - * Based on the following paper: - * https://www.usenix.org/legacy/event/atc11/tech/final_files/Triplett.pdf - * * Code partially derived from nft_hash + * Rewritten with rehash code from br_multicast plus single list + * pointer as suggested by Josh Triplett * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -17,6 +17,7 @@ #include <linux/kernel.h> #include <linux/init.h> #include <linux/log2.h> +#include <linux/sched.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/mm.h> @@ -26,121 +27,18 @@ #include <linux/err.h> #define HASH_DEFAULT_SIZE 64UL -#define HASH_MIN_SIZE 4UL +#define HASH_MIN_SIZE 4U #define BUCKET_LOCKS_PER_CPU 128UL -/* Base bits plus 1 bit for nulls marker */ -#define HASH_RESERVED_SPACE (RHT_BASE_BITS + 1) - -enum { - RHT_LOCK_NORMAL, - RHT_LOCK_NESTED, -}; - -/* The bucket lock is selected based on the hash and protects mutations - * on a group of hash buckets. - * - * A maximum of tbl->size/2 bucket locks is allocated. This ensures that - * a single lock always covers both buckets which may both contains - * entries which link to the same bucket of the old table during resizing. - * This allows to simplify the locking as locking the bucket in both - * tables during resize always guarantee protection. - * - * IMPORTANT: When holding the bucket lock of both the old and new table - * during expansions and shrinking, the old bucket lock must always be - * acquired first. - */ -static spinlock_t *bucket_lock(const struct bucket_table *tbl, u32 hash) -{ - return &tbl->locks[hash & tbl->locks_mask]; -} - -static void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he) -{ - return (void *) he - ht->p.head_offset; -} - -static u32 rht_bucket_index(const struct bucket_table *tbl, u32 hash) -{ - return hash & (tbl->size - 1); -} - -static u32 obj_raw_hashfn(const struct rhashtable *ht, const void *ptr) -{ - u32 hash; - - if (unlikely(!ht->p.key_len)) - hash = ht->p.obj_hashfn(ptr, ht->p.hash_rnd); - else - hash = ht->p.hashfn(ptr + ht->p.key_offset, ht->p.key_len, - ht->p.hash_rnd); - - return hash >> HASH_RESERVED_SPACE; -} - -static u32 key_hashfn(struct rhashtable *ht, const void *key, u32 len) -{ - return ht->p.hashfn(key, len, ht->p.hash_rnd) >> HASH_RESERVED_SPACE; -} - -static u32 head_hashfn(const struct rhashtable *ht, +static u32 head_hashfn(struct rhashtable *ht, const struct bucket_table *tbl, const struct rhash_head *he) { - return rht_bucket_index(tbl, obj_raw_hashfn(ht, rht_obj(ht, he))); + return rht_head_hashfn(ht, tbl, he, ht->p); } #ifdef CONFIG_PROVE_LOCKING -static void debug_dump_buckets(const struct rhashtable *ht, - const struct bucket_table *tbl) -{ - struct rhash_head *he; - unsigned int i, hash; - - for (i = 0; i < tbl->size; i++) { - pr_warn(" [Bucket %d] ", i); - rht_for_each_rcu(he, tbl, i) { - hash = head_hashfn(ht, tbl, he); - pr_cont("[hash = %#x, lock = %p] ", - hash, bucket_lock(tbl, hash)); - } - pr_cont("\n"); - } - -} - -static void debug_dump_table(struct rhashtable *ht, - const struct bucket_table *tbl, - unsigned int hash) -{ - struct bucket_table *old_tbl, *future_tbl; - - pr_emerg("BUG: lock for hash %#x in table %p not held\n", - hash, tbl); - - rcu_read_lock(); - future_tbl = rht_dereference_rcu(ht->future_tbl, ht); - old_tbl = rht_dereference_rcu(ht->tbl, ht); - if (future_tbl != old_tbl) { - pr_warn("Future table %p (size: %zd)\n", - future_tbl, future_tbl->size); - debug_dump_buckets(ht, future_tbl); - } - - pr_warn("Table %p (size: %zd)\n", old_tbl, old_tbl->size); - debug_dump_buckets(ht, old_tbl); - - rcu_read_unlock(); -} - #define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT)) -#define ASSERT_BUCKET_LOCK(HT, TBL, HASH) \ - do { \ - if (unlikely(!lockdep_rht_bucket_is_held(TBL, HASH))) { \ - debug_dump_table(HT, TBL, HASH); \ - BUG(); \ - } \ - } while (0) int lockdep_rht_mutex_is_held(struct rhashtable *ht) { @@ -150,30 +48,18 @@ EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held); int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash) { - spinlock_t *lock = bucket_lock(tbl, hash); + spinlock_t *lock = rht_bucket_lock(tbl, hash); return (debug_locks) ? lockdep_is_held(lock) : 1; } EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held); #else #define ASSERT_RHT_MUTEX(HT) -#define ASSERT_BUCKET_LOCK(HT, TBL, HASH) #endif -static struct rhash_head __rcu **bucket_tail(struct bucket_table *tbl, u32 n) -{ - struct rhash_head __rcu **pprev; - - for (pprev = &tbl->buckets[n]; - !rht_is_a_nulls(rht_dereference_bucket(*pprev, tbl, n)); - pprev = &rht_dereference_bucket(*pprev, tbl, n)->next) - ; - - return pprev; -} - -static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl) +static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl, + gfp_t gfp) { unsigned int i, size; #if defined(CONFIG_PROVE_LOCKING) @@ -190,12 +76,13 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl) if (sizeof(spinlock_t) != 0) { #ifdef CONFIG_NUMA - if (size * sizeof(spinlock_t) > PAGE_SIZE) + if (size * sizeof(spinlock_t) > PAGE_SIZE && + gfp == GFP_KERNEL) tbl->locks = vmalloc(size * sizeof(spinlock_t)); else #endif tbl->locks = kmalloc_array(size, sizeof(spinlock_t), - GFP_KERNEL); + gfp); if (!tbl->locks) return -ENOMEM; for (i = 0; i < size; i++) @@ -214,155 +101,181 @@ static void bucket_table_free(const struct bucket_table *tbl) kvfree(tbl); } +static void bucket_table_free_rcu(struct rcu_head *head) +{ + bucket_table_free(container_of(head, struct bucket_table, rcu)); +} + static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, - size_t nbuckets) + size_t nbuckets, + gfp_t gfp) { - struct bucket_table *tbl; + struct bucket_table *tbl = NULL; size_t size; int i; size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]); - tbl = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); - if (tbl == NULL) + if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER) || + gfp != GFP_KERNEL) + tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY); + if (tbl == NULL && gfp == GFP_KERNEL) tbl = vzalloc(size); - if (tbl == NULL) return NULL; tbl->size = nbuckets; - if (alloc_bucket_locks(ht, tbl) < 0) { + if (alloc_bucket_locks(ht, tbl, gfp) < 0) { bucket_table_free(tbl); return NULL; } + INIT_LIST_HEAD(&tbl->walkers); + + get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd)); + for (i = 0; i < nbuckets; i++) INIT_RHT_NULLS_HEAD(tbl->buckets[i], ht, i); return tbl; } -/** - * rht_grow_above_75 - returns true if nelems > 0.75 * table-size - * @ht: hash table - * @new_size: new table size - */ -bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size) +static struct bucket_table *rhashtable_last_table(struct rhashtable *ht, + struct bucket_table *tbl) { - /* Expand table when exceeding 75% load */ - return atomic_read(&ht->nelems) > (new_size / 4 * 3) && - (ht->p.max_shift && atomic_read(&ht->shift) < ht->p.max_shift); -} -EXPORT_SYMBOL_GPL(rht_grow_above_75); + struct bucket_table *new_tbl; -/** - * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size - * @ht: hash table - * @new_size: new table size - */ -bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size) -{ - /* Shrink table beneath 30% load */ - return atomic_read(&ht->nelems) < (new_size * 3 / 10) && - (atomic_read(&ht->shift) > ht->p.min_shift); -} -EXPORT_SYMBOL_GPL(rht_shrink_below_30); + do { + new_tbl = tbl; + tbl = rht_dereference_rcu(tbl->future_tbl, ht); + } while (tbl); -static void lock_buckets(struct bucket_table *new_tbl, - struct bucket_table *old_tbl, unsigned int hash) - __acquires(old_bucket_lock) -{ - spin_lock_bh(bucket_lock(old_tbl, hash)); - if (new_tbl != old_tbl) - spin_lock_bh_nested(bucket_lock(new_tbl, hash), - RHT_LOCK_NESTED); + return new_tbl; } -static void unlock_buckets(struct bucket_table *new_tbl, - struct bucket_table *old_tbl, unsigned int hash) - __releases(old_bucket_lock) +static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash) { - if (new_tbl != old_tbl) - spin_unlock_bh(bucket_lock(new_tbl, hash)); - spin_unlock_bh(bucket_lock(old_tbl, hash)); + struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); + struct bucket_table *new_tbl = rhashtable_last_table(ht, + rht_dereference_rcu(old_tbl->future_tbl, ht)); + struct rhash_head __rcu **pprev = &old_tbl->buckets[old_hash]; + int err = -ENOENT; + struct rhash_head *head, *next, *entry; + spinlock_t *new_bucket_lock; + unsigned int new_hash; + + rht_for_each(entry, old_tbl, old_hash) { + err = 0; + next = rht_dereference_bucket(entry->next, old_tbl, old_hash); + + if (rht_is_a_nulls(next)) + break; + + pprev = &entry->next; + } + + if (err) + goto out; + + new_hash = head_hashfn(ht, new_tbl, entry); + + new_bucket_lock = rht_bucket_lock(new_tbl, new_hash); + + spin_lock_nested(new_bucket_lock, SINGLE_DEPTH_NESTING); + head = rht_dereference_bucket(new_tbl->buckets[new_hash], + new_tbl, new_hash); + + if (rht_is_a_nulls(head)) + INIT_RHT_NULLS_HEAD(entry->next, ht, new_hash); + else + RCU_INIT_POINTER(entry->next, head); + + rcu_assign_pointer(new_tbl->buckets[new_hash], entry); + spin_unlock(new_bucket_lock); + + rcu_assign_pointer(*pprev, next); + +out: + return err; } -/** - * Unlink entries on bucket which hash to different bucket. - * - * Returns true if no more work needs to be performed on the bucket. - */ -static bool hashtable_chain_unzip(struct rhashtable *ht, - const struct bucket_table *new_tbl, - struct bucket_table *old_tbl, - size_t old_hash) +static void rhashtable_rehash_chain(struct rhashtable *ht, + unsigned int old_hash) { - struct rhash_head *he, *p, *next; - unsigned int new_hash, new_hash2; + struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); + spinlock_t *old_bucket_lock; - ASSERT_BUCKET_LOCK(ht, old_tbl, old_hash); + old_bucket_lock = rht_bucket_lock(old_tbl, old_hash); - /* Old bucket empty, no work needed. */ - p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl, - old_hash); - if (rht_is_a_nulls(p)) - return false; - - new_hash = head_hashfn(ht, new_tbl, p); - ASSERT_BUCKET_LOCK(ht, new_tbl, new_hash); + spin_lock_bh(old_bucket_lock); + while (!rhashtable_rehash_one(ht, old_hash)) + ; + old_tbl->rehash++; + spin_unlock_bh(old_bucket_lock); +} - /* Advance the old bucket pointer one or more times until it - * reaches a node that doesn't hash to the same bucket as the - * previous node p. Call the previous node p; - */ - rht_for_each_continue(he, p->next, old_tbl, old_hash) { - new_hash2 = head_hashfn(ht, new_tbl, he); - ASSERT_BUCKET_LOCK(ht, new_tbl, new_hash2); +static int rhashtable_rehash_attach(struct rhashtable *ht, + struct bucket_table *old_tbl, + struct bucket_table *new_tbl) +{ + /* Protect future_tbl using the first bucket lock. */ + spin_lock_bh(old_tbl->locks); - if (new_hash != new_hash2) - break; - p = he; + /* Did somebody beat us to it? */ + if (rcu_access_pointer(old_tbl->future_tbl)) { + spin_unlock_bh(old_tbl->locks); + return -EEXIST; } - rcu_assign_pointer(old_tbl->buckets[old_hash], p->next); - /* Find the subsequent node which does hash to the same - * bucket as node P, or NULL if no such node exists. + /* Make insertions go into the new, empty table right away. Deletions + * and lookups will be attempted in both tables until we synchronize. */ - INIT_RHT_NULLS_HEAD(next, ht, old_hash); - if (!rht_is_a_nulls(he)) { - rht_for_each_continue(he, he->next, old_tbl, old_hash) { - if (head_hashfn(ht, new_tbl, he) == new_hash) { - next = he; - break; - } - } - } + rcu_assign_pointer(old_tbl->future_tbl, new_tbl); - /* Set p's next pointer to that subsequent node pointer, - * bypassing the nodes which do not hash to p's bucket - */ - rcu_assign_pointer(p->next, next); + /* Ensure the new table is visible to readers. */ + smp_wmb(); - p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl, - old_hash); + spin_unlock_bh(old_tbl->locks); - return !rht_is_a_nulls(p); + return 0; } -static void link_old_to_new(struct rhashtable *ht, struct bucket_table *new_tbl, - unsigned int new_hash, struct rhash_head *entry) +static int rhashtable_rehash_table(struct rhashtable *ht) { - ASSERT_BUCKET_LOCK(ht, new_tbl, new_hash); + struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); + struct bucket_table *new_tbl; + struct rhashtable_walker *walker; + unsigned int old_hash; + + new_tbl = rht_dereference(old_tbl->future_tbl, ht); + if (!new_tbl) + return 0; + + for (old_hash = 0; old_hash < old_tbl->size; old_hash++) + rhashtable_rehash_chain(ht, old_hash); + + /* Publish the new table pointer. */ + rcu_assign_pointer(ht->tbl, new_tbl); - rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), entry); + spin_lock(&ht->lock); + list_for_each_entry(walker, &old_tbl->walkers, list) + walker->tbl = NULL; + spin_unlock(&ht->lock); + + /* Wait for readers. All new readers will see the new + * table, and thus no references to the old table will + * remain. + */ + call_rcu(&old_tbl->rcu, bucket_table_free_rcu); + + return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0; } /** * rhashtable_expand - Expand hash table while allowing concurrent lookups * @ht: the hash table to expand * - * A secondary bucket array is allocated and the hash entries are migrated - * while keeping them on both lists until the end of the RCU grace period. + * A secondary bucket array is allocated and the hash entries are migrated. * * This function may only be called in a context where it is safe to call * synchronize_rcu(), e.g. not within a rcu_read_lock() section. @@ -373,87 +286,32 @@ static void link_old_to_new(struct rhashtable *ht, struct bucket_table *new_tbl, * It is valid to have concurrent insertions and deletions protected by per * bucket locks or concurrent RCU protected lookups and traversals. */ -int rhashtable_expand(struct rhashtable *ht) +static int rhashtable_expand(struct rhashtable *ht) { struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht); - struct rhash_head *he; - unsigned int new_hash, old_hash; - bool complete = false; + int err; ASSERT_RHT_MUTEX(ht); - new_tbl = bucket_table_alloc(ht, old_tbl->size * 2); + old_tbl = rhashtable_last_table(ht, old_tbl); + + new_tbl = bucket_table_alloc(ht, old_tbl->size * 2, GFP_KERNEL); if (new_tbl == NULL) return -ENOMEM; - atomic_inc(&ht->shift); + err = rhashtable_rehash_attach(ht, old_tbl, new_tbl); + if (err) + bucket_table_free(new_tbl); - /* Make insertions go into the new, empty table right away. Deletions - * and lookups will be attempted in both tables until we synchronize. - * The synchronize_rcu() guarantees for the new table to be picked up - * so no new additions go into the old table while we relink. - */ - rcu_assign_pointer(ht->future_tbl, new_tbl); - synchronize_rcu(); - - /* For each new bucket, search the corresponding old bucket for the - * first entry that hashes to the new bucket, and link the end of - * newly formed bucket chain (containing entries added to future - * table) to that entry. Since all the entries which will end up in - * the new bucket appear in the same old bucket, this constructs an - * entirely valid new hash table, but with multiple buckets - * "zipped" together into a single imprecise chain. - */ - for (new_hash = 0; new_hash < new_tbl->size; new_hash++) { - old_hash = rht_bucket_index(old_tbl, new_hash); - lock_buckets(new_tbl, old_tbl, new_hash); - rht_for_each(he, old_tbl, old_hash) { - if (head_hashfn(ht, new_tbl, he) == new_hash) { - link_old_to_new(ht, new_tbl, new_hash, he); - break; - } - } - unlock_buckets(new_tbl, old_tbl, new_hash); - } - - /* Unzip interleaved hash chains */ - while (!complete && !ht->being_destroyed) { - /* Wait for readers. All new readers will see the new - * table, and thus no references to the old table will - * remain. - */ - synchronize_rcu(); - - /* For each bucket in the old table (each of which - * contains items from multiple buckets of the new - * table): ... - */ - complete = true; - for (old_hash = 0; old_hash < old_tbl->size; old_hash++) { - lock_buckets(new_tbl, old_tbl, old_hash); - - if (hashtable_chain_unzip(ht, new_tbl, old_tbl, - old_hash)) - complete = false; - - unlock_buckets(new_tbl, old_tbl, old_hash); - } - } - - rcu_assign_pointer(ht->tbl, new_tbl); - synchronize_rcu(); - - bucket_table_free(old_tbl); - return 0; + return err; } -EXPORT_SYMBOL_GPL(rhashtable_expand); /** * rhashtable_shrink - Shrink hash table while allowing concurrent lookups * @ht: the hash table to shrink * - * This function may only be called in a context where it is safe to call - * synchronize_rcu(), e.g. not within a rcu_read_lock() section. + * This function shrinks the hash table to fit, i.e., the smallest + * size would not cause it to expand right away automatically. * * The caller must ensure that no concurrent resizing occurs by holding * ht->mutex. @@ -464,403 +322,146 @@ EXPORT_SYMBOL_GPL(rhashtable_expand); * It is valid to have concurrent insertions and deletions protected by per * bucket locks or concurrent RCU protected lookups and traversals. */ -int rhashtable_shrink(struct rhashtable *ht) +static int rhashtable_shrink(struct rhashtable *ht) { - struct bucket_table *new_tbl, *tbl = rht_dereference(ht->tbl, ht); - unsigned int new_hash; + struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht); + unsigned int size; + int err; ASSERT_RHT_MUTEX(ht); - new_tbl = bucket_table_alloc(ht, tbl->size / 2); - if (new_tbl == NULL) - return -ENOMEM; - - rcu_assign_pointer(ht->future_tbl, new_tbl); - synchronize_rcu(); - - /* Link the first entry in the old bucket to the end of the - * bucket in the new table. As entries are concurrently being - * added to the new table, lock down the new bucket. As we - * always divide the size in half when shrinking, each bucket - * in the new table maps to exactly two buckets in the old - * table. - */ - for (new_hash = 0; new_hash < new_tbl->size; new_hash++) { - lock_buckets(new_tbl, tbl, new_hash); - - rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), - tbl->buckets[new_hash]); - ASSERT_BUCKET_LOCK(ht, tbl, new_hash + new_tbl->size); - rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), - tbl->buckets[new_hash + new_tbl->size]); + size = roundup_pow_of_two(atomic_read(&ht->nelems) * 3 / 2); + if (size < ht->p.min_size) + size = ht->p.min_size; - unlock_buckets(new_tbl, tbl, new_hash); - } + if (old_tbl->size <= size) + return 0; - /* Publish the new, valid hash table */ - rcu_assign_pointer(ht->tbl, new_tbl); - atomic_dec(&ht->shift); + if (rht_dereference(old_tbl->future_tbl, ht)) + return -EEXIST; - /* Wait for readers. No new readers will have references to the - * old hash table. - */ - synchronize_rcu(); + new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL); + if (new_tbl == NULL) + return -ENOMEM; - bucket_table_free(tbl); + err = rhashtable_rehash_attach(ht, old_tbl, new_tbl); + if (err) + bucket_table_free(new_tbl); - return 0; + return err; } -EXPORT_SYMBOL_GPL(rhashtable_shrink); static void rht_deferred_worker(struct work_struct *work) { struct rhashtable *ht; struct bucket_table *tbl; - struct rhashtable_walker *walker; + int err = 0; ht = container_of(work, struct rhashtable, run_work); mutex_lock(&ht->mutex); - if (ht->being_destroyed) - goto unlock; tbl = rht_dereference(ht->tbl, ht); + tbl = rhashtable_last_table(ht, tbl); - list_for_each_entry(walker, &ht->walkers, list) - walker->resize = true; - - if (ht->p.grow_decision && ht->p.grow_decision(ht, tbl->size)) + if (rht_grow_above_75(ht, tbl)) rhashtable_expand(ht); - else if (ht->p.shrink_decision && ht->p.shrink_decision(ht, tbl->size)) + else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl)) rhashtable_shrink(ht); -unlock: + err = rhashtable_rehash_table(ht); + mutex_unlock(&ht->mutex); -} -static void rhashtable_wakeup_worker(struct rhashtable *ht) -{ - struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); - struct bucket_table *new_tbl = rht_dereference_rcu(ht->future_tbl, ht); - size_t size = tbl->size; - - /* Only adjust the table if no resizing is currently in progress. */ - if (tbl == new_tbl && - ((ht->p.grow_decision && ht->p.grow_decision(ht, size)) || - (ht->p.shrink_decision && ht->p.shrink_decision(ht, size)))) + if (err) schedule_work(&ht->run_work); } -static void __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj, - struct bucket_table *tbl, u32 hash) +static bool rhashtable_check_elasticity(struct rhashtable *ht, + struct bucket_table *tbl, + unsigned int hash) { + unsigned int elasticity = ht->elasticity; struct rhash_head *head; - hash = rht_bucket_index(tbl, hash); - head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash); - - ASSERT_BUCKET_LOCK(ht, tbl, hash); - - if (rht_is_a_nulls(head)) - INIT_RHT_NULLS_HEAD(obj->next, ht, hash); - else - RCU_INIT_POINTER(obj->next, head); - - rcu_assign_pointer(tbl->buckets[hash], obj); + rht_for_each(head, tbl, hash) + if (!--elasticity) + return true; - atomic_inc(&ht->nelems); - - rhashtable_wakeup_worker(ht); + return false; } -/** - * rhashtable_insert - insert object into hash table - * @ht: hash table - * @obj: pointer to hash head inside object - * - * Will take a per bucket spinlock to protect against mutual mutations - * on the same bucket. Multiple insertions may occur in parallel unless - * they map to the same bucket lock. - * - * It is safe to call this function from atomic context. - * - * Will trigger an automatic deferred table resizing if the size grows - * beyond the watermark indicated by grow_decision() which can be passed - * to rhashtable_init(). - */ -void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj) +int rhashtable_insert_rehash(struct rhashtable *ht) { - struct bucket_table *tbl, *old_tbl; - unsigned hash; - - rcu_read_lock(); - - tbl = rht_dereference_rcu(ht->future_tbl, ht); - old_tbl = rht_dereference_rcu(ht->tbl, ht); - hash = obj_raw_hashfn(ht, rht_obj(ht, obj)); - - lock_buckets(tbl, old_tbl, hash); - __rhashtable_insert(ht, obj, tbl, hash); - unlock_buckets(tbl, old_tbl, hash); - - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(rhashtable_insert); - -/** - * rhashtable_remove - remove object from hash table - * @ht: hash table - * @obj: pointer to hash head inside object - * - * Since the hash chain is single linked, the removal operation needs to - * walk the bucket chain upon removal. The removal operation is thus - * considerable slow if the hash table is not correctly sized. - * - * Will automatically shrink the table via rhashtable_expand() if the - * shrink_decision function specified at rhashtable_init() returns true. - * - * The caller must ensure that no concurrent table mutations occur. It is - * however valid to have concurrent lookups if they are RCU protected. - */ -bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj) -{ - struct bucket_table *tbl, *new_tbl, *old_tbl; - struct rhash_head __rcu **pprev; - struct rhash_head *he, *he2; - unsigned int hash, new_hash; - bool ret = false; + struct bucket_table *old_tbl; + struct bucket_table *new_tbl; + struct bucket_table *tbl; + unsigned int size; + int err; - rcu_read_lock(); old_tbl = rht_dereference_rcu(ht->tbl, ht); - tbl = new_tbl = rht_dereference_rcu(ht->future_tbl, ht); - new_hash = obj_raw_hashfn(ht, rht_obj(ht, obj)); - - lock_buckets(new_tbl, old_tbl, new_hash); -restart: - hash = rht_bucket_index(tbl, new_hash); - pprev = &tbl->buckets[hash]; - rht_for_each(he, tbl, hash) { - if (he != obj) { - pprev = &he->next; - continue; - } - - ASSERT_BUCKET_LOCK(ht, tbl, hash); - - if (old_tbl->size > new_tbl->size && tbl == old_tbl && - !rht_is_a_nulls(obj->next) && - head_hashfn(ht, tbl, obj->next) != hash) { - rcu_assign_pointer(*pprev, (struct rhash_head *) rht_marker(ht, hash)); - } else if (unlikely(old_tbl->size < new_tbl->size && tbl == new_tbl)) { - rht_for_each_continue(he2, obj->next, tbl, hash) { - if (head_hashfn(ht, tbl, he2) == hash) { - rcu_assign_pointer(*pprev, he2); - goto found; - } - } - - rcu_assign_pointer(*pprev, (struct rhash_head *) rht_marker(ht, hash)); - } else { - rcu_assign_pointer(*pprev, obj->next); - } - -found: - ret = true; - break; - } + tbl = rhashtable_last_table(ht, old_tbl); - /* The entry may be linked in either 'tbl', 'future_tbl', or both. - * 'future_tbl' only exists for a short period of time during - * resizing. Thus traversing both is fine and the added cost is - * very rare. - */ - if (tbl != old_tbl) { - tbl = old_tbl; - goto restart; - } + size = tbl->size; - unlock_buckets(new_tbl, old_tbl, new_hash); - - if (ret) { - atomic_dec(&ht->nelems); - rhashtable_wakeup_worker(ht); - } - - rcu_read_unlock(); + if (rht_grow_above_75(ht, tbl)) + size *= 2; + /* More than two rehashes (not resizes) detected. */ + else if (WARN_ON(old_tbl != tbl && old_tbl->size == size)) + return -EBUSY; - return ret; -} -EXPORT_SYMBOL_GPL(rhashtable_remove); - -struct rhashtable_compare_arg { - struct rhashtable *ht; - const void *key; -}; - -static bool rhashtable_compare(void *ptr, void *arg) -{ - struct rhashtable_compare_arg *x = arg; - struct rhashtable *ht = x->ht; - - return !memcmp(ptr + ht->p.key_offset, x->key, ht->p.key_len); -} - -/** - * rhashtable_lookup - lookup key in hash table - * @ht: hash table - * @key: pointer to key - * - * Computes the hash value for the key and traverses the bucket chain looking - * for a entry with an identical key. The first matching entry is returned. - * - * This lookup function may only be used for fixed key hash table (key_len - * parameter set). It will BUG() if used inappropriately. - * - * Lookups may occur in parallel with hashtable mutations and resizing. - */ -void *rhashtable_lookup(struct rhashtable *ht, const void *key) -{ - struct rhashtable_compare_arg arg = { - .ht = ht, - .key = key, - }; - - BUG_ON(!ht->p.key_len); - - return rhashtable_lookup_compare(ht, key, &rhashtable_compare, &arg); -} -EXPORT_SYMBOL_GPL(rhashtable_lookup); - -/** - * rhashtable_lookup_compare - search hash table with compare function - * @ht: hash table - * @key: the pointer to the key - * @compare: compare function, must return true on match - * @arg: argument passed on to compare function - * - * Traverses the bucket chain behind the provided hash value and calls the - * specified compare function for each entry. - * - * Lookups may occur in parallel with hashtable mutations and resizing. - * - * Returns the first entry on which the compare function returned true. - */ -void *rhashtable_lookup_compare(struct rhashtable *ht, const void *key, - bool (*compare)(void *, void *), void *arg) -{ - const struct bucket_table *tbl, *old_tbl; - struct rhash_head *he; - u32 hash; - - rcu_read_lock(); - - old_tbl = rht_dereference_rcu(ht->tbl, ht); - tbl = rht_dereference_rcu(ht->future_tbl, ht); - hash = key_hashfn(ht, key, ht->p.key_len); -restart: - rht_for_each_rcu(he, tbl, rht_bucket_index(tbl, hash)) { - if (!compare(rht_obj(ht, he), arg)) - continue; - rcu_read_unlock(); - return rht_obj(ht, he); - } + new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC); + if (new_tbl == NULL) + return -ENOMEM; - if (unlikely(tbl != old_tbl)) { - tbl = old_tbl; - goto restart; - } - rcu_read_unlock(); + err = rhashtable_rehash_attach(ht, tbl, new_tbl); + if (err) { + bucket_table_free(new_tbl); + if (err == -EEXIST) + err = 0; + } else + schedule_work(&ht->run_work); - return NULL; + return err; } -EXPORT_SYMBOL_GPL(rhashtable_lookup_compare); +EXPORT_SYMBOL_GPL(rhashtable_insert_rehash); -/** - * rhashtable_lookup_insert - lookup and insert object into hash table - * @ht: hash table - * @obj: pointer to hash head inside object - * - * Locks down the bucket chain in both the old and new table if a resize - * is in progress to ensure that writers can't remove from the old table - * and can't insert to the new table during the atomic operation of search - * and insertion. Searches for duplicates in both the old and new table if - * a resize is in progress. - * - * This lookup function may only be used for fixed key hash table (key_len - * parameter set). It will BUG() if used inappropriately. - * - * It is safe to call this function from atomic context. - * - * Will trigger an automatic deferred table resizing if the size grows - * beyond the watermark indicated by grow_decision() which can be passed - * to rhashtable_init(). - */ -bool rhashtable_lookup_insert(struct rhashtable *ht, struct rhash_head *obj) +int rhashtable_insert_slow(struct rhashtable *ht, const void *key, + struct rhash_head *obj, + struct bucket_table *tbl) { - struct rhashtable_compare_arg arg = { - .ht = ht, - .key = rht_obj(ht, obj) + ht->p.key_offset, - }; + struct rhash_head *head; + unsigned int hash; + int err; - BUG_ON(!ht->p.key_len); + tbl = rhashtable_last_table(ht, tbl); + hash = head_hashfn(ht, tbl, obj); + spin_lock_nested(rht_bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING); - return rhashtable_lookup_compare_insert(ht, obj, &rhashtable_compare, - &arg); -} -EXPORT_SYMBOL_GPL(rhashtable_lookup_insert); + err = -EEXIST; + if (key && rhashtable_lookup_fast(ht, key, ht->p)) + goto exit; -/** - * rhashtable_lookup_compare_insert - search and insert object to hash table - * with compare function - * @ht: hash table - * @obj: pointer to hash head inside object - * @compare: compare function, must return true on match - * @arg: argument passed on to compare function - * - * Locks down the bucket chain in both the old and new table if a resize - * is in progress to ensure that writers can't remove from the old table - * and can't insert to the new table during the atomic operation of search - * and insertion. Searches for duplicates in both the old and new table if - * a resize is in progress. - * - * Lookups may occur in parallel with hashtable mutations and resizing. - * - * Will trigger an automatic deferred table resizing if the size grows - * beyond the watermark indicated by grow_decision() which can be passed - * to rhashtable_init(). - */ -bool rhashtable_lookup_compare_insert(struct rhashtable *ht, - struct rhash_head *obj, - bool (*compare)(void *, void *), - void *arg) -{ - struct bucket_table *new_tbl, *old_tbl; - u32 new_hash; - bool success = true; + err = -EAGAIN; + if (rhashtable_check_elasticity(ht, tbl, hash) || + rht_grow_above_100(ht, tbl)) + goto exit; - BUG_ON(!ht->p.key_len); + err = 0; - rcu_read_lock(); - old_tbl = rht_dereference_rcu(ht->tbl, ht); - new_tbl = rht_dereference_rcu(ht->future_tbl, ht); - new_hash = obj_raw_hashfn(ht, rht_obj(ht, obj)); + head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash); - lock_buckets(new_tbl, old_tbl, new_hash); + RCU_INIT_POINTER(obj->next, head); - if (rhashtable_lookup_compare(ht, rht_obj(ht, obj) + ht->p.key_offset, - compare, arg)) { - success = false; - goto exit; - } + rcu_assign_pointer(tbl->buckets[hash], obj); - __rhashtable_insert(ht, obj, new_tbl, new_hash); + atomic_inc(&ht->nelems); exit: - unlock_buckets(new_tbl, old_tbl, new_hash); - rcu_read_unlock(); + spin_unlock(rht_bucket_lock(tbl, hash)); - return success; + return err; } -EXPORT_SYMBOL_GPL(rhashtable_lookup_compare_insert); +EXPORT_SYMBOL_GPL(rhashtable_insert_slow); /** * rhashtable_walk_init - Initialise an iterator @@ -895,7 +496,8 @@ int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter) return -ENOMEM; mutex_lock(&ht->mutex); - list_add(&iter->walker->list, &ht->walkers); + iter->walker->tbl = rht_dereference(ht->tbl, ht); + list_add(&iter->walker->list, &iter->walker->tbl->walkers); mutex_unlock(&ht->mutex); return 0; @@ -911,7 +513,8 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_init); void rhashtable_walk_exit(struct rhashtable_iter *iter) { mutex_lock(&iter->ht->mutex); - list_del(&iter->walker->list); + if (iter->walker->tbl) + list_del(&iter->walker->list); mutex_unlock(&iter->ht->mutex); kfree(iter->walker); } @@ -932,13 +535,21 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_exit); * by calling rhashtable_walk_next. */ int rhashtable_walk_start(struct rhashtable_iter *iter) + __acquires(RCU) { + struct rhashtable *ht = iter->ht; + + mutex_lock(&ht->mutex); + + if (iter->walker->tbl) + list_del(&iter->walker->list); + rcu_read_lock(); - if (iter->walker->resize) { - iter->slot = 0; - iter->skip = 0; - iter->walker->resize = false; + mutex_unlock(&ht->mutex); + + if (!iter->walker->tbl) { + iter->walker->tbl = rht_dereference_rcu(ht->tbl, ht); return -EAGAIN; } @@ -960,13 +571,11 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_start); */ void *rhashtable_walk_next(struct rhashtable_iter *iter) { - const struct bucket_table *tbl; + struct bucket_table *tbl = iter->walker->tbl; struct rhashtable *ht = iter->ht; struct rhash_head *p = iter->p; void *obj = NULL; - tbl = rht_dereference_rcu(ht->tbl, ht); - if (p) { p = rht_dereference_bucket_rcu(p->next, tbl, iter->slot); goto next; @@ -992,17 +601,20 @@ next: iter->skip = 0; } - iter->p = NULL; + /* Ensure we see any new tables. */ + smp_rmb(); -out: - if (iter->walker->resize) { - iter->p = NULL; + iter->walker->tbl = rht_dereference_rcu(tbl->future_tbl, ht); + if (iter->walker->tbl) { iter->slot = 0; iter->skip = 0; - iter->walker->resize = false; return ERR_PTR(-EAGAIN); } + iter->p = NULL; + +out: + return obj; } EXPORT_SYMBOL_GPL(rhashtable_walk_next); @@ -1014,16 +626,39 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_next); * Finish a hash table walk. */ void rhashtable_walk_stop(struct rhashtable_iter *iter) + __releases(RCU) { - rcu_read_unlock(); + struct rhashtable *ht; + struct bucket_table *tbl = iter->walker->tbl; + + if (!tbl) + goto out; + + ht = iter->ht; + + spin_lock(&ht->lock); + if (tbl->rehash < tbl->size) + list_add(&iter->walker->list, &tbl->walkers); + else + iter->walker->tbl = NULL; + spin_unlock(&ht->lock); + iter->p = NULL; + +out: + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rhashtable_walk_stop); -static size_t rounded_hashtable_size(struct rhashtable_params *params) +static size_t rounded_hashtable_size(const struct rhashtable_params *params) { return max(roundup_pow_of_two(params->nelem_hint * 4 / 3), - 1UL << params->min_shift); + (unsigned long)params->min_size); +} + +static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed) +{ + return jhash2(key, length, seed); } /** @@ -1056,7 +691,7 @@ static size_t rounded_hashtable_size(struct rhashtable_params *params) * struct rhash_head node; * }; * - * u32 my_hash_fn(const void *data, u32 seed) + * u32 my_hash_fn(const void *data, u32 len, u32 seed) * { * struct test_obj *obj = data; * @@ -1069,72 +704,129 @@ static size_t rounded_hashtable_size(struct rhashtable_params *params) * .obj_hashfn = my_hash_fn, * }; */ -int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params) +int rhashtable_init(struct rhashtable *ht, + const struct rhashtable_params *params) { struct bucket_table *tbl; size_t size; size = HASH_DEFAULT_SIZE; - if ((params->key_len && !params->hashfn) || - (!params->key_len && !params->obj_hashfn)) + if ((!params->key_len && !params->obj_hashfn) || + (params->obj_hashfn && !params->obj_cmpfn)) return -EINVAL; if (params->nulls_base && params->nulls_base < (1U << RHT_BASE_SHIFT)) return -EINVAL; - params->min_shift = max_t(size_t, params->min_shift, - ilog2(HASH_MIN_SIZE)); - if (params->nelem_hint) size = rounded_hashtable_size(params); memset(ht, 0, sizeof(*ht)); mutex_init(&ht->mutex); + spin_lock_init(&ht->lock); memcpy(&ht->p, params, sizeof(*params)); - INIT_LIST_HEAD(&ht->walkers); + + if (params->min_size) + ht->p.min_size = roundup_pow_of_two(params->min_size); + + if (params->max_size) + ht->p.max_size = rounddown_pow_of_two(params->max_size); + + ht->p.min_size = max(ht->p.min_size, HASH_MIN_SIZE); + + /* The maximum (not average) chain length grows with the + * size of the hash table, at a rate of (log N)/(log log N). + * The value of 16 is selected so that even if the hash + * table grew to 2^32 you would not expect the maximum + * chain length to exceed it unless we are under attack + * (or extremely unlucky). + * + * As this limit is only to detect attacks, we don't need + * to set it to a lower value as you'd need the chain + * length to vastly exceed 16 to have any real effect + * on the system. + */ + if (!params->insecure_elasticity) + ht->elasticity = 16; if (params->locks_mul) ht->p.locks_mul = roundup_pow_of_two(params->locks_mul); else ht->p.locks_mul = BUCKET_LOCKS_PER_CPU; - tbl = bucket_table_alloc(ht, size); + ht->key_len = ht->p.key_len; + if (!params->hashfn) { + ht->p.hashfn = jhash; + + if (!(ht->key_len & (sizeof(u32) - 1))) { + ht->key_len /= sizeof(u32); + ht->p.hashfn = rhashtable_jhash2; + } + } + + tbl = bucket_table_alloc(ht, size, GFP_KERNEL); if (tbl == NULL) return -ENOMEM; atomic_set(&ht->nelems, 0); - atomic_set(&ht->shift, ilog2(tbl->size)); - RCU_INIT_POINTER(ht->tbl, tbl); - RCU_INIT_POINTER(ht->future_tbl, tbl); - if (!ht->p.hash_rnd) - get_random_bytes(&ht->p.hash_rnd, sizeof(ht->p.hash_rnd)); + RCU_INIT_POINTER(ht->tbl, tbl); - if (ht->p.grow_decision || ht->p.shrink_decision) - INIT_WORK(&ht->run_work, rht_deferred_worker); + INIT_WORK(&ht->run_work, rht_deferred_worker); return 0; } EXPORT_SYMBOL_GPL(rhashtable_init); /** - * rhashtable_destroy - destroy hash table + * rhashtable_free_and_destroy - free elements and destroy hash table * @ht: the hash table to destroy + * @free_fn: callback to release resources of element + * @arg: pointer passed to free_fn + * + * Stops an eventual async resize. If defined, invokes free_fn for each + * element to releasal resources. Please note that RCU protected + * readers may still be accessing the elements. Releasing of resources + * must occur in a compatible manner. Then frees the bucket array. * - * Frees the bucket array. This function is not rcu safe, therefore the caller - * has to make sure that no resizing may happen by unpublishing the hashtable - * and waiting for the quiescent cycle before releasing the bucket array. + * This function will eventually sleep to wait for an async resize + * to complete. The caller is responsible that no further write operations + * occurs in parallel. */ -void rhashtable_destroy(struct rhashtable *ht) +void rhashtable_free_and_destroy(struct rhashtable *ht, + void (*free_fn)(void *ptr, void *arg), + void *arg) { - ht->being_destroyed = true; + const struct bucket_table *tbl; + unsigned int i; - if (ht->p.grow_decision || ht->p.shrink_decision) - cancel_work_sync(&ht->run_work); + cancel_work_sync(&ht->run_work); mutex_lock(&ht->mutex); - bucket_table_free(rht_dereference(ht->tbl, ht)); + tbl = rht_dereference(ht->tbl, ht); + if (free_fn) { + for (i = 0; i < tbl->size; i++) { + struct rhash_head *pos, *next; + + for (pos = rht_dereference(tbl->buckets[i], ht), + next = !rht_is_a_nulls(pos) ? + rht_dereference(pos->next, ht) : NULL; + !rht_is_a_nulls(pos); + pos = next, + next = !rht_is_a_nulls(pos) ? + rht_dereference(pos->next, ht) : NULL) + free_fn(rht_obj(ht, pos), arg); + } + } + + bucket_table_free(tbl); mutex_unlock(&ht->mutex); } +EXPORT_SYMBOL_GPL(rhashtable_free_and_destroy); + +void rhashtable_destroy(struct rhashtable *ht) +{ + return rhashtable_free_and_destroy(ht, NULL, NULL); +} EXPORT_SYMBOL_GPL(rhashtable_destroy); |