diff options
Diffstat (limited to 'lib')
-rw-r--r-- | lib/Kconfig.debug | 60 | ||||
-rw-r--r-- | lib/Makefile | 2 | ||||
-rw-r--r-- | lib/div64.c | 2 | ||||
-rw-r--r-- | lib/ioremap.c | 53 | ||||
-rw-r--r-- | lib/iov_iter.c | 851 | ||||
-rw-r--r-- | lib/kobject.c | 7 | ||||
-rw-r--r-- | lib/lcm.c | 11 | ||||
-rw-r--r-- | lib/lockref.c | 2 | ||||
-rw-r--r-- | lib/lz4/lz4_decompress.c | 21 | ||||
-rw-r--r-- | lib/nlattr.c | 2 | ||||
-rw-r--r-- | lib/rhashtable.c | 1034 | ||||
-rw-r--r-- | lib/seq_buf.c | 4 | ||||
-rw-r--r-- | lib/sha1.c | 1 | ||||
-rw-r--r-- | lib/test_rhashtable.c | 67 |
14 files changed, 1378 insertions, 739 deletions
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index c5cefb3c009c..17670573dda8 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -865,6 +865,19 @@ config SCHED_STACK_END_CHECK data corruption or a sporadic crash at a later stage once the region is examined. The runtime overhead introduced is minimal. +config DEBUG_TIMEKEEPING + bool "Enable extra timekeeping sanity checking" + help + This option will enable additional timekeeping sanity checks + which may be helpful when diagnosing issues where timekeeping + problems are suspected. + + This may include checks in the timekeeping hotpaths, so this + option may have a (very small) performance impact to some + workloads. + + If unsure, say N. + config TIMER_STATS bool "Collect kernel timers statistics" depends on DEBUG_KERNEL && PROC_FS @@ -1180,16 +1193,7 @@ config DEBUG_CREDENTIALS menu "RCU Debugging" config PROVE_RCU - bool "RCU debugging: prove RCU correctness" - depends on PROVE_LOCKING - default n - help - This feature enables lockdep extensions that check for correct - use of RCU APIs. This is currently under development. Say Y - if you want to debug RCU usage or help work on the PROVE_RCU - feature. - - Say N if you are unsure. + def_bool PROVE_LOCKING config PROVE_RCU_REPEATEDLY bool "RCU debugging: don't disable PROVE_RCU on first splat" @@ -1257,6 +1261,30 @@ config RCU_TORTURE_TEST_RUNNABLE Say N here if you want the RCU torture tests to start only after being manually enabled via /proc. +config RCU_TORTURE_TEST_SLOW_INIT + bool "Slow down RCU grace-period initialization to expose races" + depends on RCU_TORTURE_TEST + help + This option makes grace-period initialization block for a + few jiffies between initializing each pair of consecutive + rcu_node structures. This helps to expose races involving + grace-period initialization, in other words, it makes your + kernel less stable. It can also greatly increase grace-period + latency, especially on systems with large numbers of CPUs. + This is useful when torture-testing RCU, but in almost no + other circumstance. + + Say Y here if you want your system to crash and hang more often. + Say N if you want a sane system. + +config RCU_TORTURE_TEST_SLOW_INIT_DELAY + int "How much to slow down RCU grace-period initialization" + range 0 5 + default 3 + help + This option specifies the number of jiffies to wait between + each rcu_node structure initialization. + config RCU_CPU_STALL_TIMEOUT int "RCU CPU stall timeout in seconds" depends on RCU_STALL_COMMON @@ -1732,6 +1760,18 @@ config TEST_UDELAY If unsure, say N. +config MEMTEST + bool "Memtest" + depends on HAVE_MEMBLOCK + ---help--- + This option adds a kernel parameter 'memtest', which allows memtest + to be set. + memtest=0, mean disabled; -- default + memtest=1, mean do 1 test pattern; + ... + memtest=17, mean do 17 test patterns. + If you are unsure how to answer this question, answer N. + source "samples/Kconfig" source "lib/Kconfig.kgdb" diff --git a/lib/Makefile b/lib/Makefile index 87eb3bffc283..58f74d2dd396 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -24,7 +24,7 @@ obj-y += lockref.o obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \ bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \ - gcd.o lcm.o list_sort.o uuid.o flex_array.o clz_ctz.o \ + gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \ bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o \ percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o obj-y += string_helpers.o diff --git a/lib/div64.c b/lib/div64.c index 4382ad77777e..19ea7ed4b948 100644 --- a/lib/div64.c +++ b/lib/div64.c @@ -127,7 +127,7 @@ EXPORT_SYMBOL(div64_u64_rem); * by the book 'Hacker's Delight'. The original source and full proof * can be found here and is available for use without restriction. * - * 'http://www.hackersdelight.org/HDcode/newCode/divDouble.c.txt' + * 'http://www.hackersdelight.org/hdcodetxt/divDouble.c.txt' */ #ifndef div64_u64 u64 div64_u64(u64 dividend, u64 divisor) diff --git a/lib/ioremap.c b/lib/ioremap.c index 0c9216c48762..86c8911b0e3a 100644 --- a/lib/ioremap.c +++ b/lib/ioremap.c @@ -13,6 +13,43 @@ #include <asm/cacheflush.h> #include <asm/pgtable.h> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP +static int __read_mostly ioremap_pud_capable; +static int __read_mostly ioremap_pmd_capable; +static int __read_mostly ioremap_huge_disabled; + +static int __init set_nohugeiomap(char *str) +{ + ioremap_huge_disabled = 1; + return 0; +} +early_param("nohugeiomap", set_nohugeiomap); + +void __init ioremap_huge_init(void) +{ + if (!ioremap_huge_disabled) { + if (arch_ioremap_pud_supported()) + ioremap_pud_capable = 1; + if (arch_ioremap_pmd_supported()) + ioremap_pmd_capable = 1; + } +} + +static inline int ioremap_pud_enabled(void) +{ + return ioremap_pud_capable; +} + +static inline int ioremap_pmd_enabled(void) +{ + return ioremap_pmd_capable; +} + +#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ +static inline int ioremap_pud_enabled(void) { return 0; } +static inline int ioremap_pmd_enabled(void) { return 0; } +#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ + static int ioremap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot) { @@ -43,6 +80,14 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr, return -ENOMEM; do { next = pmd_addr_end(addr, end); + + if (ioremap_pmd_enabled() && + ((next - addr) == PMD_SIZE) && + IS_ALIGNED(phys_addr + addr, PMD_SIZE)) { + if (pmd_set_huge(pmd, phys_addr + addr, prot)) + continue; + } + if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, prot)) return -ENOMEM; } while (pmd++, addr = next, addr != end); @@ -61,6 +106,14 @@ static inline int ioremap_pud_range(pgd_t *pgd, unsigned long addr, return -ENOMEM; do { next = pud_addr_end(addr, end); + + if (ioremap_pud_enabled() && + ((next - addr) == PUD_SIZE) && + IS_ALIGNED(phys_addr + addr, PUD_SIZE)) { + if (pud_set_huge(pud, phys_addr + addr, prot)) + continue; + } + if (ioremap_pmd_range(pud, addr, next, phys_addr + addr, prot)) return -ENOMEM; } while (pud++, addr = next, addr != end); diff --git a/lib/iov_iter.c b/lib/iov_iter.c new file mode 100644 index 000000000000..75232ad0a5e7 --- /dev/null +++ b/lib/iov_iter.c @@ -0,0 +1,851 @@ +#include <linux/export.h> +#include <linux/uio.h> +#include <linux/pagemap.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <net/checksum.h> + +#define iterate_iovec(i, n, __v, __p, skip, STEP) { \ + size_t left; \ + size_t wanted = n; \ + __p = i->iov; \ + __v.iov_len = min(n, __p->iov_len - skip); \ + if (likely(__v.iov_len)) { \ + __v.iov_base = __p->iov_base + skip; \ + left = (STEP); \ + __v.iov_len -= left; \ + skip += __v.iov_len; \ + n -= __v.iov_len; \ + } else { \ + left = 0; \ + } \ + while (unlikely(!left && n)) { \ + __p++; \ + __v.iov_len = min(n, __p->iov_len); \ + if (unlikely(!__v.iov_len)) \ + continue; \ + __v.iov_base = __p->iov_base; \ + left = (STEP); \ + __v.iov_len -= left; \ + skip = __v.iov_len; \ + n -= __v.iov_len; \ + } \ + n = wanted - n; \ +} + +#define iterate_kvec(i, n, __v, __p, skip, STEP) { \ + size_t wanted = n; \ + __p = i->kvec; \ + __v.iov_len = min(n, __p->iov_len - skip); \ + if (likely(__v.iov_len)) { \ + __v.iov_base = __p->iov_base + skip; \ + (void)(STEP); \ + skip += __v.iov_len; \ + n -= __v.iov_len; \ + } \ + while (unlikely(n)) { \ + __p++; \ + __v.iov_len = min(n, __p->iov_len); \ + if (unlikely(!__v.iov_len)) \ + continue; \ + __v.iov_base = __p->iov_base; \ + (void)(STEP); \ + skip = __v.iov_len; \ + n -= __v.iov_len; \ + } \ + n = wanted; \ +} + +#define iterate_bvec(i, n, __v, __p, skip, STEP) { \ + size_t wanted = n; \ + __p = i->bvec; \ + __v.bv_len = min_t(size_t, n, __p->bv_len - skip); \ + if (likely(__v.bv_len)) { \ + __v.bv_page = __p->bv_page; \ + __v.bv_offset = __p->bv_offset + skip; \ + (void)(STEP); \ + skip += __v.bv_len; \ + n -= __v.bv_len; \ + } \ + while (unlikely(n)) { \ + __p++; \ + __v.bv_len = min_t(size_t, n, __p->bv_len); \ + if (unlikely(!__v.bv_len)) \ + continue; \ + __v.bv_page = __p->bv_page; \ + __v.bv_offset = __p->bv_offset; \ + (void)(STEP); \ + skip = __v.bv_len; \ + n -= __v.bv_len; \ + } \ + n = wanted; \ +} + +#define iterate_all_kinds(i, n, v, I, B, K) { \ + size_t skip = i->iov_offset; \ + if (unlikely(i->type & ITER_BVEC)) { \ + const struct bio_vec *bvec; \ + struct bio_vec v; \ + iterate_bvec(i, n, v, bvec, skip, (B)) \ + } else if (unlikely(i->type & ITER_KVEC)) { \ + const struct kvec *kvec; \ + struct kvec v; \ + iterate_kvec(i, n, v, kvec, skip, (K)) \ + } else { \ + const struct iovec *iov; \ + struct iovec v; \ + iterate_iovec(i, n, v, iov, skip, (I)) \ + } \ +} + +#define iterate_and_advance(i, n, v, I, B, K) { \ + size_t skip = i->iov_offset; \ + if (unlikely(i->type & ITER_BVEC)) { \ + const struct bio_vec *bvec; \ + struct bio_vec v; \ + iterate_bvec(i, n, v, bvec, skip, (B)) \ + if (skip == bvec->bv_len) { \ + bvec++; \ + skip = 0; \ + } \ + i->nr_segs -= bvec - i->bvec; \ + i->bvec = bvec; \ + } else if (unlikely(i->type & ITER_KVEC)) { \ + const struct kvec *kvec; \ + struct kvec v; \ + iterate_kvec(i, n, v, kvec, skip, (K)) \ + if (skip == kvec->iov_len) { \ + kvec++; \ + skip = 0; \ + } \ + i->nr_segs -= kvec - i->kvec; \ + i->kvec = kvec; \ + } else { \ + const struct iovec *iov; \ + struct iovec v; \ + iterate_iovec(i, n, v, iov, skip, (I)) \ + if (skip == iov->iov_len) { \ + iov++; \ + skip = 0; \ + } \ + i->nr_segs -= iov - i->iov; \ + i->iov = iov; \ + } \ + i->count -= n; \ + i->iov_offset = skip; \ +} + +static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes, + struct iov_iter *i) +{ + size_t skip, copy, left, wanted; + const struct iovec *iov; + char __user *buf; + void *kaddr, *from; + + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + wanted = bytes; + iov = i->iov; + skip = i->iov_offset; + buf = iov->iov_base + skip; + copy = min(bytes, iov->iov_len - skip); + + if (!fault_in_pages_writeable(buf, copy)) { + kaddr = kmap_atomic(page); + from = kaddr + offset; + + /* first chunk, usually the only one */ + left = __copy_to_user_inatomic(buf, from, copy); + copy -= left; + skip += copy; + from += copy; + bytes -= copy; + + while (unlikely(!left && bytes)) { + iov++; + buf = iov->iov_base; + copy = min(bytes, iov->iov_len); + left = __copy_to_user_inatomic(buf, from, copy); + copy -= left; + skip = copy; + from += copy; + bytes -= copy; + } + if (likely(!bytes)) { + kunmap_atomic(kaddr); + goto done; + } + offset = from - kaddr; + buf += copy; + kunmap_atomic(kaddr); + copy = min(bytes, iov->iov_len - skip); + } + /* Too bad - revert to non-atomic kmap */ + kaddr = kmap(page); + from = kaddr + offset; + left = __copy_to_user(buf, from, copy); + copy -= left; + skip += copy; + from += copy; + bytes -= copy; + while (unlikely(!left && bytes)) { + iov++; + buf = iov->iov_base; + copy = min(bytes, iov->iov_len); + left = __copy_to_user(buf, from, copy); + copy -= left; + skip = copy; + from += copy; + bytes -= copy; + } + kunmap(page); +done: + if (skip == iov->iov_len) { + iov++; + skip = 0; + } + i->count -= wanted - bytes; + i->nr_segs -= iov - i->iov; + i->iov = iov; + i->iov_offset = skip; + return wanted - bytes; +} + +static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes, + struct iov_iter *i) +{ + size_t skip, copy, left, wanted; + const struct iovec *iov; + char __user *buf; + void *kaddr, *to; + + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + wanted = bytes; + iov = i->iov; + skip = i->iov_offset; + buf = iov->iov_base + skip; + copy = min(bytes, iov->iov_len - skip); + + if (!fault_in_pages_readable(buf, copy)) { + kaddr = kmap_atomic(page); + to = kaddr + offset; + + /* first chunk, usually the only one */ + left = __copy_from_user_inatomic(to, buf, copy); + copy -= left; + skip += copy; + to += copy; + bytes -= copy; + + while (unlikely(!left && bytes)) { + iov++; + buf = iov->iov_base; + copy = min(bytes, iov->iov_len); + left = __copy_from_user_inatomic(to, buf, copy); + copy -= left; + skip = copy; + to += copy; + bytes -= copy; + } + if (likely(!bytes)) { + kunmap_atomic(kaddr); + goto done; + } + offset = to - kaddr; + buf += copy; + kunmap_atomic(kaddr); + copy = min(bytes, iov->iov_len - skip); + } + /* Too bad - revert to non-atomic kmap */ + kaddr = kmap(page); + to = kaddr + offset; + left = __copy_from_user(to, buf, copy); + copy -= left; + skip += copy; + to += copy; + bytes -= copy; + while (unlikely(!left && bytes)) { + iov++; + buf = iov->iov_base; + copy = min(bytes, iov->iov_len); + left = __copy_from_user(to, buf, copy); + copy -= left; + skip = copy; + to += copy; + bytes -= copy; + } + kunmap(page); +done: + if (skip == iov->iov_len) { + iov++; + skip = 0; + } + i->count -= wanted - bytes; + i->nr_segs -= iov - i->iov; + i->iov = iov; + i->iov_offset = skip; + return wanted - bytes; +} + +/* + * Fault in the first iovec of the given iov_iter, to a maximum length + * of bytes. Returns 0 on success, or non-zero if the memory could not be + * accessed (ie. because it is an invalid address). + * + * writev-intensive code may want this to prefault several iovecs -- that + * would be possible (callers must not rely on the fact that _only_ the + * first iovec will be faulted with the current implementation). + */ +int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) +{ + if (!(i->type & (ITER_BVEC|ITER_KVEC))) { + char __user *buf = i->iov->iov_base + i->iov_offset; + bytes = min(bytes, i->iov->iov_len - i->iov_offset); + return fault_in_pages_readable(buf, bytes); + } + return 0; +} +EXPORT_SYMBOL(iov_iter_fault_in_readable); + +/* + * Fault in one or more iovecs of the given iov_iter, to a maximum length of + * bytes. For each iovec, fault in each page that constitutes the iovec. + * + * Return 0 on success, or non-zero if the memory could not be accessed (i.e. + * because it is an invalid address). + */ +int iov_iter_fault_in_multipages_readable(struct iov_iter *i, size_t bytes) +{ + size_t skip = i->iov_offset; + const struct iovec *iov; + int err; + struct iovec v; + + if (!(i->type & (ITER_BVEC|ITER_KVEC))) { + iterate_iovec(i, bytes, v, iov, skip, ({ + err = fault_in_multipages_readable(v.iov_base, + v.iov_len); + if (unlikely(err)) + return err; + 0;})) + } + return 0; +} +EXPORT_SYMBOL(iov_iter_fault_in_multipages_readable); + +void iov_iter_init(struct iov_iter *i, int direction, + const struct iovec *iov, unsigned long nr_segs, + size_t count) +{ + /* It will get better. Eventually... */ + if (segment_eq(get_fs(), KERNEL_DS)) { + direction |= ITER_KVEC; + i->type = direction; + i->kvec = (struct kvec *)iov; + } else { + i->type = direction; + i->iov = iov; + } + i->nr_segs = nr_segs; + i->iov_offset = 0; + i->count = count; +} +EXPORT_SYMBOL(iov_iter_init); + +static void memcpy_from_page(char *to, struct page *page, size_t offset, size_t len) +{ + char *from = kmap_atomic(page); + memcpy(to, from + offset, len); + kunmap_atomic(from); +} + +static void memcpy_to_page(struct page *page, size_t offset, char *from, size_t len) +{ + char *to = kmap_atomic(page); + memcpy(to + offset, from, len); + kunmap_atomic(to); +} + +static void memzero_page(struct page *page, size_t offset, size_t len) +{ + char *addr = kmap_atomic(page); + memset(addr + offset, 0, len); + kunmap_atomic(addr); +} + +size_t copy_to_iter(void *addr, size_t bytes, struct iov_iter *i) +{ + char *from = addr; + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + iterate_and_advance(i, bytes, v, + __copy_to_user(v.iov_base, (from += v.iov_len) - v.iov_len, + v.iov_len), + memcpy_to_page(v.bv_page, v.bv_offset, + (from += v.bv_len) - v.bv_len, v.bv_len), + memcpy(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len) + ) + + return bytes; +} +EXPORT_SYMBOL(copy_to_iter); + +size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) +{ + char *to = addr; + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + iterate_and_advance(i, bytes, v, + __copy_from_user((to += v.iov_len) - v.iov_len, v.iov_base, + v.iov_len), + memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page, + v.bv_offset, v.bv_len), + memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) + ) + + return bytes; +} +EXPORT_SYMBOL(copy_from_iter); + +size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) +{ + char *to = addr; + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + iterate_and_advance(i, bytes, v, + __copy_from_user_nocache((to += v.iov_len) - v.iov_len, + v.iov_base, v.iov_len), + memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page, + v.bv_offset, v.bv_len), + memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) + ) + + return bytes; +} +EXPORT_SYMBOL(copy_from_iter_nocache); + +size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, + struct iov_iter *i) +{ + if (i->type & (ITER_BVEC|ITER_KVEC)) { + void *kaddr = kmap_atomic(page); + size_t wanted = copy_to_iter(kaddr + offset, bytes, i); + kunmap_atomic(kaddr); + return wanted; + } else + return copy_page_to_iter_iovec(page, offset, bytes, i); +} +EXPORT_SYMBOL(copy_page_to_iter); + +size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, + struct iov_iter *i) +{ + if (i->type & (ITER_BVEC|ITER_KVEC)) { + void *kaddr = kmap_atomic(page); + size_t wanted = copy_from_iter(kaddr + offset, bytes, i); + kunmap_atomic(kaddr); + return wanted; + } else + return copy_page_from_iter_iovec(page, offset, bytes, i); +} +EXPORT_SYMBOL(copy_page_from_iter); + +size_t iov_iter_zero(size_t bytes, struct iov_iter *i) +{ + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + iterate_and_advance(i, bytes, v, + __clear_user(v.iov_base, v.iov_len), + memzero_page(v.bv_page, v.bv_offset, v.bv_len), + memset(v.iov_base, 0, v.iov_len) + ) + + return bytes; +} +EXPORT_SYMBOL(iov_iter_zero); + +size_t iov_iter_copy_from_user_atomic(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes) +{ + char *kaddr = kmap_atomic(page), *p = kaddr + offset; + iterate_all_kinds(i, bytes, v, + __copy_from_user_inatomic((p += v.iov_len) - v.iov_len, + v.iov_base, v.iov_len), + memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page, + v.bv_offset, v.bv_len), + memcpy((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) + ) + kunmap_atomic(kaddr); + return bytes; +} +EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); + +void iov_iter_advance(struct iov_iter *i, size_t size) +{ + iterate_and_advance(i, size, v, 0, 0, 0) +} +EXPORT_SYMBOL(iov_iter_advance); + +/* + * Return the count of just the current iov_iter segment. + */ +size_t iov_iter_single_seg_count(const struct iov_iter *i) +{ + if (i->nr_segs == 1) + return i->count; + else if (i->type & ITER_BVEC) + return min(i->count, i->bvec->bv_len - i->iov_offset); + else + return min(i->count, i->iov->iov_len - i->iov_offset); +} +EXPORT_SYMBOL(iov_iter_single_seg_count); + +void iov_iter_kvec(struct iov_iter *i, int direction, + const struct kvec *kvec, unsigned long nr_segs, + size_t count) +{ + BUG_ON(!(direction & ITER_KVEC)); + i->type = direction; + i->kvec = kvec; + i->nr_segs = nr_segs; + i->iov_offset = 0; + i->count = count; +} +EXPORT_SYMBOL(iov_iter_kvec); + +void iov_iter_bvec(struct iov_iter *i, int direction, + const struct bio_vec *bvec, unsigned long nr_segs, + size_t count) +{ + BUG_ON(!(direction & ITER_BVEC)); + i->type = direction; + i->bvec = bvec; + i->nr_segs = nr_segs; + i->iov_offset = 0; + i->count = count; +} +EXPORT_SYMBOL(iov_iter_bvec); + +unsigned long iov_iter_alignment(const struct iov_iter *i) +{ + unsigned long res = 0; + size_t size = i->count; + + if (!size) + return 0; + + iterate_all_kinds(i, size, v, + (res |= (unsigned long)v.iov_base | v.iov_len, 0), + res |= v.bv_offset | v.bv_len, + res |= (unsigned long)v.iov_base | v.iov_len + ) + return res; +} +EXPORT_SYMBOL(iov_iter_alignment); + +ssize_t iov_iter_get_pages(struct iov_iter *i, + struct page **pages, size_t maxsize, unsigned maxpages, + size_t *start) +{ + if (maxsize > i->count) + maxsize = i->count; + + if (!maxsize) + return 0; + + iterate_all_kinds(i, maxsize, v, ({ + unsigned long addr = (unsigned long)v.iov_base; + size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1)); + int n; + int res; + + if (len > maxpages * PAGE_SIZE) + len = maxpages * PAGE_SIZE; + addr &= ~(PAGE_SIZE - 1); + n = DIV_ROUND_UP(len, PAGE_SIZE); + res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages); + if (unlikely(res < 0)) + return res; + return (res == n ? len : res * PAGE_SIZE) - *start; + 0;}),({ + /* can't be more than PAGE_SIZE */ + *start = v.bv_offset; + get_page(*pages = v.bv_page); + return v.bv_len; + }),({ + return -EFAULT; + }) + ) + return 0; +} +EXPORT_SYMBOL(iov_iter_get_pages); + +static struct page **get_pages_array(size_t n) +{ + struct page **p = kmalloc(n * sizeof(struct page *), GFP_KERNEL); + if (!p) + p = vmalloc(n * sizeof(struct page *)); + return p; +} + +ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, + struct page ***pages, size_t maxsize, + size_t *start) +{ + struct page **p; + + if (maxsize > i->count) + maxsize = i->count; + + if (!maxsize) + return 0; + + iterate_all_kinds(i, maxsize, v, ({ + unsigned long addr = (unsigned long)v.iov_base; + size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1)); + int n; + int res; + + addr &= ~(PAGE_SIZE - 1); + n = DIV_ROUND_UP(len, PAGE_SIZE); + p = get_pages_array(n); + if (!p) + return -ENOMEM; + res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, p); + if (unlikely(res < 0)) { + kvfree(p); + return res; + } + *pages = p; + return (res == n ? len : res * PAGE_SIZE) - *start; + 0;}),({ + /* can't be more than PAGE_SIZE */ + *start = v.bv_offset; + *pages = p = get_pages_array(1); + if (!p) + return -ENOMEM; + get_page(*p = v.bv_page); + return v.bv_len; + }),({ + return -EFAULT; + }) + ) + return 0; +} +EXPORT_SYMBOL(iov_iter_get_pages_alloc); + +size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, + struct iov_iter *i) +{ + char *to = addr; + __wsum sum, next; + size_t off = 0; + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + sum = *csum; + iterate_and_advance(i, bytes, v, ({ + int err = 0; + next = csum_and_copy_from_user(v.iov_base, + (to += v.iov_len) - v.iov_len, + v.iov_len, 0, &err); + if (!err) { + sum = csum_block_add(sum, next, off); + off += v.iov_len; + } + err ? v.iov_len : 0; + }), ({ + char *p = kmap_atomic(v.bv_page); + next = csum_partial_copy_nocheck(p + v.bv_offset, + (to += v.bv_len) - v.bv_len, + v.bv_len, 0); + kunmap_atomic(p); + sum = csum_block_add(sum, next, off); + off += v.bv_len; + }),({ + next = csum_partial_copy_nocheck(v.iov_base, + (to += v.iov_len) - v.iov_len, + v.iov_len, 0); + sum = csum_block_add(sum, next, off); + off += v.iov_len; + }) + ) + *csum = sum; + return bytes; +} +EXPORT_SYMBOL(csum_and_copy_from_iter); + +size_t csum_and_copy_to_iter(void *addr, size_t bytes, __wsum *csum, + struct iov_iter *i) +{ + char *from = addr; + __wsum sum, next; + size_t off = 0; + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + sum = *csum; + iterate_and_advance(i, bytes, v, ({ + int err = 0; + next = csum_and_copy_to_user((from += v.iov_len) - v.iov_len, + v.iov_base, + v.iov_len, 0, &err); + if (!err) { + sum = csum_block_add(sum, next, off); + off += v.iov_len; + } + err ? v.iov_len : 0; + }), ({ + char *p = kmap_atomic(v.bv_page); + next = csum_partial_copy_nocheck((from += v.bv_len) - v.bv_len, + p + v.bv_offset, + v.bv_len, 0); + kunmap_atomic(p); + sum = csum_block_add(sum, next, off); + off += v.bv_len; + }),({ + next = csum_partial_copy_nocheck((from += v.iov_len) - v.iov_len, + v.iov_base, + v.iov_len, 0); + sum = csum_block_add(sum, next, off); + off += v.iov_len; + }) + ) + *csum = sum; + return bytes; +} +EXPORT_SYMBOL(csum_and_copy_to_iter); + +int iov_iter_npages(const struct iov_iter *i, int maxpages) +{ + size_t size = i->count; + int npages = 0; + + if (!size) + return 0; + + iterate_all_kinds(i, size, v, ({ + unsigned long p = (unsigned long)v.iov_base; + npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE) + - p / PAGE_SIZE; + if (npages >= maxpages) + return maxpages; + 0;}),({ + npages++; + if (npages >= maxpages) + return maxpages; + }),({ + unsigned long p = (unsigned long)v.iov_base; + npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE) + - p / PAGE_SIZE; + if (npages >= maxpages) + return maxpages; + }) + ) + return npages; +} +EXPORT_SYMBOL(iov_iter_npages); + +const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) +{ + *new = *old; + if (new->type & ITER_BVEC) + return new->bvec = kmemdup(new->bvec, + new->nr_segs * sizeof(struct bio_vec), + flags); + else + /* iovec and kvec have identical layout */ + return new->iov = kmemdup(new->iov, + new->nr_segs * sizeof(struct iovec), + flags); +} +EXPORT_SYMBOL(dup_iter); + +int import_iovec(int type, const struct iovec __user * uvector, + unsigned nr_segs, unsigned fast_segs, + struct iovec **iov, struct iov_iter *i) +{ + ssize_t n; + struct iovec *p; + n = rw_copy_check_uvector(type, uvector, nr_segs, fast_segs, + *iov, &p); + if (n < 0) { + if (p != *iov) + kfree(p); + *iov = NULL; + return n; + } + iov_iter_init(i, type, p, nr_segs, n); + *iov = p == *iov ? NULL : p; + return 0; +} +EXPORT_SYMBOL(import_iovec); + +#ifdef CONFIG_COMPAT +#include <linux/compat.h> + +int compat_import_iovec(int type, const struct compat_iovec __user * uvector, + unsigned nr_segs, unsigned fast_segs, + struct iovec **iov, struct iov_iter *i) +{ + ssize_t n; + struct iovec *p; + n = compat_rw_copy_check_uvector(type, uvector, nr_segs, fast_segs, + *iov, &p); + if (n < 0) { + if (p != *iov) + kfree(p); + *iov = NULL; + return n; + } + iov_iter_init(i, type, p, nr_segs, n); + *iov = p == *iov ? NULL : p; + return 0; +} +#endif + +int import_single_range(int rw, void __user *buf, size_t len, + struct iovec *iov, struct iov_iter *i) +{ + if (len > MAX_RW_COUNT) + len = MAX_RW_COUNT; + if (unlikely(!access_ok(!rw, buf, len))) + return -EFAULT; + + iov->iov_base = buf; + iov->iov_len = len; + iov_iter_init(i, rw, iov, 1, len); + return 0; +} diff --git a/lib/kobject.c b/lib/kobject.c index 03d4ab349fa7..3b841b97fccd 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -576,8 +576,13 @@ void kobject_del(struct kobject *kobj) */ struct kobject *kobject_get(struct kobject *kobj) { - if (kobj) + if (kobj) { + if (!kobj->state_initialized) + WARN(1, KERN_WARNING "kobject: '%s' (%p): is not " + "initialized, yet kobject_get() is being " + "called.\n", kobject_name(kobj), kobj); kref_get(&kobj->kref); + } return kobj; } diff --git a/lib/lcm.c b/lib/lcm.c index e97dbd51e756..03d7fcb420b5 100644 --- a/lib/lcm.c +++ b/lib/lcm.c @@ -12,3 +12,14 @@ unsigned long lcm(unsigned long a, unsigned long b) return 0; } EXPORT_SYMBOL_GPL(lcm); + +unsigned long lcm_not_zero(unsigned long a, unsigned long b) +{ + unsigned long l = lcm(a, b); + + if (l) + return l; + + return (b ? : a); +} +EXPORT_SYMBOL_GPL(lcm_not_zero); diff --git a/lib/lockref.c b/lib/lockref.c index ecb9a665ec19..494994bf17c8 100644 --- a/lib/lockref.c +++ b/lib/lockref.c @@ -18,7 +18,7 @@ #define CMPXCHG_LOOP(CODE, SUCCESS) do { \ struct lockref old; \ BUILD_BUG_ON(sizeof(old) != 8); \ - old.lock_count = ACCESS_ONCE(lockref->lock_count); \ + old.lock_count = READ_ONCE(lockref->lock_count); \ while (likely(arch_spin_value_unlocked(old.lock.rlock.raw_lock))) { \ struct lockref new = old, prev = old; \ CODE \ diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c index 7a85967060a5..26cc6029b280 100644 --- a/lib/lz4/lz4_decompress.c +++ b/lib/lz4/lz4_decompress.c @@ -47,6 +47,11 @@ #include "lz4defs.h" +static const int dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0}; +#if LZ4_ARCH64 +static const int dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3}; +#endif + static int lz4_uncompress(const char *source, char *dest, int osize) { const BYTE *ip = (const BYTE *) source; @@ -56,10 +61,6 @@ static int lz4_uncompress(const char *source, char *dest, int osize) BYTE *cpy; unsigned token; size_t length; - size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0}; -#if LZ4_ARCH64 - size_t dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3}; -#endif while (1) { @@ -116,7 +117,7 @@ static int lz4_uncompress(const char *source, char *dest, int osize) /* copy repeated sequence */ if (unlikely((op - ref) < STEPSIZE)) { #if LZ4_ARCH64 - size_t dec64 = dec64table[op - ref]; + int dec64 = dec64table[op - ref]; #else const int dec64 = 0; #endif @@ -139,6 +140,9 @@ static int lz4_uncompress(const char *source, char *dest, int osize) /* Error: request to write beyond destination buffer */ if (cpy > oend) goto _output_error; + if ((ref + COPYLENGTH) > oend || + (op + COPYLENGTH) > oend) + goto _output_error; LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH)); while (op < cpy) *op++ = *ref++; @@ -174,11 +178,6 @@ static int lz4_uncompress_unknownoutputsize(const char *source, char *dest, BYTE * const oend = op + maxoutputsize; BYTE *cpy; - size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0}; -#if LZ4_ARCH64 - size_t dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3}; -#endif - /* Main Loop */ while (ip < iend) { @@ -246,7 +245,7 @@ static int lz4_uncompress_unknownoutputsize(const char *source, char *dest, /* copy repeated sequence */ if (unlikely((op - ref) < STEPSIZE)) { #if LZ4_ARCH64 - size_t dec64 = dec64table[op - ref]; + int dec64 = dec64table[op - ref]; #else const int dec64 = 0; #endif diff --git a/lib/nlattr.c b/lib/nlattr.c index 76a1b59523ab..f5907d23272d 100644 --- a/lib/nlattr.c +++ b/lib/nlattr.c @@ -279,6 +279,8 @@ int nla_memcpy(void *dest, const struct nlattr *src, int count) int minlen = min_t(int, count, nla_len(src)); memcpy(dest, nla_data(src), minlen); + if (count > minlen) + memset(dest + minlen, 0, count - minlen); return minlen; } diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 9cc4c4a90d00..4898442b837f 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -1,13 +1,13 @@ /* * Resizable, Scalable, Concurrent Hash Table * + * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au> * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch> * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net> * - * Based on the following paper: - * https://www.usenix.org/legacy/event/atc11/tech/final_files/Triplett.pdf - * * Code partially derived from nft_hash + * Rewritten with rehash code from br_multicast plus single list + * pointer as suggested by Josh Triplett * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -17,6 +17,7 @@ #include <linux/kernel.h> #include <linux/init.h> #include <linux/log2.h> +#include <linux/sched.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/mm.h> @@ -26,121 +27,18 @@ #include <linux/err.h> #define HASH_DEFAULT_SIZE 64UL -#define HASH_MIN_SIZE 4UL +#define HASH_MIN_SIZE 4U #define BUCKET_LOCKS_PER_CPU 128UL -/* Base bits plus 1 bit for nulls marker */ -#define HASH_RESERVED_SPACE (RHT_BASE_BITS + 1) - -enum { - RHT_LOCK_NORMAL, - RHT_LOCK_NESTED, -}; - -/* The bucket lock is selected based on the hash and protects mutations - * on a group of hash buckets. - * - * A maximum of tbl->size/2 bucket locks is allocated. This ensures that - * a single lock always covers both buckets which may both contains - * entries which link to the same bucket of the old table during resizing. - * This allows to simplify the locking as locking the bucket in both - * tables during resize always guarantee protection. - * - * IMPORTANT: When holding the bucket lock of both the old and new table - * during expansions and shrinking, the old bucket lock must always be - * acquired first. - */ -static spinlock_t *bucket_lock(const struct bucket_table *tbl, u32 hash) -{ - return &tbl->locks[hash & tbl->locks_mask]; -} - -static void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he) -{ - return (void *) he - ht->p.head_offset; -} - -static u32 rht_bucket_index(const struct bucket_table *tbl, u32 hash) -{ - return hash & (tbl->size - 1); -} - -static u32 obj_raw_hashfn(const struct rhashtable *ht, const void *ptr) -{ - u32 hash; - - if (unlikely(!ht->p.key_len)) - hash = ht->p.obj_hashfn(ptr, ht->p.hash_rnd); - else - hash = ht->p.hashfn(ptr + ht->p.key_offset, ht->p.key_len, - ht->p.hash_rnd); - - return hash >> HASH_RESERVED_SPACE; -} - -static u32 key_hashfn(struct rhashtable *ht, const void *key, u32 len) -{ - return ht->p.hashfn(key, len, ht->p.hash_rnd) >> HASH_RESERVED_SPACE; -} - -static u32 head_hashfn(const struct rhashtable *ht, +static u32 head_hashfn(struct rhashtable *ht, const struct bucket_table *tbl, const struct rhash_head *he) { - return rht_bucket_index(tbl, obj_raw_hashfn(ht, rht_obj(ht, he))); + return rht_head_hashfn(ht, tbl, he, ht->p); } #ifdef CONFIG_PROVE_LOCKING -static void debug_dump_buckets(const struct rhashtable *ht, - const struct bucket_table *tbl) -{ - struct rhash_head *he; - unsigned int i, hash; - - for (i = 0; i < tbl->size; i++) { - pr_warn(" [Bucket %d] ", i); - rht_for_each_rcu(he, tbl, i) { - hash = head_hashfn(ht, tbl, he); - pr_cont("[hash = %#x, lock = %p] ", - hash, bucket_lock(tbl, hash)); - } - pr_cont("\n"); - } - -} - -static void debug_dump_table(struct rhashtable *ht, - const struct bucket_table *tbl, - unsigned int hash) -{ - struct bucket_table *old_tbl, *future_tbl; - - pr_emerg("BUG: lock for hash %#x in table %p not held\n", - hash, tbl); - - rcu_read_lock(); - future_tbl = rht_dereference_rcu(ht->future_tbl, ht); - old_tbl = rht_dereference_rcu(ht->tbl, ht); - if (future_tbl != old_tbl) { - pr_warn("Future table %p (size: %zd)\n", - future_tbl, future_tbl->size); - debug_dump_buckets(ht, future_tbl); - } - - pr_warn("Table %p (size: %zd)\n", old_tbl, old_tbl->size); - debug_dump_buckets(ht, old_tbl); - - rcu_read_unlock(); -} - #define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT)) -#define ASSERT_BUCKET_LOCK(HT, TBL, HASH) \ - do { \ - if (unlikely(!lockdep_rht_bucket_is_held(TBL, HASH))) { \ - debug_dump_table(HT, TBL, HASH); \ - BUG(); \ - } \ - } while (0) int lockdep_rht_mutex_is_held(struct rhashtable *ht) { @@ -150,30 +48,18 @@ EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held); int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash) { - spinlock_t *lock = bucket_lock(tbl, hash); + spinlock_t *lock = rht_bucket_lock(tbl, hash); return (debug_locks) ? lockdep_is_held(lock) : 1; } EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held); #else #define ASSERT_RHT_MUTEX(HT) -#define ASSERT_BUCKET_LOCK(HT, TBL, HASH) #endif -static struct rhash_head __rcu **bucket_tail(struct bucket_table *tbl, u32 n) -{ - struct rhash_head __rcu **pprev; - - for (pprev = &tbl->buckets[n]; - !rht_is_a_nulls(rht_dereference_bucket(*pprev, tbl, n)); - pprev = &rht_dereference_bucket(*pprev, tbl, n)->next) - ; - - return pprev; -} - -static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl) +static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl, + gfp_t gfp) { unsigned int i, size; #if defined(CONFIG_PROVE_LOCKING) @@ -190,12 +76,13 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl) if (sizeof(spinlock_t) != 0) { #ifdef CONFIG_NUMA - if (size * sizeof(spinlock_t) > PAGE_SIZE) + if (size * sizeof(spinlock_t) > PAGE_SIZE && + gfp == GFP_KERNEL) tbl->locks = vmalloc(size * sizeof(spinlock_t)); else #endif tbl->locks = kmalloc_array(size, sizeof(spinlock_t), - GFP_KERNEL); + gfp); if (!tbl->locks) return -ENOMEM; for (i = 0; i < size; i++) @@ -214,155 +101,181 @@ static void bucket_table_free(const struct bucket_table *tbl) kvfree(tbl); } +static void bucket_table_free_rcu(struct rcu_head *head) +{ + bucket_table_free(container_of(head, struct bucket_table, rcu)); +} + static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, - size_t nbuckets) + size_t nbuckets, + gfp_t gfp) { - struct bucket_table *tbl; + struct bucket_table *tbl = NULL; size_t size; int i; size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]); - tbl = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); - if (tbl == NULL) + if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER) || + gfp != GFP_KERNEL) + tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY); + if (tbl == NULL && gfp == GFP_KERNEL) tbl = vzalloc(size); - if (tbl == NULL) return NULL; tbl->size = nbuckets; - if (alloc_bucket_locks(ht, tbl) < 0) { + if (alloc_bucket_locks(ht, tbl, gfp) < 0) { bucket_table_free(tbl); return NULL; } + INIT_LIST_HEAD(&tbl->walkers); + + get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd)); + for (i = 0; i < nbuckets; i++) INIT_RHT_NULLS_HEAD(tbl->buckets[i], ht, i); return tbl; } -/** - * rht_grow_above_75 - returns true if nelems > 0.75 * table-size - * @ht: hash table - * @new_size: new table size - */ -bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size) +static struct bucket_table *rhashtable_last_table(struct rhashtable *ht, + struct bucket_table *tbl) { - /* Expand table when exceeding 75% load */ - return atomic_read(&ht->nelems) > (new_size / 4 * 3) && - (ht->p.max_shift && atomic_read(&ht->shift) < ht->p.max_shift); -} -EXPORT_SYMBOL_GPL(rht_grow_above_75); + struct bucket_table *new_tbl; -/** - * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size - * @ht: hash table - * @new_size: new table size - */ -bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size) -{ - /* Shrink table beneath 30% load */ - return atomic_read(&ht->nelems) < (new_size * 3 / 10) && - (atomic_read(&ht->shift) > ht->p.min_shift); -} -EXPORT_SYMBOL_GPL(rht_shrink_below_30); + do { + new_tbl = tbl; + tbl = rht_dereference_rcu(tbl->future_tbl, ht); + } while (tbl); -static void lock_buckets(struct bucket_table *new_tbl, - struct bucket_table *old_tbl, unsigned int hash) - __acquires(old_bucket_lock) -{ - spin_lock_bh(bucket_lock(old_tbl, hash)); - if (new_tbl != old_tbl) - spin_lock_bh_nested(bucket_lock(new_tbl, hash), - RHT_LOCK_NESTED); + return new_tbl; } -static void unlock_buckets(struct bucket_table *new_tbl, - struct bucket_table *old_tbl, unsigned int hash) - __releases(old_bucket_lock) +static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash) { - if (new_tbl != old_tbl) - spin_unlock_bh(bucket_lock(new_tbl, hash)); - spin_unlock_bh(bucket_lock(old_tbl, hash)); + struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); + struct bucket_table *new_tbl = rhashtable_last_table(ht, + rht_dereference_rcu(old_tbl->future_tbl, ht)); + struct rhash_head __rcu **pprev = &old_tbl->buckets[old_hash]; + int err = -ENOENT; + struct rhash_head *head, *next, *entry; + spinlock_t *new_bucket_lock; + unsigned int new_hash; + + rht_for_each(entry, old_tbl, old_hash) { + err = 0; + next = rht_dereference_bucket(entry->next, old_tbl, old_hash); + + if (rht_is_a_nulls(next)) + break; + + pprev = &entry->next; + } + + if (err) + goto out; + + new_hash = head_hashfn(ht, new_tbl, entry); + + new_bucket_lock = rht_bucket_lock(new_tbl, new_hash); + + spin_lock_nested(new_bucket_lock, SINGLE_DEPTH_NESTING); + head = rht_dereference_bucket(new_tbl->buckets[new_hash], + new_tbl, new_hash); + + if (rht_is_a_nulls(head)) + INIT_RHT_NULLS_HEAD(entry->next, ht, new_hash); + else + RCU_INIT_POINTER(entry->next, head); + + rcu_assign_pointer(new_tbl->buckets[new_hash], entry); + spin_unlock(new_bucket_lock); + + rcu_assign_pointer(*pprev, next); + +out: + return err; } -/** - * Unlink entries on bucket which hash to different bucket. - * - * Returns true if no more work needs to be performed on the bucket. - */ -static bool hashtable_chain_unzip(struct rhashtable *ht, - const struct bucket_table *new_tbl, - struct bucket_table *old_tbl, - size_t old_hash) +static void rhashtable_rehash_chain(struct rhashtable *ht, + unsigned int old_hash) { - struct rhash_head *he, *p, *next; - unsigned int new_hash, new_hash2; + struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); + spinlock_t *old_bucket_lock; - ASSERT_BUCKET_LOCK(ht, old_tbl, old_hash); + old_bucket_lock = rht_bucket_lock(old_tbl, old_hash); - /* Old bucket empty, no work needed. */ - p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl, - old_hash); - if (rht_is_a_nulls(p)) - return false; - - new_hash = head_hashfn(ht, new_tbl, p); - ASSERT_BUCKET_LOCK(ht, new_tbl, new_hash); + spin_lock_bh(old_bucket_lock); + while (!rhashtable_rehash_one(ht, old_hash)) + ; + old_tbl->rehash++; + spin_unlock_bh(old_bucket_lock); +} - /* Advance the old bucket pointer one or more times until it - * reaches a node that doesn't hash to the same bucket as the - * previous node p. Call the previous node p; - */ - rht_for_each_continue(he, p->next, old_tbl, old_hash) { - new_hash2 = head_hashfn(ht, new_tbl, he); - ASSERT_BUCKET_LOCK(ht, new_tbl, new_hash2); +static int rhashtable_rehash_attach(struct rhashtable *ht, + struct bucket_table *old_tbl, + struct bucket_table *new_tbl) +{ + /* Protect future_tbl using the first bucket lock. */ + spin_lock_bh(old_tbl->locks); - if (new_hash != new_hash2) - break; - p = he; + /* Did somebody beat us to it? */ + if (rcu_access_pointer(old_tbl->future_tbl)) { + spin_unlock_bh(old_tbl->locks); + return -EEXIST; } - rcu_assign_pointer(old_tbl->buckets[old_hash], p->next); - /* Find the subsequent node which does hash to the same - * bucket as node P, or NULL if no such node exists. + /* Make insertions go into the new, empty table right away. Deletions + * and lookups will be attempted in both tables until we synchronize. */ - INIT_RHT_NULLS_HEAD(next, ht, old_hash); - if (!rht_is_a_nulls(he)) { - rht_for_each_continue(he, he->next, old_tbl, old_hash) { - if (head_hashfn(ht, new_tbl, he) == new_hash) { - next = he; - break; - } - } - } + rcu_assign_pointer(old_tbl->future_tbl, new_tbl); - /* Set p's next pointer to that subsequent node pointer, - * bypassing the nodes which do not hash to p's bucket - */ - rcu_assign_pointer(p->next, next); + /* Ensure the new table is visible to readers. */ + smp_wmb(); - p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl, - old_hash); + spin_unlock_bh(old_tbl->locks); - return !rht_is_a_nulls(p); + return 0; } -static void link_old_to_new(struct rhashtable *ht, struct bucket_table *new_tbl, - unsigned int new_hash, struct rhash_head *entry) +static int rhashtable_rehash_table(struct rhashtable *ht) { - ASSERT_BUCKET_LOCK(ht, new_tbl, new_hash); + struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); + struct bucket_table *new_tbl; + struct rhashtable_walker *walker; + unsigned int old_hash; + + new_tbl = rht_dereference(old_tbl->future_tbl, ht); + if (!new_tbl) + return 0; + + for (old_hash = 0; old_hash < old_tbl->size; old_hash++) + rhashtable_rehash_chain(ht, old_hash); + + /* Publish the new table pointer. */ + rcu_assign_pointer(ht->tbl, new_tbl); - rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), entry); + spin_lock(&ht->lock); + list_for_each_entry(walker, &old_tbl->walkers, list) + walker->tbl = NULL; + spin_unlock(&ht->lock); + + /* Wait for readers. All new readers will see the new + * table, and thus no references to the old table will + * remain. + */ + call_rcu(&old_tbl->rcu, bucket_table_free_rcu); + + return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0; } /** * rhashtable_expand - Expand hash table while allowing concurrent lookups * @ht: the hash table to expand * - * A secondary bucket array is allocated and the hash entries are migrated - * while keeping them on both lists until the end of the RCU grace period. + * A secondary bucket array is allocated and the hash entries are migrated. * * This function may only be called in a context where it is safe to call * synchronize_rcu(), e.g. not within a rcu_read_lock() section. @@ -373,87 +286,32 @@ static void link_old_to_new(struct rhashtable *ht, struct bucket_table *new_tbl, * It is valid to have concurrent insertions and deletions protected by per * bucket locks or concurrent RCU protected lookups and traversals. */ -int rhashtable_expand(struct rhashtable *ht) +static int rhashtable_expand(struct rhashtable *ht) { struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht); - struct rhash_head *he; - unsigned int new_hash, old_hash; - bool complete = false; + int err; ASSERT_RHT_MUTEX(ht); - new_tbl = bucket_table_alloc(ht, old_tbl->size * 2); + old_tbl = rhashtable_last_table(ht, old_tbl); + + new_tbl = bucket_table_alloc(ht, old_tbl->size * 2, GFP_KERNEL); if (new_tbl == NULL) return -ENOMEM; - atomic_inc(&ht->shift); + err = rhashtable_rehash_attach(ht, old_tbl, new_tbl); + if (err) + bucket_table_free(new_tbl); - /* Make insertions go into the new, empty table right away. Deletions - * and lookups will be attempted in both tables until we synchronize. - * The synchronize_rcu() guarantees for the new table to be picked up - * so no new additions go into the old table while we relink. - */ - rcu_assign_pointer(ht->future_tbl, new_tbl); - synchronize_rcu(); - - /* For each new bucket, search the corresponding old bucket for the - * first entry that hashes to the new bucket, and link the end of - * newly formed bucket chain (containing entries added to future - * table) to that entry. Since all the entries which will end up in - * the new bucket appear in the same old bucket, this constructs an - * entirely valid new hash table, but with multiple buckets - * "zipped" together into a single imprecise chain. - */ - for (new_hash = 0; new_hash < new_tbl->size; new_hash++) { - old_hash = rht_bucket_index(old_tbl, new_hash); - lock_buckets(new_tbl, old_tbl, new_hash); - rht_for_each(he, old_tbl, old_hash) { - if (head_hashfn(ht, new_tbl, he) == new_hash) { - link_old_to_new(ht, new_tbl, new_hash, he); - break; - } - } - unlock_buckets(new_tbl, old_tbl, new_hash); - } - - /* Unzip interleaved hash chains */ - while (!complete && !ht->being_destroyed) { - /* Wait for readers. All new readers will see the new - * table, and thus no references to the old table will - * remain. - */ - synchronize_rcu(); - - /* For each bucket in the old table (each of which - * contains items from multiple buckets of the new - * table): ... - */ - complete = true; - for (old_hash = 0; old_hash < old_tbl->size; old_hash++) { - lock_buckets(new_tbl, old_tbl, old_hash); - - if (hashtable_chain_unzip(ht, new_tbl, old_tbl, - old_hash)) - complete = false; - - unlock_buckets(new_tbl, old_tbl, old_hash); - } - } - - rcu_assign_pointer(ht->tbl, new_tbl); - synchronize_rcu(); - - bucket_table_free(old_tbl); - return 0; + return err; } -EXPORT_SYMBOL_GPL(rhashtable_expand); /** * rhashtable_shrink - Shrink hash table while allowing concurrent lookups * @ht: the hash table to shrink * - * This function may only be called in a context where it is safe to call - * synchronize_rcu(), e.g. not within a rcu_read_lock() section. + * This function shrinks the hash table to fit, i.e., the smallest + * size would not cause it to expand right away automatically. * * The caller must ensure that no concurrent resizing occurs by holding * ht->mutex. @@ -464,403 +322,146 @@ EXPORT_SYMBOL_GPL(rhashtable_expand); * It is valid to have concurrent insertions and deletions protected by per * bucket locks or concurrent RCU protected lookups and traversals. */ -int rhashtable_shrink(struct rhashtable *ht) +static int rhashtable_shrink(struct rhashtable *ht) { - struct bucket_table *new_tbl, *tbl = rht_dereference(ht->tbl, ht); - unsigned int new_hash; + struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht); + unsigned int size; + int err; ASSERT_RHT_MUTEX(ht); - new_tbl = bucket_table_alloc(ht, tbl->size / 2); - if (new_tbl == NULL) - return -ENOMEM; - - rcu_assign_pointer(ht->future_tbl, new_tbl); - synchronize_rcu(); - - /* Link the first entry in the old bucket to the end of the - * bucket in the new table. As entries are concurrently being - * added to the new table, lock down the new bucket. As we - * always divide the size in half when shrinking, each bucket - * in the new table maps to exactly two buckets in the old - * table. - */ - for (new_hash = 0; new_hash < new_tbl->size; new_hash++) { - lock_buckets(new_tbl, tbl, new_hash); - - rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), - tbl->buckets[new_hash]); - ASSERT_BUCKET_LOCK(ht, tbl, new_hash + new_tbl->size); - rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), - tbl->buckets[new_hash + new_tbl->size]); + size = roundup_pow_of_two(atomic_read(&ht->nelems) * 3 / 2); + if (size < ht->p.min_size) + size = ht->p.min_size; - unlock_buckets(new_tbl, tbl, new_hash); - } + if (old_tbl->size <= size) + return 0; - /* Publish the new, valid hash table */ - rcu_assign_pointer(ht->tbl, new_tbl); - atomic_dec(&ht->shift); + if (rht_dereference(old_tbl->future_tbl, ht)) + return -EEXIST; - /* Wait for readers. No new readers will have references to the - * old hash table. - */ - synchronize_rcu(); + new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL); + if (new_tbl == NULL) + return -ENOMEM; - bucket_table_free(tbl); + err = rhashtable_rehash_attach(ht, old_tbl, new_tbl); + if (err) + bucket_table_free(new_tbl); - return 0; + return err; } -EXPORT_SYMBOL_GPL(rhashtable_shrink); static void rht_deferred_worker(struct work_struct *work) { struct rhashtable *ht; struct bucket_table *tbl; - struct rhashtable_walker *walker; + int err = 0; ht = container_of(work, struct rhashtable, run_work); mutex_lock(&ht->mutex); - if (ht->being_destroyed) - goto unlock; tbl = rht_dereference(ht->tbl, ht); + tbl = rhashtable_last_table(ht, tbl); - list_for_each_entry(walker, &ht->walkers, list) - walker->resize = true; - - if (ht->p.grow_decision && ht->p.grow_decision(ht, tbl->size)) + if (rht_grow_above_75(ht, tbl)) rhashtable_expand(ht); - else if (ht->p.shrink_decision && ht->p.shrink_decision(ht, tbl->size)) + else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl)) rhashtable_shrink(ht); -unlock: + err = rhashtable_rehash_table(ht); + mutex_unlock(&ht->mutex); -} -static void rhashtable_wakeup_worker(struct rhashtable *ht) -{ - struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); - struct bucket_table *new_tbl = rht_dereference_rcu(ht->future_tbl, ht); - size_t size = tbl->size; - - /* Only adjust the table if no resizing is currently in progress. */ - if (tbl == new_tbl && - ((ht->p.grow_decision && ht->p.grow_decision(ht, size)) || - (ht->p.shrink_decision && ht->p.shrink_decision(ht, size)))) + if (err) schedule_work(&ht->run_work); } -static void __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj, - struct bucket_table *tbl, u32 hash) +static bool rhashtable_check_elasticity(struct rhashtable *ht, + struct bucket_table *tbl, + unsigned int hash) { + unsigned int elasticity = ht->elasticity; struct rhash_head *head; - hash = rht_bucket_index(tbl, hash); - head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash); - - ASSERT_BUCKET_LOCK(ht, tbl, hash); - - if (rht_is_a_nulls(head)) - INIT_RHT_NULLS_HEAD(obj->next, ht, hash); - else - RCU_INIT_POINTER(obj->next, head); - - rcu_assign_pointer(tbl->buckets[hash], obj); + rht_for_each(head, tbl, hash) + if (!--elasticity) + return true; - atomic_inc(&ht->nelems); - - rhashtable_wakeup_worker(ht); + return false; } -/** - * rhashtable_insert - insert object into hash table - * @ht: hash table - * @obj: pointer to hash head inside object - * - * Will take a per bucket spinlock to protect against mutual mutations - * on the same bucket. Multiple insertions may occur in parallel unless - * they map to the same bucket lock. - * - * It is safe to call this function from atomic context. - * - * Will trigger an automatic deferred table resizing if the size grows - * beyond the watermark indicated by grow_decision() which can be passed - * to rhashtable_init(). - */ -void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj) +int rhashtable_insert_rehash(struct rhashtable *ht) { - struct bucket_table *tbl, *old_tbl; - unsigned hash; - - rcu_read_lock(); - - tbl = rht_dereference_rcu(ht->future_tbl, ht); - old_tbl = rht_dereference_rcu(ht->tbl, ht); - hash = obj_raw_hashfn(ht, rht_obj(ht, obj)); - - lock_buckets(tbl, old_tbl, hash); - __rhashtable_insert(ht, obj, tbl, hash); - unlock_buckets(tbl, old_tbl, hash); - - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(rhashtable_insert); - -/** - * rhashtable_remove - remove object from hash table - * @ht: hash table - * @obj: pointer to hash head inside object - * - * Since the hash chain is single linked, the removal operation needs to - * walk the bucket chain upon removal. The removal operation is thus - * considerable slow if the hash table is not correctly sized. - * - * Will automatically shrink the table via rhashtable_expand() if the - * shrink_decision function specified at rhashtable_init() returns true. - * - * The caller must ensure that no concurrent table mutations occur. It is - * however valid to have concurrent lookups if they are RCU protected. - */ -bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj) -{ - struct bucket_table *tbl, *new_tbl, *old_tbl; - struct rhash_head __rcu **pprev; - struct rhash_head *he, *he2; - unsigned int hash, new_hash; - bool ret = false; + struct bucket_table *old_tbl; + struct bucket_table *new_tbl; + struct bucket_table *tbl; + unsigned int size; + int err; - rcu_read_lock(); old_tbl = rht_dereference_rcu(ht->tbl, ht); - tbl = new_tbl = rht_dereference_rcu(ht->future_tbl, ht); - new_hash = obj_raw_hashfn(ht, rht_obj(ht, obj)); - - lock_buckets(new_tbl, old_tbl, new_hash); -restart: - hash = rht_bucket_index(tbl, new_hash); - pprev = &tbl->buckets[hash]; - rht_for_each(he, tbl, hash) { - if (he != obj) { - pprev = &he->next; - continue; - } - - ASSERT_BUCKET_LOCK(ht, tbl, hash); - - if (old_tbl->size > new_tbl->size && tbl == old_tbl && - !rht_is_a_nulls(obj->next) && - head_hashfn(ht, tbl, obj->next) != hash) { - rcu_assign_pointer(*pprev, (struct rhash_head *) rht_marker(ht, hash)); - } else if (unlikely(old_tbl->size < new_tbl->size && tbl == new_tbl)) { - rht_for_each_continue(he2, obj->next, tbl, hash) { - if (head_hashfn(ht, tbl, he2) == hash) { - rcu_assign_pointer(*pprev, he2); - goto found; - } - } - - rcu_assign_pointer(*pprev, (struct rhash_head *) rht_marker(ht, hash)); - } else { - rcu_assign_pointer(*pprev, obj->next); - } - -found: - ret = true; - break; - } + tbl = rhashtable_last_table(ht, old_tbl); - /* The entry may be linked in either 'tbl', 'future_tbl', or both. - * 'future_tbl' only exists for a short period of time during - * resizing. Thus traversing both is fine and the added cost is - * very rare. - */ - if (tbl != old_tbl) { - tbl = old_tbl; - goto restart; - } + size = tbl->size; - unlock_buckets(new_tbl, old_tbl, new_hash); - - if (ret) { - atomic_dec(&ht->nelems); - rhashtable_wakeup_worker(ht); - } - - rcu_read_unlock(); + if (rht_grow_above_75(ht, tbl)) + size *= 2; + /* More than two rehashes (not resizes) detected. */ + else if (WARN_ON(old_tbl != tbl && old_tbl->size == size)) + return -EBUSY; - return ret; -} -EXPORT_SYMBOL_GPL(rhashtable_remove); - -struct rhashtable_compare_arg { - struct rhashtable *ht; - const void *key; -}; - -static bool rhashtable_compare(void *ptr, void *arg) -{ - struct rhashtable_compare_arg *x = arg; - struct rhashtable *ht = x->ht; - - return !memcmp(ptr + ht->p.key_offset, x->key, ht->p.key_len); -} - -/** - * rhashtable_lookup - lookup key in hash table - * @ht: hash table - * @key: pointer to key - * - * Computes the hash value for the key and traverses the bucket chain looking - * for a entry with an identical key. The first matching entry is returned. - * - * This lookup function may only be used for fixed key hash table (key_len - * parameter set). It will BUG() if used inappropriately. - * - * Lookups may occur in parallel with hashtable mutations and resizing. - */ -void *rhashtable_lookup(struct rhashtable *ht, const void *key) -{ - struct rhashtable_compare_arg arg = { - .ht = ht, - .key = key, - }; - - BUG_ON(!ht->p.key_len); - - return rhashtable_lookup_compare(ht, key, &rhashtable_compare, &arg); -} -EXPORT_SYMBOL_GPL(rhashtable_lookup); - -/** - * rhashtable_lookup_compare - search hash table with compare function - * @ht: hash table - * @key: the pointer to the key - * @compare: compare function, must return true on match - * @arg: argument passed on to compare function - * - * Traverses the bucket chain behind the provided hash value and calls the - * specified compare function for each entry. - * - * Lookups may occur in parallel with hashtable mutations and resizing. - * - * Returns the first entry on which the compare function returned true. - */ -void *rhashtable_lookup_compare(struct rhashtable *ht, const void *key, - bool (*compare)(void *, void *), void *arg) -{ - const struct bucket_table *tbl, *old_tbl; - struct rhash_head *he; - u32 hash; - - rcu_read_lock(); - - old_tbl = rht_dereference_rcu(ht->tbl, ht); - tbl = rht_dereference_rcu(ht->future_tbl, ht); - hash = key_hashfn(ht, key, ht->p.key_len); -restart: - rht_for_each_rcu(he, tbl, rht_bucket_index(tbl, hash)) { - if (!compare(rht_obj(ht, he), arg)) - continue; - rcu_read_unlock(); - return rht_obj(ht, he); - } + new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC); + if (new_tbl == NULL) + return -ENOMEM; - if (unlikely(tbl != old_tbl)) { - tbl = old_tbl; - goto restart; - } - rcu_read_unlock(); + err = rhashtable_rehash_attach(ht, tbl, new_tbl); + if (err) { + bucket_table_free(new_tbl); + if (err == -EEXIST) + err = 0; + } else + schedule_work(&ht->run_work); - return NULL; + return err; } -EXPORT_SYMBOL_GPL(rhashtable_lookup_compare); +EXPORT_SYMBOL_GPL(rhashtable_insert_rehash); -/** - * rhashtable_lookup_insert - lookup and insert object into hash table - * @ht: hash table - * @obj: pointer to hash head inside object - * - * Locks down the bucket chain in both the old and new table if a resize - * is in progress to ensure that writers can't remove from the old table - * and can't insert to the new table during the atomic operation of search - * and insertion. Searches for duplicates in both the old and new table if - * a resize is in progress. - * - * This lookup function may only be used for fixed key hash table (key_len - * parameter set). It will BUG() if used inappropriately. - * - * It is safe to call this function from atomic context. - * - * Will trigger an automatic deferred table resizing if the size grows - * beyond the watermark indicated by grow_decision() which can be passed - * to rhashtable_init(). - */ -bool rhashtable_lookup_insert(struct rhashtable *ht, struct rhash_head *obj) +int rhashtable_insert_slow(struct rhashtable *ht, const void *key, + struct rhash_head *obj, + struct bucket_table *tbl) { - struct rhashtable_compare_arg arg = { - .ht = ht, - .key = rht_obj(ht, obj) + ht->p.key_offset, - }; + struct rhash_head *head; + unsigned int hash; + int err; - BUG_ON(!ht->p.key_len); + tbl = rhashtable_last_table(ht, tbl); + hash = head_hashfn(ht, tbl, obj); + spin_lock_nested(rht_bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING); - return rhashtable_lookup_compare_insert(ht, obj, &rhashtable_compare, - &arg); -} -EXPORT_SYMBOL_GPL(rhashtable_lookup_insert); + err = -EEXIST; + if (key && rhashtable_lookup_fast(ht, key, ht->p)) + goto exit; -/** - * rhashtable_lookup_compare_insert - search and insert object to hash table - * with compare function - * @ht: hash table - * @obj: pointer to hash head inside object - * @compare: compare function, must return true on match - * @arg: argument passed on to compare function - * - * Locks down the bucket chain in both the old and new table if a resize - * is in progress to ensure that writers can't remove from the old table - * and can't insert to the new table during the atomic operation of search - * and insertion. Searches for duplicates in both the old and new table if - * a resize is in progress. - * - * Lookups may occur in parallel with hashtable mutations and resizing. - * - * Will trigger an automatic deferred table resizing if the size grows - * beyond the watermark indicated by grow_decision() which can be passed - * to rhashtable_init(). - */ -bool rhashtable_lookup_compare_insert(struct rhashtable *ht, - struct rhash_head *obj, - bool (*compare)(void *, void *), - void *arg) -{ - struct bucket_table *new_tbl, *old_tbl; - u32 new_hash; - bool success = true; + err = -EAGAIN; + if (rhashtable_check_elasticity(ht, tbl, hash) || + rht_grow_above_100(ht, tbl)) + goto exit; - BUG_ON(!ht->p.key_len); + err = 0; - rcu_read_lock(); - old_tbl = rht_dereference_rcu(ht->tbl, ht); - new_tbl = rht_dereference_rcu(ht->future_tbl, ht); - new_hash = obj_raw_hashfn(ht, rht_obj(ht, obj)); + head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash); - lock_buckets(new_tbl, old_tbl, new_hash); + RCU_INIT_POINTER(obj->next, head); - if (rhashtable_lookup_compare(ht, rht_obj(ht, obj) + ht->p.key_offset, - compare, arg)) { - success = false; - goto exit; - } + rcu_assign_pointer(tbl->buckets[hash], obj); - __rhashtable_insert(ht, obj, new_tbl, new_hash); + atomic_inc(&ht->nelems); exit: - unlock_buckets(new_tbl, old_tbl, new_hash); - rcu_read_unlock(); + spin_unlock(rht_bucket_lock(tbl, hash)); - return success; + return err; } -EXPORT_SYMBOL_GPL(rhashtable_lookup_compare_insert); +EXPORT_SYMBOL_GPL(rhashtable_insert_slow); /** * rhashtable_walk_init - Initialise an iterator @@ -895,7 +496,8 @@ int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter) return -ENOMEM; mutex_lock(&ht->mutex); - list_add(&iter->walker->list, &ht->walkers); + iter->walker->tbl = rht_dereference(ht->tbl, ht); + list_add(&iter->walker->list, &iter->walker->tbl->walkers); mutex_unlock(&ht->mutex); return 0; @@ -911,7 +513,8 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_init); void rhashtable_walk_exit(struct rhashtable_iter *iter) { mutex_lock(&iter->ht->mutex); - list_del(&iter->walker->list); + if (iter->walker->tbl) + list_del(&iter->walker->list); mutex_unlock(&iter->ht->mutex); kfree(iter->walker); } @@ -932,13 +535,21 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_exit); * by calling rhashtable_walk_next. */ int rhashtable_walk_start(struct rhashtable_iter *iter) + __acquires(RCU) { + struct rhashtable *ht = iter->ht; + + mutex_lock(&ht->mutex); + + if (iter->walker->tbl) + list_del(&iter->walker->list); + rcu_read_lock(); - if (iter->walker->resize) { - iter->slot = 0; - iter->skip = 0; - iter->walker->resize = false; + mutex_unlock(&ht->mutex); + + if (!iter->walker->tbl) { + iter->walker->tbl = rht_dereference_rcu(ht->tbl, ht); return -EAGAIN; } @@ -960,13 +571,11 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_start); */ void *rhashtable_walk_next(struct rhashtable_iter *iter) { - const struct bucket_table *tbl; + struct bucket_table *tbl = iter->walker->tbl; struct rhashtable *ht = iter->ht; struct rhash_head *p = iter->p; void *obj = NULL; - tbl = rht_dereference_rcu(ht->tbl, ht); - if (p) { p = rht_dereference_bucket_rcu(p->next, tbl, iter->slot); goto next; @@ -992,17 +601,20 @@ next: iter->skip = 0; } - iter->p = NULL; + /* Ensure we see any new tables. */ + smp_rmb(); -out: - if (iter->walker->resize) { - iter->p = NULL; + iter->walker->tbl = rht_dereference_rcu(tbl->future_tbl, ht); + if (iter->walker->tbl) { iter->slot = 0; iter->skip = 0; - iter->walker->resize = false; return ERR_PTR(-EAGAIN); } + iter->p = NULL; + +out: + return obj; } EXPORT_SYMBOL_GPL(rhashtable_walk_next); @@ -1014,16 +626,39 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_next); * Finish a hash table walk. */ void rhashtable_walk_stop(struct rhashtable_iter *iter) + __releases(RCU) { - rcu_read_unlock(); + struct rhashtable *ht; + struct bucket_table *tbl = iter->walker->tbl; + + if (!tbl) + goto out; + + ht = iter->ht; + + spin_lock(&ht->lock); + if (tbl->rehash < tbl->size) + list_add(&iter->walker->list, &tbl->walkers); + else + iter->walker->tbl = NULL; + spin_unlock(&ht->lock); + iter->p = NULL; + +out: + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rhashtable_walk_stop); -static size_t rounded_hashtable_size(struct rhashtable_params *params) +static size_t rounded_hashtable_size(const struct rhashtable_params *params) { return max(roundup_pow_of_two(params->nelem_hint * 4 / 3), - 1UL << params->min_shift); + (unsigned long)params->min_size); +} + +static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed) +{ + return jhash2(key, length, seed); } /** @@ -1056,7 +691,7 @@ static size_t rounded_hashtable_size(struct rhashtable_params *params) * struct rhash_head node; * }; * - * u32 my_hash_fn(const void *data, u32 seed) + * u32 my_hash_fn(const void *data, u32 len, u32 seed) * { * struct test_obj *obj = data; * @@ -1069,72 +704,129 @@ static size_t rounded_hashtable_size(struct rhashtable_params *params) * .obj_hashfn = my_hash_fn, * }; */ -int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params) +int rhashtable_init(struct rhashtable *ht, + const struct rhashtable_params *params) { struct bucket_table *tbl; size_t size; size = HASH_DEFAULT_SIZE; - if ((params->key_len && !params->hashfn) || - (!params->key_len && !params->obj_hashfn)) + if ((!params->key_len && !params->obj_hashfn) || + (params->obj_hashfn && !params->obj_cmpfn)) return -EINVAL; if (params->nulls_base && params->nulls_base < (1U << RHT_BASE_SHIFT)) return -EINVAL; - params->min_shift = max_t(size_t, params->min_shift, - ilog2(HASH_MIN_SIZE)); - if (params->nelem_hint) size = rounded_hashtable_size(params); memset(ht, 0, sizeof(*ht)); mutex_init(&ht->mutex); + spin_lock_init(&ht->lock); memcpy(&ht->p, params, sizeof(*params)); - INIT_LIST_HEAD(&ht->walkers); + + if (params->min_size) + ht->p.min_size = roundup_pow_of_two(params->min_size); + + if (params->max_size) + ht->p.max_size = rounddown_pow_of_two(params->max_size); + + ht->p.min_size = max(ht->p.min_size, HASH_MIN_SIZE); + + /* The maximum (not average) chain length grows with the + * size of the hash table, at a rate of (log N)/(log log N). + * The value of 16 is selected so that even if the hash + * table grew to 2^32 you would not expect the maximum + * chain length to exceed it unless we are under attack + * (or extremely unlucky). + * + * As this limit is only to detect attacks, we don't need + * to set it to a lower value as you'd need the chain + * length to vastly exceed 16 to have any real effect + * on the system. + */ + if (!params->insecure_elasticity) + ht->elasticity = 16; if (params->locks_mul) ht->p.locks_mul = roundup_pow_of_two(params->locks_mul); else ht->p.locks_mul = BUCKET_LOCKS_PER_CPU; - tbl = bucket_table_alloc(ht, size); + ht->key_len = ht->p.key_len; + if (!params->hashfn) { + ht->p.hashfn = jhash; + + if (!(ht->key_len & (sizeof(u32) - 1))) { + ht->key_len /= sizeof(u32); + ht->p.hashfn = rhashtable_jhash2; + } + } + + tbl = bucket_table_alloc(ht, size, GFP_KERNEL); if (tbl == NULL) return -ENOMEM; atomic_set(&ht->nelems, 0); - atomic_set(&ht->shift, ilog2(tbl->size)); - RCU_INIT_POINTER(ht->tbl, tbl); - RCU_INIT_POINTER(ht->future_tbl, tbl); - if (!ht->p.hash_rnd) - get_random_bytes(&ht->p.hash_rnd, sizeof(ht->p.hash_rnd)); + RCU_INIT_POINTER(ht->tbl, tbl); - if (ht->p.grow_decision || ht->p.shrink_decision) - INIT_WORK(&ht->run_work, rht_deferred_worker); + INIT_WORK(&ht->run_work, rht_deferred_worker); return 0; } EXPORT_SYMBOL_GPL(rhashtable_init); /** - * rhashtable_destroy - destroy hash table + * rhashtable_free_and_destroy - free elements and destroy hash table * @ht: the hash table to destroy + * @free_fn: callback to release resources of element + * @arg: pointer passed to free_fn + * + * Stops an eventual async resize. If defined, invokes free_fn for each + * element to releasal resources. Please note that RCU protected + * readers may still be accessing the elements. Releasing of resources + * must occur in a compatible manner. Then frees the bucket array. * - * Frees the bucket array. This function is not rcu safe, therefore the caller - * has to make sure that no resizing may happen by unpublishing the hashtable - * and waiting for the quiescent cycle before releasing the bucket array. + * This function will eventually sleep to wait for an async resize + * to complete. The caller is responsible that no further write operations + * occurs in parallel. */ -void rhashtable_destroy(struct rhashtable *ht) +void rhashtable_free_and_destroy(struct rhashtable *ht, + void (*free_fn)(void *ptr, void *arg), + void *arg) { - ht->being_destroyed = true; + const struct bucket_table *tbl; + unsigned int i; - if (ht->p.grow_decision || ht->p.shrink_decision) - cancel_work_sync(&ht->run_work); + cancel_work_sync(&ht->run_work); mutex_lock(&ht->mutex); - bucket_table_free(rht_dereference(ht->tbl, ht)); + tbl = rht_dereference(ht->tbl, ht); + if (free_fn) { + for (i = 0; i < tbl->size; i++) { + struct rhash_head *pos, *next; + + for (pos = rht_dereference(tbl->buckets[i], ht), + next = !rht_is_a_nulls(pos) ? + rht_dereference(pos->next, ht) : NULL; + !rht_is_a_nulls(pos); + pos = next, + next = !rht_is_a_nulls(pos) ? + rht_dereference(pos->next, ht) : NULL) + free_fn(rht_obj(ht, pos), arg); + } + } + + bucket_table_free(tbl); mutex_unlock(&ht->mutex); } +EXPORT_SYMBOL_GPL(rhashtable_free_and_destroy); + +void rhashtable_destroy(struct rhashtable *ht) +{ + return rhashtable_free_and_destroy(ht, NULL, NULL); +} EXPORT_SYMBOL_GPL(rhashtable_destroy); diff --git a/lib/seq_buf.c b/lib/seq_buf.c index 88c0854bd752..5c94e1012a91 100644 --- a/lib/seq_buf.c +++ b/lib/seq_buf.c @@ -61,7 +61,7 @@ int seq_buf_vprintf(struct seq_buf *s, const char *fmt, va_list args) if (s->len < s->size) { len = vsnprintf(s->buffer + s->len, s->size - s->len, fmt, args); - if (seq_buf_can_fit(s, len)) { + if (s->len + len < s->size) { s->len += len; return 0; } @@ -118,7 +118,7 @@ int seq_buf_bprintf(struct seq_buf *s, const char *fmt, const u32 *binary) if (s->len < s->size) { ret = bstr_printf(s->buffer + s->len, len, fmt, binary); - if (seq_buf_can_fit(s, ret)) { + if (s->len + ret < s->size) { s->len += ret; return 0; } diff --git a/lib/sha1.c b/lib/sha1.c index 1df191e04a24..5a56dfd7b99d 100644 --- a/lib/sha1.c +++ b/lib/sha1.c @@ -198,3 +198,4 @@ void sha_init(__u32 *buf) buf[3] = 0x10325476; buf[4] = 0xc3d2e1f0; } +EXPORT_SYMBOL(sha_init); diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c index 1dfeba73fc74..b2957540d3c7 100644 --- a/lib/test_rhashtable.c +++ b/lib/test_rhashtable.c @@ -38,6 +38,15 @@ struct test_obj { struct rhash_head node; }; +static const struct rhashtable_params test_rht_params = { + .nelem_hint = TEST_HT_SIZE, + .head_offset = offsetof(struct test_obj, node), + .key_offset = offsetof(struct test_obj, value), + .key_len = sizeof(int), + .hashfn = jhash, + .nulls_base = (3U << RHT_BASE_SHIFT), +}; + static int __init test_rht_lookup(struct rhashtable *ht) { unsigned int i; @@ -47,7 +56,7 @@ static int __init test_rht_lookup(struct rhashtable *ht) bool expected = !(i % 2); u32 key = i; - obj = rhashtable_lookup(ht, &key); + obj = rhashtable_lookup_fast(ht, &key, test_rht_params); if (expected && !obj) { pr_warn("Test failed: Could not find key %u\n", key); @@ -80,7 +89,7 @@ static void test_bucket_stats(struct rhashtable *ht, bool quiet) rcu_cnt = cnt = 0; if (!quiet) - pr_info(" [%#4x/%zu]", i, tbl->size); + pr_info(" [%#4x/%u]", i, tbl->size); rht_for_each_entry_rcu(obj, pos, tbl, i, node) { cnt++; @@ -133,7 +142,11 @@ static int __init test_rhashtable(struct rhashtable *ht) obj->ptr = TEST_PTR; obj->value = i * 2; - rhashtable_insert(ht, &obj->node); + err = rhashtable_insert_fast(ht, &obj->node, test_rht_params); + if (err) { + kfree(obj); + goto error; + } } rcu_read_lock(); @@ -141,30 +154,6 @@ static int __init test_rhashtable(struct rhashtable *ht) test_rht_lookup(ht); rcu_read_unlock(); - for (i = 0; i < TEST_NEXPANDS; i++) { - pr_info(" Table expansion iteration %u...\n", i); - mutex_lock(&ht->mutex); - rhashtable_expand(ht); - mutex_unlock(&ht->mutex); - - rcu_read_lock(); - pr_info(" Verifying lookups...\n"); - test_rht_lookup(ht); - rcu_read_unlock(); - } - - for (i = 0; i < TEST_NEXPANDS; i++) { - pr_info(" Table shrinkage iteration %u...\n", i); - mutex_lock(&ht->mutex); - rhashtable_shrink(ht); - mutex_unlock(&ht->mutex); - - rcu_read_lock(); - pr_info(" Verifying lookups...\n"); - test_rht_lookup(ht); - rcu_read_unlock(); - } - rcu_read_lock(); test_bucket_stats(ht, true); rcu_read_unlock(); @@ -173,10 +162,10 @@ static int __init test_rhashtable(struct rhashtable *ht) for (i = 0; i < TEST_ENTRIES; i++) { u32 key = i * 2; - obj = rhashtable_lookup(ht, &key); + obj = rhashtable_lookup_fast(ht, &key, test_rht_params); BUG_ON(!obj); - rhashtable_remove(ht, &obj->node); + rhashtable_remove_fast(ht, &obj->node, test_rht_params); kfree(obj); } @@ -191,24 +180,15 @@ error: return err; } +static struct rhashtable ht; + static int __init test_rht_init(void) { - struct rhashtable ht; - struct rhashtable_params params = { - .nelem_hint = TEST_HT_SIZE, - .head_offset = offsetof(struct test_obj, node), - .key_offset = offsetof(struct test_obj, value), - .key_len = sizeof(int), - .hashfn = jhash, - .nulls_base = (3U << RHT_BASE_SHIFT), - .grow_decision = rht_grow_above_75, - .shrink_decision = rht_shrink_below_30, - }; int err; pr_info("Running resizable hashtable tests...\n"); - err = rhashtable_init(&ht, ¶ms); + err = rhashtable_init(&ht, &test_rht_params); if (err < 0) { pr_warn("Test failed: Unable to initialize hashtable: %d\n", err); @@ -222,6 +202,11 @@ static int __init test_rht_init(void) return err; } +static void __exit test_rht_exit(void) +{ +} + module_init(test_rht_init); +module_exit(test_rht_exit); MODULE_LICENSE("GPL v2"); |