diff options
Diffstat (limited to 'lib')
46 files changed, 3164 insertions, 500 deletions
diff --git a/lib/Kconfig b/lib/Kconfig index d79909dc01ec..260a80e313b9 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -457,9 +457,6 @@ config NLATTR config GENERIC_ATOMIC64 bool -config ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE - def_bool y if GENERIC_ATOMIC64 - config LRU_CACHE tristate @@ -550,4 +547,7 @@ config STACKDEPOT bool select STACKTRACE +config SBITMAP + bool + endmenu diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index cab7405f48d2..e6327d102184 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -13,7 +13,22 @@ config PRINTK_TIME be included, not that the timestamp is recorded. The behavior is also controlled by the kernel command line - parameter printk.time=1. See Documentation/kernel-parameters.txt + parameter printk.time=1. See Documentation/admin-guide/kernel-parameters.rst + +config CONSOLE_LOGLEVEL_DEFAULT + int "Default console loglevel (1-15)" + range 1 15 + default "7" + help + Default loglevel to determine what will be printed on the console. + + Setting a default here is equivalent to passing in loglevel=<x> in + the kernel bootargs. loglevel=<x> continues to override whatever + value is specified here as well. + + Note: This does not affect the log level of un-prefixed prink() + usage in the kernel. That is controlled by the MESSAGE_LOGLEVEL_DEFAULT + option. config MESSAGE_LOGLEVEL_DEFAULT int "Default message log level (1-7)" @@ -26,6 +41,10 @@ config MESSAGE_LOGLEVEL_DEFAULT that are auditing their logs closely may want to set it to a lower priority. + Note: This does not affect what message level gets printed on the console + by default. To change that, use loglevel=<x> in the kernel bootargs, + or pick a different CONSOLE_LOGLEVEL_DEFAULT configuration value. + config BOOT_PRINTK_DELAY bool "Delay each boot printk message by N milliseconds" depends on DEBUG_KERNEL && PRINTK && GENERIC_CALIBRATE_DELAY @@ -198,6 +217,7 @@ config FRAME_WARN int "Warn for stack frames larger than (needs gcc 4.4)" range 0 8192 default 0 if KASAN + default 2048 if GCC_PLUGIN_LATENT_ENTROPY default 1024 if !64BIT default 2048 if 64BIT help @@ -305,7 +325,7 @@ config DEBUG_SECTION_MISMATCH a larger kernel). - Run the section mismatch analysis for each module/built-in.o file. When we run the section mismatch analysis on vmlinux.o, we - lose valueble information about where the mismatch was + lose valuable information about where the mismatch was introduced. Running the analysis for each module/built-in.o file tells where the mismatch happens much closer to the @@ -1084,6 +1104,9 @@ config PROVE_LOCKING For more details, see Documentation/locking/lockdep-design.txt. +config PROVE_LOCKING_SMALL + bool + config LOCKDEP bool depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT @@ -1214,7 +1237,7 @@ config DEBUG_BUGVERBOSE config DEBUG_LIST bool "Debug linked list manipulation" - depends on DEBUG_KERNEL + depends on DEBUG_KERNEL || BUG_ON_DATA_CORRUPTION help Enable this to turn on extended checks in the linked-list walking routines. @@ -1430,7 +1453,8 @@ config RCU_TRACE select TRACE_CLOCK help This option provides tracing in RCU which presents stats - in debugfs for debugging RCU implementation. + in debugfs for debugging RCU implementation. It also enables + additional tracepoints for ftrace-style event tracing. Say Y here if you want to enable RCU tracing Say N if you are unsure. @@ -1857,15 +1881,6 @@ config PROVIDE_OHCI1394_DMA_INIT See Documentation/debugging-via-ohci1394.txt for more information. -config BUILD_DOCSRC - bool "Build targets in Documentation/ tree" - depends on HEADERS_CHECK - help - This option attempts to build objects from the source files in the - kernel Documentation/ tree. - - Say N if you are unsure. - config DMA_API_DEBUG bool "Enable debugging of DMA-API usage" depends on HAVE_DMA_API_DEBUG @@ -1969,6 +1984,16 @@ config TEST_STATIC_KEYS If unsure, say N. +config BUG_ON_DATA_CORRUPTION + bool "Trigger a BUG when data corruption is detected" + select DEBUG_LIST + help + Select this option if the kernel should BUG when it encounters + data corruption in kernel memory structures when they get checked + for validity. + + If unsure, say N. + source "samples/Kconfig" source "lib/Kconfig.kgdb" @@ -1980,7 +2005,7 @@ config ARCH_HAS_DEVMEM_IS_ALLOWED config STRICT_DEVMEM bool "Filter access to /dev/mem" - depends on MMU + depends on MMU && DEVMEM depends on ARCH_HAS_DEVMEM_IS_ALLOWED default y if TILE || PPC ---help--- diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan index 39494af9a84a..bc6e651df68c 100644 --- a/lib/Kconfig.ubsan +++ b/lib/Kconfig.ubsan @@ -1,6 +1,9 @@ config ARCH_HAS_UBSAN_SANITIZE_ALL bool +config ARCH_WANTS_UBSAN_NO_NULL + def_bool n + config UBSAN bool "Undefined behaviour sanity checker" help @@ -34,3 +37,11 @@ config UBSAN_ALIGNMENT This option enables detection of unaligned memory accesses. Enabling this option on architectures that support unaligned accesses may produce a lot of false positives. + +config UBSAN_NULL + bool "Enable checking of null pointers" + depends on UBSAN + default y if !ARCH_WANTS_UBSAN_NO_NULL + help + This option enables detection of memory accesses via a + null pointer. diff --git a/lib/Makefile b/lib/Makefile index 5dc77a8ec297..50144a3aeebd 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -22,7 +22,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ sha1.o chacha20.o md5.o irq_regs.o argv_split.o \ flex_proportions.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ - earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o + earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o win_minmax.o lib-$(CONFIG_MMU) += ioremap.o lib-$(CONFIG_SMP) += cpumask.o @@ -180,6 +180,7 @@ obj-$(CONFIG_IRQ_POLL) += irq_poll.o obj-$(CONFIG_STACKDEPOT) += stackdepot.o KASAN_SANITIZE_stackdepot.o := n +KCOV_INSTRUMENT_stackdepot.o := n libfdt_files = fdt.o fdt_ro.o fdt_wip.o fdt_rw.o fdt_sw.o fdt_strerror.o \ fdt_empty_tree.o @@ -227,3 +228,5 @@ obj-$(CONFIG_UCS2_STRING) += ucs2_string.o obj-$(CONFIG_UBSAN) += ubsan.o UBSAN_SANITIZE_ubsan.o := n + +obj-$(CONFIG_SBITMAP) += sbitmap.o diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c index dbb369145dda..46042901130f 100644 --- a/lib/atomic64_test.c +++ b/lib/atomic64_test.c @@ -213,7 +213,6 @@ static __init void test_atomic64(void) r += one; BUG_ON(v.counter != r); -#ifdef CONFIG_ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE INIT(onestwos); BUG_ON(atomic64_dec_if_positive(&v) != (onestwos - 1)); r -= one; @@ -226,9 +225,6 @@ static __init void test_atomic64(void) INIT(-one); BUG_ON(atomic64_dec_if_positive(&v) != (-one - one)); BUG_ON(v.counter != r); -#else -#warning Please implement atomic64_dec_if_positive for your architecture and select the above Kconfig symbol -#endif INIT(onestwos); BUG_ON(!atomic64_inc_not_zero(&v)); diff --git a/lib/bitmap.c b/lib/bitmap.c index eca88087fa8a..0b66f0e5eb6b 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -496,6 +496,11 @@ EXPORT_SYMBOL(bitmap_print_to_pagebuf); * ranges. Consecutively set bits are shown as two hyphen-separated * decimal numbers, the smallest and largest bit numbers set in * the range. + * Optionally each range can be postfixed to denote that only parts of it + * should be set. The range will divided to groups of specific size. + * From each group will be used only defined amount of bits. + * Syntax: range:used_size/group_size + * Example: 0-1023:2/256 ==> 0,1,256,257,512,513,768,769 * * Returns 0 on success, -errno on invalid input strings. * Error values: @@ -507,16 +512,20 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen, int is_user, unsigned long *maskp, int nmaskbits) { - unsigned a, b; + unsigned int a, b, old_a, old_b; + unsigned int group_size, used_size; int c, old_c, totaldigits, ndigits; const char __user __force *ubuf = (const char __user __force *)buf; - int at_start, in_range; + int at_start, in_range, in_partial_range; totaldigits = c = 0; + old_a = old_b = 0; + group_size = used_size = 0; bitmap_zero(maskp, nmaskbits); do { at_start = 1; in_range = 0; + in_partial_range = 0; a = b = 0; ndigits = totaldigits; @@ -547,6 +556,24 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen, if ((totaldigits != ndigits) && isspace(old_c)) return -EINVAL; + if (c == '/') { + used_size = a; + at_start = 1; + in_range = 0; + a = b = 0; + continue; + } + + if (c == ':') { + old_a = a; + old_b = b; + at_start = 1; + in_range = 0; + in_partial_range = 1; + a = b = 0; + continue; + } + if (c == '-') { if (at_start || in_range) return -EINVAL; @@ -567,15 +594,30 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen, } if (ndigits == totaldigits) continue; + if (in_partial_range) { + group_size = a; + a = old_a; + b = old_b; + old_a = old_b = 0; + } /* if no digit is after '-', it's wrong*/ if (at_start && in_range) return -EINVAL; - if (!(a <= b)) + if (!(a <= b) || !(used_size <= group_size)) return -EINVAL; if (b >= nmaskbits) return -ERANGE; while (a <= b) { - set_bit(a, maskp); + if (in_partial_range) { + static int pos_in_group = 1; + + if (pos_in_group <= used_size) + set_bit(a, maskp); + + if (a == b || ++pos_in_group > group_size) + pos_in_group = 1; + } else + set_bit(a, maskp); a++; } } while (buflen && c == ','); diff --git a/lib/cpu-notifier-error-inject.c b/lib/cpu-notifier-error-inject.c index 707ca24f7b18..0e2c9a1e958a 100644 --- a/lib/cpu-notifier-error-inject.c +++ b/lib/cpu-notifier-error-inject.c @@ -8,16 +8,47 @@ static int priority; module_param(priority, int, 0); MODULE_PARM_DESC(priority, "specify cpu notifier priority"); +#define UP_PREPARE 0 +#define UP_PREPARE_FROZEN 0 +#define DOWN_PREPARE 0 +#define DOWN_PREPARE_FROZEN 0 + static struct notifier_err_inject cpu_notifier_err_inject = { .actions = { - { NOTIFIER_ERR_INJECT_ACTION(CPU_UP_PREPARE) }, - { NOTIFIER_ERR_INJECT_ACTION(CPU_UP_PREPARE_FROZEN) }, - { NOTIFIER_ERR_INJECT_ACTION(CPU_DOWN_PREPARE) }, - { NOTIFIER_ERR_INJECT_ACTION(CPU_DOWN_PREPARE_FROZEN) }, + { NOTIFIER_ERR_INJECT_ACTION(UP_PREPARE) }, + { NOTIFIER_ERR_INJECT_ACTION(UP_PREPARE_FROZEN) }, + { NOTIFIER_ERR_INJECT_ACTION(DOWN_PREPARE) }, + { NOTIFIER_ERR_INJECT_ACTION(DOWN_PREPARE_FROZEN) }, {} } }; +static int notf_err_handle(struct notifier_err_inject_action *action) +{ + int ret; + + ret = action->error; + if (ret) + pr_info("Injecting error (%d) to %s\n", ret, action->name); + return ret; +} + +static int notf_err_inj_up_prepare(unsigned int cpu) +{ + if (!cpuhp_tasks_frozen) + return notf_err_handle(&cpu_notifier_err_inject.actions[0]); + else + return notf_err_handle(&cpu_notifier_err_inject.actions[1]); +} + +static int notf_err_inj_dead(unsigned int cpu) +{ + if (!cpuhp_tasks_frozen) + return notf_err_handle(&cpu_notifier_err_inject.actions[2]); + else + return notf_err_handle(&cpu_notifier_err_inject.actions[3]); +} + static struct dentry *dir; static int err_inject_init(void) @@ -29,7 +60,10 @@ static int err_inject_init(void) if (IS_ERR(dir)) return PTR_ERR(dir); - err = register_hotcpu_notifier(&cpu_notifier_err_inject.nb); + err = cpuhp_setup_state_nocalls(CPUHP_NOTF_ERR_INJ_PREPARE, + "cpu-err-notif:prepare", + notf_err_inj_up_prepare, + notf_err_inj_dead); if (err) debugfs_remove_recursive(dir); @@ -38,7 +72,7 @@ static int err_inject_init(void) static void err_inject_exit(void) { - unregister_hotcpu_notifier(&cpu_notifier_err_inject.nb); + cpuhp_remove_state_nocalls(CPUHP_NOTF_ERR_INJ_PREPARE); debugfs_remove_recursive(dir); } diff --git a/lib/debugobjects.c b/lib/debugobjects.c index a8e12601eb37..04c1ef717fe0 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -199,7 +199,7 @@ static void free_object(struct debug_obj *obj) * initialized: */ if (obj_pool_free > ODEBUG_POOL_SIZE && obj_cache) - sched = keventd_up(); + sched = 1; hlist_add_head(&obj->node, &obj_pool); obj_pool_free++; obj_pool_used--; @@ -362,6 +362,7 @@ void debug_object_init(void *addr, struct debug_obj_descr *descr) __debug_object_init(addr, descr, 0); } +EXPORT_SYMBOL_GPL(debug_object_init); /** * debug_object_init_on_stack - debug checks when an object on stack is @@ -376,6 +377,7 @@ void debug_object_init_on_stack(void *addr, struct debug_obj_descr *descr) __debug_object_init(addr, descr, 1); } +EXPORT_SYMBOL_GPL(debug_object_init_on_stack); /** * debug_object_activate - debug checks when an object is activated @@ -449,6 +451,7 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr) } return 0; } +EXPORT_SYMBOL_GPL(debug_object_activate); /** * debug_object_deactivate - debug checks when an object is deactivated @@ -496,6 +499,7 @@ void debug_object_deactivate(void *addr, struct debug_obj_descr *descr) raw_spin_unlock_irqrestore(&db->lock, flags); } +EXPORT_SYMBOL_GPL(debug_object_deactivate); /** * debug_object_destroy - debug checks when an object is destroyed @@ -542,6 +546,7 @@ void debug_object_destroy(void *addr, struct debug_obj_descr *descr) out_unlock: raw_spin_unlock_irqrestore(&db->lock, flags); } +EXPORT_SYMBOL_GPL(debug_object_destroy); /** * debug_object_free - debug checks when an object is freed @@ -582,6 +587,7 @@ void debug_object_free(void *addr, struct debug_obj_descr *descr) out_unlock: raw_spin_unlock_irqrestore(&db->lock, flags); } +EXPORT_SYMBOL_GPL(debug_object_free); /** * debug_object_assert_init - debug checks when object should be init-ed @@ -626,6 +632,7 @@ void debug_object_assert_init(void *addr, struct debug_obj_descr *descr) raw_spin_unlock_irqrestore(&db->lock, flags); } +EXPORT_SYMBOL_GPL(debug_object_assert_init); /** * debug_object_active_state - debug checks object usage state machine @@ -673,6 +680,7 @@ debug_object_active_state(void *addr, struct debug_obj_descr *descr, raw_spin_unlock_irqrestore(&db->lock, flags); } +EXPORT_SYMBOL_GPL(debug_object_active_state); #ifdef CONFIG_DEBUG_OBJECTS_FREE static void __debug_check_no_obj_freed(const void *address, unsigned long size) diff --git a/lib/dma-debug.c b/lib/dma-debug.c index fcfa1939ac41..8971370bfb16 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -22,6 +22,7 @@ #include <linux/stacktrace.h> #include <linux/dma-debug.h> #include <linux/spinlock.h> +#include <linux/vmalloc.h> #include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/export.h> @@ -43,6 +44,7 @@ enum { dma_debug_page, dma_debug_sg, dma_debug_coherent, + dma_debug_resource, }; enum map_err_types { @@ -150,8 +152,9 @@ static const char *const maperr2str[] = { [MAP_ERR_CHECKED] = "dma map error checked", }; -static const char *type2name[4] = { "single", "page", - "scather-gather", "coherent" }; +static const char *type2name[5] = { "single", "page", + "scather-gather", "coherent", + "resource" }; static const char *dir2name[4] = { "DMA_BIDIRECTIONAL", "DMA_TO_DEVICE", "DMA_FROM_DEVICE", "DMA_NONE" }; @@ -399,6 +402,9 @@ static void hash_bucket_del(struct dma_debug_entry *entry) static unsigned long long phys_addr(struct dma_debug_entry *entry) { + if (entry->type == dma_debug_resource) + return __pfn_to_phys(entry->pfn) + entry->offset; + return page_to_phys(pfn_to_page(entry->pfn)) + entry->offset; } @@ -1164,11 +1170,32 @@ static void check_unmap(struct dma_debug_entry *ref) put_hash_bucket(bucket, &flags); } -static void check_for_stack(struct device *dev, void *addr) +static void check_for_stack(struct device *dev, + struct page *page, size_t offset) { - if (object_is_on_stack(addr)) - err_printk(dev, NULL, "DMA-API: device driver maps memory from " - "stack [addr=%p]\n", addr); + void *addr; + struct vm_struct *stack_vm_area = task_stack_vm_area(current); + + if (!stack_vm_area) { + /* Stack is direct-mapped. */ + if (PageHighMem(page)) + return; + addr = page_address(page) + offset; + if (object_is_on_stack(addr)) + err_printk(dev, NULL, "DMA-API: device driver maps memory from stack [addr=%p]\n", addr); + } else { + /* Stack is vmalloced. */ + int i; + + for (i = 0; i < stack_vm_area->nr_pages; i++) { + if (page != stack_vm_area->pages[i]) + continue; + + addr = (u8 *)current->stack + i * PAGE_SIZE + offset; + err_printk(dev, NULL, "DMA-API: device driver maps memory from stack [probable addr=%p]\n", addr); + break; + } + } } static inline bool overlap(void *addr, unsigned long len, void *start, void *end) @@ -1291,10 +1318,11 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, if (map_single) entry->type = dma_debug_single; + check_for_stack(dev, page, offset); + if (!PageHighMem(page)) { void *addr = page_address(page) + offset; - check_for_stack(dev, addr); check_for_illegal_area(dev, addr, size); } @@ -1386,8 +1414,9 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, entry->sg_call_ents = nents; entry->sg_mapped_ents = mapped_ents; + check_for_stack(dev, sg_page(s), s->offset); + if (!PageHighMem(sg_page(s))) { - check_for_stack(dev, sg_virt(s)); check_for_illegal_area(dev, sg_virt(s), sg_dma_len(s)); } @@ -1495,6 +1524,49 @@ void debug_dma_free_coherent(struct device *dev, size_t size, } EXPORT_SYMBOL(debug_dma_free_coherent); +void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size, + int direction, dma_addr_t dma_addr) +{ + struct dma_debug_entry *entry; + + if (unlikely(dma_debug_disabled())) + return; + + entry = dma_entry_alloc(); + if (!entry) + return; + + entry->type = dma_debug_resource; + entry->dev = dev; + entry->pfn = PHYS_PFN(addr); + entry->offset = offset_in_page(addr); + entry->size = size; + entry->dev_addr = dma_addr; + entry->direction = direction; + entry->map_err_type = MAP_ERR_NOT_CHECKED; + + add_dma_entry(entry); +} +EXPORT_SYMBOL(debug_dma_map_resource); + +void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr, + size_t size, int direction) +{ + struct dma_debug_entry ref = { + .type = dma_debug_resource, + .dev = dev, + .dev_addr = dma_addr, + .size = size, + .direction = direction, + }; + + if (unlikely(dma_debug_disabled())) + return; + + check_unmap(&ref); +} +EXPORT_SYMBOL(debug_dma_unmap_resource); + void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, int direction) { diff --git a/lib/genalloc.c b/lib/genalloc.c index 0a1139644d32..144fe6b1a03e 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -292,7 +292,7 @@ unsigned long gen_pool_alloc_algo(struct gen_pool *pool, size_t size, struct gen_pool_chunk *chunk; unsigned long addr = 0; int order = pool->min_alloc_order; - int nbits, start_bit = 0, end_bit, remain; + int nbits, start_bit, end_bit, remain; #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG BUG_ON(in_nmi()); @@ -307,6 +307,7 @@ unsigned long gen_pool_alloc_algo(struct gen_pool *pool, size_t size, if (size > atomic_read(&chunk->avail)) continue; + start_bit = 0; end_bit = chunk_size(chunk) >> order; retry: start_bit = algo(chunk->bits, end_bit, start_bit, diff --git a/lib/idr.c b/lib/idr.c index 6098336df267..52d2979a05e8 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -927,6 +927,9 @@ EXPORT_SYMBOL(ida_pre_get); * and go back to the ida_pre_get() call. If the ida is full, it will * return %-ENOSPC. * + * Note that callers must ensure that concurrent access to @ida is not possible. + * See ida_simple_get() for a varaint which takes care of locking. + * * @p_id returns a value in the range @starting_id ... %0x7fffffff. */ int ida_get_new_above(struct ida *ida, int starting_id, int *p_id) @@ -1073,6 +1076,9 @@ EXPORT_SYMBOL(ida_destroy); * Allocates an id in the range start <= id < end, or returns -ENOSPC. * On memory allocation failure, returns -ENOMEM. * + * Compared to ida_get_new_above() this function does its own locking, and + * should be used unless there are special requirements. + * * Use ida_simple_remove() to get rid of an id. */ int ida_simple_get(struct ida *ida, unsigned int start, unsigned int end, @@ -1119,6 +1125,11 @@ EXPORT_SYMBOL(ida_simple_get); * ida_simple_remove - remove an allocated id. * @ida: the (initialized) ida. * @id: the id returned by ida_simple_get. + * + * Use to release an id allocated with ida_simple_get(). + * + * Compared to ida_remove() this function does its own locking, and should be + * used unless there are special requirements. */ void ida_simple_remove(struct ida *ida, unsigned int id) { diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 7e3138cfc8c9..691a52b634fe 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1,10 +1,14 @@ #include <linux/export.h> +#include <linux/bvec.h> #include <linux/uio.h> #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/vmalloc.h> +#include <linux/splice.h> #include <net/checksum.h> +#define PIPE_PARANOIA /* for now */ + #define iterate_iovec(i, n, __v, __p, skip, STEP) { \ size_t left; \ size_t wanted = n; \ @@ -290,6 +294,93 @@ done: return wanted - bytes; } +#ifdef PIPE_PARANOIA +static bool sanity(const struct iov_iter *i) +{ + struct pipe_inode_info *pipe = i->pipe; + int idx = i->idx; + int next = pipe->curbuf + pipe->nrbufs; + if (i->iov_offset) { + struct pipe_buffer *p; + if (unlikely(!pipe->nrbufs)) + goto Bad; // pipe must be non-empty + if (unlikely(idx != ((next - 1) & (pipe->buffers - 1)))) + goto Bad; // must be at the last buffer... + + p = &pipe->bufs[idx]; + if (unlikely(p->offset + p->len != i->iov_offset)) + goto Bad; // ... at the end of segment + } else { + if (idx != (next & (pipe->buffers - 1))) + goto Bad; // must be right after the last buffer + } + return true; +Bad: + printk(KERN_ERR "idx = %d, offset = %zd\n", i->idx, i->iov_offset); + printk(KERN_ERR "curbuf = %d, nrbufs = %d, buffers = %d\n", + pipe->curbuf, pipe->nrbufs, pipe->buffers); + for (idx = 0; idx < pipe->buffers; idx++) + printk(KERN_ERR "[%p %p %d %d]\n", + pipe->bufs[idx].ops, + pipe->bufs[idx].page, + pipe->bufs[idx].offset, + pipe->bufs[idx].len); + WARN_ON(1); + return false; +} +#else +#define sanity(i) true +#endif + +static inline int next_idx(int idx, struct pipe_inode_info *pipe) +{ + return (idx + 1) & (pipe->buffers - 1); +} + +static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes, + struct iov_iter *i) +{ + struct pipe_inode_info *pipe = i->pipe; + struct pipe_buffer *buf; + size_t off; + int idx; + + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + if (!sanity(i)) + return 0; + + off = i->iov_offset; + idx = i->idx; + buf = &pipe->bufs[idx]; + if (off) { + if (offset == off && buf->page == page) { + /* merge with the last one */ + buf->len += bytes; + i->iov_offset += bytes; + goto out; + } + idx = next_idx(idx, pipe); + buf = &pipe->bufs[idx]; + } + if (idx == pipe->curbuf && pipe->nrbufs) + return 0; + pipe->nrbufs++; + buf->ops = &page_cache_pipe_buf_ops; + get_page(buf->page = page); + buf->offset = offset; + buf->len = bytes; + i->iov_offset = offset + bytes; + i->idx = idx; +out: + i->count -= bytes; + return bytes; +} + /* * Fault in one or more iovecs of the given iov_iter, to a maximum length of * bytes. For each iovec, fault in each page that constitutes the iovec. @@ -306,8 +397,7 @@ int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) if (!(i->type & (ITER_BVEC|ITER_KVEC))) { iterate_iovec(i, bytes, v, iov, skip, ({ - err = fault_in_multipages_readable(v.iov_base, - v.iov_len); + err = fault_in_pages_readable(v.iov_base, v.iov_len); if (unlikely(err)) return err; 0;})) @@ -356,9 +446,98 @@ static void memzero_page(struct page *page, size_t offset, size_t len) kunmap_atomic(addr); } +static inline bool allocated(struct pipe_buffer *buf) +{ + return buf->ops == &default_pipe_buf_ops; +} + +static inline void data_start(const struct iov_iter *i, int *idxp, size_t *offp) +{ + size_t off = i->iov_offset; + int idx = i->idx; + if (off && (!allocated(&i->pipe->bufs[idx]) || off == PAGE_SIZE)) { + idx = next_idx(idx, i->pipe); + off = 0; + } + *idxp = idx; + *offp = off; +} + +static size_t push_pipe(struct iov_iter *i, size_t size, + int *idxp, size_t *offp) +{ + struct pipe_inode_info *pipe = i->pipe; + size_t off; + int idx; + ssize_t left; + + if (unlikely(size > i->count)) + size = i->count; + if (unlikely(!size)) + return 0; + + left = size; + data_start(i, &idx, &off); + *idxp = idx; + *offp = off; + if (off) { + left -= PAGE_SIZE - off; + if (left <= 0) { + pipe->bufs[idx].len += size; + return size; + } + pipe->bufs[idx].len = PAGE_SIZE; + idx = next_idx(idx, pipe); + } + while (idx != pipe->curbuf || !pipe->nrbufs) { + struct page *page = alloc_page(GFP_USER); + if (!page) + break; + pipe->nrbufs++; + pipe->bufs[idx].ops = &default_pipe_buf_ops; + pipe->bufs[idx].page = page; + pipe->bufs[idx].offset = 0; + if (left <= PAGE_SIZE) { + pipe->bufs[idx].len = left; + return size; + } + pipe->bufs[idx].len = PAGE_SIZE; + left -= PAGE_SIZE; + idx = next_idx(idx, pipe); + } + return size - left; +} + +static size_t copy_pipe_to_iter(const void *addr, size_t bytes, + struct iov_iter *i) +{ + struct pipe_inode_info *pipe = i->pipe; + size_t n, off; + int idx; + + if (!sanity(i)) + return 0; + + bytes = n = push_pipe(i, bytes, &idx, &off); + if (unlikely(!n)) + return 0; + for ( ; n; idx = next_idx(idx, pipe), off = 0) { + size_t chunk = min_t(size_t, n, PAGE_SIZE - off); + memcpy_to_page(pipe->bufs[idx].page, off, addr, chunk); + i->idx = idx; + i->iov_offset = off + chunk; + n -= chunk; + addr += chunk; + } + i->count -= bytes; + return bytes; +} + size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { const char *from = addr; + if (unlikely(i->type & ITER_PIPE)) + return copy_pipe_to_iter(addr, bytes, i); iterate_and_advance(i, bytes, v, __copy_to_user(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len), @@ -374,6 +553,10 @@ EXPORT_SYMBOL(copy_to_iter); size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) { char *to = addr; + if (unlikely(i->type & ITER_PIPE)) { + WARN_ON(1); + return 0; + } iterate_and_advance(i, bytes, v, __copy_from_user((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len), @@ -389,6 +572,10 @@ EXPORT_SYMBOL(copy_from_iter); size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) { char *to = addr; + if (unlikely(i->type & ITER_PIPE)) { + WARN_ON(1); + return 0; + } iterate_and_advance(i, bytes, v, __copy_from_user_nocache((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len), @@ -409,14 +596,20 @@ size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, size_t wanted = copy_to_iter(kaddr + offset, bytes, i); kunmap_atomic(kaddr); return wanted; - } else + } else if (likely(!(i->type & ITER_PIPE))) return copy_page_to_iter_iovec(page, offset, bytes, i); + else + return copy_page_to_iter_pipe(page, offset, bytes, i); } EXPORT_SYMBOL(copy_page_to_iter); size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { + if (unlikely(i->type & ITER_PIPE)) { + WARN_ON(1); + return 0; + } if (i->type & (ITER_BVEC|ITER_KVEC)) { void *kaddr = kmap_atomic(page); size_t wanted = copy_from_iter(kaddr + offset, bytes, i); @@ -427,8 +620,34 @@ size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, } EXPORT_SYMBOL(copy_page_from_iter); +static size_t pipe_zero(size_t bytes, struct iov_iter *i) +{ + struct pipe_inode_info *pipe = i->pipe; + size_t n, off; + int idx; + + if (!sanity(i)) + return 0; + + bytes = n = push_pipe(i, bytes, &idx, &off); + if (unlikely(!n)) + return 0; + + for ( ; n; idx = next_idx(idx, pipe), off = 0) { + size_t chunk = min_t(size_t, n, PAGE_SIZE - off); + memzero_page(pipe->bufs[idx].page, off, chunk); + i->idx = idx; + i->iov_offset = off + chunk; + n -= chunk; + } + i->count -= bytes; + return bytes; +} + size_t iov_iter_zero(size_t bytes, struct iov_iter *i) { + if (unlikely(i->type & ITER_PIPE)) + return pipe_zero(bytes, i); iterate_and_advance(i, bytes, v, __clear_user(v.iov_base, v.iov_len), memzero_page(v.bv_page, v.bv_offset, v.bv_len), @@ -443,6 +662,11 @@ size_t iov_iter_copy_from_user_atomic(struct page *page, struct iov_iter *i, unsigned long offset, size_t bytes) { char *kaddr = kmap_atomic(page), *p = kaddr + offset; + if (unlikely(i->type & ITER_PIPE)) { + kunmap_atomic(kaddr); + WARN_ON(1); + return 0; + } iterate_all_kinds(i, bytes, v, __copy_from_user_inatomic((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len), @@ -455,8 +679,51 @@ size_t iov_iter_copy_from_user_atomic(struct page *page, } EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); +static void pipe_advance(struct iov_iter *i, size_t size) +{ + struct pipe_inode_info *pipe = i->pipe; + struct pipe_buffer *buf; + int idx = i->idx; + size_t off = i->iov_offset, orig_sz; + + if (unlikely(i->count < size)) + size = i->count; + orig_sz = size; + + if (size) { + if (off) /* make it relative to the beginning of buffer */ + size += off - pipe->bufs[idx].offset; + while (1) { + buf = &pipe->bufs[idx]; + if (size <= buf->len) + break; + size -= buf->len; + idx = next_idx(idx, pipe); + } + buf->len = size; + i->idx = idx; + off = i->iov_offset = buf->offset + size; + } + if (off) + idx = next_idx(idx, pipe); + if (pipe->nrbufs) { + int unused = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); + /* [curbuf,unused) is in use. Free [idx,unused) */ + while (idx != unused) { + pipe_buf_release(pipe, &pipe->bufs[idx]); + idx = next_idx(idx, pipe); + pipe->nrbufs--; + } + } + i->count -= orig_sz; +} + void iov_iter_advance(struct iov_iter *i, size_t size) { + if (unlikely(i->type & ITER_PIPE)) { + pipe_advance(i, size); + return; + } iterate_and_advance(i, size, v, 0, 0, 0) } EXPORT_SYMBOL(iov_iter_advance); @@ -466,6 +733,8 @@ EXPORT_SYMBOL(iov_iter_advance); */ size_t iov_iter_single_seg_count(const struct iov_iter *i) { + if (unlikely(i->type & ITER_PIPE)) + return i->count; // it is a silly place, anyway if (i->nr_segs == 1) return i->count; else if (i->type & ITER_BVEC) @@ -501,6 +770,19 @@ void iov_iter_bvec(struct iov_iter *i, int direction, } EXPORT_SYMBOL(iov_iter_bvec); +void iov_iter_pipe(struct iov_iter *i, int direction, + struct pipe_inode_info *pipe, + size_t count) +{ + BUG_ON(direction != ITER_PIPE); + i->type = direction; + i->pipe = pipe; + i->idx = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); + i->iov_offset = 0; + i->count = count; +} +EXPORT_SYMBOL(iov_iter_pipe); + unsigned long iov_iter_alignment(const struct iov_iter *i) { unsigned long res = 0; @@ -509,6 +791,11 @@ unsigned long iov_iter_alignment(const struct iov_iter *i) if (!size) return 0; + if (unlikely(i->type & ITER_PIPE)) { + if (i->iov_offset && allocated(&i->pipe->bufs[i->idx])) + return size | i->iov_offset; + return size; + } iterate_all_kinds(i, size, v, (res |= (unsigned long)v.iov_base | v.iov_len, 0), res |= v.bv_offset | v.bv_len, @@ -525,6 +812,11 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i) if (!size) return 0; + if (unlikely(i->type & ITER_PIPE)) { + WARN_ON(1); + return ~0U; + } + iterate_all_kinds(i, size, v, (res |= (!res ? 0 : (unsigned long)v.iov_base) | (size != v.iov_len ? size : 0), 0), @@ -537,6 +829,47 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i) } EXPORT_SYMBOL(iov_iter_gap_alignment); +static inline size_t __pipe_get_pages(struct iov_iter *i, + size_t maxsize, + struct page **pages, + int idx, + size_t *start) +{ + struct pipe_inode_info *pipe = i->pipe; + ssize_t n = push_pipe(i, maxsize, &idx, start); + if (!n) + return -EFAULT; + + maxsize = n; + n += *start; + while (n > 0) { + get_page(*pages++ = pipe->bufs[idx].page); + idx = next_idx(idx, pipe); + n -= PAGE_SIZE; + } + + return maxsize; +} + +static ssize_t pipe_get_pages(struct iov_iter *i, + struct page **pages, size_t maxsize, unsigned maxpages, + size_t *start) +{ + unsigned npages; + size_t capacity; + int idx; + + if (!sanity(i)) + return -EFAULT; + + data_start(i, &idx, start); + /* some of this one + all after this one */ + npages = ((i->pipe->curbuf - idx - 1) & (i->pipe->buffers - 1)) + 1; + capacity = min(npages,maxpages) * PAGE_SIZE - *start; + + return __pipe_get_pages(i, min(maxsize, capacity), pages, idx, start); +} + ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start) @@ -547,6 +880,8 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, if (!maxsize) return 0; + if (unlikely(i->type & ITER_PIPE)) + return pipe_get_pages(i, pages, maxsize, maxpages, start); iterate_all_kinds(i, maxsize, v, ({ unsigned long addr = (unsigned long)v.iov_base; size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1)); @@ -582,6 +917,37 @@ static struct page **get_pages_array(size_t n) return p; } +static ssize_t pipe_get_pages_alloc(struct iov_iter *i, + struct page ***pages, size_t maxsize, + size_t *start) +{ + struct page **p; + size_t n; + int idx; + int npages; + + if (!sanity(i)) + return -EFAULT; + + data_start(i, &idx, start); + /* some of this one + all after this one */ + npages = ((i->pipe->curbuf - idx - 1) & (i->pipe->buffers - 1)) + 1; + n = npages * PAGE_SIZE - *start; + if (maxsize > n) + maxsize = n; + else + npages = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE); + p = get_pages_array(npages); + if (!p) + return -ENOMEM; + n = __pipe_get_pages(i, maxsize, p, idx, start); + if (n > 0) + *pages = p; + else + kvfree(p); + return n; +} + ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages, size_t maxsize, size_t *start) @@ -594,6 +960,8 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, if (!maxsize) return 0; + if (unlikely(i->type & ITER_PIPE)) + return pipe_get_pages_alloc(i, pages, maxsize, start); iterate_all_kinds(i, maxsize, v, ({ unsigned long addr = (unsigned long)v.iov_base; size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1)); @@ -635,6 +1003,10 @@ size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, __wsum sum, next; size_t off = 0; sum = *csum; + if (unlikely(i->type & ITER_PIPE)) { + WARN_ON(1); + return 0; + } iterate_and_advance(i, bytes, v, ({ int err = 0; next = csum_and_copy_from_user(v.iov_base, @@ -673,6 +1045,10 @@ size_t csum_and_copy_to_iter(const void *addr, size_t bytes, __wsum *csum, __wsum sum, next; size_t off = 0; sum = *csum; + if (unlikely(i->type & ITER_PIPE)) { + WARN_ON(1); /* for now */ + return 0; + } iterate_and_advance(i, bytes, v, ({ int err = 0; next = csum_and_copy_to_user((from += v.iov_len) - v.iov_len, @@ -712,7 +1088,20 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages) if (!size) return 0; - iterate_all_kinds(i, size, v, ({ + if (unlikely(i->type & ITER_PIPE)) { + struct pipe_inode_info *pipe = i->pipe; + size_t off; + int idx; + + if (!sanity(i)) + return 0; + + data_start(i, &idx, &off); + /* some of this one + all after this one */ + npages = ((pipe->curbuf - idx - 1) & (pipe->buffers - 1)) + 1; + if (npages >= maxpages) + return maxpages; + } else iterate_all_kinds(i, size, v, ({ unsigned long p = (unsigned long)v.iov_base; npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE) - p / PAGE_SIZE; @@ -737,6 +1126,10 @@ EXPORT_SYMBOL(iov_iter_npages); const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) { *new = *old; + if (unlikely(new->type & ITER_PIPE)) { + WARN_ON(1); + return NULL; + } if (new->type & ITER_BVEC) return new->bvec = kmemdup(new->bvec, new->nr_segs * sizeof(struct bio_vec), @@ -749,6 +1142,28 @@ const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) } EXPORT_SYMBOL(dup_iter); +/** + * import_iovec() - Copy an array of &struct iovec from userspace + * into the kernel, check that it is valid, and initialize a new + * &struct iov_iter iterator to access it. + * + * @type: One of %READ or %WRITE. + * @uvector: Pointer to the userspace array. + * @nr_segs: Number of elements in userspace array. + * @fast_segs: Number of elements in @iov. + * @iov: (input and output parameter) Pointer to pointer to (usually small + * on-stack) kernel array. + * @i: Pointer to iterator that will be initialized on success. + * + * If the array pointed to by *@iov is large enough to hold all @nr_segs, + * then this function places %NULL in *@iov on return. Otherwise, a new + * array will be allocated and the result placed in *@iov. This means that + * the caller may call kfree() on *@iov regardless of whether the small + * on-stack array was used or not (and regardless of whether this function + * returns an error or not). + * + * Return: 0 on success or negative error code on error. + */ int import_iovec(int type, const struct iovec __user * uvector, unsigned nr_segs, unsigned fast_segs, struct iovec **iov, struct iov_iter *i) diff --git a/lib/irq_poll.c b/lib/irq_poll.c index 836f7db4e548..1d6565e81030 100644 --- a/lib/irq_poll.c +++ b/lib/irq_poll.c @@ -74,7 +74,7 @@ void irq_poll_complete(struct irq_poll *iop) } EXPORT_SYMBOL(irq_poll_complete); -static void irq_poll_softirq(struct softirq_action *h) +static void __latent_entropy irq_poll_softirq(struct softirq_action *h) { struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll); int rearm = 0, budget = irq_poll_budget; @@ -184,30 +184,21 @@ void irq_poll_init(struct irq_poll *iop, int weight, irq_poll_fn *poll_fn) } EXPORT_SYMBOL(irq_poll_init); -static int irq_poll_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +static int irq_poll_cpu_dead(unsigned int cpu) { /* * If a CPU goes away, splice its entries to the current CPU * and trigger a run of the softirq */ - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { - int cpu = (unsigned long) hcpu; - - local_irq_disable(); - list_splice_init(&per_cpu(blk_cpu_iopoll, cpu), - this_cpu_ptr(&blk_cpu_iopoll)); - __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ); - local_irq_enable(); - } + local_irq_disable(); + list_splice_init(&per_cpu(blk_cpu_iopoll, cpu), + this_cpu_ptr(&blk_cpu_iopoll)); + __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ); + local_irq_enable(); - return NOTIFY_OK; + return 0; } -static struct notifier_block irq_poll_cpu_notifier = { - .notifier_call = irq_poll_cpu_notify, -}; - static __init int irq_poll_setup(void) { int i; @@ -216,7 +207,8 @@ static __init int irq_poll_setup(void) INIT_LIST_HEAD(&per_cpu(blk_cpu_iopoll, i)); open_softirq(IRQ_POLL_SOFTIRQ, irq_poll_softirq); - register_hotcpu_notifier(&irq_poll_cpu_notifier); + cpuhp_setup_state_nocalls(CPUHP_IRQ_POLL_DEAD, "irq_poll:dead", NULL, + irq_poll_cpu_dead); return 0; } subsys_initcall(irq_poll_setup); diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index f6c2c1e7779c..9a2b811966eb 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -56,7 +56,7 @@ static const char *kobject_actions[] = { * kobject_action_type - translate action string to numeric type * * @buf: buffer containing the action string, newline is ignored - * @len: length of buffer + * @count: length of buffer * @type: pointer to the location to store the action type * * Returns 0 if the action string was recognized. @@ -154,8 +154,8 @@ static void cleanup_uevent_env(struct subprocess_info *info) /** * kobject_uevent_env - send an uevent with environmental data * - * @action: action that is happening * @kobj: struct kobject that the action is happening to + * @action: action that is happening * @envp_ext: pointer to environmental data * * Returns 0 if kobject_uevent_env() is completed with success or the @@ -363,8 +363,8 @@ EXPORT_SYMBOL_GPL(kobject_uevent_env); /** * kobject_uevent - notify userspace by sending an uevent * - * @action: action that is happening * @kobj: struct kobject that the action is happening to + * @action: action that is happening * * Returns 0 if kobject_uevent() is completed with success or the * corresponding error when it fails. diff --git a/lib/kstrtox.c b/lib/kstrtox.c index d8a5cf66c316..b8e2080c1a47 100644 --- a/lib/kstrtox.c +++ b/lib/kstrtox.c @@ -48,11 +48,9 @@ unsigned int _parse_integer(const char *s, unsigned int base, unsigned long long { unsigned long long res; unsigned int rv; - int overflow; res = 0; rv = 0; - overflow = 0; while (*s) { unsigned int val; @@ -71,15 +69,13 @@ unsigned int _parse_integer(const char *s, unsigned int base, unsigned long long */ if (unlikely(res & (~0ull << 60))) { if (res > div_u64(ULLONG_MAX - val, base)) - overflow = 1; + rv |= KSTRTOX_OVERFLOW; } res = res * base + val; rv++; s++; } *p = res; - if (overflow) - rv |= KSTRTOX_OVERFLOW; return rv; } diff --git a/lib/list_debug.c b/lib/list_debug.c index 3859bf63561c..7f7bfa55eb6d 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -2,8 +2,7 @@ * Copyright 2006, Red Hat, Inc., Dave Jones * Released under the General Public License (GPL). * - * This file contains the linked list implementations for - * DEBUG_LIST. + * This file contains the linked list validation for DEBUG_LIST. */ #include <linux/export.h> @@ -13,88 +12,48 @@ #include <linux/rculist.h> /* - * Insert a new entry between two known consecutive entries. - * - * This is only for internal list manipulation where we know - * the prev/next entries already! + * Check that the data structures for the list manipulations are reasonably + * valid. Failures here indicate memory corruption (and possibly an exploit + * attempt). */ -void __list_add(struct list_head *new, - struct list_head *prev, - struct list_head *next) +bool __list_add_valid(struct list_head *new, struct list_head *prev, + struct list_head *next) { - WARN(next->prev != prev, - "list_add corruption. next->prev should be " - "prev (%p), but was %p. (next=%p).\n", + CHECK_DATA_CORRUPTION(next->prev != prev, + "list_add corruption. next->prev should be prev (%p), but was %p. (next=%p).\n", prev, next->prev, next); - WARN(prev->next != next, - "list_add corruption. prev->next should be " - "next (%p), but was %p. (prev=%p).\n", + CHECK_DATA_CORRUPTION(prev->next != next, + "list_add corruption. prev->next should be next (%p), but was %p. (prev=%p).\n", next, prev->next, prev); - WARN(new == prev || new == next, - "list_add double add: new=%p, prev=%p, next=%p.\n", - new, prev, next); - next->prev = new; - new->next = next; - new->prev = prev; - WRITE_ONCE(prev->next, new); + CHECK_DATA_CORRUPTION(new == prev || new == next, + "list_add double add: new=%p, prev=%p, next=%p.\n", + new, prev, next); + + return true; } -EXPORT_SYMBOL(__list_add); +EXPORT_SYMBOL(__list_add_valid); -void __list_del_entry(struct list_head *entry) +bool __list_del_entry_valid(struct list_head *entry) { struct list_head *prev, *next; prev = entry->prev; next = entry->next; - if (WARN(next == LIST_POISON1, + CHECK_DATA_CORRUPTION(next == LIST_POISON1, "list_del corruption, %p->next is LIST_POISON1 (%p)\n", - entry, LIST_POISON1) || - WARN(prev == LIST_POISON2, + entry, LIST_POISON1); + CHECK_DATA_CORRUPTION(prev == LIST_POISON2, "list_del corruption, %p->prev is LIST_POISON2 (%p)\n", - entry, LIST_POISON2) || - WARN(prev->next != entry, - "list_del corruption. prev->next should be %p, " - "but was %p\n", entry, prev->next) || - WARN(next->prev != entry, - "list_del corruption. next->prev should be %p, " - "but was %p\n", entry, next->prev)) - return; - - __list_del(prev, next); -} -EXPORT_SYMBOL(__list_del_entry); + entry, LIST_POISON2); + CHECK_DATA_CORRUPTION(prev->next != entry, + "list_del corruption. prev->next should be %p, but was %p\n", + entry, prev->next); + CHECK_DATA_CORRUPTION(next->prev != entry, + "list_del corruption. next->prev should be %p, but was %p\n", + entry, next->prev); + return true; -/** - * list_del - deletes entry from list. - * @entry: the element to delete from the list. - * Note: list_empty on entry does not return true after this, the entry is - * in an undefined state. - */ -void list_del(struct list_head *entry) -{ - __list_del_entry(entry); - entry->next = LIST_POISON1; - entry->prev = LIST_POISON2; -} -EXPORT_SYMBOL(list_del); - -/* - * RCU variants. - */ -void __list_add_rcu(struct list_head *new, - struct list_head *prev, struct list_head *next) -{ - WARN(next->prev != prev, - "list_add_rcu corruption. next->prev should be prev (%p), but was %p. (next=%p).\n", - prev, next->prev, next); - WARN(prev->next != next, - "list_add_rcu corruption. prev->next should be next (%p), but was %p. (prev=%p).\n", - next, prev->next, prev); - new->next = next; - new->prev = prev; - rcu_assign_pointer(list_next_rcu(prev), new); - next->prev = new; } -EXPORT_SYMBOL(__list_add_rcu); +EXPORT_SYMBOL(__list_del_entry_valid); diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c index 872a15a2a637..f3a217ea0388 100644 --- a/lib/locking-selftest.c +++ b/lib/locking-selftest.c @@ -980,23 +980,23 @@ static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask) #ifndef CONFIG_PROVE_LOCKING if (expected == FAILURE && debug_locks) { expected_testcase_failures++; - printk("failed|"); + pr_cont("failed|"); } else #endif if (debug_locks != expected) { unexpected_testcase_failures++; - printk("FAILED|"); + pr_cont("FAILED|"); dump_stack(); } else { testcase_successes++; - printk(" ok |"); + pr_cont(" ok |"); } testcase_total++; if (debug_locks_verbose) - printk(" lockclass mask: %x, debug_locks: %d, expected: %d\n", + pr_cont(" lockclass mask: %x, debug_locks: %d, expected: %d\n", lockclass_mask, debug_locks, expected); /* * Some tests (e.g. double-unlock) might corrupt the preemption @@ -1021,26 +1021,26 @@ static inline void print_testname(const char *testname) #define DO_TESTCASE_1(desc, name, nr) \ print_testname(desc"/"#nr); \ dotest(name##_##nr, SUCCESS, LOCKTYPE_RWLOCK); \ - printk("\n"); + pr_cont("\n"); #define DO_TESTCASE_1B(desc, name, nr) \ print_testname(desc"/"#nr); \ dotest(name##_##nr, FAILURE, LOCKTYPE_RWLOCK); \ - printk("\n"); + pr_cont("\n"); #define DO_TESTCASE_3(desc, name, nr) \ print_testname(desc"/"#nr); \ dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN); \ dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK); \ dotest(name##_rlock_##nr, SUCCESS, LOCKTYPE_RWLOCK); \ - printk("\n"); + pr_cont("\n"); #define DO_TESTCASE_3RW(desc, name, nr) \ print_testname(desc"/"#nr); \ dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN|LOCKTYPE_RWLOCK);\ dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK); \ dotest(name##_rlock_##nr, SUCCESS, LOCKTYPE_RWLOCK); \ - printk("\n"); + pr_cont("\n"); #define DO_TESTCASE_6(desc, name) \ print_testname(desc); \ @@ -1050,7 +1050,7 @@ static inline void print_testname(const char *testname) dotest(name##_mutex, FAILURE, LOCKTYPE_MUTEX); \ dotest(name##_wsem, FAILURE, LOCKTYPE_RWSEM); \ dotest(name##_rsem, FAILURE, LOCKTYPE_RWSEM); \ - printk("\n"); + pr_cont("\n"); #define DO_TESTCASE_6_SUCCESS(desc, name) \ print_testname(desc); \ @@ -1060,7 +1060,7 @@ static inline void print_testname(const char *testname) dotest(name##_mutex, SUCCESS, LOCKTYPE_MUTEX); \ dotest(name##_wsem, SUCCESS, LOCKTYPE_RWSEM); \ dotest(name##_rsem, SUCCESS, LOCKTYPE_RWSEM); \ - printk("\n"); + pr_cont("\n"); /* * 'read' variant: rlocks must not trigger. @@ -1073,7 +1073,7 @@ static inline void print_testname(const char *testname) dotest(name##_mutex, FAILURE, LOCKTYPE_MUTEX); \ dotest(name##_wsem, FAILURE, LOCKTYPE_RWSEM); \ dotest(name##_rsem, FAILURE, LOCKTYPE_RWSEM); \ - printk("\n"); + pr_cont("\n"); #define DO_TESTCASE_2I(desc, name, nr) \ DO_TESTCASE_1("hard-"desc, name##_hard, nr); \ @@ -1726,25 +1726,25 @@ static void ww_tests(void) dotest(ww_test_fail_acquire, SUCCESS, LOCKTYPE_WW); dotest(ww_test_normal, SUCCESS, LOCKTYPE_WW); dotest(ww_test_unneeded_slow, FAILURE, LOCKTYPE_WW); - printk("\n"); + pr_cont("\n"); print_testname("ww contexts mixing"); dotest(ww_test_two_contexts, FAILURE, LOCKTYPE_WW); dotest(ww_test_diff_class, FAILURE, LOCKTYPE_WW); - printk("\n"); + pr_cont("\n"); print_testname("finishing ww context"); dotest(ww_test_context_done_twice, FAILURE, LOCKTYPE_WW); dotest(ww_test_context_unlock_twice, FAILURE, LOCKTYPE_WW); dotest(ww_test_context_fini_early, FAILURE, LOCKTYPE_WW); dotest(ww_test_context_lock_after_done, FAILURE, LOCKTYPE_WW); - printk("\n"); + pr_cont("\n"); print_testname("locking mismatches"); dotest(ww_test_object_unlock_twice, FAILURE, LOCKTYPE_WW); dotest(ww_test_object_lock_unbalanced, FAILURE, LOCKTYPE_WW); dotest(ww_test_object_lock_stale_context, FAILURE, LOCKTYPE_WW); - printk("\n"); + pr_cont("\n"); print_testname("EDEADLK handling"); dotest(ww_test_edeadlk_normal, SUCCESS, LOCKTYPE_WW); @@ -1757,11 +1757,11 @@ static void ww_tests(void) dotest(ww_test_edeadlk_acquire_more_edeadlk_slow, FAILURE, LOCKTYPE_WW); dotest(ww_test_edeadlk_acquire_wrong, FAILURE, LOCKTYPE_WW); dotest(ww_test_edeadlk_acquire_wrong_slow, FAILURE, LOCKTYPE_WW); - printk("\n"); + pr_cont("\n"); print_testname("spinlock nest unlocked"); dotest(ww_test_spin_nest_unlocked, FAILURE, LOCKTYPE_WW); - printk("\n"); + pr_cont("\n"); printk(" -----------------------------------------------------\n"); printk(" |block | try |context|\n"); @@ -1771,25 +1771,25 @@ static void ww_tests(void) dotest(ww_test_context_block, FAILURE, LOCKTYPE_WW); dotest(ww_test_context_try, SUCCESS, LOCKTYPE_WW); dotest(ww_test_context_context, SUCCESS, LOCKTYPE_WW); - printk("\n"); + pr_cont("\n"); print_testname("try"); dotest(ww_test_try_block, FAILURE, LOCKTYPE_WW); dotest(ww_test_try_try, SUCCESS, LOCKTYPE_WW); dotest(ww_test_try_context, FAILURE, LOCKTYPE_WW); - printk("\n"); + pr_cont("\n"); print_testname("block"); dotest(ww_test_block_block, FAILURE, LOCKTYPE_WW); dotest(ww_test_block_try, SUCCESS, LOCKTYPE_WW); dotest(ww_test_block_context, FAILURE, LOCKTYPE_WW); - printk("\n"); + pr_cont("\n"); print_testname("spinlock"); dotest(ww_test_spin_block, FAILURE, LOCKTYPE_WW); dotest(ww_test_spin_try, SUCCESS, LOCKTYPE_WW); dotest(ww_test_spin_context, FAILURE, LOCKTYPE_WW); - printk("\n"); + pr_cont("\n"); } void locking_selftest(void) @@ -1829,32 +1829,32 @@ void locking_selftest(void) printk(" --------------------------------------------------------------------------\n"); print_testname("recursive read-lock"); - printk(" |"); + pr_cont(" |"); dotest(rlock_AA1, SUCCESS, LOCKTYPE_RWLOCK); - printk(" |"); + pr_cont(" |"); dotest(rsem_AA1, FAILURE, LOCKTYPE_RWSEM); - printk("\n"); + pr_cont("\n"); print_testname("recursive read-lock #2"); - printk(" |"); + pr_cont(" |"); dotest(rlock_AA1B, SUCCESS, LOCKTYPE_RWLOCK); - printk(" |"); + pr_cont(" |"); dotest(rsem_AA1B, FAILURE, LOCKTYPE_RWSEM); - printk("\n"); + pr_cont("\n"); print_testname("mixed read-write-lock"); - printk(" |"); + pr_cont(" |"); dotest(rlock_AA2, FAILURE, LOCKTYPE_RWLOCK); - printk(" |"); + pr_cont(" |"); dotest(rsem_AA2, FAILURE, LOCKTYPE_RWSEM); - printk("\n"); + pr_cont("\n"); print_testname("mixed write-read-lock"); - printk(" |"); + pr_cont(" |"); dotest(rlock_AA3, FAILURE, LOCKTYPE_RWLOCK); - printk(" |"); + pr_cont(" |"); dotest(rsem_AA3, FAILURE, LOCKTYPE_RWSEM); - printk("\n"); + pr_cont("\n"); printk(" --------------------------------------------------------------------------\n"); diff --git a/lib/lockref.c b/lib/lockref.c index 5a92189ad711..c4bfcb8836cd 100644 --- a/lib/lockref.c +++ b/lib/lockref.c @@ -20,7 +20,7 @@ if (likely(old.lock_count == prev.lock_count)) { \ SUCCESS; \ } \ - cpu_relax_lowlatency(); \ + cpu_relax(); \ } \ } while (0) diff --git a/lib/mpi/mpi-pow.c b/lib/mpi/mpi-pow.c index 5464c8744ea9..e24388a863a7 100644 --- a/lib/mpi/mpi-pow.c +++ b/lib/mpi/mpi-pow.c @@ -64,8 +64,13 @@ int mpi_powm(MPI res, MPI base, MPI exp, MPI mod) if (!esize) { /* Exponent is zero, result is 1 mod MOD, i.e., 1 or 0 * depending on if MOD equals 1. */ - rp[0] = 1; res->nlimbs = (msize == 1 && mod->d[0] == 1) ? 0 : 1; + if (res->nlimbs) { + if (mpi_resize(res, 1) < 0) + goto enomem; + rp = res->d; + rp[0] = 1; + } res->sign = 0; goto leave; } diff --git a/lib/nlattr.c b/lib/nlattr.c index fce1e9afc6d9..b42b8577fc23 100644 --- a/lib/nlattr.c +++ b/lib/nlattr.c @@ -14,7 +14,7 @@ #include <linux/types.h> #include <net/netlink.h> -static const u16 nla_attr_minlen[NLA_TYPE_MAX+1] = { +static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = { [NLA_U8] = sizeof(u8), [NLA_U16] = sizeof(u16), [NLA_U32] = sizeof(u32), diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c index 26caf51cc238..75554754eadf 100644 --- a/lib/nmi_backtrace.c +++ b/lib/nmi_backtrace.c @@ -16,21 +16,23 @@ #include <linux/delay.h> #include <linux/kprobes.h> #include <linux/nmi.h> +#include <linux/cpu.h> -#ifdef arch_trigger_all_cpu_backtrace +#ifdef arch_trigger_cpumask_backtrace /* For reliability, we're prepared to waste bits here. */ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; -/* "in progress" flag of arch_trigger_all_cpu_backtrace */ +/* "in progress" flag of arch_trigger_cpumask_backtrace */ static unsigned long backtrace_flag; /* - * When raise() is called it will be is passed a pointer to the + * When raise() is called it will be passed a pointer to the * backtrace_mask. Architectures that call nmi_cpu_backtrace() * directly from their raise() functions may rely on the mask * they are passed being updated as a side effect of this call. */ -void nmi_trigger_all_cpu_backtrace(bool include_self, +void nmi_trigger_cpumask_backtrace(const cpumask_t *mask, + bool exclude_self, void (*raise)(cpumask_t *mask)) { int i, this_cpu = get_cpu(); @@ -44,13 +46,22 @@ void nmi_trigger_all_cpu_backtrace(bool include_self, return; } - cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); - if (!include_self) + cpumask_copy(to_cpumask(backtrace_mask), mask); + if (exclude_self) cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask)); + /* + * Don't try to send an NMI to this cpu; it may work on some + * architectures, but on others it may not, and we'll get + * information at least as useful just by doing a dump_stack() here. + * Note that nmi_cpu_backtrace(NULL) will clear the cpu bit. + */ + if (cpumask_test_cpu(this_cpu, to_cpumask(backtrace_mask))) + nmi_cpu_backtrace(NULL); + if (!cpumask_empty(to_cpumask(backtrace_mask))) { - pr_info("Sending NMI to %s CPUs:\n", - (include_self ? "all" : "other")); + pr_info("Sending NMI from CPU %d to CPUs %*pbl:\n", + this_cpu, nr_cpumask_bits, to_cpumask(backtrace_mask)); raise(to_cpumask(backtrace_mask)); } @@ -77,11 +88,16 @@ bool nmi_cpu_backtrace(struct pt_regs *regs) int cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { - pr_warn("NMI backtrace for cpu %d\n", cpu); - if (regs) - show_regs(regs); - else - dump_stack(); + if (regs && cpu_in_idle(instruction_pointer(regs))) { + pr_warn("NMI backtrace for cpu %d skipped: idling at pc %#lx\n", + cpu, instruction_pointer(regs)); + } else { + pr_warn("NMI backtrace for cpu %d\n", cpu); + if (regs) + show_regs(regs); + else + dump_stack(); + } cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); return true; } diff --git a/lib/parser.c b/lib/parser.c index b6d11631231b..3278958b472a 100644 --- a/lib/parser.c +++ b/lib/parser.c @@ -152,6 +152,36 @@ static int match_number(substring_t *s, int *result, int base) } /** + * match_u64int: scan a number in the given base from a substring_t + * @s: substring to be scanned + * @result: resulting u64 on success + * @base: base to use when converting string + * + * Description: Given a &substring_t and a base, attempts to parse the substring + * as a number in that base. On success, sets @result to the integer represented + * by the string and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure. + */ +static int match_u64int(substring_t *s, u64 *result, int base) +{ + char *buf; + int ret; + u64 val; + size_t len = s->to - s->from; + + buf = kmalloc(len + 1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + memcpy(buf, s->from, len); + buf[len] = '\0'; + + ret = kstrtoull(buf, base, &val); + if (!ret) + *result = val; + kfree(buf); + return ret; +} + +/** * match_int: - scan a decimal representation of an integer from a substring_t * @s: substring_t to be scanned * @result: resulting integer on success @@ -167,6 +197,23 @@ int match_int(substring_t *s, int *result) EXPORT_SYMBOL(match_int); /** + * match_u64: - scan a decimal representation of a u64 from + * a substring_t + * @s: substring_t to be scanned + * @result: resulting unsigned long long on success + * + * Description: Attempts to parse the &substring_t @s as a long decimal + * integer. On success, sets @result to the integer represented by the + * string and returns 0. + * Returns -ENOMEM, -EINVAL, or -ERANGE on failure. + */ +int match_u64(substring_t *s, u64 *result) +{ + return match_u64int(s, result, 0); +} +EXPORT_SYMBOL(match_u64); + +/** * match_octal: - scan an octal representation of an integer from a substring_t * @s: substring_t to be scanned * @result: resulting integer on success diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 27fe74948882..9ac959ef4cae 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -33,6 +33,7 @@ #define PERCPU_COUNT_BIAS (1LU << (BITS_PER_LONG - 1)) +static DEFINE_SPINLOCK(percpu_ref_switch_lock); static DECLARE_WAIT_QUEUE_HEAD(percpu_ref_switch_waitq); static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref) @@ -82,6 +83,7 @@ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, atomic_long_set(&ref->count, start_count); ref->release = release; + ref->confirm_switch = NULL; return 0; } EXPORT_SYMBOL_GPL(percpu_ref_init); @@ -101,6 +103,8 @@ void percpu_ref_exit(struct percpu_ref *ref) unsigned long __percpu *percpu_count = percpu_count_ptr(ref); if (percpu_count) { + /* non-NULL confirm_switch indicates switching in progress */ + WARN_ON_ONCE(ref->confirm_switch); free_percpu(percpu_count); ref->percpu_count_ptr = __PERCPU_REF_ATOMIC_DEAD; } @@ -161,66 +165,23 @@ static void percpu_ref_noop_confirm_switch(struct percpu_ref *ref) static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref, percpu_ref_func_t *confirm_switch) { - if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC)) { - /* switching from percpu to atomic */ - ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; - - /* - * Non-NULL ->confirm_switch is used to indicate that - * switching is in progress. Use noop one if unspecified. - */ - WARN_ON_ONCE(ref->confirm_switch); - ref->confirm_switch = - confirm_switch ?: percpu_ref_noop_confirm_switch; - - percpu_ref_get(ref); /* put after confirmation */ - call_rcu_sched(&ref->rcu, percpu_ref_switch_to_atomic_rcu); - } else if (confirm_switch) { - /* - * Somebody already set ATOMIC. Switching may still be in - * progress. @confirm_switch must be invoked after the - * switching is complete and a full sched RCU grace period - * has passed. Wait synchronously for the previous - * switching and schedule @confirm_switch invocation. - */ - wait_event(percpu_ref_switch_waitq, !ref->confirm_switch); - ref->confirm_switch = confirm_switch; - - percpu_ref_get(ref); /* put after confirmation */ - call_rcu_sched(&ref->rcu, percpu_ref_call_confirm_rcu); + if (ref->percpu_count_ptr & __PERCPU_REF_ATOMIC) { + if (confirm_switch) + confirm_switch(ref); + return; } -} -/** - * percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode - * @ref: percpu_ref to switch to atomic mode - * @confirm_switch: optional confirmation callback - * - * There's no reason to use this function for the usual reference counting. - * Use percpu_ref_kill[_and_confirm](). - * - * Schedule switching of @ref to atomic mode. All its percpu counts will - * be collected to the main atomic counter. On completion, when all CPUs - * are guaraneed to be in atomic mode, @confirm_switch, which may not - * block, is invoked. This function may be invoked concurrently with all - * the get/put operations and can safely be mixed with kill and reinit - * operations. Note that @ref will stay in atomic mode across kill/reinit - * cycles until percpu_ref_switch_to_percpu() is called. - * - * This function normally doesn't block and can be called from any context - * but it may block if @confirm_kill is specified and @ref is already in - * the process of switching to atomic mode. In such cases, @confirm_switch - * will be invoked after the switching is complete. - * - * Due to the way percpu_ref is implemented, @confirm_switch will be called - * after at least one full sched RCU grace period has passed but this is an - * implementation detail and must not be depended upon. - */ -void percpu_ref_switch_to_atomic(struct percpu_ref *ref, - percpu_ref_func_t *confirm_switch) -{ - ref->force_atomic = true; - __percpu_ref_switch_to_atomic(ref, confirm_switch); + /* switching from percpu to atomic */ + ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; + + /* + * Non-NULL ->confirm_switch is used to indicate that switching is + * in progress. Use noop one if unspecified. + */ + ref->confirm_switch = confirm_switch ?: percpu_ref_noop_confirm_switch; + + percpu_ref_get(ref); /* put after confirmation */ + call_rcu_sched(&ref->rcu, percpu_ref_switch_to_atomic_rcu); } static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref) @@ -233,8 +194,6 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref) if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC)) return; - wait_event(percpu_ref_switch_waitq, !ref->confirm_switch); - atomic_long_add(PERCPU_COUNT_BIAS, &ref->count); /* @@ -250,6 +209,58 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref) ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC); } +static void __percpu_ref_switch_mode(struct percpu_ref *ref, + percpu_ref_func_t *confirm_switch) +{ + lockdep_assert_held(&percpu_ref_switch_lock); + + /* + * If the previous ATOMIC switching hasn't finished yet, wait for + * its completion. If the caller ensures that ATOMIC switching + * isn't in progress, this function can be called from any context. + */ + wait_event_lock_irq(percpu_ref_switch_waitq, !ref->confirm_switch, + percpu_ref_switch_lock); + + if (ref->force_atomic || (ref->percpu_count_ptr & __PERCPU_REF_DEAD)) + __percpu_ref_switch_to_atomic(ref, confirm_switch); + else + __percpu_ref_switch_to_percpu(ref); +} + +/** + * percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode + * @ref: percpu_ref to switch to atomic mode + * @confirm_switch: optional confirmation callback + * + * There's no reason to use this function for the usual reference counting. + * Use percpu_ref_kill[_and_confirm](). + * + * Schedule switching of @ref to atomic mode. All its percpu counts will + * be collected to the main atomic counter. On completion, when all CPUs + * are guaraneed to be in atomic mode, @confirm_switch, which may not + * block, is invoked. This function may be invoked concurrently with all + * the get/put operations and can safely be mixed with kill and reinit + * operations. Note that @ref will stay in atomic mode across kill/reinit + * cycles until percpu_ref_switch_to_percpu() is called. + * + * This function may block if @ref is in the process of switching to atomic + * mode. If the caller ensures that @ref is not in the process of + * switching to atomic mode, this function can be called from any context. + */ +void percpu_ref_switch_to_atomic(struct percpu_ref *ref, + percpu_ref_func_t *confirm_switch) +{ + unsigned long flags; + + spin_lock_irqsave(&percpu_ref_switch_lock, flags); + + ref->force_atomic = true; + __percpu_ref_switch_mode(ref, confirm_switch); + + spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); +} + /** * percpu_ref_switch_to_percpu - switch a percpu_ref to percpu mode * @ref: percpu_ref to switch to percpu mode @@ -264,17 +275,20 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref) * dying or dead, the actual switching takes place on the following * percpu_ref_reinit(). * - * This function normally doesn't block and can be called from any context - * but it may block if @ref is in the process of switching to atomic mode - * by percpu_ref_switch_atomic(). + * This function may block if @ref is in the process of switching to atomic + * mode. If the caller ensures that @ref is not in the process of + * switching to atomic mode, this function can be called from any context. */ void percpu_ref_switch_to_percpu(struct percpu_ref *ref) { + unsigned long flags; + + spin_lock_irqsave(&percpu_ref_switch_lock, flags); + ref->force_atomic = false; + __percpu_ref_switch_mode(ref, NULL); - /* a dying or dead ref can't be switched to percpu mode w/o reinit */ - if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) - __percpu_ref_switch_to_percpu(ref); + spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); } /** @@ -290,21 +304,23 @@ void percpu_ref_switch_to_percpu(struct percpu_ref *ref) * * This function normally doesn't block and can be called from any context * but it may block if @confirm_kill is specified and @ref is in the - * process of switching to atomic mode by percpu_ref_switch_atomic(). - * - * Due to the way percpu_ref is implemented, @confirm_switch will be called - * after at least one full sched RCU grace period has passed but this is an - * implementation detail and must not be depended upon. + * process of switching to atomic mode by percpu_ref_switch_to_atomic(). */ void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill) { + unsigned long flags; + + spin_lock_irqsave(&percpu_ref_switch_lock, flags); + WARN_ONCE(ref->percpu_count_ptr & __PERCPU_REF_DEAD, "%s called more than once on %pf!", __func__, ref->release); ref->percpu_count_ptr |= __PERCPU_REF_DEAD; - __percpu_ref_switch_to_atomic(ref, confirm_kill); + __percpu_ref_switch_mode(ref, confirm_kill); percpu_ref_put(ref); + + spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); } EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); @@ -321,11 +337,16 @@ EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); */ void percpu_ref_reinit(struct percpu_ref *ref) { + unsigned long flags; + + spin_lock_irqsave(&percpu_ref_switch_lock, flags); + WARN_ON_ONCE(!percpu_ref_is_zero(ref)); ref->percpu_count_ptr &= ~__PERCPU_REF_DEAD; percpu_ref_get(ref); - if (!ref->force_atomic) - __percpu_ref_switch_to_percpu(ref); + __percpu_ref_switch_mode(ref, NULL); + + spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); } EXPORT_SYMBOL_GPL(percpu_ref_reinit); diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 72d36113ccaa..c8cebb137076 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -158,25 +158,21 @@ EXPORT_SYMBOL(percpu_counter_destroy); int percpu_counter_batch __read_mostly = 32; EXPORT_SYMBOL(percpu_counter_batch); -static void compute_batch_value(void) +static int compute_batch_value(unsigned int cpu) { int nr = num_online_cpus(); percpu_counter_batch = max(32, nr*2); + return 0; } -static int percpu_counter_hotcpu_callback(struct notifier_block *nb, - unsigned long action, void *hcpu) +static int percpu_counter_cpu_dead(unsigned int cpu) { #ifdef CONFIG_HOTPLUG_CPU - unsigned int cpu; struct percpu_counter *fbc; - compute_batch_value(); - if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) - return NOTIFY_OK; + compute_batch_value(cpu); - cpu = (unsigned long)hcpu; spin_lock_irq(&percpu_counters_lock); list_for_each_entry(fbc, &percpu_counters, list) { s32 *pcount; @@ -190,7 +186,7 @@ static int percpu_counter_hotcpu_callback(struct notifier_block *nb, } spin_unlock_irq(&percpu_counters_lock); #endif - return NOTIFY_OK; + return 0; } /* @@ -222,8 +218,15 @@ EXPORT_SYMBOL(__percpu_counter_compare); static int __init percpu_counter_startup(void) { - compute_batch_value(); - hotcpu_notifier(percpu_counter_hotcpu_callback, 0); + int ret; + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "lib/percpu_cnt:online", + compute_batch_value, NULL); + WARN_ON(ret < 0); + ret = cpuhp_setup_state_nocalls(CPUHP_PERCPU_CNT_DEAD, + "lib/percpu_cnt:dead", NULL, + percpu_counter_cpu_dead); + WARN_ON(ret < 0); return 0; } module_init(percpu_counter_startup); diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 91f0727e3cad..2e8c6f7aa56e 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -220,10 +220,10 @@ static void dump_node(struct radix_tree_node *node, unsigned long index) { unsigned long i; - pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d parent %p\n", + pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d exceptional %d parent %p\n", node, node->offset, node->tags[0][0], node->tags[1][0], node->tags[2][0], - node->shift, node->count, node->parent); + node->shift, node->count, node->exceptional, node->parent); for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { unsigned long first = index | (i << node->shift); @@ -325,7 +325,6 @@ static void radix_tree_node_rcu_free(struct rcu_head *head) tag_clear(node, i, 0); node->slots[0] = NULL; - node->count = 0; kmem_cache_free(radix_tree_node_cachep, node); } @@ -522,8 +521,13 @@ static int radix_tree_extend(struct radix_tree_root *root, node->offset = 0; node->count = 1; node->parent = NULL; - if (radix_tree_is_internal_node(slot)) + if (radix_tree_is_internal_node(slot)) { entry_to_node(slot)->parent = node; + } else { + /* Moving an exceptional root->rnode to a node */ + if (radix_tree_exceptional_entry(slot)) + node->exceptional = 1; + } node->slots[0] = slot; slot = node_to_entry(node); rcu_assign_pointer(root->rnode, slot); @@ -534,6 +538,104 @@ out: } /** + * radix_tree_shrink - shrink radix tree to minimum height + * @root radix tree root + */ +static inline void radix_tree_shrink(struct radix_tree_root *root, + radix_tree_update_node_t update_node, + void *private) +{ + for (;;) { + struct radix_tree_node *node = root->rnode; + struct radix_tree_node *child; + + if (!radix_tree_is_internal_node(node)) + break; + node = entry_to_node(node); + + /* + * The candidate node has more than one child, or its child + * is not at the leftmost slot, or the child is a multiorder + * entry, we cannot shrink. + */ + if (node->count != 1) + break; + child = node->slots[0]; + if (!child) + break; + if (!radix_tree_is_internal_node(child) && node->shift) + break; + + if (radix_tree_is_internal_node(child)) + entry_to_node(child)->parent = NULL; + + /* + * We don't need rcu_assign_pointer(), since we are simply + * moving the node from one part of the tree to another: if it + * was safe to dereference the old pointer to it + * (node->slots[0]), it will be safe to dereference the new + * one (root->rnode) as far as dependent read barriers go. + */ + root->rnode = child; + + /* + * We have a dilemma here. The node's slot[0] must not be + * NULLed in case there are concurrent lookups expecting to + * find the item. However if this was a bottom-level node, + * then it may be subject to the slot pointer being visible + * to callers dereferencing it. If item corresponding to + * slot[0] is subsequently deleted, these callers would expect + * their slot to become empty sooner or later. + * + * For example, lockless pagecache will look up a slot, deref + * the page pointer, and if the page has 0 refcount it means it + * was concurrently deleted from pagecache so try the deref + * again. Fortunately there is already a requirement for logic + * to retry the entire slot lookup -- the indirect pointer + * problem (replacing direct root node with an indirect pointer + * also results in a stale slot). So tag the slot as indirect + * to force callers to retry. + */ + node->count = 0; + if (!radix_tree_is_internal_node(child)) { + node->slots[0] = RADIX_TREE_RETRY; + if (update_node) + update_node(node, private); + } + + radix_tree_node_free(node); + } +} + +static void delete_node(struct radix_tree_root *root, + struct radix_tree_node *node, + radix_tree_update_node_t update_node, void *private) +{ + do { + struct radix_tree_node *parent; + + if (node->count) { + if (node == entry_to_node(root->rnode)) + radix_tree_shrink(root, update_node, private); + return; + } + + parent = node->parent; + if (parent) { + parent->slots[node->offset] = NULL; + parent->count--; + } else { + root_tag_clear_all(root); + root->rnode = NULL; + } + + radix_tree_node_free(node); + + node = parent; + } while (node); +} + +/** * __radix_tree_create - create a slot in a radix tree * @root: radix tree root * @index: index key @@ -649,6 +751,8 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index, if (node) { unsigned offset = get_slot_offset(node, slot); node->count++; + if (radix_tree_exceptional_entry(item)) + node->exceptional++; BUG_ON(tag_get(node, 0, offset)); BUG_ON(tag_get(node, 1, offset)); BUG_ON(tag_get(node, 2, offset)); @@ -746,6 +850,85 @@ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) } EXPORT_SYMBOL(radix_tree_lookup); +static void replace_slot(struct radix_tree_root *root, + struct radix_tree_node *node, + void **slot, void *item, + bool warn_typeswitch) +{ + void *old = rcu_dereference_raw(*slot); + int count, exceptional; + + WARN_ON_ONCE(radix_tree_is_internal_node(item)); + + count = !!item - !!old; + exceptional = !!radix_tree_exceptional_entry(item) - + !!radix_tree_exceptional_entry(old); + + WARN_ON_ONCE(warn_typeswitch && (count || exceptional)); + + if (node) { + node->count += count; + node->exceptional += exceptional; + } + + rcu_assign_pointer(*slot, item); +} + +/** + * __radix_tree_replace - replace item in a slot + * @root: radix tree root + * @node: pointer to tree node + * @slot: pointer to slot in @node + * @item: new item to store in the slot. + * @update_node: callback for changing leaf nodes + * @private: private data to pass to @update_node + * + * For use with __radix_tree_lookup(). Caller must hold tree write locked + * across slot lookup and replacement. + */ +void __radix_tree_replace(struct radix_tree_root *root, + struct radix_tree_node *node, + void **slot, void *item, + radix_tree_update_node_t update_node, void *private) +{ + /* + * This function supports replacing exceptional entries and + * deleting entries, but that needs accounting against the + * node unless the slot is root->rnode. + */ + replace_slot(root, node, slot, item, + !node && slot != (void **)&root->rnode); + + if (!node) + return; + + if (update_node) + update_node(node, private); + + delete_node(root, node, update_node, private); +} + +/** + * radix_tree_replace_slot - replace item in a slot + * @root: radix tree root + * @slot: pointer to slot + * @item: new item to store in the slot. + * + * For use with radix_tree_lookup_slot(), radix_tree_gang_lookup_slot(), + * radix_tree_gang_lookup_tag_slot(). Caller must hold tree write locked + * across slot lookup and replacement. + * + * NOTE: This cannot be used to switch between non-entries (empty slots), + * regular entries, and exceptional entries, as that requires accounting + * inside the radix tree node. When switching from one type of entry or + * deleting, use __radix_tree_lookup() and __radix_tree_replace(). + */ +void radix_tree_replace_slot(struct radix_tree_root *root, + void **slot, void *item) +{ + replace_slot(root, NULL, slot, item, true); +} + /** * radix_tree_tag_set - set a tag on a radix tree node * @root: radix tree root @@ -1394,75 +1577,6 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) #endif /* CONFIG_SHMEM && CONFIG_SWAP */ /** - * radix_tree_shrink - shrink radix tree to minimum height - * @root radix tree root - */ -static inline bool radix_tree_shrink(struct radix_tree_root *root) -{ - bool shrunk = false; - - for (;;) { - struct radix_tree_node *node = root->rnode; - struct radix_tree_node *child; - - if (!radix_tree_is_internal_node(node)) - break; - node = entry_to_node(node); - - /* - * The candidate node has more than one child, or its child - * is not at the leftmost slot, or the child is a multiorder - * entry, we cannot shrink. - */ - if (node->count != 1) - break; - child = node->slots[0]; - if (!child) - break; - if (!radix_tree_is_internal_node(child) && node->shift) - break; - - if (radix_tree_is_internal_node(child)) - entry_to_node(child)->parent = NULL; - - /* - * We don't need rcu_assign_pointer(), since we are simply - * moving the node from one part of the tree to another: if it - * was safe to dereference the old pointer to it - * (node->slots[0]), it will be safe to dereference the new - * one (root->rnode) as far as dependent read barriers go. - */ - root->rnode = child; - - /* - * We have a dilemma here. The node's slot[0] must not be - * NULLed in case there are concurrent lookups expecting to - * find the item. However if this was a bottom-level node, - * then it may be subject to the slot pointer being visible - * to callers dereferencing it. If item corresponding to - * slot[0] is subsequently deleted, these callers would expect - * their slot to become empty sooner or later. - * - * For example, lockless pagecache will look up a slot, deref - * the page pointer, and if the page has 0 refcount it means it - * was concurrently deleted from pagecache so try the deref - * again. Fortunately there is already a requirement for logic - * to retry the entire slot lookup -- the indirect pointer - * problem (replacing direct root node with an indirect pointer - * also results in a stale slot). So tag the slot as indirect - * to force callers to retry. - */ - if (!radix_tree_is_internal_node(child)) - node->slots[0] = RADIX_TREE_RETRY; - - radix_tree_node_free(node); - shrunk = true; - } - - return shrunk; -} - -/** * __radix_tree_delete_node - try to free node after clearing a slot * @root: radix tree root * @node: node containing @index @@ -1470,39 +1584,11 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root) * After clearing the slot at @index in @node from radix tree * rooted at @root, call this function to attempt freeing the * node and shrinking the tree. - * - * Returns %true if @node was freed, %false otherwise. */ -bool __radix_tree_delete_node(struct radix_tree_root *root, +void __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *node) { - bool deleted = false; - - do { - struct radix_tree_node *parent; - - if (node->count) { - if (node == entry_to_node(root->rnode)) - deleted |= radix_tree_shrink(root); - return deleted; - } - - parent = node->parent; - if (parent) { - parent->slots[node->offset] = NULL; - parent->count--; - } else { - root_tag_clear_all(root); - root->rnode = NULL; - } - - radix_tree_node_free(node); - deleted = true; - - node = parent; - } while (node); - - return deleted; + delete_node(root, node, NULL, NULL); } static inline void delete_sibling_entries(struct radix_tree_node *node, @@ -1559,10 +1645,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root, node_tag_clear(root, node, tag, offset); delete_sibling_entries(node, node_to_entry(slot), offset); - node->slots[offset] = NULL; - node->count--; - - __radix_tree_delete_node(root, node); + __radix_tree_replace(root, node, slot, NULL, NULL, NULL); return entry; } @@ -1583,15 +1666,10 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) } EXPORT_SYMBOL(radix_tree_delete); -struct radix_tree_node *radix_tree_replace_clear_tags( - struct radix_tree_root *root, - unsigned long index, void *entry) +void radix_tree_clear_tags(struct radix_tree_root *root, + struct radix_tree_node *node, + void **slot) { - struct radix_tree_node *node; - void **slot; - - __radix_tree_lookup(root, index, &node, &slot); - if (node) { unsigned int tag, offset = get_slot_offset(node, slot); for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) @@ -1600,9 +1678,6 @@ struct radix_tree_node *radix_tree_replace_clear_tags( /* Clear root node tags */ root->gfp_mask &= __GFP_BITS_MASK; } - - radix_tree_replace_slot(slot, entry); - return node; } /** @@ -1650,32 +1725,31 @@ static __init void radix_tree_init_maxnodes(void) } } -static int radix_tree_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int radix_tree_cpu_dead(unsigned int cpu) { - int cpu = (long)hcpu; struct radix_tree_preload *rtp; struct radix_tree_node *node; /* Free per-cpu pool of preloaded nodes */ - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { - rtp = &per_cpu(radix_tree_preloads, cpu); - while (rtp->nr) { - node = rtp->nodes; - rtp->nodes = node->private_data; - kmem_cache_free(radix_tree_node_cachep, node); - rtp->nr--; - } + rtp = &per_cpu(radix_tree_preloads, cpu); + while (rtp->nr) { + node = rtp->nodes; + rtp->nodes = node->private_data; + kmem_cache_free(radix_tree_node_cachep, node); + rtp->nr--; } - return NOTIFY_OK; + return 0; } void __init radix_tree_init(void) { + int ret; radix_tree_node_cachep = kmem_cache_create("radix_tree_node", sizeof(struct radix_tree_node), 0, SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, radix_tree_node_ctor); radix_tree_init_maxnodes(); - hotcpu_notifier(radix_tree_callback, 0); + ret = cpuhp_setup_state_nocalls(CPUHP_RADIX_DEAD, "lib/radix:dead", + NULL, radix_tree_cpu_dead); + WARN_ON(ret < 0); } diff --git a/lib/raid6/.gitignore b/lib/raid6/.gitignore index 0a7e494b2bcd..f01b1cb04f91 100644 --- a/lib/raid6/.gitignore +++ b/lib/raid6/.gitignore @@ -3,3 +3,4 @@ altivec*.c int*.c tables.c neon?.c +s390vx?.c diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile index 3b10a48fa040..3057011f5599 100644 --- a/lib/raid6/Makefile +++ b/lib/raid6/Makefile @@ -3,10 +3,11 @@ obj-$(CONFIG_RAID6_PQ) += raid6_pq.o raid6_pq-y += algos.o recov.o tables.o int1.o int2.o int4.o \ int8.o int16.o int32.o -raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o +raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o avx512.o recov_avx512.o raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o raid6_pq-$(CONFIG_TILEGX) += tilegx8.o +raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o hostprogs-y += mktables @@ -116,6 +117,11 @@ $(obj)/tilegx8.c: UNROLL := 8 $(obj)/tilegx8.c: $(src)/tilegx.uc $(src)/unroll.awk FORCE $(call if_changed,unroll) +targets += s390vx8.c +$(obj)/s390vx8.c: UNROLL := 8 +$(obj)/s390vx8.c: $(src)/s390vx.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + quiet_cmd_mktable = TABLE $@ cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 ) diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index 975c6e0434bd..7857049fd7d3 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -49,6 +49,10 @@ const struct raid6_calls * const raid6_algos[] = { &raid6_avx2x1, &raid6_avx2x2, #endif +#ifdef CONFIG_AS_AVX512 + &raid6_avx512x1, + &raid6_avx512x2, +#endif #endif #if defined(__x86_64__) && !defined(__arch_um__) &raid6_sse2x1, @@ -59,6 +63,11 @@ const struct raid6_calls * const raid6_algos[] = { &raid6_avx2x2, &raid6_avx2x4, #endif +#ifdef CONFIG_AS_AVX512 + &raid6_avx512x1, + &raid6_avx512x2, + &raid6_avx512x4, +#endif #endif #ifdef CONFIG_ALTIVEC &raid6_altivec1, @@ -69,6 +78,9 @@ const struct raid6_calls * const raid6_algos[] = { #if defined(CONFIG_TILEGX) &raid6_tilegx8, #endif +#if defined(CONFIG_S390) + &raid6_s390vx8, +#endif &raid6_intx1, &raid6_intx2, &raid6_intx4, @@ -89,12 +101,18 @@ void (*raid6_datap_recov)(int, size_t, int, void **); EXPORT_SYMBOL_GPL(raid6_datap_recov); const struct raid6_recov_calls *const raid6_recov_algos[] = { +#ifdef CONFIG_AS_AVX512 + &raid6_recov_avx512, +#endif #ifdef CONFIG_AS_AVX2 &raid6_recov_avx2, #endif #ifdef CONFIG_AS_SSSE3 &raid6_recov_ssse3, #endif +#ifdef CONFIG_S390 + &raid6_recov_s390xc, +#endif &raid6_recov_intx1, NULL }; diff --git a/lib/raid6/avx512.c b/lib/raid6/avx512.c new file mode 100644 index 000000000000..f524a7972006 --- /dev/null +++ b/lib/raid6/avx512.c @@ -0,0 +1,569 @@ +/* -*- linux-c -*- -------------------------------------------------------- + * + * Copyright (C) 2016 Intel Corporation + * + * Author: Gayatri Kammela <gayatri.kammela@intel.com> + * Author: Megha Dey <megha.dey@linux.intel.com> + * + * Based on avx2.c: Copyright 2012 Yuanhan Liu All Rights Reserved + * Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- + */ + +/* + * AVX512 implementation of RAID-6 syndrome functions + * + */ + +#ifdef CONFIG_AS_AVX512 + +#include <linux/raid/pq.h> +#include "x86.h" + +static const struct raid6_avx512_constants { + u64 x1d[8]; +} raid6_avx512_constants __aligned(512) = { + { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, + 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, + 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, + 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,}, +}; + +static int raid6_have_avx512(void) +{ + return boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512BW) && + boot_cpu_has(X86_FEATURE_AVX512VL) && + boot_cpu_has(X86_FEATURE_AVX512DQ); +} + +static void raid6_avx5121_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa64 %0,%%zmm0\n\t" + "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */ + : + : "m" (raid6_avx512_constants.x1d[0])); + + for (d = 0; d < bytes; d += 64) { + asm volatile("prefetchnta %0\n\t" + "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */ + "prefetchnta %1\n\t" + "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */ + "vmovdqa64 %1,%%zmm6" + : + : "m" (dptr[z0][d]), "m" (dptr[z0-1][d])); + for (z = z0-2; z >= 0; z--) { + asm volatile("prefetchnta %0\n\t" + "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t" + "vmovdqa64 %0,%%zmm6" + : + : "m" (dptr[z][d])); + } + asm volatile("vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t" + "vmovntdq %%zmm2,%0\n\t" + "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" + "vmovntdq %%zmm4,%1\n\t" + "vpxorq %%zmm4,%%zmm4,%%zmm4" + : + : "m" (p[d]), "m" (q[d])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +static void raid6_avx5121_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa64 %0,%%zmm0" + : : "m" (raid6_avx512_constants.x1d[0])); + + for (d = 0 ; d < bytes ; d += 64) { + asm volatile("vmovdqa64 %0,%%zmm4\n\t" + "vmovdqa64 %1,%%zmm2\n\t" + "vpxorq %%zmm4,%%zmm2,%%zmm2" + : + : "m" (dptr[z0][d]), "m" (p[d])); + /* P/Q data pages */ + for (z = z0-1 ; z >= start ; z--) { + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vmovdqa64 %0,%%zmm5\n\t" + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4" + : + : "m" (dptr[z][d])); + } + /* P/Q left side optimization */ + for (z = start-1 ; z >= 0 ; z--) { + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4" + : + : ); + } + asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t" + /* Don't use movntdq for r/w memory area < cache line */ + "vmovdqa64 %%zmm4,%0\n\t" + "vmovdqa64 %%zmm2,%1" + : + : "m" (q[d]), "m" (p[d])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_avx512x1 = { + raid6_avx5121_gen_syndrome, + raid6_avx5121_xor_syndrome, + raid6_have_avx512, + "avx512x1", + 1 /* Has cache hints */ +}; + +/* + * Unrolled-by-2 AVX512 implementation + */ +static void raid6_avx5122_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa64 %0,%%zmm0\n\t" + "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */ + : + : "m" (raid6_avx512_constants.x1d[0])); + + /* We uniformly assume a single prefetch covers at least 64 bytes */ + for (d = 0; d < bytes; d += 128) { + asm volatile("prefetchnta %0\n\t" + "prefetchnta %1\n\t" + "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */ + "vmovdqa64 %1,%%zmm3\n\t" /* P[1] */ + "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */ + "vmovdqa64 %%zmm3,%%zmm6" /* Q[1] */ + : + : "m" (dptr[z0][d]), "m" (dptr[z0][d+64])); + for (z = z0-1; z >= 0; z--) { + asm volatile("prefetchnta %0\n\t" + "prefetchnta %1\n\t" + "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" + "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpmovm2b %%k2,%%zmm7\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vmovdqa64 %0,%%zmm5\n\t" + "vmovdqa64 %1,%%zmm7\n\t" + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6" + : + : "m" (dptr[z][d]), "m" (dptr[z][d+64])); + } + asm volatile("vmovntdq %%zmm2,%0\n\t" + "vmovntdq %%zmm3,%1\n\t" + "vmovntdq %%zmm4,%2\n\t" + "vmovntdq %%zmm6,%3" + : + : "m" (p[d]), "m" (p[d+64]), "m" (q[d]), + "m" (q[d+64])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +static void raid6_avx5122_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa64 %0,%%zmm0" + : : "m" (raid6_avx512_constants.x1d[0])); + + for (d = 0 ; d < bytes ; d += 128) { + asm volatile("vmovdqa64 %0,%%zmm4\n\t" + "vmovdqa64 %1,%%zmm6\n\t" + "vmovdqa64 %2,%%zmm2\n\t" + "vmovdqa64 %3,%%zmm3\n\t" + "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm6,%%zmm3,%%zmm3" + : + : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]), + "m" (p[d]), "m" (p[d+64])); + /* P/Q data pages */ + for (z = z0-1 ; z >= start ; z--) { + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" + "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpmovm2b %%k2,%%zmm7\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vmovdqa64 %0,%%zmm5\n\t" + "vmovdqa64 %1,%%zmm7\n\t" + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6" + : + : "m" (dptr[z][d]), "m" (dptr[z][d+64])); + } + /* P/Q left side optimization */ + for (z = start-1 ; z >= 0 ; z--) { + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" + "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpmovm2b %%k2,%%zmm7\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6" + : + : ); + } + asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t" + "vpxorq %1,%%zmm6,%%zmm6\n\t" + /* Don't use movntdq for r/w + * memory area < cache line + */ + "vmovdqa64 %%zmm4,%0\n\t" + "vmovdqa64 %%zmm6,%1\n\t" + "vmovdqa64 %%zmm2,%2\n\t" + "vmovdqa64 %%zmm3,%3" + : + : "m" (q[d]), "m" (q[d+64]), "m" (p[d]), + "m" (p[d+64])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_avx512x2 = { + raid6_avx5122_gen_syndrome, + raid6_avx5122_xor_syndrome, + raid6_have_avx512, + "avx512x2", + 1 /* Has cache hints */ +}; + +#ifdef CONFIG_X86_64 + +/* + * Unrolled-by-4 AVX2 implementation + */ +static void raid6_avx5124_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa64 %0,%%zmm0\n\t" + "vpxorq %%zmm1,%%zmm1,%%zmm1\n\t" /* Zero temp */ + "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" /* P[0] */ + "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t" /* P[1] */ + "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t" /* Q[0] */ + "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t" /* Q[1] */ + "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t" /* P[2] */ + "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t" /* P[3] */ + "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t" /* Q[2] */ + "vpxorq %%zmm14,%%zmm14,%%zmm14" /* Q[3] */ + : + : "m" (raid6_avx512_constants.x1d[0])); + + for (d = 0; d < bytes; d += 256) { + for (z = z0; z >= 0; z--) { + asm volatile("prefetchnta %0\n\t" + "prefetchnta %1\n\t" + "prefetchnta %2\n\t" + "prefetchnta %3\n\t" + "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" + "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t" + "vpcmpgtb %%zmm12,%%zmm1,%%k3\n\t" + "vpcmpgtb %%zmm14,%%zmm1,%%k4\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpmovm2b %%k2,%%zmm7\n\t" + "vpmovm2b %%k3,%%zmm13\n\t" + "vpmovm2b %%k4,%%zmm15\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" + "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" + "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" + "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" + "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" + "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t" + "vmovdqa64 %0,%%zmm5\n\t" + "vmovdqa64 %1,%%zmm7\n\t" + "vmovdqa64 %2,%%zmm13\n\t" + "vmovdqa64 %3,%%zmm15\n\t" + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" + "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t" + "vpxorq %%zmm15,%%zmm11,%%zmm11\n" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" + "vpxorq %%zmm15,%%zmm14,%%zmm14" + : + : "m" (dptr[z][d]), "m" (dptr[z][d+64]), + "m" (dptr[z][d+128]), "m" (dptr[z][d+192])); + } + asm volatile("vmovntdq %%zmm2,%0\n\t" + "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" + "vmovntdq %%zmm3,%1\n\t" + "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t" + "vmovntdq %%zmm10,%2\n\t" + "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t" + "vmovntdq %%zmm11,%3\n\t" + "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t" + "vmovntdq %%zmm4,%4\n\t" + "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t" + "vmovntdq %%zmm6,%5\n\t" + "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t" + "vmovntdq %%zmm12,%6\n\t" + "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t" + "vmovntdq %%zmm14,%7\n\t" + "vpxorq %%zmm14,%%zmm14,%%zmm14" + : + : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), + "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]), + "m" (q[d+128]), "m" (q[d+192])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +static void raid6_avx5124_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa64 %0,%%zmm0" + :: "m" (raid6_avx512_constants.x1d[0])); + + for (d = 0 ; d < bytes ; d += 256) { + asm volatile("vmovdqa64 %0,%%zmm4\n\t" + "vmovdqa64 %1,%%zmm6\n\t" + "vmovdqa64 %2,%%zmm12\n\t" + "vmovdqa64 %3,%%zmm14\n\t" + "vmovdqa64 %4,%%zmm2\n\t" + "vmovdqa64 %5,%%zmm3\n\t" + "vmovdqa64 %6,%%zmm10\n\t" + "vmovdqa64 %7,%%zmm11\n\t" + "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm6,%%zmm3,%%zmm3\n\t" + "vpxorq %%zmm12,%%zmm10,%%zmm10\n\t" + "vpxorq %%zmm14,%%zmm11,%%zmm11" + : + : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]), + "m" (dptr[z0][d+128]), "m" (dptr[z0][d+192]), + "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), + "m" (p[d+192])); + /* P/Q data pages */ + for (z = z0-1 ; z >= start ; z--) { + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" + "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t" + "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t" + "prefetchnta %0\n\t" + "prefetchnta %2\n\t" + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" + "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" + "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t" + "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpmovm2b %%k2,%%zmm7\n\t" + "vpmovm2b %%k3,%%zmm13\n\t" + "vpmovm2b %%k4,%%zmm15\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" + "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" + "vpaddb %%Zmm14,%%zmm14,%%zmm14\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" + "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" + "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" + "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t" + "vmovdqa64 %0,%%zmm5\n\t" + "vmovdqa64 %1,%%zmm7\n\t" + "vmovdqa64 %2,%%zmm13\n\t" + "vmovdqa64 %3,%%zmm15\n\t" + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" + "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t" + "vpxorq %%zmm15,%%zmm11,%%zmm11\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" + "vpxorq %%zmm15,%%zmm14,%%zmm14" + : + : "m" (dptr[z][d]), "m" (dptr[z][d+64]), + "m" (dptr[z][d+128]), + "m" (dptr[z][d+192])); + } + asm volatile("prefetchnta %0\n\t" + "prefetchnta %1\n\t" + : + : "m" (q[d]), "m" (q[d+128])); + /* P/Q left side optimization */ + for (z = start-1 ; z >= 0 ; z--) { + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" + "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t" + "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t" + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" + "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" + "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t" + "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpmovm2b %%k2,%%zmm7\n\t" + "vpmovm2b %%k3,%%zmm13\n\t" + "vpmovm2b %%k4,%%zmm15\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" + "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" + "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" + "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" + "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" + "vpxorq %%zmm15,%%zmm14,%%zmm14" + : + : ); + } + asm volatile("vmovntdq %%zmm2,%0\n\t" + "vmovntdq %%zmm3,%1\n\t" + "vmovntdq %%zmm10,%2\n\t" + "vmovntdq %%zmm11,%3\n\t" + "vpxorq %4,%%zmm4,%%zmm4\n\t" + "vpxorq %5,%%zmm6,%%zmm6\n\t" + "vpxorq %6,%%zmm12,%%zmm12\n\t" + "vpxorq %7,%%zmm14,%%zmm14\n\t" + "vmovntdq %%zmm4,%4\n\t" + "vmovntdq %%zmm6,%5\n\t" + "vmovntdq %%zmm12,%6\n\t" + "vmovntdq %%zmm14,%7" + : + : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), + "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]), + "m" (q[d+128]), "m" (q[d+192])); + } + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} +const struct raid6_calls raid6_avx512x4 = { + raid6_avx5124_gen_syndrome, + raid6_avx5124_xor_syndrome, + raid6_have_avx512, + "avx512x4", + 1 /* Has cache hints */ +}; +#endif + +#endif /* CONFIG_AS_AVX512 */ diff --git a/lib/raid6/recov_avx512.c b/lib/raid6/recov_avx512.c new file mode 100644 index 000000000000..625aafa33b61 --- /dev/null +++ b/lib/raid6/recov_avx512.c @@ -0,0 +1,388 @@ +/* + * Copyright (C) 2016 Intel Corporation + * + * Author: Gayatri Kammela <gayatri.kammela@intel.com> + * Author: Megha Dey <megha.dey@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + */ + +#ifdef CONFIG_AS_AVX512 + +#include <linux/raid/pq.h> +#include "x86.h" + +static int raid6_has_avx512(void) +{ + return boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512BW) && + boot_cpu_has(X86_FEATURE_AVX512VL) && + boot_cpu_has(X86_FEATURE_AVX512DQ); +} + +static void raid6_2data_recov_avx512(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + const u8 x0f = 0x0f; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* + * Compute syndrome with zero for the missing data pages + * Use the dead data pages as temporary storage for + * delta p and delta q + */ + + dp = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks-2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks-2] = p; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]]; + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ + raid6_gfexp[failb]]]; + + kernel_fpu_begin(); + + /* zmm0 = x0f[16] */ + asm volatile("vpbroadcastb %0, %%zmm7" : : "m" (x0f)); + + while (bytes) { +#ifdef CONFIG_X86_64 + asm volatile("vmovdqa64 %0, %%zmm1\n\t" + "vmovdqa64 %1, %%zmm9\n\t" + "vmovdqa64 %2, %%zmm0\n\t" + "vmovdqa64 %3, %%zmm8\n\t" + "vpxorq %4, %%zmm1, %%zmm1\n\t" + "vpxorq %5, %%zmm9, %%zmm9\n\t" + "vpxorq %6, %%zmm0, %%zmm0\n\t" + "vpxorq %7, %%zmm8, %%zmm8" + : + : "m" (q[0]), "m" (q[64]), "m" (p[0]), + "m" (p[64]), "m" (dq[0]), "m" (dq[64]), + "m" (dp[0]), "m" (dp[64])); + + /* + * 1 = dq[0] ^ q[0] + * 9 = dq[64] ^ q[64] + * 0 = dp[0] ^ p[0] + * 8 = dp[64] ^ p[64] + */ + + asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t" + "vbroadcasti64x2 %1, %%zmm5" + : + : "m" (qmul[0]), "m" (qmul[16])); + + asm volatile("vpsraw $4, %%zmm1, %%zmm3\n\t" + "vpsraw $4, %%zmm9, %%zmm12\n\t" + "vpandq %%zmm7, %%zmm1, %%zmm1\n\t" + "vpandq %%zmm7, %%zmm9, %%zmm9\n\t" + "vpandq %%zmm7, %%zmm3, %%zmm3\n\t" + "vpandq %%zmm7, %%zmm12, %%zmm12\n\t" + "vpshufb %%zmm9, %%zmm4, %%zmm14\n\t" + "vpshufb %%zmm1, %%zmm4, %%zmm4\n\t" + "vpshufb %%zmm12, %%zmm5, %%zmm15\n\t" + "vpshufb %%zmm3, %%zmm5, %%zmm5\n\t" + "vpxorq %%zmm14, %%zmm15, %%zmm15\n\t" + "vpxorq %%zmm4, %%zmm5, %%zmm5" + : + : ); + + /* + * 5 = qx[0] + * 15 = qx[64] + */ + + asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t" + "vbroadcasti64x2 %1, %%zmm1\n\t" + "vpsraw $4, %%zmm0, %%zmm2\n\t" + "vpsraw $4, %%zmm8, %%zmm6\n\t" + "vpandq %%zmm7, %%zmm0, %%zmm3\n\t" + "vpandq %%zmm7, %%zmm8, %%zmm14\n\t" + "vpandq %%zmm7, %%zmm2, %%zmm2\n\t" + "vpandq %%zmm7, %%zmm6, %%zmm6\n\t" + "vpshufb %%zmm14, %%zmm4, %%zmm12\n\t" + "vpshufb %%zmm3, %%zmm4, %%zmm4\n\t" + "vpshufb %%zmm6, %%zmm1, %%zmm13\n\t" + "vpshufb %%zmm2, %%zmm1, %%zmm1\n\t" + "vpxorq %%zmm4, %%zmm1, %%zmm1\n\t" + "vpxorq %%zmm12, %%zmm13, %%zmm13" + : + : "m" (pbmul[0]), "m" (pbmul[16])); + + /* + * 1 = pbmul[px[0]] + * 13 = pbmul[px[64]] + */ + asm volatile("vpxorq %%zmm5, %%zmm1, %%zmm1\n\t" + "vpxorq %%zmm15, %%zmm13, %%zmm13" + : + : ); + + /* + * 1 = db = DQ + * 13 = db[64] = DQ[64] + */ + asm volatile("vmovdqa64 %%zmm1, %0\n\t" + "vmovdqa64 %%zmm13,%1\n\t" + "vpxorq %%zmm1, %%zmm0, %%zmm0\n\t" + "vpxorq %%zmm13, %%zmm8, %%zmm8" + : + : "m" (dq[0]), "m" (dq[64])); + + asm volatile("vmovdqa64 %%zmm0, %0\n\t" + "vmovdqa64 %%zmm8, %1" + : + : "m" (dp[0]), "m" (dp[64])); + + bytes -= 128; + p += 128; + q += 128; + dp += 128; + dq += 128; +#else + asm volatile("vmovdqa64 %0, %%zmm1\n\t" + "vmovdqa64 %1, %%zmm0\n\t" + "vpxorq %2, %%zmm1, %%zmm1\n\t" + "vpxorq %3, %%zmm0, %%zmm0" + : + : "m" (*q), "m" (*p), "m"(*dq), "m" (*dp)); + + /* 1 = dq ^ q; 0 = dp ^ p */ + + asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t" + "vbroadcasti64x2 %1, %%zmm5" + : + : "m" (qmul[0]), "m" (qmul[16])); + + /* + * 1 = dq ^ q + * 3 = dq ^ p >> 4 + */ + asm volatile("vpsraw $4, %%zmm1, %%zmm3\n\t" + "vpandq %%zmm7, %%zmm1, %%zmm1\n\t" + "vpandq %%zmm7, %%zmm3, %%zmm3\n\t" + "vpshufb %%zmm1, %%zmm4, %%zmm4\n\t" + "vpshufb %%zmm3, %%zmm5, %%zmm5\n\t" + "vpxorq %%zmm4, %%zmm5, %%zmm5" + : + : ); + + /* 5 = qx */ + + asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t" + "vbroadcasti64x2 %1, %%zmm1" + : + : "m" (pbmul[0]), "m" (pbmul[16])); + + asm volatile("vpsraw $4, %%zmm0, %%zmm2\n\t" + "vpandq %%zmm7, %%zmm0, %%zmm3\n\t" + "vpandq %%zmm7, %%zmm2, %%zmm2\n\t" + "vpshufb %%zmm3, %%zmm4, %%zmm4\n\t" + "vpshufb %%zmm2, %%zmm1, %%zmm1\n\t" + "vpxorq %%zmm4, %%zmm1, %%zmm1" + : + : ); + + /* 1 = pbmul[px] */ + asm volatile("vpxorq %%zmm5, %%zmm1, %%zmm1\n\t" + /* 1 = db = DQ */ + "vmovdqa64 %%zmm1, %0\n\t" + : + : "m" (dq[0])); + + asm volatile("vpxorq %%zmm1, %%zmm0, %%zmm0\n\t" + "vmovdqa64 %%zmm0, %0" + : + : "m" (dp[0])); + + bytes -= 64; + p += 64; + q += 64; + dp += 64; + dq += 64; +#endif + } + + kernel_fpu_end(); +} + +static void raid6_datap_recov_avx512(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + const u8 x0f = 0x0f; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* + * Compute syndrome with zero for the missing data page + * Use the dead data page as temporary storage for delta q + */ + + dq = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + kernel_fpu_begin(); + + asm volatile("vpbroadcastb %0, %%zmm7" : : "m" (x0f)); + + while (bytes) { +#ifdef CONFIG_X86_64 + asm volatile("vmovdqa64 %0, %%zmm3\n\t" + "vmovdqa64 %1, %%zmm8\n\t" + "vpxorq %2, %%zmm3, %%zmm3\n\t" + "vpxorq %3, %%zmm8, %%zmm8" + : + : "m" (dq[0]), "m" (dq[64]), "m" (q[0]), + "m" (q[64])); + + /* + * 3 = q[0] ^ dq[0] + * 8 = q[64] ^ dq[64] + */ + asm volatile("vbroadcasti64x2 %0, %%zmm0\n\t" + "vmovapd %%zmm0, %%zmm13\n\t" + "vbroadcasti64x2 %1, %%zmm1\n\t" + "vmovapd %%zmm1, %%zmm14" + : + : "m" (qmul[0]), "m" (qmul[16])); + + asm volatile("vpsraw $4, %%zmm3, %%zmm6\n\t" + "vpsraw $4, %%zmm8, %%zmm12\n\t" + "vpandq %%zmm7, %%zmm3, %%zmm3\n\t" + "vpandq %%zmm7, %%zmm8, %%zmm8\n\t" + "vpandq %%zmm7, %%zmm6, %%zmm6\n\t" + "vpandq %%zmm7, %%zmm12, %%zmm12\n\t" + "vpshufb %%zmm3, %%zmm0, %%zmm0\n\t" + "vpshufb %%zmm8, %%zmm13, %%zmm13\n\t" + "vpshufb %%zmm6, %%zmm1, %%zmm1\n\t" + "vpshufb %%zmm12, %%zmm14, %%zmm14\n\t" + "vpxorq %%zmm0, %%zmm1, %%zmm1\n\t" + "vpxorq %%zmm13, %%zmm14, %%zmm14" + : + : ); + + /* + * 1 = qmul[q[0] ^ dq[0]] + * 14 = qmul[q[64] ^ dq[64]] + */ + asm volatile("vmovdqa64 %0, %%zmm2\n\t" + "vmovdqa64 %1, %%zmm12\n\t" + "vpxorq %%zmm1, %%zmm2, %%zmm2\n\t" + "vpxorq %%zmm14, %%zmm12, %%zmm12" + : + : "m" (p[0]), "m" (p[64])); + + /* + * 2 = p[0] ^ qmul[q[0] ^ dq[0]] + * 12 = p[64] ^ qmul[q[64] ^ dq[64]] + */ + + asm volatile("vmovdqa64 %%zmm1, %0\n\t" + "vmovdqa64 %%zmm14, %1\n\t" + "vmovdqa64 %%zmm2, %2\n\t" + "vmovdqa64 %%zmm12,%3" + : + : "m" (dq[0]), "m" (dq[64]), "m" (p[0]), + "m" (p[64])); + + bytes -= 128; + p += 128; + q += 128; + dq += 128; +#else + asm volatile("vmovdqa64 %0, %%zmm3\n\t" + "vpxorq %1, %%zmm3, %%zmm3" + : + : "m" (dq[0]), "m" (q[0])); + + /* 3 = q ^ dq */ + + asm volatile("vbroadcasti64x2 %0, %%zmm0\n\t" + "vbroadcasti64x2 %1, %%zmm1" + : + : "m" (qmul[0]), "m" (qmul[16])); + + asm volatile("vpsraw $4, %%zmm3, %%zmm6\n\t" + "vpandq %%zmm7, %%zmm3, %%zmm3\n\t" + "vpandq %%zmm7, %%zmm6, %%zmm6\n\t" + "vpshufb %%zmm3, %%zmm0, %%zmm0\n\t" + "vpshufb %%zmm6, %%zmm1, %%zmm1\n\t" + "vpxorq %%zmm0, %%zmm1, %%zmm1" + : + : ); + + /* 1 = qmul[q ^ dq] */ + + asm volatile("vmovdqa64 %0, %%zmm2\n\t" + "vpxorq %%zmm1, %%zmm2, %%zmm2" + : + : "m" (p[0])); + + /* 2 = p ^ qmul[q ^ dq] */ + + asm volatile("vmovdqa64 %%zmm1, %0\n\t" + "vmovdqa64 %%zmm2, %1" + : + : "m" (dq[0]), "m" (p[0])); + + bytes -= 64; + p += 64; + q += 64; + dq += 64; +#endif + } + + kernel_fpu_end(); +} + +const struct raid6_recov_calls raid6_recov_avx512 = { + .data2 = raid6_2data_recov_avx512, + .datap = raid6_datap_recov_avx512, + .valid = raid6_has_avx512, +#ifdef CONFIG_X86_64 + .name = "avx512x2", +#else + .name = "avx512x1", +#endif + .priority = 3, +}; + +#else +#warning "your version of binutils lacks AVX512 support" +#endif diff --git a/lib/raid6/recov_s390xc.c b/lib/raid6/recov_s390xc.c new file mode 100644 index 000000000000..b042dac826cc --- /dev/null +++ b/lib/raid6/recov_s390xc.c @@ -0,0 +1,116 @@ +/* + * RAID-6 data recovery in dual failure mode based on the XC instruction. + * + * Copyright IBM Corp. 2016 + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> + */ + +#include <linux/export.h> +#include <linux/raid/pq.h> + +static inline void xor_block(u8 *p1, u8 *p2) +{ + typedef struct { u8 _[256]; } addrtype; + + asm volatile( + " xc 0(256,%[p1]),0(%[p2])\n" + : "+m" (*(addrtype *) p1) : "m" (*(addrtype *) p2), + [p1] "a" (p1), [p2] "a" (p2) : "cc"); +} + +/* Recover two failed data blocks. */ +static void raid6_2data_recov_s390xc(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + int i; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* Compute syndrome with zero for the missing data pages + Use the dead data pages as temporary storage for + delta p and delta q */ + dp = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks-2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks-2] = p; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_gfmul[raid6_gfexi[failb-faila]]; + qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]]; + + /* Now do it... */ + while (bytes) { + xor_block(dp, p); + xor_block(dq, q); + for (i = 0; i < 256; i++) + dq[i] = pbmul[dp[i]] ^ qmul[dq[i]]; + xor_block(dp, dq); + p += 256; + q += 256; + dp += 256; + dq += 256; + bytes -= 256; + } +} + +/* Recover failure of one data block plus the P block */ +static void raid6_datap_recov_s390xc(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + int i; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* Compute syndrome with zero for the missing data page + Use the dead data page as temporary storage for delta q */ + dq = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + /* Now do it... */ + while (bytes) { + xor_block(dq, q); + for (i = 0; i < 256; i++) + dq[i] = qmul[dq[i]]; + xor_block(p, dq); + p += 256; + q += 256; + dq += 256; + bytes -= 256; + } +} + + +const struct raid6_recov_calls raid6_recov_s390xc = { + .data2 = raid6_2data_recov_s390xc, + .datap = raid6_datap_recov_s390xc, + .valid = NULL, + .name = "s390xc", + .priority = 1, +}; diff --git a/lib/raid6/s390vx.uc b/lib/raid6/s390vx.uc new file mode 100644 index 000000000000..7b45191a655f --- /dev/null +++ b/lib/raid6/s390vx.uc @@ -0,0 +1,168 @@ +/* + * raid6_vx$#.c + * + * $#-way unrolled RAID6 gen/xor functions for s390 + * based on the vector facility + * + * Copyright IBM Corp. 2016 + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> + * + * This file is postprocessed using unroll.awk. + */ + +#include <linux/raid/pq.h> +#include <asm/fpu/api.h> + +asm(".include \"asm/vx-insn.h\"\n"); + +#define NSIZE 16 + +static inline void LOAD_CONST(void) +{ + asm volatile("VREPIB %v24,7"); + asm volatile("VREPIB %v25,0x1d"); +} + +/* + * The SHLBYTE() operation shifts each of the 16 bytes in + * vector register y left by 1 bit and stores the result in + * vector register x. + */ +static inline void SHLBYTE(int x, int y) +{ + asm volatile ("VAB %0,%1,%1" : : "i" (x), "i" (y)); +} + +/* + * For each of the 16 bytes in the vector register y the MASK() + * operation returns 0xFF if the high bit of the byte is 1, + * or 0x00 if the high bit is 0. The result is stored in vector + * register x. + */ +static inline void MASK(int x, int y) +{ + asm volatile ("VESRAVB %0,%1,24" : : "i" (x), "i" (y)); +} + +static inline void AND(int x, int y, int z) +{ + asm volatile ("VN %0,%1,%2" : : "i" (x), "i" (y), "i" (z)); +} + +static inline void XOR(int x, int y, int z) +{ + asm volatile ("VX %0,%1,%2" : : "i" (x), "i" (y), "i" (z)); +} + +static inline void LOAD_DATA(int x, int n, u8 *ptr) +{ + typedef struct { u8 _[16*n]; } addrtype; + register addrtype *__ptr asm("1") = (addrtype *) ptr; + + asm volatile ("VLM %2,%3,0,%r1" + : : "m" (*__ptr), "a" (__ptr), "i" (x), "i" (x + n - 1)); +} + +static inline void STORE_DATA(int x, int n, u8 *ptr) +{ + typedef struct { u8 _[16*n]; } addrtype; + register addrtype *__ptr asm("1") = (addrtype *) ptr; + + asm volatile ("VSTM %2,%3,0,1" + : "=m" (*__ptr) : "a" (__ptr), "i" (x), "i" (x + n - 1)); +} + +static inline void COPY_VEC(int x, int y) +{ + asm volatile ("VLR %0,%1" : : "i" (x), "i" (y)); +} + +static void raid6_s390vx$#_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + struct kernel_fpu vxstate; + u8 **dptr, *p, *q; + int d, z, z0; + + kernel_fpu_begin(&vxstate, KERNEL_VXR); + LOAD_CONST(); + + dptr = (u8 **) ptrs; + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0 + 1]; /* XOR parity */ + q = dptr[z0 + 2]; /* RS syndrome */ + + for (d = 0; d < bytes; d += $#*NSIZE) { + LOAD_DATA(0,$#,&dptr[z0][d]); + COPY_VEC(8+$$,0+$$); + for (z = z0 - 1; z >= 0; z--) { + MASK(16+$$,8+$$); + AND(16+$$,16+$$,25); + SHLBYTE(8+$$,8+$$); + XOR(8+$$,8+$$,16+$$); + LOAD_DATA(16,$#,&dptr[z][d]); + XOR(0+$$,0+$$,16+$$); + XOR(8+$$,8+$$,16+$$); + } + STORE_DATA(0,$#,&p[d]); + STORE_DATA(8,$#,&q[d]); + } + kernel_fpu_end(&vxstate, KERNEL_VXR); +} + +static void raid6_s390vx$#_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + struct kernel_fpu vxstate; + u8 **dptr, *p, *q; + int d, z, z0; + + dptr = (u8 **) ptrs; + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks - 2]; /* XOR parity */ + q = dptr[disks - 1]; /* RS syndrome */ + + kernel_fpu_begin(&vxstate, KERNEL_VXR); + LOAD_CONST(); + + for (d = 0; d < bytes; d += $#*NSIZE) { + /* P/Q data pages */ + LOAD_DATA(0,$#,&dptr[z0][d]); + COPY_VEC(8+$$,0+$$); + for (z = z0 - 1; z >= start; z--) { + MASK(16+$$,8+$$); + AND(16+$$,16+$$,25); + SHLBYTE(8+$$,8+$$); + XOR(8+$$,8+$$,16+$$); + LOAD_DATA(16,$#,&dptr[z][d]); + XOR(0+$$,0+$$,16+$$); + XOR(8+$$,8+$$,16+$$); + } + /* P/Q left side optimization */ + for (z = start - 1; z >= 0; z--) { + MASK(16+$$,8+$$); + AND(16+$$,16+$$,25); + SHLBYTE(8+$$,8+$$); + XOR(8+$$,8+$$,16+$$); + } + LOAD_DATA(16,$#,&p[d]); + XOR(16+$$,16+$$,0+$$); + STORE_DATA(16,$#,&p[d]); + LOAD_DATA(16,$#,&q[d]); + XOR(16+$$,16+$$,8+$$); + STORE_DATA(16,$#,&q[d]); + } + kernel_fpu_end(&vxstate, KERNEL_VXR); +} + +static int raid6_s390vx$#_valid(void) +{ + return MACHINE_HAS_VX; +} + +const struct raid6_calls raid6_s390vx$# = { + raid6_s390vx$#_gen_syndrome, + raid6_s390vx$#_xor_syndrome, + raid6_s390vx$#_valid, + "vx128x$#", + 1 +}; diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile index 29090f3db677..2c7b60edea04 100644 --- a/lib/raid6/test/Makefile +++ b/lib/raid6/test/Makefile @@ -32,10 +32,13 @@ ifeq ($(ARCH),arm64) endif ifeq ($(IS_X86),yes) - OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o + OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o avx512.o recov_avx512.o CFLAGS += $(shell echo "vpbroadcastb %xmm0, %ymm1" | \ gcc -c -x assembler - >&/dev/null && \ rm ./-.o && echo -DCONFIG_AS_AVX2=1) + CFLAGS += $(shell echo "vpmovm2b %k1, %zmm5" | \ + gcc -c -x assembler - >&/dev/null && \ + rm ./-.o && echo -DCONFIG_AS_AVX512=1) else ifeq ($(HAS_NEON),yes) OBJS += neon.o neon1.o neon2.o neon4.o neon8.o CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1 diff --git a/lib/raid6/test/test.c b/lib/raid6/test/test.c index 3bebbabdb510..b07f4d8e6b03 100644 --- a/lib/raid6/test/test.c +++ b/lib/raid6/test/test.c @@ -21,12 +21,13 @@ #define NDISKS 16 /* Including P and Q */ -const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); +const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); struct raid6_calls raid6_call; char *dataptrs[NDISKS]; -char data[NDISKS][PAGE_SIZE]; -char recovi[PAGE_SIZE], recovj[PAGE_SIZE]; +char data[NDISKS][PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); +char recovi[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); +char recovj[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); static void makedata(int start, int stop) { diff --git a/lib/raid6/x86.h b/lib/raid6/x86.h index 8fe9d9662abb..834d268a4b05 100644 --- a/lib/raid6/x86.h +++ b/lib/raid6/x86.h @@ -46,6 +46,16 @@ static inline void kernel_fpu_end(void) #define X86_FEATURE_SSSE3 (4*32+ 9) /* Supplemental SSE-3 */ #define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ #define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */ +#define X86_FEATURE_AVX512F (9*32+16) /* AVX-512 Foundation */ +#define X86_FEATURE_AVX512DQ (9*32+17) /* AVX-512 DQ (Double/Quad granular) + * Instructions + */ +#define X86_FEATURE_AVX512BW (9*32+30) /* AVX-512 BW (Byte/Word granular) + * Instructions + */ +#define X86_FEATURE_AVX512VL (9*32+31) /* AVX-512 VL (128/256 Vector Length) + * Extensions + */ #define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */ /* Should work well enough on modern CPUs for testing */ diff --git a/lib/random32.c b/lib/random32.c index 69ed593aab07..fa594b1140e6 100644 --- a/lib/random32.c +++ b/lib/random32.c @@ -47,7 +47,7 @@ static inline void prandom_state_selftest(void) } #endif -static DEFINE_PER_CPU(struct rnd_state, net_rand_state); +static DEFINE_PER_CPU(struct rnd_state, net_rand_state) __latent_entropy; /** * prandom_u32_state - seeded pseudo-random number generator. @@ -81,7 +81,7 @@ u32 prandom_u32(void) u32 res; res = prandom_u32_state(state); - put_cpu_var(state); + put_cpu_var(net_rand_state); return res; } @@ -128,7 +128,7 @@ void prandom_bytes(void *buf, size_t bytes) struct rnd_state *state = &get_cpu_var(net_rand_state); prandom_bytes_state(state, buf, bytes); - put_cpu_var(state); + put_cpu_var(net_rand_state); } EXPORT_SYMBOL(prandom_bytes); diff --git a/lib/rbtree.c b/lib/rbtree.c index eb8a19fee110..1f8b112a7c35 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -296,11 +296,26 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, * * (p) (p) * / \ / \ - * N S --> N Sl + * N S --> N sl * / \ \ - * sl Sr s + * sl Sr S * \ * Sr + * + * Note: p might be red, and then both + * p and sl are red after rotation(which + * breaks property 4). This is fixed in + * Case 4 (in __rb_rotate_set_parents() + * which set sl the color of p + * and set p RB_BLACK) + * + * (p) (sl) + * / \ / \ + * N sl --> P S + * \ / \ + * S N Sr + * \ + * Sr */ tmp1 = tmp2->rb_right; WRITE_ONCE(sibling->rb_left, tmp1); @@ -365,7 +380,7 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, } break; } - /* Case 3 - right rotate at sibling */ + /* Case 3 - left rotate at sibling */ tmp1 = tmp2->rb_left; WRITE_ONCE(sibling->rb_right, tmp1); WRITE_ONCE(tmp2->rb_left, sibling); @@ -377,7 +392,7 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, tmp1 = sibling; sibling = tmp2; } - /* Case 4 - left rotate at parent + color flips */ + /* Case 4 - right rotate at parent + color flips */ tmp2 = sibling->rb_right; WRITE_ONCE(parent->rb_left, tmp2); WRITE_ONCE(sibling->rb_right, parent); diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 56054e541a0f..32d0ad058380 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -378,22 +378,8 @@ static void rht_deferred_worker(struct work_struct *work) schedule_work(&ht->run_work); } -static bool rhashtable_check_elasticity(struct rhashtable *ht, - struct bucket_table *tbl, - unsigned int hash) -{ - unsigned int elasticity = ht->elasticity; - struct rhash_head *head; - - rht_for_each(head, tbl, hash) - if (!--elasticity) - return true; - - return false; -} - -int rhashtable_insert_rehash(struct rhashtable *ht, - struct bucket_table *tbl) +static int rhashtable_insert_rehash(struct rhashtable *ht, + struct bucket_table *tbl) { struct bucket_table *old_tbl; struct bucket_table *new_tbl; @@ -439,61 +425,172 @@ fail: return err; } -EXPORT_SYMBOL_GPL(rhashtable_insert_rehash); -struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht, - const void *key, - struct rhash_head *obj, - struct bucket_table *tbl) +static void *rhashtable_lookup_one(struct rhashtable *ht, + struct bucket_table *tbl, unsigned int hash, + const void *key, struct rhash_head *obj) { + struct rhashtable_compare_arg arg = { + .ht = ht, + .key = key, + }; + struct rhash_head __rcu **pprev; struct rhash_head *head; - unsigned int hash; - int err; + int elasticity; - tbl = rhashtable_last_table(ht, tbl); - hash = head_hashfn(ht, tbl, obj); - spin_lock_nested(rht_bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING); + elasticity = ht->elasticity; + pprev = &tbl->buckets[hash]; + rht_for_each(head, tbl, hash) { + struct rhlist_head *list; + struct rhlist_head *plist; - err = -EEXIST; - if (key && rhashtable_lookup_fast(ht, key, ht->p)) - goto exit; + elasticity--; + if (!key || + (ht->p.obj_cmpfn ? + ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) : + rhashtable_compare(&arg, rht_obj(ht, head)))) + continue; - err = -E2BIG; - if (unlikely(rht_grow_above_max(ht, tbl))) - goto exit; + if (!ht->rhlist) + return rht_obj(ht, head); + + list = container_of(obj, struct rhlist_head, rhead); + plist = container_of(head, struct rhlist_head, rhead); + + RCU_INIT_POINTER(list->next, plist); + head = rht_dereference_bucket(head->next, tbl, hash); + RCU_INIT_POINTER(list->rhead.next, head); + rcu_assign_pointer(*pprev, obj); + + return NULL; + } + + if (elasticity <= 0) + return ERR_PTR(-EAGAIN); + + return ERR_PTR(-ENOENT); +} + +static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht, + struct bucket_table *tbl, + unsigned int hash, + struct rhash_head *obj, + void *data) +{ + struct bucket_table *new_tbl; + struct rhash_head *head; + + if (!IS_ERR_OR_NULL(data)) + return ERR_PTR(-EEXIST); + + if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT) + return ERR_CAST(data); - err = -EAGAIN; - if (rhashtable_check_elasticity(ht, tbl, hash) || - rht_grow_above_100(ht, tbl)) - goto exit; + new_tbl = rcu_dereference(tbl->future_tbl); + if (new_tbl) + return new_tbl; - err = 0; + if (PTR_ERR(data) != -ENOENT) + return ERR_CAST(data); + + if (unlikely(rht_grow_above_max(ht, tbl))) + return ERR_PTR(-E2BIG); + + if (unlikely(rht_grow_above_100(ht, tbl))) + return ERR_PTR(-EAGAIN); head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash); RCU_INIT_POINTER(obj->next, head); + if (ht->rhlist) { + struct rhlist_head *list; + + list = container_of(obj, struct rhlist_head, rhead); + RCU_INIT_POINTER(list->next, NULL); + } rcu_assign_pointer(tbl->buckets[hash], obj); atomic_inc(&ht->nelems); + if (rht_grow_above_75(ht, tbl)) + schedule_work(&ht->run_work); -exit: - spin_unlock(rht_bucket_lock(tbl, hash)); + return NULL; +} - if (err == 0) - return NULL; - else if (err == -EAGAIN) - return tbl; - else - return ERR_PTR(err); +static void *rhashtable_try_insert(struct rhashtable *ht, const void *key, + struct rhash_head *obj) +{ + struct bucket_table *new_tbl; + struct bucket_table *tbl; + unsigned int hash; + spinlock_t *lock; + void *data; + + tbl = rcu_dereference(ht->tbl); + + /* All insertions must grab the oldest table containing + * the hashed bucket that is yet to be rehashed. + */ + for (;;) { + hash = rht_head_hashfn(ht, tbl, obj, ht->p); + lock = rht_bucket_lock(tbl, hash); + spin_lock_bh(lock); + + if (tbl->rehash <= hash) + break; + + spin_unlock_bh(lock); + tbl = rcu_dereference(tbl->future_tbl); + } + + data = rhashtable_lookup_one(ht, tbl, hash, key, obj); + new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data); + if (PTR_ERR(new_tbl) != -EEXIST) + data = ERR_CAST(new_tbl); + + while (!IS_ERR_OR_NULL(new_tbl)) { + tbl = new_tbl; + hash = rht_head_hashfn(ht, tbl, obj, ht->p); + spin_lock_nested(rht_bucket_lock(tbl, hash), + SINGLE_DEPTH_NESTING); + + data = rhashtable_lookup_one(ht, tbl, hash, key, obj); + new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data); + if (PTR_ERR(new_tbl) != -EEXIST) + data = ERR_CAST(new_tbl); + + spin_unlock(rht_bucket_lock(tbl, hash)); + } + + spin_unlock_bh(lock); + + if (PTR_ERR(data) == -EAGAIN) + data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?: + -EAGAIN); + + return data; +} + +void *rhashtable_insert_slow(struct rhashtable *ht, const void *key, + struct rhash_head *obj) +{ + void *data; + + do { + rcu_read_lock(); + data = rhashtable_try_insert(ht, key, obj); + rcu_read_unlock(); + } while (PTR_ERR(data) == -EAGAIN); + + return data; } EXPORT_SYMBOL_GPL(rhashtable_insert_slow); /** - * rhashtable_walk_init - Initialise an iterator + * rhashtable_walk_enter - Initialise an iterator * @ht: Table to walk over * @iter: Hash table Iterator - * @gfp: GFP flags for allocations * * This function prepares a hash table walk. * @@ -508,30 +605,22 @@ EXPORT_SYMBOL_GPL(rhashtable_insert_slow); * This function may sleep so you must not call it from interrupt * context or with spin locks held. * - * You must call rhashtable_walk_exit if this function returns - * successfully. + * You must call rhashtable_walk_exit after this function returns. */ -int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter, - gfp_t gfp) +void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter) { iter->ht = ht; iter->p = NULL; iter->slot = 0; iter->skip = 0; - iter->walker = kmalloc(sizeof(*iter->walker), gfp); - if (!iter->walker) - return -ENOMEM; - spin_lock(&ht->lock); - iter->walker->tbl = + iter->walker.tbl = rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock)); - list_add(&iter->walker->list, &iter->walker->tbl->walkers); + list_add(&iter->walker.list, &iter->walker.tbl->walkers); spin_unlock(&ht->lock); - - return 0; } -EXPORT_SYMBOL_GPL(rhashtable_walk_init); +EXPORT_SYMBOL_GPL(rhashtable_walk_enter); /** * rhashtable_walk_exit - Free an iterator @@ -542,10 +631,9 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_init); void rhashtable_walk_exit(struct rhashtable_iter *iter) { spin_lock(&iter->ht->lock); - if (iter->walker->tbl) - list_del(&iter->walker->list); + if (iter->walker.tbl) + list_del(&iter->walker.list); spin_unlock(&iter->ht->lock); - kfree(iter->walker); } EXPORT_SYMBOL_GPL(rhashtable_walk_exit); @@ -571,12 +659,12 @@ int rhashtable_walk_start(struct rhashtable_iter *iter) rcu_read_lock(); spin_lock(&ht->lock); - if (iter->walker->tbl) - list_del(&iter->walker->list); + if (iter->walker.tbl) + list_del(&iter->walker.list); spin_unlock(&ht->lock); - if (!iter->walker->tbl) { - iter->walker->tbl = rht_dereference_rcu(ht->tbl, ht); + if (!iter->walker.tbl) { + iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht); return -EAGAIN; } @@ -598,12 +686,17 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_start); */ void *rhashtable_walk_next(struct rhashtable_iter *iter) { - struct bucket_table *tbl = iter->walker->tbl; + struct bucket_table *tbl = iter->walker.tbl; + struct rhlist_head *list = iter->list; struct rhashtable *ht = iter->ht; struct rhash_head *p = iter->p; + bool rhlist = ht->rhlist; if (p) { - p = rht_dereference_bucket_rcu(p->next, tbl, iter->slot); + if (!rhlist || !(list = rcu_dereference(list->next))) { + p = rcu_dereference(p->next); + list = container_of(p, struct rhlist_head, rhead); + } goto next; } @@ -611,6 +704,18 @@ void *rhashtable_walk_next(struct rhashtable_iter *iter) int skip = iter->skip; rht_for_each_rcu(p, tbl, iter->slot) { + if (rhlist) { + list = container_of(p, struct rhlist_head, + rhead); + do { + if (!skip) + goto next; + skip--; + list = rcu_dereference(list->next); + } while (list); + + continue; + } if (!skip) break; skip--; @@ -620,7 +725,8 @@ next: if (!rht_is_a_nulls(p)) { iter->skip++; iter->p = p; - return rht_obj(ht, p); + iter->list = list; + return rht_obj(ht, rhlist ? &list->rhead : p); } iter->skip = 0; @@ -631,8 +737,8 @@ next: /* Ensure we see any new tables. */ smp_rmb(); - iter->walker->tbl = rht_dereference_rcu(tbl->future_tbl, ht); - if (iter->walker->tbl) { + iter->walker.tbl = rht_dereference_rcu(tbl->future_tbl, ht); + if (iter->walker.tbl) { iter->slot = 0; iter->skip = 0; return ERR_PTR(-EAGAIN); @@ -652,7 +758,7 @@ void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU) { struct rhashtable *ht; - struct bucket_table *tbl = iter->walker->tbl; + struct bucket_table *tbl = iter->walker.tbl; if (!tbl) goto out; @@ -661,9 +767,9 @@ void rhashtable_walk_stop(struct rhashtable_iter *iter) spin_lock(&ht->lock); if (tbl->rehash < tbl->size) - list_add(&iter->walker->list, &tbl->walkers); + list_add(&iter->walker.list, &tbl->walkers); else - iter->walker->tbl = NULL; + iter->walker.tbl = NULL; spin_unlock(&ht->lock); iter->p = NULL; @@ -809,6 +915,48 @@ int rhashtable_init(struct rhashtable *ht, EXPORT_SYMBOL_GPL(rhashtable_init); /** + * rhltable_init - initialize a new hash list table + * @hlt: hash list table to be initialized + * @params: configuration parameters + * + * Initializes a new hash list table. + * + * See documentation for rhashtable_init. + */ +int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params) +{ + int err; + + /* No rhlist NULLs marking for now. */ + if (params->nulls_base) + return -EINVAL; + + err = rhashtable_init(&hlt->ht, params); + hlt->ht.rhlist = true; + return err; +} +EXPORT_SYMBOL_GPL(rhltable_init); + +static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj, + void (*free_fn)(void *ptr, void *arg), + void *arg) +{ + struct rhlist_head *list; + + if (!ht->rhlist) { + free_fn(rht_obj(ht, obj), arg); + return; + } + + list = container_of(obj, struct rhlist_head, rhead); + do { + obj = &list->rhead; + list = rht_dereference(list->next, ht); + free_fn(rht_obj(ht, obj), arg); + } while (list); +} + +/** * rhashtable_free_and_destroy - free elements and destroy hash table * @ht: the hash table to destroy * @free_fn: callback to release resources of element @@ -845,7 +993,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht, pos = next, next = !rht_is_a_nulls(pos) ? rht_dereference(pos->next, ht) : NULL) - free_fn(rht_obj(ht, pos), arg); + rhashtable_free_one(ht, pos, free_fn, arg); } } diff --git a/lib/sbitmap.c b/lib/sbitmap.c new file mode 100644 index 000000000000..2cecf05c82fd --- /dev/null +++ b/lib/sbitmap.c @@ -0,0 +1,347 @@ +/* + * Copyright (C) 2016 Facebook + * Copyright (C) 2013-2014 Jens Axboe + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +#include <linux/random.h> +#include <linux/sbitmap.h> + +int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, + gfp_t flags, int node) +{ + unsigned int bits_per_word; + unsigned int i; + + if (shift < 0) { + shift = ilog2(BITS_PER_LONG); + /* + * If the bitmap is small, shrink the number of bits per word so + * we spread over a few cachelines, at least. If less than 4 + * bits, just forget about it, it's not going to work optimally + * anyway. + */ + if (depth >= 4) { + while ((4U << shift) > depth) + shift--; + } + } + bits_per_word = 1U << shift; + if (bits_per_word > BITS_PER_LONG) + return -EINVAL; + + sb->shift = shift; + sb->depth = depth; + sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word); + + if (depth == 0) { + sb->map = NULL; + return 0; + } + + sb->map = kzalloc_node(sb->map_nr * sizeof(*sb->map), flags, node); + if (!sb->map) + return -ENOMEM; + + for (i = 0; i < sb->map_nr; i++) { + sb->map[i].depth = min(depth, bits_per_word); + depth -= sb->map[i].depth; + } + return 0; +} +EXPORT_SYMBOL_GPL(sbitmap_init_node); + +void sbitmap_resize(struct sbitmap *sb, unsigned int depth) +{ + unsigned int bits_per_word = 1U << sb->shift; + unsigned int i; + + sb->depth = depth; + sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word); + + for (i = 0; i < sb->map_nr; i++) { + sb->map[i].depth = min(depth, bits_per_word); + depth -= sb->map[i].depth; + } +} +EXPORT_SYMBOL_GPL(sbitmap_resize); + +static int __sbitmap_get_word(struct sbitmap_word *word, unsigned int hint, + bool wrap) +{ + unsigned int orig_hint = hint; + int nr; + + while (1) { + nr = find_next_zero_bit(&word->word, word->depth, hint); + if (unlikely(nr >= word->depth)) { + /* + * We started with an offset, and we didn't reset the + * offset to 0 in a failure case, so start from 0 to + * exhaust the map. + */ + if (orig_hint && hint && wrap) { + hint = orig_hint = 0; + continue; + } + return -1; + } + + if (!test_and_set_bit(nr, &word->word)) + break; + + hint = nr + 1; + if (hint >= word->depth - 1) + hint = 0; + } + + return nr; +} + +int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin) +{ + unsigned int i, index; + int nr = -1; + + index = SB_NR_TO_INDEX(sb, alloc_hint); + + for (i = 0; i < sb->map_nr; i++) { + nr = __sbitmap_get_word(&sb->map[index], + SB_NR_TO_BIT(sb, alloc_hint), + !round_robin); + if (nr != -1) { + nr += index << sb->shift; + break; + } + + /* Jump to next index. */ + index++; + alloc_hint = index << sb->shift; + + if (index >= sb->map_nr) { + index = 0; + alloc_hint = 0; + } + } + + return nr; +} +EXPORT_SYMBOL_GPL(sbitmap_get); + +bool sbitmap_any_bit_set(const struct sbitmap *sb) +{ + unsigned int i; + + for (i = 0; i < sb->map_nr; i++) { + if (sb->map[i].word) + return true; + } + return false; +} +EXPORT_SYMBOL_GPL(sbitmap_any_bit_set); + +bool sbitmap_any_bit_clear(const struct sbitmap *sb) +{ + unsigned int i; + + for (i = 0; i < sb->map_nr; i++) { + const struct sbitmap_word *word = &sb->map[i]; + unsigned long ret; + + ret = find_first_zero_bit(&word->word, word->depth); + if (ret < word->depth) + return true; + } + return false; +} +EXPORT_SYMBOL_GPL(sbitmap_any_bit_clear); + +unsigned int sbitmap_weight(const struct sbitmap *sb) +{ + unsigned int i, weight = 0; + + for (i = 0; i < sb->map_nr; i++) { + const struct sbitmap_word *word = &sb->map[i]; + + weight += bitmap_weight(&word->word, word->depth); + } + return weight; +} +EXPORT_SYMBOL_GPL(sbitmap_weight); + +static unsigned int sbq_calc_wake_batch(unsigned int depth) +{ + unsigned int wake_batch; + + /* + * For each batch, we wake up one queue. We need to make sure that our + * batch size is small enough that the full depth of the bitmap is + * enough to wake up all of the queues. + */ + wake_batch = SBQ_WAKE_BATCH; + if (wake_batch > depth / SBQ_WAIT_QUEUES) + wake_batch = max(1U, depth / SBQ_WAIT_QUEUES); + + return wake_batch; +} + +int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, + int shift, bool round_robin, gfp_t flags, int node) +{ + int ret; + int i; + + ret = sbitmap_init_node(&sbq->sb, depth, shift, flags, node); + if (ret) + return ret; + + sbq->alloc_hint = alloc_percpu_gfp(unsigned int, flags); + if (!sbq->alloc_hint) { + sbitmap_free(&sbq->sb); + return -ENOMEM; + } + + if (depth && !round_robin) { + for_each_possible_cpu(i) + *per_cpu_ptr(sbq->alloc_hint, i) = prandom_u32() % depth; + } + + sbq->wake_batch = sbq_calc_wake_batch(depth); + atomic_set(&sbq->wake_index, 0); + + sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node); + if (!sbq->ws) { + free_percpu(sbq->alloc_hint); + sbitmap_free(&sbq->sb); + return -ENOMEM; + } + + for (i = 0; i < SBQ_WAIT_QUEUES; i++) { + init_waitqueue_head(&sbq->ws[i].wait); + atomic_set(&sbq->ws[i].wait_cnt, sbq->wake_batch); + } + + sbq->round_robin = round_robin; + return 0; +} +EXPORT_SYMBOL_GPL(sbitmap_queue_init_node); + +void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth) +{ + sbq->wake_batch = sbq_calc_wake_batch(depth); + sbitmap_resize(&sbq->sb, depth); +} +EXPORT_SYMBOL_GPL(sbitmap_queue_resize); + +int __sbitmap_queue_get(struct sbitmap_queue *sbq) +{ + unsigned int hint, depth; + int nr; + + hint = this_cpu_read(*sbq->alloc_hint); + depth = READ_ONCE(sbq->sb.depth); + if (unlikely(hint >= depth)) { + hint = depth ? prandom_u32() % depth : 0; + this_cpu_write(*sbq->alloc_hint, hint); + } + nr = sbitmap_get(&sbq->sb, hint, sbq->round_robin); + + if (nr == -1) { + /* If the map is full, a hint won't do us much good. */ + this_cpu_write(*sbq->alloc_hint, 0); + } else if (nr == hint || unlikely(sbq->round_robin)) { + /* Only update the hint if we used it. */ + hint = nr + 1; + if (hint >= depth - 1) + hint = 0; + this_cpu_write(*sbq->alloc_hint, hint); + } + + return nr; +} +EXPORT_SYMBOL_GPL(__sbitmap_queue_get); + +static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq) +{ + int i, wake_index; + + wake_index = atomic_read(&sbq->wake_index); + for (i = 0; i < SBQ_WAIT_QUEUES; i++) { + struct sbq_wait_state *ws = &sbq->ws[wake_index]; + + if (waitqueue_active(&ws->wait)) { + int o = atomic_read(&sbq->wake_index); + + if (wake_index != o) + atomic_cmpxchg(&sbq->wake_index, o, wake_index); + return ws; + } + + wake_index = sbq_index_inc(wake_index); + } + + return NULL; +} + +static void sbq_wake_up(struct sbitmap_queue *sbq) +{ + struct sbq_wait_state *ws; + int wait_cnt; + + /* Ensure that the wait list checks occur after clear_bit(). */ + smp_mb(); + + ws = sbq_wake_ptr(sbq); + if (!ws) + return; + + wait_cnt = atomic_dec_return(&ws->wait_cnt); + if (unlikely(wait_cnt < 0)) + wait_cnt = atomic_inc_return(&ws->wait_cnt); + if (wait_cnt == 0) { + atomic_add(sbq->wake_batch, &ws->wait_cnt); + sbq_index_atomic_inc(&sbq->wake_index); + wake_up(&ws->wait); + } +} + +void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, + unsigned int cpu) +{ + sbitmap_clear_bit(&sbq->sb, nr); + sbq_wake_up(sbq); + if (likely(!sbq->round_robin && nr < sbq->sb.depth)) + *per_cpu_ptr(sbq->alloc_hint, cpu) = nr; +} +EXPORT_SYMBOL_GPL(sbitmap_queue_clear); + +void sbitmap_queue_wake_all(struct sbitmap_queue *sbq) +{ + int i, wake_index; + + /* + * Make sure all changes prior to this are visible from other CPUs. + */ + smp_mb(); + wake_index = atomic_read(&sbq->wake_index); + for (i = 0; i < SBQ_WAIT_QUEUES; i++) { + struct sbq_wait_state *ws = &sbq->ws[wake_index]; + + if (waitqueue_active(&ws->wait)) + wake_up(&ws->wait); + + wake_index = sbq_index_inc(wake_index); + } +} +EXPORT_SYMBOL_GPL(sbitmap_queue_wake_all); diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 60f77f1d470a..f87d138e9672 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -50,7 +50,7 @@ STACK_ALLOC_ALIGN) #define STACK_ALLOC_INDEX_BITS (DEPOT_STACK_BITS - \ STACK_ALLOC_NULL_PROTECTION_BITS - STACK_ALLOC_OFFSET_BITS) -#define STACK_ALLOC_SLABS_CAP 1024 +#define STACK_ALLOC_SLABS_CAP 8192 #define STACK_ALLOC_MAX_SLABS \ (((1LL << (STACK_ALLOC_INDEX_BITS)) < STACK_ALLOC_SLABS_CAP) ? \ (1LL << (STACK_ALLOC_INDEX_BITS)) : STACK_ALLOC_SLABS_CAP) @@ -192,6 +192,7 @@ void depot_fetch_stack(depot_stack_handle_t handle, struct stack_trace *trace) trace->entries = stack->entries; trace->skip = 0; } +EXPORT_SYMBOL_GPL(depot_fetch_stack); /** * depot_save_stack - save stack in a stack depot. @@ -283,3 +284,4 @@ exit: fast_exit: return retval; } +EXPORT_SYMBOL_GPL(depot_save_stack); diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index 9c5fe8110413..7e35fc450c5b 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -1,6 +1,7 @@ #include <linux/compiler.h> #include <linux/export.h> #include <linux/kasan-checks.h> +#include <linux/thread_info.h> #include <linux/uaccess.h> #include <linux/kernel.h> #include <linux/errno.h> @@ -111,6 +112,7 @@ long strncpy_from_user(char *dst, const char __user *src, long count) long retval; kasan_check_write(dst, count); + check_object_size(dst, count, false); user_access_begin(); retval = do_strncpy_from_user(dst, src, count, max); user_access_end(); diff --git a/lib/syscall.c b/lib/syscall.c index e30e03932480..63239e097b13 100644 --- a/lib/syscall.c +++ b/lib/syscall.c @@ -7,9 +7,19 @@ static int collect_syscall(struct task_struct *target, long *callno, unsigned long args[6], unsigned int maxargs, unsigned long *sp, unsigned long *pc) { - struct pt_regs *regs = task_pt_regs(target); - if (unlikely(!regs)) + struct pt_regs *regs; + + if (!try_get_task_stack(target)) { + /* Task has no stack, so the task isn't in a syscall. */ + *callno = -1; + return 0; + } + + regs = task_pt_regs(target); + if (unlikely(!regs)) { + put_task_stack(target); return -EAGAIN; + } *sp = user_stack_pointer(regs); *pc = instruction_pointer(regs); @@ -18,6 +28,7 @@ static int collect_syscall(struct task_struct *target, long *callno, if (*callno != -1L && maxargs > 0) syscall_get_arguments(target, regs, 0, maxargs, args); + put_task_stack(target); return 0; } diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 93f45011a59d..0362da0b66c3 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -4831,7 +4831,7 @@ static struct bpf_test tests[] = { { }, INTERNAL, { 0x34 }, - { { 1, 0xbef } }, + { { ETH_HLEN, 0xbef } }, .fill_helper = bpf_fill_ld_abs_vlan_push_pop, }, /* @@ -5485,6 +5485,7 @@ static struct sk_buff *populate_skb(char *buf, int size) skb->hash = SKB_HASH; skb->queue_mapping = SKB_QUEUE_MAP; skb->vlan_tci = SKB_VLAN_TCI; + skb->vlan_proto = htons(ETH_P_IP); skb->dev = &dev; skb->dev->ifindex = SKB_DEV_IFINDEX; skb->dev->type = SKB_DEV_TYPE; diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 5e51872b3fc1..fbdf87920093 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -20,6 +20,11 @@ #include <linux/uaccess.h> #include <linux/module.h> +/* + * Note: test functions are marked noinline so that their names appear in + * reports. + */ + static noinline void __init kmalloc_oob_right(void) { char *ptr; @@ -411,6 +416,29 @@ static noinline void __init copy_user_test(void) kfree(kmem); } +static noinline void __init use_after_scope_test(void) +{ + volatile char *volatile p; + + pr_info("use-after-scope on int\n"); + { + int local = 0; + + p = (char *)&local; + } + p[0] = 1; + p[3] = 1; + + pr_info("use-after-scope on array\n"); + { + char local[1024] = {0}; + + p = local; + } + p[0] = 1; + p[1023] = 1; +} + static int __init kmalloc_tests_init(void) { kmalloc_oob_right(); @@ -436,6 +464,7 @@ static int __init kmalloc_tests_init(void) kasan_global_oob(); ksize_unpoisons_memory(); copy_user_test(); + use_after_scope_test(); return -EAGAIN; } diff --git a/lib/ucs2_string.c b/lib/ucs2_string.c index f0b323abb4c6..ae8d2491133c 100644 --- a/lib/ucs2_string.c +++ b/lib/ucs2_string.c @@ -56,7 +56,7 @@ ucs2_utf8size(const ucs2_char_t *src) unsigned long i; unsigned long j = 0; - for (i = 0; i < ucs2_strlen(src); i++) { + for (i = 0; src[i]; i++) { u16 c = src[i]; if (c >= 0x800) diff --git a/lib/win_minmax.c b/lib/win_minmax.c new file mode 100644 index 000000000000..c8420d404926 --- /dev/null +++ b/lib/win_minmax.c @@ -0,0 +1,98 @@ +/** + * lib/minmax.c: windowed min/max tracker + * + * Kathleen Nichols' algorithm for tracking the minimum (or maximum) + * value of a data stream over some fixed time interval. (E.g., + * the minimum RTT over the past five minutes.) It uses constant + * space and constant time per update yet almost always delivers + * the same minimum as an implementation that has to keep all the + * data in the window. + * + * The algorithm keeps track of the best, 2nd best & 3rd best min + * values, maintaining an invariant that the measurement time of + * the n'th best >= n-1'th best. It also makes sure that the three + * values are widely separated in the time window since that bounds + * the worse case error when that data is monotonically increasing + * over the window. + * + * Upon getting a new min, we can forget everything earlier because + * it has no value - the new min is <= everything else in the window + * by definition and it's the most recent. So we restart fresh on + * every new min and overwrites 2nd & 3rd choices. The same property + * holds for 2nd & 3rd best. + */ +#include <linux/module.h> +#include <linux/win_minmax.h> + +/* As time advances, update the 1st, 2nd, and 3rd choices. */ +static u32 minmax_subwin_update(struct minmax *m, u32 win, + const struct minmax_sample *val) +{ + u32 dt = val->t - m->s[0].t; + + if (unlikely(dt > win)) { + /* + * Passed entire window without a new val so make 2nd + * choice the new val & 3rd choice the new 2nd choice. + * we may have to iterate this since our 2nd choice + * may also be outside the window (we checked on entry + * that the third choice was in the window). + */ + m->s[0] = m->s[1]; + m->s[1] = m->s[2]; + m->s[2] = *val; + if (unlikely(val->t - m->s[0].t > win)) { + m->s[0] = m->s[1]; + m->s[1] = m->s[2]; + m->s[2] = *val; + } + } else if (unlikely(m->s[1].t == m->s[0].t) && dt > win/4) { + /* + * We've passed a quarter of the window without a new val + * so take a 2nd choice from the 2nd quarter of the window. + */ + m->s[2] = m->s[1] = *val; + } else if (unlikely(m->s[2].t == m->s[1].t) && dt > win/2) { + /* + * We've passed half the window without finding a new val + * so take a 3rd choice from the last half of the window + */ + m->s[2] = *val; + } + return m->s[0].v; +} + +/* Check if new measurement updates the 1st, 2nd or 3rd choice max. */ +u32 minmax_running_max(struct minmax *m, u32 win, u32 t, u32 meas) +{ + struct minmax_sample val = { .t = t, .v = meas }; + + if (unlikely(val.v >= m->s[0].v) || /* found new max? */ + unlikely(val.t - m->s[2].t > win)) /* nothing left in window? */ + return minmax_reset(m, t, meas); /* forget earlier samples */ + + if (unlikely(val.v >= m->s[1].v)) + m->s[2] = m->s[1] = val; + else if (unlikely(val.v >= m->s[2].v)) + m->s[2] = val; + + return minmax_subwin_update(m, win, &val); +} +EXPORT_SYMBOL(minmax_running_max); + +/* Check if new measurement updates the 1st, 2nd or 3rd choice min. */ +u32 minmax_running_min(struct minmax *m, u32 win, u32 t, u32 meas) +{ + struct minmax_sample val = { .t = t, .v = meas }; + + if (unlikely(val.v <= m->s[0].v) || /* found new min? */ + unlikely(val.t - m->s[2].t > win)) /* nothing left in window? */ + return minmax_reset(m, t, meas); /* forget earlier samples */ + + if (unlikely(val.v <= m->s[1].v)) + m->s[2] = m->s[1] = val; + else if (unlikely(val.v <= m->s[2].v)) + m->s[2] = val; + + return minmax_subwin_update(m, win, &val); +} |