36 files changed, 2897 insertions, 493 deletions
diff --git a/lib/Kconfig b/lib/Kconfig
index c686f4adc124..2d90935d5a21 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -506,6 +506,9 @@ config ASSOCIATIVE_ARRAY
 
 	  for more information.
 
+config CLOSURES
+	bool
+
 config HAS_IOMEM
 	bool
 	depends on !NO_IOMEM
@@ -729,6 +732,11 @@ config PARMAN
 config OBJAGG
 	tristate "objagg" if COMPILE_TEST
 
+config LWQ_TEST
+	bool "Boot-time test for lwq queuing"
+	help
+          Run boot-time test of light-weight queuing.
+
 endmenu
 
 config GENERIC_IOREMAP
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index fa307f93fa2e..cc7d53d9dc01 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -759,7 +759,7 @@ config SHRINKER_DEBUG
 
 config DEBUG_STACK_USAGE
 	bool "Stack utilization instrumentation"
-	depends on DEBUG_KERNEL && !IA64
+	depends on DEBUG_KERNEL
 	help
 	  Enables the display of the minimum amount of free stack which each
 	  task has ever had available in the sysrq-T and sysrq-P debug output.
@@ -1720,6 +1720,15 @@ config DEBUG_NOTIFIERS
 	  This is a relatively cheap check but if you care about maximum
 	  performance, say N.
 
+config DEBUG_CLOSURES
+	bool "Debug closures (bcache async widgits)"
+	depends on CLOSURES
+	select DEBUG_FS
+	help
+	  Keeps all active closures in a linked list and provides a debugfs
+	  interface to list them, which makes it possible to see asynchronous
+	  operations that get stuck.
+
 config DEBUG_MAPLE_TREE
 	bool "Debug maple trees"
 	depends on DEBUG_KERNEL
@@ -2945,6 +2954,17 @@ config TEST_CLOCKSOURCE_WATCHDOG
 
 	  If unsure, say N.
 
+config TEST_OBJPOOL
+	tristate "Test module for correctness and stress of objpool"
+	default n
+	depends on m && DEBUG_KERNEL
+	help
+	  This builds the "test_objpool" module that should be used for
+	  correctness verification and concurrent testings of objects
+	  allocation and reclamation.
+
+	  If unsure, say N.
+
 endif # RUNTIME_TESTING_MENU
 
 config ARCH_USE_MEMTEST
diff --git a/lib/Makefile b/lib/Makefile
index 740109b6e2c8..9e0972f7b764 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -34,7 +34,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
 	 earlycpio.o seq_buf.o siphash.o dec_and_lock.o \
 	 nmi_backtrace.o win_minmax.o memcat_p.o \
-	 buildid.o
+	 buildid.o objpool.o
 
 lib-$(CONFIG_PRINTK) += dump_stack.o
 lib-$(CONFIG_SMP) += cpumask.o
@@ -45,7 +45,7 @@ obj-y	+= lockref.o
 obj-y += bcd.o sort.o parser.o debug_locks.o random32.o \
 	 bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \
 	 list_sort.o uuid.o iov_iter.o clz_ctz.o \
-	 bsearch.o find_bit.o llist.o memweight.o kfifo.o \
+	 bsearch.o find_bit.o llist.o lwq.o memweight.o kfifo.o \
 	 percpu-refcount.o rhashtable.o base64.o \
 	 once.o refcount.o rcuref.o usercopy.o errseq.o bucket_locks.o \
 	 generic-radix-tree.o
@@ -107,6 +107,8 @@ obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
 obj-$(CONFIG_TEST_REF_TRACKER) += test_ref_tracker.o
 CFLAGS_test_fprobe.o += $(CC_FLAGS_FTRACE)
 obj-$(CONFIG_FPROBE_SANITY_TEST) += test_fprobe.o
+obj-$(CONFIG_TEST_OBJPOOL) += test_objpool.o
+
 #
 # CFLAGS for compiling floating point code inside the kernel. x86/Makefile turns
 # off the generation of FPU/SSE* instructions for kernel proper but FPU_FLAGS
@@ -255,6 +257,8 @@ obj-$(CONFIG_ATOMIC64_SELFTEST) += atomic64_test.o
 
 obj-$(CONFIG_CPU_RMAP) += cpu_rmap.o
 
+obj-$(CONFIG_CLOSURES) += closure.o
+
 obj-$(CONFIG_DQL) += dynamic_queue_limits.o
 
 obj-$(CONFIG_GLOB) += glob.o
diff --git a/lib/closure.c b/lib/closure.c
new file mode 100644
index 000000000000..0855e698ced1
--- /dev/null
+++ b/lib/closure.c
@@ -0,0 +1,205 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Asynchronous refcounty things
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include <linux/closure.h>
+#include <linux/debugfs.h>
+#include <linux/export.h>
+#include <linux/rcupdate.h>
+#include <linux/seq_file.h>
+#include <linux/sched/debug.h>
+
+static inline void closure_put_after_sub(struct closure *cl, int flags)
+{
+	int r = flags & CLOSURE_REMAINING_MASK;
+
+	BUG_ON(flags & CLOSURE_GUARD_MASK);
+	BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR));
+
+	if (!r) {
+		if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
+			atomic_set(&cl->remaining,
+				   CLOSURE_REMAINING_INITIALIZER);
+			closure_queue(cl);
+		} else {
+			struct closure *parent = cl->parent;
+			closure_fn *destructor = cl->fn;
+
+			closure_debug_destroy(cl);
+
+			if (destructor)
+				destructor(cl);
+
+			if (parent)
+				closure_put(parent);
+		}
+	}
+}
+
+/* For clearing flags with the same atomic op as a put */
+void closure_sub(struct closure *cl, int v)
+{
+	closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining));
+}
+EXPORT_SYMBOL(closure_sub);
+
+/*
+ * closure_put - decrement a closure's refcount
+ */
+void closure_put(struct closure *cl)
+{
+	closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
+}
+EXPORT_SYMBOL(closure_put);
+
+/*
+ * closure_wake_up - wake up all closures on a wait list, without memory barrier
+ */
+void __closure_wake_up(struct closure_waitlist *wait_list)
+{
+	struct llist_node *list;
+	struct closure *cl, *t;
+	struct llist_node *reverse = NULL;
+
+	list = llist_del_all(&wait_list->list);
+
+	/* We first reverse the list to preserve FIFO ordering and fairness */
+	reverse = llist_reverse_order(list);
+
+	/* Then do the wakeups */
+	llist_for_each_entry_safe(cl, t, reverse, list) {
+		closure_set_waiting(cl, 0);
+		closure_sub(cl, CLOSURE_WAITING + 1);
+	}
+}
+EXPORT_SYMBOL(__closure_wake_up);
+
+/**
+ * closure_wait - add a closure to a waitlist
+ * @waitlist: will own a ref on @cl, which will be released when
+ * closure_wake_up() is called on @waitlist.
+ * @cl: closure pointer.
+ *
+ */
+bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
+{
+	if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
+		return false;
+
+	closure_set_waiting(cl, _RET_IP_);
+	atomic_add(CLOSURE_WAITING + 1, &cl->remaining);
+	llist_add(&cl->list, &waitlist->list);
+
+	return true;
+}
+EXPORT_SYMBOL(closure_wait);
+
+struct closure_syncer {
+	struct task_struct	*task;
+	int			done;
+};
+
+static void closure_sync_fn(struct closure *cl)
+{
+	struct closure_syncer *s = cl->s;
+	struct task_struct *p;
+
+	rcu_read_lock();
+	p = READ_ONCE(s->task);
+	s->done = 1;
+	wake_up_process(p);
+	rcu_read_unlock();
+}
+
+void __sched __closure_sync(struct closure *cl)
+{
+	struct closure_syncer s = { .task = current };
+
+	cl->s = &s;
+	continue_at(cl, closure_sync_fn, NULL);
+
+	while (1) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (s.done)
+			break;
+		schedule();
+	}
+
+	__set_current_state(TASK_RUNNING);
+}
+EXPORT_SYMBOL(__closure_sync);
+
+#ifdef CONFIG_DEBUG_CLOSURES
+
+static LIST_HEAD(closure_list);
+static DEFINE_SPINLOCK(closure_list_lock);
+
+void closure_debug_create(struct closure *cl)
+{
+	unsigned long flags;
+
+	BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE);
+	cl->magic = CLOSURE_MAGIC_ALIVE;
+
+	spin_lock_irqsave(&closure_list_lock, flags);
+	list_add(&cl->all, &closure_list);
+	spin_unlock_irqrestore(&closure_list_lock, flags);
+}
+EXPORT_SYMBOL(closure_debug_create);
+
+void closure_debug_destroy(struct closure *cl)
+{
+	unsigned long flags;
+
+	BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE);
+	cl->magic = CLOSURE_MAGIC_DEAD;
+
+	spin_lock_irqsave(&closure_list_lock, flags);
+	list_del(&cl->all);
+	spin_unlock_irqrestore(&closure_list_lock, flags);
+}
+EXPORT_SYMBOL(closure_debug_destroy);
+
+static int debug_show(struct seq_file *f, void *data)
+{
+	struct closure *cl;
+
+	spin_lock_irq(&closure_list_lock);
+
+	list_for_each_entry(cl, &closure_list, all) {
+		int r = atomic_read(&cl->remaining);
+
+		seq_printf(f, "%p: %pS -> %pS p %p r %i ",
+			   cl, (void *) cl->ip, cl->fn, cl->parent,
+			   r & CLOSURE_REMAINING_MASK);
+
+		seq_printf(f, "%s%s\n",
+			   test_bit(WORK_STRUCT_PENDING_BIT,
+				    work_data_bits(&cl->work)) ? "Q" : "",
+			   r & CLOSURE_RUNNING	? "R" : "");
+
+		if (r & CLOSURE_WAITING)
+			seq_printf(f, " W %pS\n",
+				   (void *) cl->waiting_on);
+
+		seq_puts(f, "\n");
+	}
+
+	spin_unlock_irq(&closure_list_lock);
+	return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(debug);
+
+static int __init closure_debug_init(void)
+{
+	debugfs_create_file("closures", 0400, NULL, NULL, &debug_fops);
+	return 0;
+}
+late_initcall(closure_debug_init)
+
+#endif
diff --git a/lib/cpumask.c b/lib/cpumask.c
index a7fd02b5ae26..34335c1e7265 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -146,9 +146,7 @@ unsigned int cpumask_local_spread(unsigned int i, int node)
 	/* Wrap: we always want a cpu. */
 	i %= num_online_cpus();
 
-	cpu = (node == NUMA_NO_NODE) ?
-		cpumask_nth(i, cpu_online_mask) :
-		sched_numa_find_nth_cpu(cpu_online_mask, i, node);
+	cpu = sched_numa_find_nth_cpu(cpu_online_mask, i, node);
 
 	WARN_ON(cpu >= nr_cpu_ids);
 	return cpu;
diff --git a/lib/decompress_unxz.c b/lib/decompress_unxz.c
index 353268b9f129..842894158944 100644
--- a/lib/decompress_unxz.c
+++ b/lib/decompress_unxz.c
@@ -133,9 +133,6 @@
 #ifdef CONFIG_ARM
 #	define XZ_DEC_ARM
 #endif
-#ifdef CONFIG_IA64
-#	define XZ_DEC_IA64
-#endif
 #ifdef CONFIG_SPARC
 #	define XZ_DEC_SPARC
 #endif
diff --git a/lib/errname.c b/lib/errname.c
index 67739b174a8c..dd1b998552cd 100644
--- a/lib/errname.c
+++ b/lib/errname.c
@@ -228,3 +228,4 @@ const char *errname(int err)
 
 	return err > 0 ? name + 1 : name;
 }
+EXPORT_SYMBOL(errname);
diff --git a/lib/generic-radix-tree.c b/lib/generic-radix-tree.c
index f25eb111c051..41f1bcdc4488 100644
--- a/lib/generic-radix-tree.c
+++ b/lib/generic-radix-tree.c
@@ -1,4 +1,5 @@
 
+#include <linux/atomic.h>
 #include <linux/export.h>
 #include <linux/generic-radix-tree.h>
 #include <linux/gfp.h>
@@ -166,6 +167,10 @@ void *__genradix_iter_peek(struct genradix_iter *iter,
 	struct genradix_root *r;
 	struct genradix_node *n;
 	unsigned level, i;
+
+	if (iter->offset == SIZE_MAX)
+		return NULL;
+
 restart:
 	r = READ_ONCE(radix->root);
 	if (!r)
@@ -184,10 +189,17 @@ restart:
 			(GENRADIX_ARY - 1);
 
 		while (!n->children[i]) {
+			size_t objs_per_ptr = genradix_depth_size(level);
+
+			if (iter->offset + objs_per_ptr < iter->offset) {
+				iter->offset	= SIZE_MAX;
+				iter->pos	= SIZE_MAX;
+				return NULL;
+			}
+
 			i++;
-			iter->offset = round_down(iter->offset +
-					   genradix_depth_size(level),
-					   genradix_depth_size(level));
+			iter->offset = round_down(iter->offset + objs_per_ptr,
+						  objs_per_ptr);
 			iter->pos = (iter->offset >> PAGE_SHIFT) *
 				objs_per_page;
 			if (i == GENRADIX_ARY)
@@ -201,6 +213,64 @@ restart:
 }
 EXPORT_SYMBOL(__genradix_iter_peek);
 
+void *__genradix_iter_peek_prev(struct genradix_iter *iter,
+				struct __genradix *radix,
+				size_t objs_per_page,
+				size_t obj_size_plus_page_remainder)
+{
+	struct genradix_root *r;
+	struct genradix_node *n;
+	unsigned level, i;
+
+	if (iter->offset == SIZE_MAX)
+		return NULL;
+
+restart:
+	r = READ_ONCE(radix->root);
+	if (!r)
+		return NULL;
+
+	n	= genradix_root_to_node(r);
+	level	= genradix_root_to_depth(r);
+
+	if (ilog2(iter->offset) >= genradix_depth_shift(level)) {
+		iter->offset = genradix_depth_size(level);
+		iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page;
+
+		iter->offset -= obj_size_plus_page_remainder;
+		iter->pos--;
+	}
+
+	while (level) {
+		level--;
+
+		i = (iter->offset >> genradix_depth_shift(level)) &
+			(GENRADIX_ARY - 1);
+
+		while (!n->children[i]) {
+			size_t objs_per_ptr = genradix_depth_size(level);
+
+			iter->offset = round_down(iter->offset, objs_per_ptr);
+			iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page;
+
+			if (!iter->offset)
+				return NULL;
+
+			iter->offset -= obj_size_plus_page_remainder;
+			iter->pos--;
+
+			if (!i)
+				goto restart;
+			--i;
+		}
+
+		n = n->children[i];
+	}
+
+	return &n->data[iter->offset & (PAGE_SIZE - 1)];
+}
+EXPORT_SYMBOL(__genradix_iter_peek_prev);
+
 static void genradix_free_recurse(struct genradix_node *n, unsigned level)
 {
 	if (level) {
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 27234a820eeb..de7d11cf4c63 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1,5 +1,4 @@
 // SPDX-License-Identifier: GPL-2.0-only
-#include <crypto/hash.h>
 #include <linux/export.h>
 #include <linux/bvec.h>
 #include <linux/fault-inject-usercopy.h>
@@ -10,192 +9,71 @@
 #include <linux/vmalloc.h>
 #include <linux/splice.h>
 #include <linux/compat.h>
-#include <net/checksum.h>
 #include <linux/scatterlist.h>
 #include <linux/instrumented.h>
+#include <linux/iov_iter.h>
 
-/* covers ubuf and kbuf alike */
-#define iterate_buf(i, n, base, len, off, __p, STEP) {		\
-	size_t __maybe_unused off = 0;				\
-	len = n;						\
-	base = __p + i->iov_offset;				\
-	len -= (STEP);						\
-	i->iov_offset += len;					\
-	n = len;						\
-}
-
-/* covers iovec and kvec alike */
-#define iterate_iovec(i, n, base, len, off, __p, STEP) {	\
-	size_t off = 0;						\
-	size_t skip = i->iov_offset;				\
-	do {							\
-		len = min(n, __p->iov_len - skip);		\
-		if (likely(len)) {				\
-			base = __p->iov_base + skip;		\
-			len -= (STEP);				\
-			off += len;				\
-			skip += len;				\
-			n -= len;				\
-			if (skip < __p->iov_len)		\
-				break;				\
-		}						\
-		__p++;						\
-		skip = 0;					\
-	} while (n);						\
-	i->iov_offset = skip;					\
-	n = off;						\
-}
-
-#define iterate_bvec(i, n, base, len, off, p, STEP) {		\
-	size_t off = 0;						\
-	unsigned skip = i->iov_offset;				\
-	while (n) {						\
-		unsigned offset = p->bv_offset + skip;		\
-		unsigned left;					\
-		void *kaddr = kmap_local_page(p->bv_page +	\
-					offset / PAGE_SIZE);	\
-		base = kaddr + offset % PAGE_SIZE;		\
-		len = min(min(n, (size_t)(p->bv_len - skip)),	\
-		     (size_t)(PAGE_SIZE - offset % PAGE_SIZE));	\
-		left = (STEP);					\
-		kunmap_local(kaddr);				\
-		len -= left;					\
-		off += len;					\
-		skip += len;					\
-		if (skip == p->bv_len) {			\
-			skip = 0;				\
-			p++;					\
-		}						\
-		n -= len;					\
-		if (left)					\
-			break;					\
-	}							\
-	i->iov_offset = skip;					\
-	n = off;						\
-}
-
-#define iterate_xarray(i, n, base, len, __off, STEP) {		\
-	__label__ __out;					\
-	size_t __off = 0;					\
-	struct folio *folio;					\
-	loff_t start = i->xarray_start + i->iov_offset;		\
-	pgoff_t index = start / PAGE_SIZE;			\
-	XA_STATE(xas, i->xarray, index);			\
-								\
-	len = PAGE_SIZE - offset_in_page(start);		\
-	rcu_read_lock();					\
-	xas_for_each(&xas, folio, ULONG_MAX) {			\
-		unsigned left;					\
-		size_t offset;					\
-		if (xas_retry(&xas, folio))			\
-			continue;				\
-		if (WARN_ON(xa_is_value(folio)))		\
-			break;					\
-		if (WARN_ON(folio_test_hugetlb(folio)))		\
-			break;					\
-		offset = offset_in_folio(folio, start + __off);	\
-		while (offset < folio_size(folio)) {		\
-			base = kmap_local_folio(folio, offset);	\
-			len = min(n, len);			\
-			left = (STEP);				\
-			kunmap_local(base);			\
-			len -= left;				\
-			__off += len;				\
-			n -= len;				\
-			if (left || n == 0)			\
-				goto __out;			\
-			offset += len;				\
-			len = PAGE_SIZE;			\
-		}						\
-	}							\
-__out:								\
-	rcu_read_unlock();					\
-	i->iov_offset += __off;					\
-	n = __off;						\
-}
-
-#define __iterate_and_advance(i, n, base, len, off, I, K) {	\
-	if (unlikely(i->count < n))				\
-		n = i->count;					\
-	if (likely(n)) {					\
-		if (likely(iter_is_ubuf(i))) {			\
-			void __user *base;			\
-			size_t len;				\
-			iterate_buf(i, n, base, len, off,	\
-						i->ubuf, (I)) 	\
-		} else if (likely(iter_is_iovec(i))) {		\
-			const struct iovec *iov = iter_iov(i);	\
-			void __user *base;			\
-			size_t len;				\
-			iterate_iovec(i, n, base, len, off,	\
-						iov, (I))	\
-			i->nr_segs -= iov - iter_iov(i);	\
-			i->__iov = iov;				\
-		} else if (iov_iter_is_bvec(i)) {		\
-			const struct bio_vec *bvec = i->bvec;	\
-			void *base;				\
-			size_t len;				\
-			iterate_bvec(i, n, base, len, off,	\
-						bvec, (K))	\
-			i->nr_segs -= bvec - i->bvec;		\
-			i->bvec = bvec;				\
-		} else if (iov_iter_is_kvec(i)) {		\
-			const struct kvec *kvec = i->kvec;	\
-			void *base;				\
-			size_t len;				\
-			iterate_iovec(i, n, base, len, off,	\
-						kvec, (K))	\
-			i->nr_segs -= kvec - i->kvec;		\
-			i->kvec = kvec;				\
-		} else if (iov_iter_is_xarray(i)) {		\
-			void *base;				\
-			size_t len;				\
-			iterate_xarray(i, n, base, len, off,	\
-							(K))	\
-		}						\
-		i->count -= n;					\
-	}							\
-}
-#define iterate_and_advance(i, n, base, len, off, I, K) \
-	__iterate_and_advance(i, n, base, len, off, I, ((void)(K),0))
-
-static int copyout(void __user *to, const void *from, size_t n)
+static __always_inline
+size_t copy_to_user_iter(void __user *iter_to, size_t progress,
+			 size_t len, void *from, void *priv2)
 {
 	if (should_fail_usercopy())
-		return n;
-	if (access_ok(to, n)) {
-		instrument_copy_to_user(to, from, n);
-		n = raw_copy_to_user(to, from, n);
+		return len;
+	if (access_ok(iter_to, len)) {
+		from += progress;
+		instrument_copy_to_user(iter_to, from, len);
+		len = raw_copy_to_user(iter_to, from, len);
 	}
-	return n;
+	return len;
 }
 
-static int copyout_nofault(void __user *to, const void *from, size_t n)
+static __always_inline
+size_t copy_to_user_iter_nofault(void __user *iter_to, size_t progress,
+				 size_t len, void *from, void *priv2)
 {
-	long res;
+	ssize_t res;
 
 	if (should_fail_usercopy())
-		return n;
-
-	res = copy_to_user_nofault(to, from, n);
+		return len;
 
-	return res < 0 ? n : res;
+	from += progress;
+	res = copy_to_user_nofault(iter_to, from, len);
+	return res < 0 ? len : res;
 }
 
-static int copyin(void *to, const void __user *from, size_t n)
+static __always_inline
+size_t copy_from_user_iter(void __user *iter_from, size_t progress,
+			   size_t len, void *to, void *priv2)
 {
-	size_t res = n;
+	size_t res = len;
 
 	if (should_fail_usercopy())
-		return n;
-	if (access_ok(from, n)) {
-		instrument_copy_from_user_before(to, from, n);
-		res = raw_copy_from_user(to, from, n);
-		instrument_copy_from_user_after(to, from, n, res);
+		return len;
+	if (access_ok(iter_from, len)) {
+		to += progress;
+		instrument_copy_from_user_before(to, iter_from, len);
+		res = raw_copy_from_user(to, iter_from, len);
+		instrument_copy_from_user_after(to, iter_from, len, res);
 	}
 	return res;
 }
 
+static __always_inline
+size_t memcpy_to_iter(void *iter_to, size_t progress,
+		      size_t len, void *from, void *priv2)
+{
+	memcpy(iter_to, from + progress, len);
+	return 0;
+}
+
+static __always_inline
+size_t memcpy_from_iter(void *iter_from, size_t progress,
+			size_t len, void *to, void *priv2)
+{
+	memcpy(to + progress, iter_from, len);
+	return 0;
+}
+
 /*
  * fault_in_iov_iter_readable - fault in iov iterator for reading
  * @i: iterator
@@ -290,7 +168,6 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction,
 		.iter_type = ITER_IOVEC,
 		.copy_mc = false,
 		.nofault = false,
-		.user_backed = true,
 		.data_source = direction,
 		.__iov = iov,
 		.nr_segs = nr_segs,
@@ -300,36 +177,35 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction,
 }
 EXPORT_SYMBOL(iov_iter_init);
 
-static __wsum csum_and_memcpy(void *to, const void *from, size_t len,
-			      __wsum sum, size_t off)
-{
-	__wsum next = csum_partial_copy_nocheck(from, to, len);
-	return csum_block_add(sum, next, off);
-}
-
 size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 {
 	if (WARN_ON_ONCE(i->data_source))
 		return 0;
 	if (user_backed_iter(i))
 		might_fault();
-	iterate_and_advance(i, bytes, base, len, off,
-		copyout(base, addr + off, len),
-		memcpy(base, addr + off, len)
-	)
-
-	return bytes;
+	return iterate_and_advance(i, bytes, (void *)addr,
+				   copy_to_user_iter, memcpy_to_iter);
 }
 EXPORT_SYMBOL(_copy_to_iter);
 
 #ifdef CONFIG_ARCH_HAS_COPY_MC
-static int copyout_mc(void __user *to, const void *from, size_t n)
-{
-	if (access_ok(to, n)) {
-		instrument_copy_to_user(to, from, n);
-		n = copy_mc_to_user((__force void *) to, from, n);
+static __always_inline
+size_t copy_to_user_iter_mc(void __user *iter_to, size_t progress,
+			    size_t len, void *from, void *priv2)
+{
+	if (access_ok(iter_to, len)) {
+		from += progress;
+		instrument_copy_to_user(iter_to, from, len);
+		len = copy_mc_to_user(iter_to, from, len);
 	}
-	return n;
+	return len;
+}
+
+static __always_inline
+size_t memcpy_to_iter_mc(void *iter_to, size_t progress,
+			 size_t len, void *from, void *priv2)
+{
+	return copy_mc_to_kernel(iter_to, from + progress, len);
 }
 
 /**
@@ -362,22 +238,35 @@ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 		return 0;
 	if (user_backed_iter(i))
 		might_fault();
-	__iterate_and_advance(i, bytes, base, len, off,
-		copyout_mc(base, addr + off, len),
-		copy_mc_to_kernel(base, addr + off, len)
-	)
-
-	return bytes;
+	return iterate_and_advance(i, bytes, (void *)addr,
+				   copy_to_user_iter_mc, memcpy_to_iter_mc);
 }
 EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
 #endif /* CONFIG_ARCH_HAS_COPY_MC */
 
-static void *memcpy_from_iter(struct iov_iter *i, void *to, const void *from,
-				 size_t size)
+static __always_inline
+size_t memcpy_from_iter_mc(void *iter_from, size_t progress,
+			   size_t len, void *to, void *priv2)
+{
+	return copy_mc_to_kernel(to + progress, iter_from, len);
+}
+
+static size_t __copy_from_iter_mc(void *addr, size_t bytes, struct iov_iter *i)
+{
+	if (unlikely(i->count < bytes))
+		bytes = i->count;
+	if (unlikely(!bytes))
+		return 0;
+	return iterate_bvec(i, bytes, addr, NULL, memcpy_from_iter_mc);
+}
+
+static __always_inline
+size_t __copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
 {
-	if (iov_iter_is_copy_mc(i))
-		return (void *)copy_mc_to_kernel(to, from, size);
-	return memcpy(to, from, size);
+	if (unlikely(iov_iter_is_copy_mc(i)))
+		return __copy_from_iter_mc(addr, bytes, i);
+	return iterate_and_advance(i, bytes, addr,
+				   copy_from_user_iter, memcpy_from_iter);
 }
 
 size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
@@ -387,30 +276,44 @@ size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
 
 	if (user_backed_iter(i))
 		might_fault();
-	iterate_and_advance(i, bytes, base, len, off,
-		copyin(addr + off, base, len),
-		memcpy_from_iter(i, addr + off, base, len)
-	)
-
-	return bytes;
+	return __copy_from_iter(addr, bytes, i);
 }
 EXPORT_SYMBOL(_copy_from_iter);
 
+static __always_inline
+size_t copy_from_user_iter_nocache(void __user *iter_from, size_t progress,
+				   size_t len, void *to, void *priv2)
+{
+	return __copy_from_user_inatomic_nocache(to + progress, iter_from, len);
+}
+
 size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
 {
 	if (WARN_ON_ONCE(!i->data_source))
 		return 0;
 
-	iterate_and_advance(i, bytes, base, len, off,
-		__copy_from_user_inatomic_nocache(addr + off, base, len),
-		memcpy(addr + off, base, len)
-	)
-
-	return bytes;
+	return iterate_and_advance(i, bytes, addr,
+				   copy_from_user_iter_nocache,
+				   memcpy_from_iter);
 }
 EXPORT_SYMBOL(_copy_from_iter_nocache);
 
 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
+static __always_inline
+size_t copy_from_user_iter_flushcache(void __user *iter_from, size_t progress,
+				      size_t len, void *to, void *priv2)
+{
+	return __copy_from_user_flushcache(to + progress, iter_from, len);
+}
+
+static __always_inline
+size_t memcpy_from_iter_flushcache(void *iter_from, size_t progress,
+				   size_t len, void *to, void *priv2)
+{
+	memcpy_flushcache(to + progress, iter_from, len);
+	return 0;
+}
+
 /**
  * _copy_from_iter_flushcache - write destination through cpu cache
  * @addr: destination kernel address
@@ -432,12 +335,9 @@ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
 	if (WARN_ON_ONCE(!i->data_source))
 		return 0;
 
-	iterate_and_advance(i, bytes, base, len, off,
-		__copy_from_user_flushcache(addr + off, base, len),
-		memcpy_flushcache(addr + off, base, len)
-	)
-
-	return bytes;
+	return iterate_and_advance(i, bytes, addr,
+				   copy_from_user_iter_flushcache,
+				   memcpy_from_iter_flushcache);
 }
 EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache);
 #endif
@@ -509,10 +409,9 @@ size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t byte
 		void *kaddr = kmap_local_page(page);
 		size_t n = min(bytes, (size_t)PAGE_SIZE - offset);
 
-		iterate_and_advance(i, n, base, len, off,
-			copyout_nofault(base, kaddr + offset + off, len),
-			memcpy(base, kaddr + offset + off, len)
-		)
+		n = iterate_and_advance(i, bytes, kaddr,
+					copy_to_user_iter_nofault,
+					memcpy_to_iter);
 		kunmap_local(kaddr);
 		res += n;
 		bytes -= n;
@@ -555,14 +454,25 @@ size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
 }
 EXPORT_SYMBOL(copy_page_from_iter);
 
-size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
+static __always_inline
+size_t zero_to_user_iter(void __user *iter_to, size_t progress,
+			 size_t len, void *priv, void *priv2)
 {
-	iterate_and_advance(i, bytes, base, len, count,
-		clear_user(base, len),
-		memset(base, 0, len)
-	)
+	return clear_user(iter_to, len);
+}
+
+static __always_inline
+size_t zero_to_iter(void *iter_to, size_t progress,
+		    size_t len, void *priv, void *priv2)
+{
+	memset(iter_to, 0, len);
+	return 0;
+}
 
-	return bytes;
+size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
+{
+	return iterate_and_advance(i, bytes, NULL,
+				   zero_to_user_iter, zero_to_iter);
 }
 EXPORT_SYMBOL(iov_iter_zero);
 
@@ -587,10 +497,7 @@ size_t copy_page_from_iter_atomic(struct page *page, size_t offset,
 		}
 
 		p = kmap_atomic(page) + offset;
-		iterate_and_advance(i, n, base, len, off,
-			copyin(p + off, base, len),
-			memcpy_from_iter(i, p + off, base, len)
-		)
+		n = __copy_from_iter(p, n, i);
 		kunmap_atomic(p);
 		copied += n;
 		offset += n;
@@ -1181,78 +1088,6 @@ ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i,
 }
 EXPORT_SYMBOL(iov_iter_get_pages_alloc2);
 
-size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
-			       struct iov_iter *i)
-{
-	__wsum sum, next;
-	sum = *csum;
-	if (WARN_ON_ONCE(!i->data_source))
-		return 0;
-
-	iterate_and_advance(i, bytes, base, len, off, ({
-		next = csum_and_copy_from_user(base, addr + off, len);
-		sum = csum_block_add(sum, next, off);
-		next ? 0 : len;
-	}), ({
-		sum = csum_and_memcpy(addr + off, base, len, sum, off);
-	})
-	)
-	*csum = sum;
-	return bytes;
-}
-EXPORT_SYMBOL(csum_and_copy_from_iter);
-
-size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate,
-			     struct iov_iter *i)
-{
-	struct csum_state *csstate = _csstate;
-	__wsum sum, next;
-
-	if (WARN_ON_ONCE(i->data_source))
-		return 0;
-	if (unlikely(iov_iter_is_discard(i))) {
-		// can't use csum_memcpy() for that one - data is not copied
-		csstate->csum = csum_block_add(csstate->csum,
-					       csum_partial(addr, bytes, 0),
-					       csstate->off);
-		csstate->off += bytes;
-		return bytes;
-	}
-
-	sum = csum_shift(csstate->csum, csstate->off);
-	iterate_and_advance(i, bytes, base, len, off, ({
-		next = csum_and_copy_to_user(addr + off, base, len);
-		sum = csum_block_add(sum, next, off);
-		next ? 0 : len;
-	}), ({
-		sum = csum_and_memcpy(base, addr + off, len, sum, off);
-	})
-	)
-	csstate->csum = csum_shift(sum, csstate->off);
-	csstate->off += bytes;
-	return bytes;
-}
-EXPORT_SYMBOL(csum_and_copy_to_iter);
-
-size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
-		struct iov_iter *i)
-{
-#ifdef CONFIG_CRYPTO_HASH
-	struct ahash_request *hash = hashp;
-	struct scatterlist sg;
-	size_t copied;
-
-	copied = copy_to_iter(addr, bytes, i);
-	sg_init_one(&sg, addr, copied);
-	ahash_request_set_crypt(hash, &sg, NULL, copied);
-	crypto_ahash_update(hash);
-	return copied;
-#else
-	return 0;
-#endif
-}
-EXPORT_SYMBOL(hash_and_copy_to_iter);
-
 static int iov_npages(const struct iov_iter *i, int maxpages)
 {
 	size_t skip = i->iov_offset, size = i->count;
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index 7c44b7ae4c5c..fb9a2f06dd1e 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -254,10 +254,10 @@ static int init_uevent_argv(struct kobj_uevent_env *env, const char *subsystem)
 	int buffer_size = sizeof(env->buf) - env->buflen;
 	int len;
 
-	len = strlcpy(&env->buf[env->buflen], subsystem, buffer_size);
-	if (len >= buffer_size) {
-		pr_warn("init_uevent_argv: buffer size of %d too small, needed %d\n",
-			buffer_size, len);
+	len = strscpy(&env->buf[env->buflen], subsystem, buffer_size);
+	if (len < 0) {
+		pr_warn("%s: insufficient buffer space (%u left) for %s\n",
+			__func__, buffer_size, subsystem);
 		return -ENOMEM;
 	}
 
diff --git a/lib/kunit/assert.c b/lib/kunit/assert.c
index 05a09652f5a1..dd1d633d0fe2 100644
--- a/lib/kunit/assert.c
+++ b/lib/kunit/assert.c
@@ -89,8 +89,7 @@ void kunit_ptr_not_err_assert_format(const struct kunit_assert *assert,
 EXPORT_SYMBOL_GPL(kunit_ptr_not_err_assert_format);
 
 /* Checks if `text` is a literal representing `value`, e.g. "5" and 5 */
-static bool is_literal(struct kunit *test, const char *text, long long value,
-		       gfp_t gfp)
+static bool is_literal(const char *text, long long value)
 {
 	char *buffer;
 	int len;
@@ -100,14 +99,15 @@ static bool is_literal(struct kunit *test, const char *text, long long value,
 	if (strlen(text) != len)
 		return false;
 
-	buffer = kunit_kmalloc(test, len+1, gfp);
+	buffer = kmalloc(len+1, GFP_KERNEL);
 	if (!buffer)
 		return false;
 
 	snprintf(buffer, len+1, "%lld", value);
 	ret = strncmp(buffer, text, len) == 0;
 
-	kunit_kfree(test, buffer);
+	kfree(buffer);
+
 	return ret;
 }
 
@@ -125,14 +125,12 @@ void kunit_binary_assert_format(const struct kunit_assert *assert,
 			  binary_assert->text->left_text,
 			  binary_assert->text->operation,
 			  binary_assert->text->right_text);
-	if (!is_literal(stream->test, binary_assert->text->left_text,
-			binary_assert->left_value, stream->gfp))
+	if (!is_literal(binary_assert->text->left_text, binary_assert->left_value))
 		string_stream_add(stream, KUNIT_SUBSUBTEST_INDENT "%s == %lld (0x%llx)\n",
 				  binary_assert->text->left_text,
 				  binary_assert->left_value,
 				  binary_assert->left_value);
-	if (!is_literal(stream->test, binary_assert->text->right_text,
-			binary_assert->right_value, stream->gfp))
+	if (!is_literal(binary_assert->text->right_text, binary_assert->right_value))
 		string_stream_add(stream, KUNIT_SUBSUBTEST_INDENT "%s == %lld (0x%llx)",
 				  binary_assert->text->right_text,
 				  binary_assert->right_value,
diff --git a/lib/kunit/debugfs.c b/lib/kunit/debugfs.c
index 22c5c496a68f..270d185737e6 100644
--- a/lib/kunit/debugfs.c
+++ b/lib/kunit/debugfs.c
@@ -37,14 +37,21 @@ void kunit_debugfs_init(void)
 		debugfs_rootdir = debugfs_create_dir(KUNIT_DEBUGFS_ROOT, NULL);
 }
 
-static void debugfs_print_result(struct seq_file *seq,
-				 struct kunit_suite *suite,
-				 struct kunit_case *test_case)
+static void debugfs_print_result(struct seq_file *seq, struct string_stream *log)
 {
-	if (!test_case || !test_case->log)
+	struct string_stream_fragment *frag_container;
+
+	if (!log)
 		return;
 
-	seq_printf(seq, "%s", test_case->log);
+	/*
+	 * Walk the fragments so we don't need to allocate a temporary
+	 * buffer to hold the entire string.
+	 */
+	spin_lock(&log->lock);
+	list_for_each_entry(frag_container, &log->fragments, node)
+		seq_printf(seq, "%s", frag_container->fragment);
+	spin_unlock(&log->lock);
 }
 
 /*
@@ -69,10 +76,9 @@ static int debugfs_print_results(struct seq_file *seq, void *v)
 	seq_printf(seq, KUNIT_SUBTEST_INDENT "1..%zd\n", kunit_suite_num_test_cases(suite));
 
 	kunit_suite_for_each_test_case(suite, test_case)
-		debugfs_print_result(seq, suite, test_case);
+		debugfs_print_result(seq, test_case->log);
 
-	if (suite->log)
-		seq_printf(seq, "%s", suite->log);
+	debugfs_print_result(seq, suite->log);
 
 	seq_printf(seq, "%s %d %s\n",
 		   kunit_status_to_ok_not_ok(success), 1, suite->name);
@@ -105,9 +111,13 @@ void kunit_debugfs_create_suite(struct kunit_suite *suite)
 	struct kunit_case *test_case;
 
 	/* Allocate logs before creating debugfs representation. */
-	suite->log = kzalloc(KUNIT_LOG_SIZE, GFP_KERNEL);
-	kunit_suite_for_each_test_case(suite, test_case)
-		test_case->log = kzalloc(KUNIT_LOG_SIZE, GFP_KERNEL);
+	suite->log = alloc_string_stream(GFP_KERNEL);
+	string_stream_set_append_newlines(suite->log, true);
+
+	kunit_suite_for_each_test_case(suite, test_case) {
+		test_case->log = alloc_string_stream(GFP_KERNEL);
+		string_stream_set_append_newlines(test_case->log, true);
+	}
 
 	suite->debugfs = debugfs_create_dir(suite->name, debugfs_rootdir);
 
@@ -121,7 +131,7 @@ void kunit_debugfs_destroy_suite(struct kunit_suite *suite)
 	struct kunit_case *test_case;
 
 	debugfs_remove_recursive(suite->debugfs);
-	kfree(suite->log);
+	string_stream_destroy(suite->log);
 	kunit_suite_for_each_test_case(suite, test_case)
-		kfree(test_case->log);
+		string_stream_destroy(test_case->log);
 }
diff --git a/lib/kunit/executor.c b/lib/kunit/executor.c
index a6348489d45f..1236b3cd2fbb 100644
--- a/lib/kunit/executor.c
+++ b/lib/kunit/executor.c
@@ -137,8 +137,10 @@ void kunit_free_suite_set(struct kunit_suite_set suite_set)
 {
 	struct kunit_suite * const *suites;
 
-	for (suites = suite_set.start; suites < suite_set.end; suites++)
+	for (suites = suite_set.start; suites < suite_set.end; suites++) {
+		kfree((*suites)->test_cases);
 		kfree(*suites);
+	}
 	kfree(suite_set.start);
 }
 
@@ -155,10 +157,11 @@ kunit_filter_suites(const struct kunit_suite_set *suite_set,
 	struct kunit_suite_set filtered = {NULL, NULL};
 	struct kunit_glob_filter parsed_glob;
 	struct kunit_attr_filter *parsed_filters = NULL;
+	struct kunit_suite * const *suites;
 
 	const size_t max = suite_set->end - suite_set->start;
 
-	copy = kmalloc_array(max, sizeof(*filtered.start), GFP_KERNEL);
+	copy = kcalloc(max, sizeof(*filtered.start), GFP_KERNEL);
 	if (!copy) { /* won't be able to run anything, return an empty set */
 		return filtered;
 	}
@@ -193,7 +196,7 @@ kunit_filter_suites(const struct kunit_suite_set *suite_set,
 					parsed_glob.test_glob);
 			if (IS_ERR(filtered_suite)) {
 				*err = PTR_ERR(filtered_suite);
-				goto free_parsed_filters;
+				goto free_filtered_suite;
 			}
 		}
 		if (filter_count > 0 && parsed_filters != NULL) {
@@ -210,11 +213,11 @@ kunit_filter_suites(const struct kunit_suite_set *suite_set,
 				filtered_suite = new_filtered_suite;
 
 				if (*err)
-					goto free_parsed_filters;
+					goto free_filtered_suite;
 
 				if (IS_ERR(filtered_suite)) {
 					*err = PTR_ERR(filtered_suite);
-					goto free_parsed_filters;
+					goto free_filtered_suite;
 				}
 				if (!filtered_suite)
 					break;
@@ -229,6 +232,14 @@ kunit_filter_suites(const struct kunit_suite_set *suite_set,
 	filtered.start = copy_start;
 	filtered.end = copy;
 
+free_filtered_suite:
+	if (*err) {
+		for (suites = copy_start; suites < copy; suites++) {
+			kfree((*suites)->test_cases);
+			kfree(*suites);
+		}
+	}
+
 free_parsed_filters:
 	if (filter_count)
 		kfree(parsed_filters);
@@ -241,7 +252,7 @@ free_parsed_glob:
 
 free_copy:
 	if (*err)
-		kfree(copy);
+		kfree(copy_start);
 
 	return filtered;
 }
diff --git a/lib/kunit/executor_test.c b/lib/kunit/executor_test.c
index b4f6f96b2844..22d4ee86dbed 100644
--- a/lib/kunit/executor_test.c
+++ b/lib/kunit/executor_test.c
@@ -9,7 +9,7 @@
 #include <kunit/test.h>
 #include <kunit/attributes.h>
 
-static void kfree_at_end(struct kunit *test, const void *to_free);
+static void free_suite_set_at_end(struct kunit *test, const void *to_free);
 static struct kunit_suite *alloc_fake_suite(struct kunit *test,
 					    const char *suite_name,
 					    struct kunit_case *test_cases);
@@ -56,7 +56,7 @@ static void filter_suites_test(struct kunit *test)
 	got = kunit_filter_suites(&suite_set, "suite2", NULL, NULL, &err);
 	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, got.start);
 	KUNIT_ASSERT_EQ(test, err, 0);
-	kfree_at_end(test, got.start);
+	free_suite_set_at_end(test, &got);
 
 	/* Validate we just have suite2 */
 	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, got.start[0]);
@@ -82,7 +82,7 @@ static void filter_suites_test_glob_test(struct kunit *test)
 	got = kunit_filter_suites(&suite_set, "suite2.test2", NULL, NULL, &err);
 	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, got.start);
 	KUNIT_ASSERT_EQ(test, err, 0);
-	kfree_at_end(test, got.start);
+	free_suite_set_at_end(test, &got);
 
 	/* Validate we just have suite2 */
 	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, got.start[0]);
@@ -109,7 +109,7 @@ static void filter_suites_to_empty_test(struct kunit *test)
 
 	got = kunit_filter_suites(&suite_set, "not_found", NULL, NULL, &err);
 	KUNIT_ASSERT_EQ(test, err, 0);
-	kfree_at_end(test, got.start); /* just in case */
+	free_suite_set_at_end(test, &got); /* just in case */
 
 	KUNIT_EXPECT_PTR_EQ_MSG(test, got.start, got.end,
 				"should be empty to indicate no match");
@@ -172,7 +172,7 @@ static void filter_attr_test(struct kunit *test)
 	got = kunit_filter_suites(&suite_set, NULL, filter, NULL, &err);
 	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, got.start);
 	KUNIT_ASSERT_EQ(test, err, 0);
-	kfree_at_end(test, got.start);
+	free_suite_set_at_end(test, &got);
 
 	/* Validate we just have normal_suite */
 	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, got.start[0]);
@@ -200,7 +200,7 @@ static void filter_attr_empty_test(struct kunit *test)
 
 	got = kunit_filter_suites(&suite_set, NULL, filter, NULL, &err);
 	KUNIT_ASSERT_EQ(test, err, 0);
-	kfree_at_end(test, got.start); /* just in case */
+	free_suite_set_at_end(test, &got); /* just in case */
 
 	KUNIT_EXPECT_PTR_EQ_MSG(test, got.start, got.end,
 				"should be empty to indicate no match");
@@ -222,7 +222,7 @@ static void filter_attr_skip_test(struct kunit *test)
 	got = kunit_filter_suites(&suite_set, NULL, filter, "skip", &err);
 	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, got.start);
 	KUNIT_ASSERT_EQ(test, err, 0);
-	kfree_at_end(test, got.start);
+	free_suite_set_at_end(test, &got);
 
 	/* Validate we have both the slow and normal test */
 	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, got.start[0]->test_cases);
@@ -256,18 +256,26 @@ kunit_test_suites(&executor_test_suite);
 
 /* Test helpers */
 
-/* Use the resource API to register a call to kfree(to_free).
+static void free_suite_set(void *suite_set)
+{
+	kunit_free_suite_set(*(struct kunit_suite_set *)suite_set);
+	kfree(suite_set);
+}
+
+/* Use the resource API to register a call to free_suite_set.
  * Since we never actually use the resource, it's safe to use on const data.
  */
-static void kfree_at_end(struct kunit *test, const void *to_free)
+static void free_suite_set_at_end(struct kunit *test, const void *to_free)
 {
-	/* kfree() handles NULL already, but avoid allocating a no-op cleanup. */
-	if (IS_ERR_OR_NULL(to_free))
+	struct kunit_suite_set *free;
+
+	if (!((struct kunit_suite_set *)to_free)->start)
 		return;
 
-	kunit_add_action(test,
-			(kunit_action_t *)kfree,
-			(void *)to_free);
+	free = kzalloc(sizeof(struct kunit_suite_set), GFP_KERNEL);
+	*free = *(struct kunit_suite_set *)to_free;
+
+	kunit_add_action(test, free_suite_set, (void *)free);
 }
 
 static struct kunit_suite *alloc_fake_suite(struct kunit *test,
diff --git a/lib/kunit/kunit-example-test.c b/lib/kunit/kunit-example-test.c
index 01a769f35e1d..6bb5c2ef6696 100644
--- a/lib/kunit/kunit-example-test.c
+++ b/lib/kunit/kunit-example-test.c
@@ -190,6 +190,7 @@ static void example_static_stub_test(struct kunit *test)
 static const struct example_param {
 	int value;
 } example_params_array[] = {
+	{ .value = 3, },
 	{ .value = 2, },
 	{ .value = 1, },
 	{ .value = 0, },
@@ -213,8 +214,8 @@ static void example_params_test(struct kunit *test)
 	KUNIT_ASSERT_NOT_NULL(test, param);
 
 	/* Test can be skipped on unsupported param values */
-	if (!param->value)
-		kunit_skip(test, "unsupported param value");
+	if (!is_power_of_2(param->value))
+		kunit_skip(test, "unsupported param value %d", param->value);
 
 	/* You can use param values for parameterized testing */
 	KUNIT_EXPECT_EQ(test, param->value % param->value, 0);
diff --git a/lib/kunit/kunit-test.c b/lib/kunit/kunit-test.c
index 83d8e90ca7a2..99d2a3a528e1 100644
--- a/lib/kunit/kunit-test.c
+++ b/lib/kunit/kunit-test.c
@@ -8,6 +8,7 @@
 #include <kunit/test.h>
 #include <kunit/test-bug.h>
 
+#include "string-stream.h"
 #include "try-catch-impl.h"
 
 struct kunit_try_catch_test_context {
@@ -530,12 +531,27 @@ static struct kunit_suite kunit_resource_test_suite = {
 	.test_cases = kunit_resource_test_cases,
 };
 
+/*
+ * Log tests call string_stream functions, which aren't exported. So only
+ * build this code if this test is built-in.
+ */
+#if IS_BUILTIN(CONFIG_KUNIT_TEST)
+
+/* This avoids a cast warning if kfree() is passed direct to kunit_add_action(). */
+static void kfree_wrapper(void *p)
+{
+	kfree(p);
+}
+
 static void kunit_log_test(struct kunit *test)
 {
 	struct kunit_suite suite;
-
-	suite.log = kunit_kzalloc(test, KUNIT_LOG_SIZE, GFP_KERNEL);
+#ifdef CONFIG_KUNIT_DEBUGFS
+	char *full_log;
+#endif
+	suite.log = kunit_alloc_string_stream(test, GFP_KERNEL);
 	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, suite.log);
+	string_stream_set_append_newlines(suite.log, true);
 
 	kunit_log(KERN_INFO, test, "put this in log.");
 	kunit_log(KERN_INFO, test, "this too.");
@@ -543,14 +559,21 @@ static void kunit_log_test(struct kunit *test)
 	kunit_log(KERN_INFO, &suite, "along with this.");
 
 #ifdef CONFIG_KUNIT_DEBUGFS
+	KUNIT_EXPECT_TRUE(test, test->log->append_newlines);
+
+	full_log = string_stream_get_string(test->log);
+	kunit_add_action(test, (kunit_action_t *)kfree, full_log);
 	KUNIT_EXPECT_NOT_ERR_OR_NULL(test,
-				     strstr(test->log, "put this in log."));
+				     strstr(full_log, "put this in log."));
 	KUNIT_EXPECT_NOT_ERR_OR_NULL(test,
-				     strstr(test->log, "this too."));
+				     strstr(full_log, "this too."));
+
+	full_log = string_stream_get_string(suite.log);
+	kunit_add_action(test, kfree_wrapper, full_log);
 	KUNIT_EXPECT_NOT_ERR_OR_NULL(test,
-				     strstr(suite.log, "add to suite log."));
+				     strstr(full_log, "add to suite log."));
 	KUNIT_EXPECT_NOT_ERR_OR_NULL(test,
-				     strstr(suite.log, "along with this."));
+				     strstr(full_log, "along with this."));
 #else
 	KUNIT_EXPECT_NULL(test, test->log);
 #endif
@@ -558,15 +581,30 @@ static void kunit_log_test(struct kunit *test)
 
 static void kunit_log_newline_test(struct kunit *test)
 {
+	char *full_log;
+
 	kunit_info(test, "Add newline\n");
 	if (test->log) {
-		KUNIT_ASSERT_NOT_NULL_MSG(test, strstr(test->log, "Add newline\n"),
-			"Missing log line, full log:\n%s", test->log);
-		KUNIT_EXPECT_NULL(test, strstr(test->log, "Add newline\n\n"));
+		full_log = string_stream_get_string(test->log);
+		kunit_add_action(test, kfree_wrapper, full_log);
+		KUNIT_ASSERT_NOT_NULL_MSG(test, strstr(full_log, "Add newline\n"),
+			"Missing log line, full log:\n%s", full_log);
+		KUNIT_EXPECT_NULL(test, strstr(full_log, "Add newline\n\n"));
 	} else {
 		kunit_skip(test, "only useful when debugfs is enabled");
 	}
 }
+#else
+static void kunit_log_test(struct kunit *test)
+{
+	kunit_skip(test, "Log tests only run when built-in");
+}
+
+static void kunit_log_newline_test(struct kunit *test)
+{
+	kunit_skip(test, "Log tests only run when built-in");
+}
+#endif /* IS_BUILTIN(CONFIG_KUNIT_TEST) */
 
 static struct kunit_case kunit_log_test_cases[] = {
 	KUNIT_CASE(kunit_log_test),
diff --git a/lib/kunit/string-stream-test.c b/lib/kunit/string-stream-test.c
index 110f3a993250..06822766f29a 100644
--- a/lib/kunit/string-stream-test.c
+++ b/lib/kunit/string-stream-test.c
@@ -6,48 +6,539 @@
  * Author: Brendan Higgins <brendanhiggins@google.com>
  */
 
+#include <kunit/static_stub.h>
 #include <kunit/test.h>
+#include <linux/ktime.h>
 #include <linux/slab.h>
+#include <linux/timekeeping.h>
 
 #include "string-stream.h"
 
-static void string_stream_test_empty_on_creation(struct kunit *test)
+struct string_stream_test_priv {
+	/* For testing resource-managed free. */
+	struct string_stream *expected_free_stream;
+	bool stream_was_freed;
+	bool stream_free_again;
+};
+
+/* Avoids a cast warning if kfree() is passed direct to kunit_add_action(). */
+static void kfree_wrapper(void *p)
+{
+	kfree(p);
+}
+
+/* Avoids a cast warning if string_stream_destroy() is passed direct to kunit_add_action(). */
+static void cleanup_raw_stream(void *p)
+{
+	struct string_stream *stream = p;
+
+	string_stream_destroy(stream);
+}
+
+static char *get_concatenated_string(struct kunit *test, struct string_stream *stream)
+{
+	char *str = string_stream_get_string(stream);
+
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, str);
+	kunit_add_action(test, kfree_wrapper, (void *)str);
+
+	return str;
+}
+
+/* Managed string_stream object is initialized correctly. */
+static void string_stream_managed_init_test(struct kunit *test)
+{
+	struct string_stream *stream;
+
+	/* Resource-managed initialization. */
+	stream = kunit_alloc_string_stream(test, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, stream);
+
+	KUNIT_EXPECT_EQ(test, stream->length, 0);
+	KUNIT_EXPECT_TRUE(test, list_empty(&stream->fragments));
+	KUNIT_EXPECT_TRUE(test, (stream->gfp == GFP_KERNEL));
+	KUNIT_EXPECT_FALSE(test, stream->append_newlines);
+	KUNIT_EXPECT_TRUE(test, string_stream_is_empty(stream));
+}
+
+/* Unmanaged string_stream object is initialized correctly. */
+static void string_stream_unmanaged_init_test(struct kunit *test)
+{
+	struct string_stream *stream;
+
+	stream = alloc_string_stream(GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, stream);
+	kunit_add_action(test, cleanup_raw_stream, stream);
+
+	KUNIT_EXPECT_EQ(test, stream->length, 0);
+	KUNIT_EXPECT_TRUE(test, list_empty(&stream->fragments));
+	KUNIT_EXPECT_EQ(test, stream->gfp, GFP_KERNEL);
+	KUNIT_EXPECT_FALSE(test, stream->append_newlines);
+
+	KUNIT_EXPECT_TRUE(test, string_stream_is_empty(stream));
+}
+
+static void string_stream_destroy_stub(struct string_stream *stream)
+{
+	struct kunit *fake_test = kunit_get_current_test();
+	struct string_stream_test_priv *priv = fake_test->priv;
+
+	/* The kunit could own string_streams other than the one we are testing. */
+	if (stream == priv->expected_free_stream) {
+		if (priv->stream_was_freed)
+			priv->stream_free_again = true;
+		else
+			priv->stream_was_freed = true;
+	}
+
+	/*
+	 * Calling string_stream_destroy() will only call this function again
+	 * because the redirection stub is still active.
+	 * Avoid calling deactivate_static_stub() or changing current->kunit_test
+	 * during cleanup.
+	 */
+	string_stream_clear(stream);
+	kfree(stream);
+}
+
+/* kunit_free_string_stream() calls string_stream_desrtoy() */
+static void string_stream_managed_free_test(struct kunit *test)
+{
+	struct string_stream_test_priv *priv = test->priv;
+
+	priv->expected_free_stream = NULL;
+	priv->stream_was_freed = false;
+	priv->stream_free_again = false;
+
+	kunit_activate_static_stub(test,
+				   string_stream_destroy,
+				   string_stream_destroy_stub);
+
+	priv->expected_free_stream = kunit_alloc_string_stream(test, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, priv->expected_free_stream);
+
+	/* This should call the stub function. */
+	kunit_free_string_stream(test, priv->expected_free_stream);
+
+	KUNIT_EXPECT_TRUE(test, priv->stream_was_freed);
+	KUNIT_EXPECT_FALSE(test, priv->stream_free_again);
+}
+
+/* string_stream object is freed when test is cleaned up. */
+static void string_stream_resource_free_test(struct kunit *test)
+{
+	struct string_stream_test_priv *priv = test->priv;
+	struct kunit *fake_test;
+
+	fake_test = kunit_kzalloc(test, sizeof(*fake_test), GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, fake_test);
+
+	kunit_init_test(fake_test, "string_stream_fake_test", NULL);
+	fake_test->priv = priv;
+
+	/*
+	 * Activate stub before creating string_stream so the
+	 * string_stream will be cleaned up first.
+	 */
+	priv->expected_free_stream = NULL;
+	priv->stream_was_freed = false;
+	priv->stream_free_again = false;
+
+	kunit_activate_static_stub(fake_test,
+				   string_stream_destroy,
+				   string_stream_destroy_stub);
+
+	priv->expected_free_stream = kunit_alloc_string_stream(fake_test, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, priv->expected_free_stream);
+
+	/* Set current->kunit_test to fake_test so the static stub will be called. */
+	current->kunit_test = fake_test;
+
+	/* Cleanup test - the stub function should be called */
+	kunit_cleanup(fake_test);
+
+	/* Set current->kunit_test back to current test. */
+	current->kunit_test = test;
+
+	KUNIT_EXPECT_TRUE(test, priv->stream_was_freed);
+	KUNIT_EXPECT_FALSE(test, priv->stream_free_again);
+}
+
+/*
+ * Add a series of lines to a string_stream. Check that all lines
+ * appear in the correct order and no characters are dropped.
+ */
+static void string_stream_line_add_test(struct kunit *test)
+{
+	struct string_stream *stream;
+	char line[60];
+	char *concat_string, *pos, *string_end;
+	size_t len, total_len;
+	int num_lines, i;
+
+	stream = kunit_alloc_string_stream(test, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, stream);
+
+	/* Add series of sequence numbered lines */
+	total_len = 0;
+	for (i = 0; i < 100; ++i) {
+		len = snprintf(line, sizeof(line),
+			"The quick brown fox jumps over the lazy penguin %d\n", i);
+
+		/* Sanity-check that our test string isn't truncated */
+		KUNIT_ASSERT_LT(test, len, sizeof(line));
+
+		string_stream_add(stream, line);
+		total_len += len;
+	}
+	num_lines = i;
+
+	concat_string = get_concatenated_string(test, stream);
+	KUNIT_EXPECT_NOT_ERR_OR_NULL(test, concat_string);
+	KUNIT_EXPECT_EQ(test, strlen(concat_string), total_len);
+
+	/*
+	 * Split the concatenated string at the newlines and check that
+	 * all the original added strings are present.
+	 */
+	pos = concat_string;
+	for (i = 0; i < num_lines; ++i) {
+		string_end = strchr(pos, '\n');
+		KUNIT_EXPECT_NOT_NULL(test, string_end);
+
+		/* Convert to NULL-terminated string */
+		*string_end = '\0';
+
+		snprintf(line, sizeof(line),
+			 "The quick brown fox jumps over the lazy penguin %d", i);
+		KUNIT_EXPECT_STREQ(test, pos, line);
+
+		pos = string_end + 1;
+	}
+
+	/* There shouldn't be any more data after this */
+	KUNIT_EXPECT_EQ(test, strlen(pos), 0);
+}
+
+/* Add a series of lines of variable length to a string_stream. */
+static void string_stream_variable_length_line_test(struct kunit *test)
+{
+	static const char line[] =
+		"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
+		" 0123456789!$%^&*()_-+={}[]:;@'~#<>,.?/|";
+	struct string_stream *stream;
+	struct rnd_state rnd;
+	char *concat_string, *pos, *string_end;
+	size_t offset, total_len;
+	int num_lines, i;
+
+	stream = kunit_alloc_string_stream(test, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, stream);
+
+	/*
+	 * Log many lines of varying lengths until we have created
+	 * many fragments.
+	 * The "randomness" must be repeatable.
+	 */
+	prandom_seed_state(&rnd, 3141592653589793238ULL);
+	total_len = 0;
+	for (i = 0; i < 100; ++i) {
+		offset = prandom_u32_state(&rnd) % (sizeof(line) - 1);
+		string_stream_add(stream, "%s\n", &line[offset]);
+		total_len += sizeof(line) - offset;
+	}
+	num_lines = i;
+
+	concat_string = get_concatenated_string(test, stream);
+	KUNIT_EXPECT_NOT_ERR_OR_NULL(test, concat_string);
+	KUNIT_EXPECT_EQ(test, strlen(concat_string), total_len);
+
+	/*
+	 * Split the concatenated string at the newlines and check that
+	 * all the original added strings are present.
+	 */
+	prandom_seed_state(&rnd, 3141592653589793238ULL);
+	pos = concat_string;
+	for (i = 0; i < num_lines; ++i) {
+		string_end = strchr(pos, '\n');
+		KUNIT_EXPECT_NOT_NULL(test, string_end);
+
+		/* Convert to NULL-terminated string */
+		*string_end = '\0';
+
+		offset = prandom_u32_state(&rnd) % (sizeof(line) - 1);
+		KUNIT_EXPECT_STREQ(test, pos, &line[offset]);
+
+		pos = string_end + 1;
+	}
+
+	/* There shouldn't be any more data after this */
+	KUNIT_EXPECT_EQ(test, strlen(pos), 0);
+}
+
+/* Appending the content of one string stream to another. */
+static void string_stream_append_test(struct kunit *test)
+{
+	static const char * const strings_1[] = {
+		"one", "two", "three", "four", "five", "six",
+		"seven", "eight", "nine", "ten",
+	};
+	static const char * const strings_2[] = {
+		"Apple", "Pear", "Orange", "Banana", "Grape", "Apricot",
+	};
+	struct string_stream *stream_1, *stream_2;
+	const char *stream1_content_before_append, *stream_2_content;
+	char *combined_content;
+	size_t combined_length;
+	int i;
+
+	stream_1 = kunit_alloc_string_stream(test, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, stream_1);
+
+	stream_2 = kunit_alloc_string_stream(test, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, stream_2);
+
+	/* Append content of empty stream to empty stream */
+	string_stream_append(stream_1, stream_2);
+	KUNIT_EXPECT_EQ(test, strlen(get_concatenated_string(test, stream_1)), 0);
+
+	/* Add some data to stream_1 */
+	for (i = 0; i < ARRAY_SIZE(strings_1); ++i)
+		string_stream_add(stream_1, "%s\n", strings_1[i]);
+
+	stream1_content_before_append = get_concatenated_string(test, stream_1);
+
+	/* Append content of empty stream to non-empty stream */
+	string_stream_append(stream_1, stream_2);
+	KUNIT_EXPECT_STREQ(test, get_concatenated_string(test, stream_1),
+			   stream1_content_before_append);
+
+	/* Add some data to stream_2 */
+	for (i = 0; i < ARRAY_SIZE(strings_2); ++i)
+		string_stream_add(stream_2, "%s\n", strings_2[i]);
+
+	/* Append content of non-empty stream to non-empty stream */
+	string_stream_append(stream_1, stream_2);
+
+	/*
+	 * End result should be the original content of stream_1 plus
+	 * the content of stream_2.
+	 */
+	stream_2_content = get_concatenated_string(test, stream_2);
+	combined_length = strlen(stream1_content_before_append) + strlen(stream_2_content);
+	combined_length++; /* for terminating \0 */
+	combined_content = kunit_kmalloc(test, combined_length, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, combined_content);
+	snprintf(combined_content, combined_length, "%s%s",
+		 stream1_content_before_append, stream_2_content);
+
+	KUNIT_EXPECT_STREQ(test, get_concatenated_string(test, stream_1), combined_content);
+
+	/* Append content of non-empty stream to empty stream */
+	kunit_free_string_stream(test, stream_1);
+
+	stream_1 = kunit_alloc_string_stream(test, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, stream_1);
+
+	string_stream_append(stream_1, stream_2);
+	KUNIT_EXPECT_STREQ(test, get_concatenated_string(test, stream_1), stream_2_content);
+}
+
+/* Appending the content of one string stream to one with auto-newlining. */
+static void string_stream_append_auto_newline_test(struct kunit *test)
+{
+	struct string_stream *stream_1, *stream_2;
+
+	/* Stream 1 has newline appending enabled */
+	stream_1 = kunit_alloc_string_stream(test, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, stream_1);
+	string_stream_set_append_newlines(stream_1, true);
+	KUNIT_EXPECT_TRUE(test, stream_1->append_newlines);
+
+	/* Stream 2 does not append newlines */
+	stream_2 = kunit_alloc_string_stream(test, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, stream_2);
+
+	/* Appending a stream with a newline should not add another newline */
+	string_stream_add(stream_1, "Original string\n");
+	string_stream_add(stream_2, "Appended content\n");
+	string_stream_add(stream_2, "More stuff\n");
+	string_stream_append(stream_1, stream_2);
+	KUNIT_EXPECT_STREQ(test, get_concatenated_string(test, stream_1),
+			   "Original string\nAppended content\nMore stuff\n");
+
+	kunit_free_string_stream(test, stream_2);
+	stream_2 = kunit_alloc_string_stream(test, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, stream_2);
+
+	/*
+	 * Appending a stream without newline should add a final newline.
+	 * The appended string_stream is treated as a single string so newlines
+	 * should not be inserted between fragments.
+	 */
+	string_stream_add(stream_2, "Another");
+	string_stream_add(stream_2, "And again");
+	string_stream_append(stream_1, stream_2);
+	KUNIT_EXPECT_STREQ(test, get_concatenated_string(test, stream_1),
+			   "Original string\nAppended content\nMore stuff\nAnotherAnd again\n");
+}
+
+/* Adding an empty string should not create a fragment. */
+static void string_stream_append_empty_string_test(struct kunit *test)
 {
-	struct string_stream *stream = alloc_string_stream(test, GFP_KERNEL);
+	struct string_stream *stream;
+	int original_frag_count;
+
+	stream = kunit_alloc_string_stream(test, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, stream);
 
+	/* Formatted empty string */
+	string_stream_add(stream, "%s", "");
 	KUNIT_EXPECT_TRUE(test, string_stream_is_empty(stream));
+	KUNIT_EXPECT_TRUE(test, list_empty(&stream->fragments));
+
+	/* Adding an empty string to a non-empty stream */
+	string_stream_add(stream, "Add this line");
+	original_frag_count = list_count_nodes(&stream->fragments);
+
+	string_stream_add(stream, "%s", "");
+	KUNIT_EXPECT_EQ(test, list_count_nodes(&stream->fragments), original_frag_count);
+	KUNIT_EXPECT_STREQ(test, get_concatenated_string(test, stream), "Add this line");
 }
 
-static void string_stream_test_not_empty_after_add(struct kunit *test)
+/* Adding strings without automatic newline appending */
+static void string_stream_no_auto_newline_test(struct kunit *test)
 {
-	struct string_stream *stream = alloc_string_stream(test, GFP_KERNEL);
+	struct string_stream *stream;
 
-	string_stream_add(stream, "Foo");
+	stream = kunit_alloc_string_stream(test, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, stream);
 
-	KUNIT_EXPECT_FALSE(test, string_stream_is_empty(stream));
+	/*
+	 * Add some strings with and without newlines. All formatted newlines
+	 * should be preserved. It should not add any extra newlines.
+	 */
+	string_stream_add(stream, "One");
+	string_stream_add(stream, "Two\n");
+	string_stream_add(stream, "%s\n", "Three");
+	string_stream_add(stream, "%s", "Four\n");
+	string_stream_add(stream, "Five\n%s", "Six");
+	string_stream_add(stream, "Seven\n\n");
+	string_stream_add(stream, "Eight");
+	KUNIT_EXPECT_STREQ(test, get_concatenated_string(test, stream),
+			   "OneTwo\nThree\nFour\nFive\nSixSeven\n\nEight");
 }
 
-static void string_stream_test_get_string(struct kunit *test)
+/* Adding strings with automatic newline appending */
+static void string_stream_auto_newline_test(struct kunit *test)
 {
-	struct string_stream *stream = alloc_string_stream(test, GFP_KERNEL);
-	char *output;
+	struct string_stream *stream;
+
+	stream = kunit_alloc_string_stream(test, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, stream);
+
+	string_stream_set_append_newlines(stream, true);
+	KUNIT_EXPECT_TRUE(test, stream->append_newlines);
+
+	/*
+	 * Add some strings with and without newlines. Newlines should
+	 * be appended to lines that do not end with \n, but newlines
+	 * resulting from the formatting should not be changed.
+	 */
+	string_stream_add(stream, "One");
+	string_stream_add(stream, "Two\n");
+	string_stream_add(stream, "%s\n", "Three");
+	string_stream_add(stream, "%s", "Four\n");
+	string_stream_add(stream, "Five\n%s", "Six");
+	string_stream_add(stream, "Seven\n\n");
+	string_stream_add(stream, "Eight");
+	KUNIT_EXPECT_STREQ(test, get_concatenated_string(test, stream),
+			   "One\nTwo\nThree\nFour\nFive\nSix\nSeven\n\nEight\n");
+}
+
+/*
+ * This doesn't actually "test" anything. It reports time taken
+ * and memory used for logging a large number of lines.
+ */
+static void string_stream_performance_test(struct kunit *test)
+{
+	struct string_stream_fragment *frag_container;
+	struct string_stream *stream;
+	char test_line[101];
+	ktime_t start_time, end_time;
+	size_t len, bytes_requested, actual_bytes_used, total_string_length;
+	int offset, i;
+
+	stream = kunit_alloc_string_stream(test, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, stream);
+
+	memset(test_line, 'x', sizeof(test_line) - 1);
+	test_line[sizeof(test_line) - 1] = '\0';
+
+	start_time = ktime_get();
+	for (i = 0; i < 10000; i++) {
+		offset = i % (sizeof(test_line) - 1);
+		string_stream_add(stream, "%s: %d\n", &test_line[offset], i);
+	}
+	end_time = ktime_get();
+
+	/*
+	 * Calculate memory used. This doesn't include invisible
+	 * overhead due to kernel allocator fragment size rounding.
+	 */
+	bytes_requested = sizeof(*stream);
+	actual_bytes_used = ksize(stream);
+	total_string_length = 0;
+
+	list_for_each_entry(frag_container, &stream->fragments, node) {
+		bytes_requested += sizeof(*frag_container);
+		actual_bytes_used += ksize(frag_container);
+
+		len = strlen(frag_container->fragment);
+		total_string_length += len;
+		bytes_requested += len + 1; /* +1 for '\0' */
+		actual_bytes_used += ksize(frag_container->fragment);
+	}
+
+	kunit_info(test, "Time elapsed:           %lld us\n",
+		   ktime_us_delta(end_time, start_time));
+	kunit_info(test, "Total string length:    %zu\n", total_string_length);
+	kunit_info(test, "Bytes requested:        %zu\n", bytes_requested);
+	kunit_info(test, "Actual bytes allocated: %zu\n", actual_bytes_used);
+}
+
+static int string_stream_test_init(struct kunit *test)
+{
+	struct string_stream_test_priv *priv;
+
+	priv = kunit_kzalloc(test, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
 
-	string_stream_add(stream, "Foo");
-	string_stream_add(stream, " %s", "bar");
+	test->priv = priv;
 
-	output = string_stream_get_string(stream);
-	KUNIT_ASSERT_STREQ(test, output, "Foo bar");
+	return 0;
 }
 
 static struct kunit_case string_stream_test_cases[] = {
-	KUNIT_CASE(string_stream_test_empty_on_creation),
-	KUNIT_CASE(string_stream_test_not_empty_after_add),
-	KUNIT_CASE(string_stream_test_get_string),
+	KUNIT_CASE(string_stream_managed_init_test),
+	KUNIT_CASE(string_stream_unmanaged_init_test),
+	KUNIT_CASE(string_stream_managed_free_test),
+	KUNIT_CASE(string_stream_resource_free_test),
+	KUNIT_CASE(string_stream_line_add_test),
+	KUNIT_CASE(string_stream_variable_length_line_test),
+	KUNIT_CASE(string_stream_append_test),
+	KUNIT_CASE(string_stream_append_auto_newline_test),
+	KUNIT_CASE(string_stream_append_empty_string_test),
+	KUNIT_CASE(string_stream_no_auto_newline_test),
+	KUNIT_CASE(string_stream_auto_newline_test),
+	KUNIT_CASE(string_stream_performance_test),
 	{}
 };
 
 static struct kunit_suite string_stream_test_suite = {
 	.name = "string-stream-test",
-	.test_cases = string_stream_test_cases
+	.test_cases = string_stream_test_cases,
+	.init = string_stream_test_init,
 };
 kunit_test_suites(&string_stream_test_suite);
diff --git a/lib/kunit/string-stream.c b/lib/kunit/string-stream.c
index cc32743c1171..a6f3616c2048 100644
--- a/lib/kunit/string-stream.c
+++ b/lib/kunit/string-stream.c
@@ -6,6 +6,7 @@
  * Author: Brendan Higgins <brendanhiggins@google.com>
  */
 
+#include <kunit/static_stub.h>
 #include <kunit/test.h>
 #include <linux/list.h>
 #include <linux/slab.h>
@@ -13,30 +14,28 @@
 #include "string-stream.h"
 
 
-static struct string_stream_fragment *alloc_string_stream_fragment(
-		struct kunit *test, int len, gfp_t gfp)
+static struct string_stream_fragment *alloc_string_stream_fragment(int len, gfp_t gfp)
 {
 	struct string_stream_fragment *frag;
 
-	frag = kunit_kzalloc(test, sizeof(*frag), gfp);
+	frag = kzalloc(sizeof(*frag), gfp);
 	if (!frag)
 		return ERR_PTR(-ENOMEM);
 
-	frag->fragment = kunit_kmalloc(test, len, gfp);
+	frag->fragment = kmalloc(len, gfp);
 	if (!frag->fragment) {
-		kunit_kfree(test, frag);
+		kfree(frag);
 		return ERR_PTR(-ENOMEM);
 	}
 
 	return frag;
 }
 
-static void string_stream_fragment_destroy(struct kunit *test,
-					   struct string_stream_fragment *frag)
+static void string_stream_fragment_destroy(struct string_stream_fragment *frag)
 {
 	list_del(&frag->node);
-	kunit_kfree(test, frag->fragment);
-	kunit_kfree(test, frag);
+	kfree(frag->fragment);
+	kfree(frag);
 }
 
 int string_stream_vadd(struct string_stream *stream,
@@ -44,26 +43,44 @@ int string_stream_vadd(struct string_stream *stream,
 		       va_list args)
 {
 	struct string_stream_fragment *frag_container;
-	int len;
+	int buf_len, result_len;
 	va_list args_for_counting;
 
 	/* Make a copy because `vsnprintf` could change it */
 	va_copy(args_for_counting, args);
 
-	/* Need space for null byte. */
-	len = vsnprintf(NULL, 0, fmt, args_for_counting) + 1;
+	/* Evaluate length of formatted string */
+	buf_len = vsnprintf(NULL, 0, fmt, args_for_counting);
 
 	va_end(args_for_counting);
 
-	frag_container = alloc_string_stream_fragment(stream->test,
-						      len,
-						      stream->gfp);
+	if (buf_len == 0)
+		return 0;
+
+	/* Reserve one extra for possible appended newline. */
+	if (stream->append_newlines)
+		buf_len++;
+
+	/* Need space for null byte. */
+	buf_len++;
+
+	frag_container = alloc_string_stream_fragment(buf_len, stream->gfp);
 	if (IS_ERR(frag_container))
 		return PTR_ERR(frag_container);
 
-	len = vsnprintf(frag_container->fragment, len, fmt, args);
+	if (stream->append_newlines) {
+		/* Don't include reserved newline byte in writeable length. */
+		result_len = vsnprintf(frag_container->fragment, buf_len - 1, fmt, args);
+
+		/* Append newline if necessary. */
+		if (frag_container->fragment[result_len - 1] != '\n')
+			result_len = strlcat(frag_container->fragment, "\n", buf_len);
+	} else {
+		result_len = vsnprintf(frag_container->fragment, buf_len, fmt, args);
+	}
+
 	spin_lock(&stream->lock);
-	stream->length += len;
+	stream->length += result_len;
 	list_add_tail(&frag_container->node, &stream->fragments);
 	spin_unlock(&stream->lock);
 
@@ -82,7 +99,7 @@ int string_stream_add(struct string_stream *stream, const char *fmt, ...)
 	return result;
 }
 
-static void string_stream_clear(struct string_stream *stream)
+void string_stream_clear(struct string_stream *stream)
 {
 	struct string_stream_fragment *frag_container, *frag_container_safe;
 
@@ -91,7 +108,7 @@ static void string_stream_clear(struct string_stream *stream)
 				 frag_container_safe,
 				 &stream->fragments,
 				 node) {
-		string_stream_fragment_destroy(stream->test, frag_container);
+		string_stream_fragment_destroy(frag_container);
 	}
 	stream->length = 0;
 	spin_unlock(&stream->lock);
@@ -103,7 +120,7 @@ char *string_stream_get_string(struct string_stream *stream)
 	size_t buf_len = stream->length + 1; /* +1 for null byte. */
 	char *buf;
 
-	buf = kunit_kzalloc(stream->test, buf_len, stream->gfp);
+	buf = kzalloc(buf_len, stream->gfp);
 	if (!buf)
 		return NULL;
 
@@ -119,13 +136,17 @@ int string_stream_append(struct string_stream *stream,
 			 struct string_stream *other)
 {
 	const char *other_content;
+	int ret;
 
 	other_content = string_stream_get_string(other);
 
 	if (!other_content)
 		return -ENOMEM;
 
-	return string_stream_add(stream, other_content);
+	ret = string_stream_add(stream, other_content);
+	kfree(other_content);
+
+	return ret;
 }
 
 bool string_stream_is_empty(struct string_stream *stream)
@@ -133,16 +154,15 @@ bool string_stream_is_empty(struct string_stream *stream)
 	return list_empty(&stream->fragments);
 }
 
-struct string_stream *alloc_string_stream(struct kunit *test, gfp_t gfp)
+struct string_stream *alloc_string_stream(gfp_t gfp)
 {
 	struct string_stream *stream;
 
-	stream = kunit_kzalloc(test, sizeof(*stream), gfp);
+	stream = kzalloc(sizeof(*stream), gfp);
 	if (!stream)
 		return ERR_PTR(-ENOMEM);
 
 	stream->gfp = gfp;
-	stream->test = test;
 	INIT_LIST_HEAD(&stream->fragments);
 	spin_lock_init(&stream->lock);
 
@@ -151,5 +171,37 @@ struct string_stream *alloc_string_stream(struct kunit *test, gfp_t gfp)
 
 void string_stream_destroy(struct string_stream *stream)
 {
+	KUNIT_STATIC_STUB_REDIRECT(string_stream_destroy, stream);
+
+	if (!stream)
+		return;
+
 	string_stream_clear(stream);
+	kfree(stream);
+}
+
+static void resource_free_string_stream(void *p)
+{
+	struct string_stream *stream = p;
+
+	string_stream_destroy(stream);
+}
+
+struct string_stream *kunit_alloc_string_stream(struct kunit *test, gfp_t gfp)
+{
+	struct string_stream *stream;
+
+	stream = alloc_string_stream(gfp);
+	if (IS_ERR(stream))
+		return stream;
+
+	if (kunit_add_action_or_reset(test, resource_free_string_stream, stream) != 0)
+		return ERR_PTR(-ENOMEM);
+
+	return stream;
+}
+
+void kunit_free_string_stream(struct kunit *test, struct string_stream *stream)
+{
+	kunit_release_action(test, resource_free_string_stream, (void *)stream);
 }
diff --git a/lib/kunit/string-stream.h b/lib/kunit/string-stream.h
index b669f9a75a94..7be2450c7079 100644
--- a/lib/kunit/string-stream.h
+++ b/lib/kunit/string-stream.h
@@ -23,13 +23,17 @@ struct string_stream {
 	struct list_head fragments;
 	/* length and fragments are protected by this lock */
 	spinlock_t lock;
-	struct kunit *test;
 	gfp_t gfp;
+	bool append_newlines;
 };
 
 struct kunit;
 
-struct string_stream *alloc_string_stream(struct kunit *test, gfp_t gfp);
+struct string_stream *kunit_alloc_string_stream(struct kunit *test, gfp_t gfp);
+void kunit_free_string_stream(struct kunit *test, struct string_stream *stream);
+
+struct string_stream *alloc_string_stream(gfp_t gfp);
+void free_string_stream(struct string_stream *stream);
 
 int __printf(2, 3) string_stream_add(struct string_stream *stream,
 				     const char *fmt, ...);
@@ -38,6 +42,8 @@ int __printf(2, 0) string_stream_vadd(struct string_stream *stream,
 				      const char *fmt,
 				      va_list args);
 
+void string_stream_clear(struct string_stream *stream);
+
 char *string_stream_get_string(struct string_stream *stream);
 
 int string_stream_append(struct string_stream *stream,
@@ -47,4 +53,10 @@ bool string_stream_is_empty(struct string_stream *stream);
 
 void string_stream_destroy(struct string_stream *stream);
 
+static inline void string_stream_set_append_newlines(struct string_stream *stream,
+						     bool append_newlines)
+{
+	stream->append_newlines = append_newlines;
+}
+
 #endif /* _KUNIT_STRING_STREAM_H */
diff --git a/lib/kunit/test.c b/lib/kunit/test.c
index 421f13981412..f2eb71f1a66c 100644
--- a/lib/kunit/test.c
+++ b/lib/kunit/test.c
@@ -109,51 +109,17 @@ static void kunit_print_test_stats(struct kunit *test,
 		  stats.total);
 }
 
-/**
- * kunit_log_newline() - Add newline to the end of log if one is not
- * already present.
- * @log: The log to add the newline to.
- */
-static void kunit_log_newline(char *log)
-{
-	int log_len, len_left;
-
-	log_len = strlen(log);
-	len_left = KUNIT_LOG_SIZE - log_len - 1;
-
-	if (log_len > 0 && log[log_len - 1] != '\n')
-		strncat(log, "\n", len_left);
-}
-
-/*
- * Append formatted message to log, size of which is limited to
- * KUNIT_LOG_SIZE bytes (including null terminating byte).
- */
-void kunit_log_append(char *log, const char *fmt, ...)
+/* Append formatted message to log. */
+void kunit_log_append(struct string_stream *log, const char *fmt, ...)
 {
 	va_list args;
-	int len, log_len, len_left;
 
 	if (!log)
 		return;
 
-	log_len = strlen(log);
-	len_left = KUNIT_LOG_SIZE - log_len - 1;
-	if (len_left <= 0)
-		return;
-
-	/* Evaluate length of line to add to log */
 	va_start(args, fmt);
-	len = vsnprintf(NULL, 0, fmt, args) + 1;
+	string_stream_vadd(log, fmt, args);
 	va_end(args);
-
-	/* Print formatted line to the log */
-	va_start(args, fmt);
-	vsnprintf(log + log_len, min(len, len_left), fmt, args);
-	va_end(args);
-
-	/* Add newline to end of log if not already present. */
-	kunit_log_newline(log);
 }
 EXPORT_SYMBOL_GPL(kunit_log_append);
 
@@ -296,7 +262,7 @@ static void kunit_print_string_stream(struct kunit *test,
 		kunit_err(test, "\n");
 	} else {
 		kunit_err(test, "%s", buf);
-		kunit_kfree(test, buf);
+		kfree(buf);
 	}
 }
 
@@ -308,7 +274,7 @@ static void kunit_fail(struct kunit *test, const struct kunit_loc *loc,
 
 	kunit_set_failure(test);
 
-	stream = alloc_string_stream(test, GFP_KERNEL);
+	stream = kunit_alloc_string_stream(test, GFP_KERNEL);
 	if (IS_ERR(stream)) {
 		WARN(true,
 		     "Could not allocate stream to print failed assertion in %s:%d\n",
@@ -322,7 +288,7 @@ static void kunit_fail(struct kunit *test, const struct kunit_loc *loc,
 
 	kunit_print_string_stream(test, stream);
 
-	string_stream_destroy(stream);
+	kunit_free_string_stream(test, stream);
 }
 
 void __noreturn __kunit_abort(struct kunit *test)
@@ -359,14 +325,14 @@ void __kunit_do_failed_assertion(struct kunit *test,
 }
 EXPORT_SYMBOL_GPL(__kunit_do_failed_assertion);
 
-void kunit_init_test(struct kunit *test, const char *name, char *log)
+void kunit_init_test(struct kunit *test, const char *name, struct string_stream *log)
 {
 	spin_lock_init(&test->lock);
 	INIT_LIST_HEAD(&test->resources);
 	test->name = name;
 	test->log = log;
 	if (test->log)
-		test->log[0] = '\0';
+		string_stream_clear(log);
 	test->status = KUNIT_SUCCESS;
 	test->status_comment[0] = '\0';
 }
@@ -648,12 +614,14 @@ int kunit_run_tests(struct kunit_suite *suite)
 						      param_desc,
 						      test.status_comment);
 
+				kunit_update_stats(&param_stats, test.status);
+
 				/* Get next param. */
 				param_desc[0] = '\0';
 				test.param_value = test_case->generate_params(test.param_value, param_desc);
 				test.param_index++;
-
-				kunit_update_stats(&param_stats, test.status);
+				test.status = KUNIT_SUCCESS;
+				test.status_comment[0] = '\0';
 			}
 		}
 
diff --git a/lib/llist.c b/lib/llist.c
index 6e668fa5a2c6..f21d0cfbbaaa 100644
--- a/lib/llist.c
+++ b/lib/llist.c
@@ -66,6 +66,34 @@ struct llist_node *llist_del_first(struct llist_head *head)
 EXPORT_SYMBOL_GPL(llist_del_first);
 
 /**
+ * llist_del_first_this - delete given entry of lock-less list if it is first
+ * @head:	the head for your lock-less list
+ * @this:	a list entry.
+ *
+ * If head of the list is given entry, delete and return %true else
+ * return %false.
+ *
+ * Multiple callers can safely call this concurrently with multiple
+ * llist_add() callers, providing all the callers offer a different @this.
+ */
+bool llist_del_first_this(struct llist_head *head,
+			  struct llist_node *this)
+{
+	struct llist_node *entry, *next;
+
+	/* acquire ensures orderig wrt try_cmpxchg() is llist_del_first() */
+	entry = smp_load_acquire(&head->first);
+	do {
+		if (entry != this)
+			return false;
+		next = READ_ONCE(entry->next);
+	} while (!try_cmpxchg(&head->first, &entry, next));
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(llist_del_first_this);
+
+/**
  * llist_reverse_order - reverse order of a llist chain
  * @head:	first item of the list to be reversed
  *
diff --git a/lib/lwq.c b/lib/lwq.c
new file mode 100644
index 000000000000..57d080a4d53d
--- /dev/null
+++ b/lib/lwq.c
@@ -0,0 +1,158 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Light-weight single-linked queue.
+ *
+ * Entries are enqueued to the head of an llist, with no blocking.
+ * This can happen in any context.
+ *
+ * Entries are dequeued using a spinlock to protect against multiple
+ * access.  The llist is staged in reverse order, and refreshed
+ * from the llist when it exhausts.
+ *
+ * This is particularly suitable when work items are queued in BH or
+ * IRQ context, and where work items are handled one at a time by
+ * dedicated threads.
+ */
+#include <linux/rcupdate.h>
+#include <linux/lwq.h>
+
+struct llist_node *__lwq_dequeue(struct lwq *q)
+{
+	struct llist_node *this;
+
+	if (lwq_empty(q))
+		return NULL;
+	spin_lock(&q->lock);
+	this = q->ready;
+	if (!this && !llist_empty(&q->new)) {
+		/* ensure queue doesn't appear transiently lwq_empty */
+		smp_store_release(&q->ready, (void *)1);
+		this = llist_reverse_order(llist_del_all(&q->new));
+		if (!this)
+			q->ready = NULL;
+	}
+	if (this)
+		q->ready = llist_next(this);
+	spin_unlock(&q->lock);
+	return this;
+}
+EXPORT_SYMBOL_GPL(__lwq_dequeue);
+
+/**
+ * lwq_dequeue_all - dequeue all currently enqueued objects
+ * @q:	the queue to dequeue from
+ *
+ * Remove and return a linked list of llist_nodes of all the objects that were
+ * in the queue. The first on the list will be the object that was least
+ * recently enqueued.
+ */
+struct llist_node *lwq_dequeue_all(struct lwq *q)
+{
+	struct llist_node *r, *t, **ep;
+
+	if (lwq_empty(q))
+		return NULL;
+
+	spin_lock(&q->lock);
+	r = q->ready;
+	q->ready = NULL;
+	t = llist_del_all(&q->new);
+	spin_unlock(&q->lock);
+	ep = &r;
+	while (*ep)
+		ep = &(*ep)->next;
+	*ep = llist_reverse_order(t);
+	return r;
+}
+EXPORT_SYMBOL_GPL(lwq_dequeue_all);
+
+#if IS_ENABLED(CONFIG_LWQ_TEST)
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/wait_bit.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+struct tnode {
+	struct lwq_node n;
+	int i;
+	int c;
+};
+
+static int lwq_exercise(void *qv)
+{
+	struct lwq *q = qv;
+	int cnt;
+	struct tnode *t;
+
+	for (cnt = 0; cnt < 10000; cnt++) {
+		wait_var_event(q, (t = lwq_dequeue(q, struct tnode, n)) != NULL);
+		t->c++;
+		if (lwq_enqueue(&t->n, q))
+			wake_up_var(q);
+	}
+	while (!kthread_should_stop())
+		schedule_timeout_idle(1);
+	return 0;
+}
+
+static int lwq_test(void)
+{
+	int i;
+	struct lwq q;
+	struct llist_node *l, **t1, *t2;
+	struct tnode *t;
+	struct task_struct *threads[8];
+
+	printk(KERN_INFO "testing lwq....\n");
+	lwq_init(&q);
+	printk(KERN_INFO " lwq: run some threads\n");
+	for (i = 0; i < ARRAY_SIZE(threads); i++)
+		threads[i] = kthread_run(lwq_exercise, &q, "lwq-test-%d", i);
+	for (i = 0; i < 100; i++) {
+		t = kmalloc(sizeof(*t), GFP_KERNEL);
+		if (!t)
+			break;
+		t->i = i;
+		t->c = 0;
+		if (lwq_enqueue(&t->n, &q))
+			wake_up_var(&q);
+	}
+	/* wait for threads to exit */
+	for (i = 0; i < ARRAY_SIZE(threads); i++)
+		if (!IS_ERR_OR_NULL(threads[i]))
+			kthread_stop(threads[i]);
+	printk(KERN_INFO " lwq: dequeue first 50:");
+	for (i = 0; i < 50 ; i++) {
+		if (i && (i % 10) == 0) {
+			printk(KERN_CONT "\n");
+			printk(KERN_INFO " lwq: ... ");
+		}
+		t = lwq_dequeue(&q, struct tnode, n);
+		if (t)
+			printk(KERN_CONT " %d(%d)", t->i, t->c);
+		kfree(t);
+	}
+	printk(KERN_CONT "\n");
+	l = lwq_dequeue_all(&q);
+	printk(KERN_INFO " lwq: delete the multiples of 3 (test lwq_for_each_safe())\n");
+	lwq_for_each_safe(t, t1, t2, &l, n) {
+		if ((t->i % 3) == 0) {
+			t->i = -1;
+			kfree(t);
+			t = NULL;
+		}
+	}
+	if (l)
+		lwq_enqueue_batch(l, &q);
+	printk(KERN_INFO " lwq: dequeue remaining:");
+	while ((t = lwq_dequeue(&q, struct tnode, n)) != NULL) {
+		printk(KERN_CONT " %d", t->i);
+		kfree(t);
+	}
+	printk(KERN_CONT "\n");
+	return 0;
+}
+
+module_init(lwq_test);
+#endif /* CONFIG_LWQ_TEST*/
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index 0e00a84e8e8f..bb24d84a4922 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -5627,7 +5627,7 @@ int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries)
 	/* Internal nodes */
 	nr_nodes += DIV_ROUND_UP(nr_nodes, nonleaf_cap);
 	/* Add working room for split (2 nodes) + new parents */
-	mas_node_count(mas, nr_nodes + 3);
+	mas_node_count_gfp(mas, nr_nodes + 3, GFP_KERNEL);
 
 	/* Detect if allocations run out */
 	mas->mas_flags |= MA_STATE_PREALLOC;
diff --git a/lib/nlattr.c b/lib/nlattr.c
index 7a2b6c38fd59..dc15e7888fc1 100644
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -134,6 +134,7 @@ void nla_get_range_unsigned(const struct nla_policy *pt,
 		range->max = U32_MAX;
 		break;
 	case NLA_U64:
+	case NLA_UINT:
 	case NLA_MSECS:
 		range->max = U64_MAX;
 		break;
@@ -183,6 +184,9 @@ static int nla_validate_range_unsigned(const struct nla_policy *pt,
 	case NLA_U64:
 		value = nla_get_u64(nla);
 		break;
+	case NLA_UINT:
+		value = nla_get_uint(nla);
+		break;
 	case NLA_MSECS:
 		value = nla_get_u64(nla);
 		break;
@@ -248,6 +252,7 @@ void nla_get_range_signed(const struct nla_policy *pt,
 		range->max = S32_MAX;
 		break;
 	case NLA_S64:
+	case NLA_SINT:
 		range->min = S64_MIN;
 		range->max = S64_MAX;
 		break;
@@ -295,6 +300,9 @@ static int nla_validate_int_range_signed(const struct nla_policy *pt,
 	case NLA_S64:
 		value = nla_get_s64(nla);
 		break;
+	case NLA_SINT:
+		value = nla_get_sint(nla);
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -320,6 +328,7 @@ static int nla_validate_int_range(const struct nla_policy *pt,
 	case NLA_U16:
 	case NLA_U32:
 	case NLA_U64:
+	case NLA_UINT:
 	case NLA_MSECS:
 	case NLA_BINARY:
 	case NLA_BE16:
@@ -329,6 +338,7 @@ static int nla_validate_int_range(const struct nla_policy *pt,
 	case NLA_S16:
 	case NLA_S32:
 	case NLA_S64:
+	case NLA_SINT:
 		return nla_validate_int_range_signed(pt, nla, extack);
 	default:
 		WARN_ON(1);
@@ -355,6 +365,9 @@ static int nla_validate_mask(const struct nla_policy *pt,
 	case NLA_U64:
 		value = nla_get_u64(nla);
 		break;
+	case NLA_UINT:
+		value = nla_get_uint(nla);
+		break;
 	case NLA_BE16:
 		value = ntohs(nla_get_be16(nla));
 		break;
@@ -433,6 +446,15 @@ static int validate_nla(const struct nlattr *nla, int maxtype,
 			goto out_err;
 		break;
 
+	case NLA_SINT:
+	case NLA_UINT:
+		if (attrlen != sizeof(u32) && attrlen != sizeof(u64)) {
+			NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
+						"invalid attribute length");
+			return -EINVAL;
+		}
+		break;
+
 	case NLA_BITFIELD32:
 		if (attrlen != sizeof(struct nla_bitfield32))
 			goto out_err;
diff --git a/lib/objpool.c b/lib/objpool.c
new file mode 100644
index 000000000000..ce0087f64400
--- /dev/null
+++ b/lib/objpool.c
@@ -0,0 +1,280 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/objpool.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/atomic.h>
+#include <linux/irqflags.h>
+#include <linux/cpumask.h>
+#include <linux/log2.h>
+
+/*
+ * objpool: ring-array based lockless MPMC/FIFO queues
+ *
+ * Copyright: wuqiang.matt@bytedance.com,mhiramat@kernel.org
+ */
+
+/* initialize percpu objpool_slot */
+static int
+objpool_init_percpu_slot(struct objpool_head *pool,
+			 struct objpool_slot *slot,
+			 int nodes, void *context,
+			 objpool_init_obj_cb objinit)
+{
+	void *obj = (void *)&slot->entries[pool->capacity];
+	int i;
+
+	/* initialize elements of percpu objpool_slot */
+	slot->mask = pool->capacity - 1;
+
+	for (i = 0; i < nodes; i++) {
+		if (objinit) {
+			int rc = objinit(obj, context);
+			if (rc)
+				return rc;
+		}
+		slot->entries[slot->tail & slot->mask] = obj;
+		obj = obj + pool->obj_size;
+		slot->tail++;
+		slot->last = slot->tail;
+		pool->nr_objs++;
+	}
+
+	return 0;
+}
+
+/* allocate and initialize percpu slots */
+static int
+objpool_init_percpu_slots(struct objpool_head *pool, int nr_objs,
+			  void *context, objpool_init_obj_cb objinit)
+{
+	int i, cpu_count = 0;
+
+	for (i = 0; i < pool->nr_cpus; i++) {
+
+		struct objpool_slot *slot;
+		int nodes, size, rc;
+
+		/* skip the cpu node which could never be present */
+		if (!cpu_possible(i))
+			continue;
+
+		/* compute how many objects to be allocated with this slot */
+		nodes = nr_objs / num_possible_cpus();
+		if (cpu_count < (nr_objs % num_possible_cpus()))
+			nodes++;
+		cpu_count++;
+
+		size = struct_size(slot, entries, pool->capacity) +
+			pool->obj_size * nodes;
+
+		/*
+		 * here we allocate percpu-slot & objs together in a single
+		 * allocation to make it more compact, taking advantage of
+		 * warm caches and TLB hits. in default vmalloc is used to
+		 * reduce the pressure of kernel slab system. as we know,
+		 * mimimal size of vmalloc is one page since vmalloc would
+		 * always align the requested size to page size
+		 */
+		if (pool->gfp & GFP_ATOMIC)
+			slot = kmalloc_node(size, pool->gfp, cpu_to_node(i));
+		else
+			slot = __vmalloc_node(size, sizeof(void *), pool->gfp,
+				cpu_to_node(i), __builtin_return_address(0));
+		if (!slot)
+			return -ENOMEM;
+		memset(slot, 0, size);
+		pool->cpu_slots[i] = slot;
+
+		/* initialize the objpool_slot of cpu node i */
+		rc = objpool_init_percpu_slot(pool, slot, nodes, context, objinit);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
+
+/* cleanup all percpu slots of the object pool */
+static void objpool_fini_percpu_slots(struct objpool_head *pool)
+{
+	int i;
+
+	if (!pool->cpu_slots)
+		return;
+
+	for (i = 0; i < pool->nr_cpus; i++)
+		kvfree(pool->cpu_slots[i]);
+	kfree(pool->cpu_slots);
+}
+
+/* initialize object pool and pre-allocate objects */
+int objpool_init(struct objpool_head *pool, int nr_objs, int object_size,
+		gfp_t gfp, void *context, objpool_init_obj_cb objinit,
+		objpool_fini_cb release)
+{
+	int rc, capacity, slot_size;
+
+	/* check input parameters */
+	if (nr_objs <= 0 || nr_objs > OBJPOOL_NR_OBJECT_MAX ||
+	    object_size <= 0 || object_size > OBJPOOL_OBJECT_SIZE_MAX)
+		return -EINVAL;
+
+	/* align up to unsigned long size */
+	object_size = ALIGN(object_size, sizeof(long));
+
+	/* calculate capacity of percpu objpool_slot */
+	capacity = roundup_pow_of_two(nr_objs);
+	if (!capacity)
+		return -EINVAL;
+
+	/* initialize objpool pool */
+	memset(pool, 0, sizeof(struct objpool_head));
+	pool->nr_cpus = nr_cpu_ids;
+	pool->obj_size = object_size;
+	pool->capacity = capacity;
+	pool->gfp = gfp & ~__GFP_ZERO;
+	pool->context = context;
+	pool->release = release;
+	slot_size = pool->nr_cpus * sizeof(struct objpool_slot);
+	pool->cpu_slots = kzalloc(slot_size, pool->gfp);
+	if (!pool->cpu_slots)
+		return -ENOMEM;
+
+	/* initialize per-cpu slots */
+	rc = objpool_init_percpu_slots(pool, nr_objs, context, objinit);
+	if (rc)
+		objpool_fini_percpu_slots(pool);
+	else
+		refcount_set(&pool->ref, pool->nr_objs + 1);
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(objpool_init);
+
+/* adding object to slot, abort if the slot was already full */
+static inline int
+objpool_try_add_slot(void *obj, struct objpool_head *pool, int cpu)
+{
+	struct objpool_slot *slot = pool->cpu_slots[cpu];
+	uint32_t head, tail;
+
+	/* loading tail and head as a local snapshot, tail first */
+	tail = READ_ONCE(slot->tail);
+
+	do {
+		head = READ_ONCE(slot->head);
+		/* fault caught: something must be wrong */
+		WARN_ON_ONCE(tail - head > pool->nr_objs);
+	} while (!try_cmpxchg_acquire(&slot->tail, &tail, tail + 1));
+
+	/* now the tail position is reserved for the given obj */
+	WRITE_ONCE(slot->entries[tail & slot->mask], obj);
+	/* update sequence to make this obj available for pop() */
+	smp_store_release(&slot->last, tail + 1);
+
+	return 0;
+}
+
+/* reclaim an object to object pool */
+int objpool_push(void *obj, struct objpool_head *pool)
+{
+	unsigned long flags;
+	int rc;
+
+	/* disable local irq to avoid preemption & interruption */
+	raw_local_irq_save(flags);
+	rc = objpool_try_add_slot(obj, pool, raw_smp_processor_id());
+	raw_local_irq_restore(flags);
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(objpool_push);
+
+/* try to retrieve object from slot */
+static inline void *objpool_try_get_slot(struct objpool_head *pool, int cpu)
+{
+	struct objpool_slot *slot = pool->cpu_slots[cpu];
+	/* load head snapshot, other cpus may change it */
+	uint32_t head = smp_load_acquire(&slot->head);
+
+	while (head != READ_ONCE(slot->last)) {
+		void *obj;
+
+		/* obj must be retrieved before moving forward head */
+		obj = READ_ONCE(slot->entries[head & slot->mask]);
+
+		/* move head forward to mark it's consumption */
+		if (try_cmpxchg_release(&slot->head, &head, head + 1))
+			return obj;
+	}
+
+	return NULL;
+}
+
+/* allocate an object from object pool */
+void *objpool_pop(struct objpool_head *pool)
+{
+	void *obj = NULL;
+	unsigned long flags;
+	int i, cpu;
+
+	/* disable local irq to avoid preemption & interruption */
+	raw_local_irq_save(flags);
+
+	cpu = raw_smp_processor_id();
+	for (i = 0; i < num_possible_cpus(); i++) {
+		obj = objpool_try_get_slot(pool, cpu);
+		if (obj)
+			break;
+		cpu = cpumask_next_wrap(cpu, cpu_possible_mask, -1, 1);
+	}
+	raw_local_irq_restore(flags);
+
+	return obj;
+}
+EXPORT_SYMBOL_GPL(objpool_pop);
+
+/* release whole objpool forcely */
+void objpool_free(struct objpool_head *pool)
+{
+	if (!pool->cpu_slots)
+		return;
+
+	/* release percpu slots */
+	objpool_fini_percpu_slots(pool);
+
+	/* call user's cleanup callback if provided */
+	if (pool->release)
+		pool->release(pool, pool->context);
+}
+EXPORT_SYMBOL_GPL(objpool_free);
+
+/* drop the allocated object, rather reclaim it to objpool */
+int objpool_drop(void *obj, struct objpool_head *pool)
+{
+	if (!obj || !pool)
+		return -EINVAL;
+
+	if (refcount_dec_and_test(&pool->ref)) {
+		objpool_free(pool);
+		return 0;
+	}
+
+	return -EAGAIN;
+}
+EXPORT_SYMBOL_GPL(objpool_drop);
+
+/* drop unused objects and defref objpool for releasing */
+void objpool_fini(struct objpool_head *pool)
+{
+	int count = 1; /* extra ref for objpool itself */
+
+	/* drop all remained objects from objpool */
+	while (objpool_pop(pool))
+		count++;
+
+	if (refcount_sub_and_test(count, &pool->ref))
+		objpool_free(pool);
+}
+EXPORT_SYMBOL_GPL(objpool_fini);
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index 9073430dc865..44dd133594d4 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -278,6 +278,85 @@ int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch)
 }
 EXPORT_SYMBOL(__percpu_counter_compare);
 
+/*
+ * Compare counter, and add amount if total is: less than or equal to limit if
+ * amount is positive, or greater than or equal to limit if amount is negative.
+ * Return true if amount is added, or false if total would be beyond the limit.
+ *
+ * Negative limit is allowed, but unusual.
+ * When negative amounts (subs) are given to percpu_counter_limited_add(),
+ * the limit would most naturally be 0 - but other limits are also allowed.
+ *
+ * Overflow beyond S64_MAX is not allowed for: counter, limit and amount
+ * are all assumed to be sane (far from S64_MIN and S64_MAX).
+ */
+bool __percpu_counter_limited_add(struct percpu_counter *fbc,
+				  s64 limit, s64 amount, s32 batch)
+{
+	s64 count;
+	s64 unknown;
+	unsigned long flags;
+	bool good = false;
+
+	if (amount == 0)
+		return true;
+
+	local_irq_save(flags);
+	unknown = batch * num_online_cpus();
+	count = __this_cpu_read(*fbc->counters);
+
+	/* Skip taking the lock when safe */
+	if (abs(count + amount) <= batch &&
+	    ((amount > 0 && fbc->count + unknown <= limit) ||
+	     (amount < 0 && fbc->count - unknown >= limit))) {
+		this_cpu_add(*fbc->counters, amount);
+		local_irq_restore(flags);
+		return true;
+	}
+
+	raw_spin_lock(&fbc->lock);
+	count = fbc->count + amount;
+
+	/* Skip percpu_counter_sum() when safe */
+	if (amount > 0) {
+		if (count - unknown > limit)
+			goto out;
+		if (count + unknown <= limit)
+			good = true;
+	} else {
+		if (count + unknown < limit)
+			goto out;
+		if (count - unknown >= limit)
+			good = true;
+	}
+
+	if (!good) {
+		s32 *pcount;
+		int cpu;
+
+		for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) {
+			pcount = per_cpu_ptr(fbc->counters, cpu);
+			count += *pcount;
+		}
+		if (amount > 0) {
+			if (count > limit)
+				goto out;
+		} else {
+			if (count < limit)
+				goto out;
+		}
+		good = true;
+	}
+
+	count = __this_cpu_read(*fbc->counters);
+	fbc->count += count + amount;
+	__this_cpu_sub(*fbc->counters, count);
+out:
+	raw_spin_unlock(&fbc->lock);
+	local_irq_restore(flags);
+	return good;
+}
+
 static int __init percpu_counter_startup(void)
 {
 	int ret;
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index 035b0a4db476..1c5420ff254e 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -2,7 +2,7 @@
 obj-$(CONFIG_RAID6_PQ)	+= raid6_pq.o
 
 raid6_pq-y	+= algos.o recov.o tables.o int1.o int2.o int4.o \
-		   int8.o int16.o int32.o
+		   int8.o
 
 raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o avx512.o recov_avx512.o
 raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o \
@@ -55,7 +55,7 @@ endif
 quiet_cmd_unroll = UNROLL  $@
       cmd_unroll = $(AWK) -v N=$* -f $(srctree)/$(src)/unroll.awk < $< > $@
 
-targets += int1.c int2.c int4.c int8.c int16.c int32.c
+targets += int1.c int2.c int4.c int8.c
 $(obj)/int%.c: $(src)/int.uc $(src)/unroll.awk FORCE
 	$(call if_changed,unroll)
 
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 0ec534faf019..cd2e88ee1f14 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -81,10 +81,6 @@ const struct raid6_calls * const raid6_algos[] = {
 	&raid6_lsx,
 #endif
 #endif
-#if defined(__ia64__)
-	&raid6_intx32,
-	&raid6_intx16,
-#endif
 	&raid6_intx8,
 	&raid6_intx4,
 	&raid6_intx2,
diff --git a/lib/raid6/int.uc b/lib/raid6/int.uc
index 558aeac9342a..1ba56c3fa482 100644
--- a/lib/raid6/int.uc
+++ b/lib/raid6/int.uc
@@ -42,13 +42,6 @@ typedef u32 unative_t;
 
 
 /*
- * IA-64 wants insane amounts of unrolling.  On other architectures that
- * is just a waste of space.
- */
-#if ($# <= 8) || defined(__ia64__)
-
-
-/*
  * These sub-operations are separate inlines since they can sometimes be
  * specially optimized using architecture-specific hacks.
  */
@@ -152,5 +145,3 @@ const struct raid6_calls raid6_intx$# = {
 	"int" NSTRING "x$#",
 	0
 };
-
-#endif
diff --git a/lib/rcuref.c b/lib/rcuref.c
index 5ec00a4a64d1..97f300eca927 100644
--- a/lib/rcuref.c
+++ b/lib/rcuref.c
@@ -248,7 +248,7 @@ bool rcuref_put_slowpath(rcuref_t *ref)
 		 * require a retry. If this fails the caller is not
 		 * allowed to deconstruct the object.
 		 */
-		if (atomic_cmpxchg_release(&ref->refcnt, RCUREF_NOREF, RCUREF_DEAD) != RCUREF_NOREF)
+		if (!atomic_try_cmpxchg_release(&ref->refcnt, &cnt, RCUREF_DEAD))
 			return false;
 
 		/*
diff --git a/lib/string_helpers.c b/lib/string_helpers.c
index 9982344cca34..7713f73e66b0 100644
--- a/lib/string_helpers.c
+++ b/lib/string_helpers.c
@@ -31,9 +31,11 @@
  * giving the size in the required units.  @buf should have room for
  * at least 9 bytes and will always be zero terminated.
  *
+ * Return value: number of characters of output that would have been written
+ * (which may be greater than len, if output was truncated).
  */
-void string_get_size(u64 size, u64 blk_size, const enum string_size_units units,
-		     char *buf, int len)
+int string_get_size(u64 size, u64 blk_size, const enum string_size_units units,
+		    char *buf, int len)
 {
 	static const char *const units_10[] = {
 		"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"
@@ -126,8 +128,8 @@ void string_get_size(u64 size, u64 blk_size, const enum string_size_units units,
 	else
 		unit = units_str[units][i];
 
-	snprintf(buf, len, "%u%s %s", (u32)size,
-		 tmp, unit);
+	return snprintf(buf, len, "%u%s %s", (u32)size,
+			tmp, unit);
 }
 EXPORT_SYMBOL(string_get_size);
 
diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index ecde4216201e..7916503e6a6a 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -5111,6 +5111,104 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 0xffffffff } }
 	},
+	/* MOVSX32 */
+	{
+		"ALU_MOVSX | BPF_B",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x00000000ffffffefLL),
+			BPF_LD_IMM64(R3, 0xdeadbeefdeadbeefLL),
+			BPF_MOVSX32_REG(R1, R3, 8),
+			BPF_JMP_REG(BPF_JEQ, R2, R1, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
+	{
+		"ALU_MOVSX | BPF_H",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x00000000ffffbeefLL),
+			BPF_LD_IMM64(R3, 0xdeadbeefdeadbeefLL),
+			BPF_MOVSX32_REG(R1, R3, 16),
+			BPF_JMP_REG(BPF_JEQ, R2, R1, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
+	{
+		"ALU_MOVSX | BPF_W",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x00000000deadbeefLL),
+			BPF_LD_IMM64(R3, 0xdeadbeefdeadbeefLL),
+			BPF_MOVSX32_REG(R1, R3, 32),
+			BPF_JMP_REG(BPF_JEQ, R2, R1, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
+	/* MOVSX64 REG */
+	{
+		"ALU64_MOVSX | BPF_B",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0xffffffffffffffefLL),
+			BPF_LD_IMM64(R3, 0xdeadbeefdeadbeefLL),
+			BPF_MOVSX64_REG(R1, R3, 8),
+			BPF_JMP_REG(BPF_JEQ, R2, R1, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
+	{
+		"ALU64_MOVSX | BPF_H",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0xffffffffffffbeefLL),
+			BPF_LD_IMM64(R3, 0xdeadbeefdeadbeefLL),
+			BPF_MOVSX64_REG(R1, R3, 16),
+			BPF_JMP_REG(BPF_JEQ, R2, R1, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
+	{
+		"ALU64_MOVSX | BPF_W",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0xffffffffdeadbeefLL),
+			BPF_LD_IMM64(R3, 0xdeadbeefdeadbeefLL),
+			BPF_MOVSX64_REG(R1, R3, 32),
+			BPF_JMP_REG(BPF_JEQ, R2, R1, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
 	/* BPF_ALU | BPF_ADD | BPF_X */
 	{
 		"ALU_ADD_X: 1 + 2 = 3",
@@ -6105,6 +6203,106 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 2 } },
 	},
+	/* BPF_ALU | BPF_DIV | BPF_X off=1 (SDIV) */
+	{
+		"ALU_SDIV_X: -6 / 2 = -3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, -6),
+			BPF_ALU32_IMM(BPF_MOV, R1, 2),
+			BPF_ALU32_REG_OFF(BPF_DIV, R0, R1, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, -3 } },
+	},
+	/* BPF_ALU | BPF_DIV | BPF_K off=1 (SDIV) */
+	{
+		"ALU_SDIV_K: -6 / 2 = -3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, -6),
+			BPF_ALU32_IMM_OFF(BPF_DIV, R0, 2, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, -3 } },
+	},
+	/* BPF_ALU64 | BPF_DIV | BPF_X off=1 (SDIV64) */
+	{
+		"ALU64_SDIV_X: -6 / 2 = -3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, -6),
+			BPF_ALU32_IMM(BPF_MOV, R1, 2),
+			BPF_ALU64_REG_OFF(BPF_DIV, R0, R1, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, -3 } },
+	},
+	/* BPF_ALU64 | BPF_DIV | BPF_K off=1 (SDIV64) */
+	{
+		"ALU64_SDIV_K: -6 / 2 = -3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, -6),
+			BPF_ALU64_IMM_OFF(BPF_DIV, R0, 2, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, -3 } },
+	},
+	/* BPF_ALU | BPF_MOD | BPF_X off=1 (SMOD) */
+	{
+		"ALU_SMOD_X: -7 % 2 = -1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, -7),
+			BPF_ALU32_IMM(BPF_MOV, R1, 2),
+			BPF_ALU32_REG_OFF(BPF_MOD, R0, R1, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, -1 } },
+	},
+	/* BPF_ALU | BPF_MOD | BPF_K off=1 (SMOD) */
+	{
+		"ALU_SMOD_K: -7 % 2 = -1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, -7),
+			BPF_ALU32_IMM_OFF(BPF_MOD, R0, 2, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, -1 } },
+	},
+	/* BPF_ALU64 | BPF_MOD | BPF_X off=1 (SMOD64) */
+	{
+		"ALU64_SMOD_X: -7 % 2 = -1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, -7),
+			BPF_ALU32_IMM(BPF_MOV, R1, 2),
+			BPF_ALU64_REG_OFF(BPF_MOD, R0, R1, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, -1 } },
+	},
+	/* BPF_ALU64 | BPF_MOD | BPF_K off=1 (SMOD64) */
+	{
+		"ALU64_SMOD_X: -7 % 2 = -1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, -7),
+			BPF_ALU64_IMM_OFF(BPF_MOD, R0, 2, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, -1 } },
+	},
 	/* BPF_ALU | BPF_AND | BPF_X */
 	{
 		"ALU_AND_X: 3 & 2 = 2",
@@ -7837,6 +8035,104 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, (u32) (cpu_to_le64(0xfedcba9876543210ULL) >> 32) } },
 	},
+	/* BSWAP */
+	{
+		"BSWAP 16: 0x0123456789abcdef -> 0xefcd",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0x0123456789abcdefLL),
+			BPF_BSWAP(R0, 16),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xefcd } },
+	},
+	{
+		"BSWAP 32: 0x0123456789abcdef -> 0xefcdab89",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0x0123456789abcdefLL),
+			BPF_BSWAP(R0, 32),
+			BPF_ALU64_REG(BPF_MOV, R1, R0),
+			BPF_ALU64_IMM(BPF_RSH, R1, 32),
+			BPF_ALU32_REG(BPF_ADD, R0, R1), /* R1 = 0 */
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xefcdab89 } },
+	},
+	{
+		"BSWAP 64: 0x0123456789abcdef -> 0x67452301",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0x0123456789abcdefLL),
+			BPF_BSWAP(R0, 64),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x67452301 } },
+	},
+	{
+		"BSWAP 64: 0x0123456789abcdef >> 32 -> 0xefcdab89",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0x0123456789abcdefLL),
+			BPF_BSWAP(R0, 64),
+			BPF_ALU64_IMM(BPF_RSH, R0, 32),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xefcdab89 } },
+	},
+	/* BSWAP, reversed */
+	{
+		"BSWAP 16: 0xfedcba9876543210 -> 0x1032",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0xfedcba9876543210ULL),
+			BPF_BSWAP(R0, 16),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1032 } },
+	},
+	{
+		"BSWAP 32: 0xfedcba9876543210 -> 0x10325476",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0xfedcba9876543210ULL),
+			BPF_BSWAP(R0, 32),
+			BPF_ALU64_REG(BPF_MOV, R1, R0),
+			BPF_ALU64_IMM(BPF_RSH, R1, 32),
+			BPF_ALU32_REG(BPF_ADD, R0, R1), /* R1 = 0 */
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x10325476 } },
+	},
+	{
+		"BSWAP 64: 0xfedcba9876543210 -> 0x98badcfe",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0xfedcba9876543210ULL),
+			BPF_BSWAP(R0, 64),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x98badcfe } },
+	},
+	{
+		"BSWAP 64: 0xfedcba9876543210 >> 32 -> 0x10325476",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0xfedcba9876543210ULL),
+			BPF_BSWAP(R0, 64),
+			BPF_ALU64_IMM(BPF_RSH, R0, 32),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x10325476 } },
+	},
 	/* BPF_LDX_MEM B/H/W/DW */
 	{
 		"BPF_LDX_MEM | BPF_B, base",
@@ -8228,6 +8524,67 @@ static struct bpf_test tests[] = {
 		{ { 32, 0 } },
 		.stack_depth = 0,
 	},
+	/* BPF_LDX_MEMSX B/H/W */
+	{
+		"BPF_LDX_MEMSX | BPF_B",
+		.u.insns_int = {
+			BPF_LD_IMM64(R1, 0xdead0000000000f0ULL),
+			BPF_LD_IMM64(R2, 0xfffffffffffffff0ULL),
+			BPF_STX_MEM(BPF_DW, R10, R1, -8),
+#ifdef __BIG_ENDIAN
+			BPF_LDX_MEMSX(BPF_B, R0, R10, -1),
+#else
+			BPF_LDX_MEMSX(BPF_B, R0, R10, -8),
+#endif
+			BPF_JMP_REG(BPF_JNE, R0, R2, 1),
+			BPF_ALU64_IMM(BPF_MOV, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0 } },
+		.stack_depth = 8,
+	},
+	{
+		"BPF_LDX_MEMSX | BPF_H",
+		.u.insns_int = {
+			BPF_LD_IMM64(R1, 0xdead00000000f123ULL),
+			BPF_LD_IMM64(R2, 0xfffffffffffff123ULL),
+			BPF_STX_MEM(BPF_DW, R10, R1, -8),
+#ifdef __BIG_ENDIAN
+			BPF_LDX_MEMSX(BPF_H, R0, R10, -2),
+#else
+			BPF_LDX_MEMSX(BPF_H, R0, R10, -8),
+#endif
+			BPF_JMP_REG(BPF_JNE, R0, R2, 1),
+			BPF_ALU64_IMM(BPF_MOV, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0 } },
+		.stack_depth = 8,
+	},
+	{
+		"BPF_LDX_MEMSX | BPF_W",
+		.u.insns_int = {
+			BPF_LD_IMM64(R1, 0x00000000deadbeefULL),
+			BPF_LD_IMM64(R2, 0xffffffffdeadbeefULL),
+			BPF_STX_MEM(BPF_DW, R10, R1, -8),
+#ifdef __BIG_ENDIAN
+			BPF_LDX_MEMSX(BPF_W, R0, R10, -4),
+#else
+			BPF_LDX_MEMSX(BPF_W, R0, R10, -8),
+#endif
+			BPF_JMP_REG(BPF_JNE, R0, R2, 1),
+			BPF_ALU64_IMM(BPF_MOV, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0 } },
+		.stack_depth = 8,
+	},
 	/* BPF_STX_MEM B/H/W/DW */
 	{
 		"BPF_STX_MEM | BPF_B",
@@ -9474,6 +9831,20 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 1 } },
 	},
+	/* BPF_JMP32 | BPF_JA */
+	{
+		"JMP32_JA: Unconditional jump: if (true) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_JMP32_IMM(BPF_JA, 0, 1, 0),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
 	/* BPF_JMP | BPF_JSLT | BPF_K */
 	{
 		"JMP_JSLT_K: Signed jump: if (-2 < -1) return 1",
diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c
index 06959165e2f9..464eeb90d5ad 100644
--- a/lib/test_maple_tree.c
+++ b/lib/test_maple_tree.c
@@ -9,6 +9,7 @@
 
 #include <linux/maple_tree.h>
 #include <linux/module.h>
+#include <linux/rwsem.h>
 
 #define MTREE_ALLOC_MAX 0x2000000000000Ul
 #define CONFIG_MAPLE_SEARCH
@@ -1841,17 +1842,21 @@ static noinline void __init check_forking(struct maple_tree *mt)
 	void *val;
 	MA_STATE(mas, mt, 0, 0);
 	MA_STATE(newmas, mt, 0, 0);
+	struct rw_semaphore newmt_lock;
+
+	init_rwsem(&newmt_lock);
 
 	for (i = 0; i <= nr_entries; i++)
 		mtree_store_range(mt, i*10, i*10 + 5,
 				  xa_mk_value(i), GFP_KERNEL);
 
 	mt_set_non_kernel(99999);
-	mt_init_flags(&newmt, MT_FLAGS_ALLOC_RANGE);
+	mt_init_flags(&newmt, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN);
+	mt_set_external_lock(&newmt, &newmt_lock);
 	newmas.tree = &newmt;
 	mas_reset(&newmas);
 	mas_reset(&mas);
-	mas_lock(&newmas);
+	down_write(&newmt_lock);
 	mas.index = 0;
 	mas.last = 0;
 	if (mas_expected_entries(&newmas, nr_entries)) {
@@ -1866,10 +1871,10 @@ static noinline void __init check_forking(struct maple_tree *mt)
 	}
 	rcu_read_unlock();
 	mas_destroy(&newmas);
-	mas_unlock(&newmas);
 	mt_validate(&newmt);
 	mt_set_non_kernel(0);
-	mtree_destroy(&newmt);
+	__mt_destroy(&newmt);
+	up_write(&newmt_lock);
 }
 
 static noinline void __init check_iteration(struct maple_tree *mt)
@@ -1980,6 +1985,10 @@ static noinline void __init bench_forking(struct maple_tree *mt)
 	void *val;
 	MA_STATE(mas, mt, 0, 0);
 	MA_STATE(newmas, mt, 0, 0);
+	struct rw_semaphore newmt_lock;
+
+	init_rwsem(&newmt_lock);
+	mt_set_external_lock(&newmt, &newmt_lock);
 
 	for (i = 0; i <= nr_entries; i++)
 		mtree_store_range(mt, i*10, i*10 + 5,
@@ -1994,7 +2003,7 @@ static noinline void __init bench_forking(struct maple_tree *mt)
 		mas.index = 0;
 		mas.last = 0;
 		rcu_read_lock();
-		mas_lock(&newmas);
+		down_write(&newmt_lock);
 		if (mas_expected_entries(&newmas, nr_entries)) {
 			printk("OOM!");
 			BUG_ON(1);
@@ -2005,11 +2014,11 @@ static noinline void __init bench_forking(struct maple_tree *mt)
 			mas_store(&newmas, val);
 		}
 		mas_destroy(&newmas);
-		mas_unlock(&newmas);
 		rcu_read_unlock();
 		mt_validate(&newmt);
 		mt_set_non_kernel(0);
-		mtree_destroy(&newmt);
+		__mt_destroy(&newmt);
+		up_write(&newmt_lock);
 	}
 }
 #endif
@@ -2616,6 +2625,10 @@ static noinline void __init check_dup_gaps(struct maple_tree *mt,
 	void *tmp;
 	MA_STATE(mas, mt, 0, 0);
 	MA_STATE(newmas, &newmt, 0, 0);
+	struct rw_semaphore newmt_lock;
+
+	init_rwsem(&newmt_lock);
+	mt_set_external_lock(&newmt, &newmt_lock);
 
 	if (!zero_start)
 		i = 1;
@@ -2625,9 +2638,9 @@ static noinline void __init check_dup_gaps(struct maple_tree *mt,
 		mtree_store_range(mt, i*10, (i+1)*10 - gap,
 				  xa_mk_value(i), GFP_KERNEL);
 
-	mt_init_flags(&newmt, MT_FLAGS_ALLOC_RANGE);
+	mt_init_flags(&newmt, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN);
 	mt_set_non_kernel(99999);
-	mas_lock(&newmas);
+	down_write(&newmt_lock);
 	ret = mas_expected_entries(&newmas, nr_entries);
 	mt_set_non_kernel(0);
 	MT_BUG_ON(mt, ret != 0);
@@ -2640,9 +2653,9 @@ static noinline void __init check_dup_gaps(struct maple_tree *mt,
 	}
 	rcu_read_unlock();
 	mas_destroy(&newmas);
-	mas_unlock(&newmas);
 
-	mtree_destroy(&newmt);
+	__mt_destroy(&newmt);
+	up_write(&newmt_lock);
 }
 
 /* Duplicate many sizes of trees.  Mainly to test expected entry values */
diff --git a/lib/test_objpool.c b/lib/test_objpool.c
new file mode 100644
index 000000000000..a94078402138
--- /dev/null
+++ b/lib/test_objpool.c
@@ -0,0 +1,690 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Test module for lockless object pool
+ *
+ * Copyright: wuqiang.matt@bytedance.com
+ */
+
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/completion.h>
+#include <linux/kthread.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/delay.h>
+#include <linux/hrtimer.h>
+#include <linux/objpool.h>
+
+#define OT_NR_MAX_BULK (16)
+
+/* memory usage */
+struct ot_mem_stat {
+	atomic_long_t alloc;
+	atomic_long_t free;
+};
+
+/* object allocation results */
+struct ot_obj_stat {
+	unsigned long nhits;
+	unsigned long nmiss;
+};
+
+/* control & results per testcase */
+struct ot_data {
+	struct rw_semaphore start;
+	struct completion wait;
+	struct completion rcu;
+	atomic_t nthreads ____cacheline_aligned_in_smp;
+	atomic_t stop ____cacheline_aligned_in_smp;
+	struct ot_mem_stat kmalloc;
+	struct ot_mem_stat vmalloc;
+	struct ot_obj_stat objects;
+	u64    duration;
+};
+
+/* testcase */
+struct ot_test {
+	int async; /* synchronous or asynchronous */
+	int mode; /* only mode 0 supported */
+	int objsz; /* object size */
+	int duration; /* ms */
+	int delay; /* ms */
+	int bulk_normal;
+	int bulk_irq;
+	unsigned long hrtimer; /* ms */
+	const char *name;
+	struct ot_data data;
+};
+
+/* per-cpu worker */
+struct ot_item {
+	struct objpool_head *pool; /* pool head */
+	struct ot_test *test; /* test parameters */
+
+	void (*worker)(struct ot_item *item, int irq);
+
+	/* hrtimer control */
+	ktime_t hrtcycle;
+	struct hrtimer hrtimer;
+
+	int bulk[2]; /* for thread and irq */
+	int delay;
+	u32 niters;
+
+	/* summary per thread */
+	struct ot_obj_stat stat[2]; /* thread and irq */
+	u64 duration;
+};
+
+/*
+ * memory leakage checking
+ */
+
+static void *ot_kzalloc(struct ot_test *test, long size)
+{
+	void *ptr = kzalloc(size, GFP_KERNEL);
+
+	if (ptr)
+		atomic_long_add(size, &test->data.kmalloc.alloc);
+	return ptr;
+}
+
+static void ot_kfree(struct ot_test *test, void *ptr, long size)
+{
+	if (!ptr)
+		return;
+	atomic_long_add(size, &test->data.kmalloc.free);
+	kfree(ptr);
+}
+
+static void ot_mem_report(struct ot_test *test)
+{
+	long alloc, free;
+
+	pr_info("memory allocation summary for %s\n", test->name);
+
+	alloc = atomic_long_read(&test->data.kmalloc.alloc);
+	free = atomic_long_read(&test->data.kmalloc.free);
+	pr_info("  kmalloc: %lu - %lu = %lu\n", alloc, free, alloc - free);
+
+	alloc = atomic_long_read(&test->data.vmalloc.alloc);
+	free = atomic_long_read(&test->data.vmalloc.free);
+	pr_info("  vmalloc: %lu - %lu = %lu\n", alloc, free, alloc - free);
+}
+
+/* user object instance */
+struct ot_node {
+	void *owner;
+	unsigned long data;
+	unsigned long refs;
+	unsigned long payload[32];
+};
+
+/* user objpool manager */
+struct ot_context {
+	struct objpool_head pool; /* objpool head */
+	struct ot_test *test; /* test parameters */
+	void *ptr; /* user pool buffer */
+	unsigned long size; /* buffer size */
+	struct rcu_head rcu;
+};
+
+static DEFINE_PER_CPU(struct ot_item, ot_pcup_items);
+
+static int ot_init_data(struct ot_data *data)
+{
+	memset(data, 0, sizeof(*data));
+	init_rwsem(&data->start);
+	init_completion(&data->wait);
+	init_completion(&data->rcu);
+	atomic_set(&data->nthreads, 1);
+
+	return 0;
+}
+
+static int ot_init_node(void *nod, void *context)
+{
+	struct ot_context *sop = context;
+	struct ot_node *on = nod;
+
+	on->owner = &sop->pool;
+	return 0;
+}
+
+static enum hrtimer_restart ot_hrtimer_handler(struct hrtimer *hrt)
+{
+	struct ot_item *item = container_of(hrt, struct ot_item, hrtimer);
+	struct ot_test *test = item->test;
+
+	if (atomic_read_acquire(&test->data.stop))
+		return HRTIMER_NORESTART;
+
+	/* do bulk-testings for objects pop/push */
+	item->worker(item, 1);
+
+	hrtimer_forward(hrt, hrt->base->get_time(), item->hrtcycle);
+	return HRTIMER_RESTART;
+}
+
+static void ot_start_hrtimer(struct ot_item *item)
+{
+	if (!item->test->hrtimer)
+		return;
+	hrtimer_start(&item->hrtimer, item->hrtcycle, HRTIMER_MODE_REL);
+}
+
+static void ot_stop_hrtimer(struct ot_item *item)
+{
+	if (!item->test->hrtimer)
+		return;
+	hrtimer_cancel(&item->hrtimer);
+}
+
+static int ot_init_hrtimer(struct ot_item *item, unsigned long hrtimer)
+{
+	struct hrtimer *hrt = &item->hrtimer;
+
+	if (!hrtimer)
+		return -ENOENT;
+
+	item->hrtcycle = ktime_set(0, hrtimer * 1000000UL);
+	hrtimer_init(hrt, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hrt->function = ot_hrtimer_handler;
+	return 0;
+}
+
+static int ot_init_cpu_item(struct ot_item *item,
+			struct ot_test *test,
+			struct objpool_head *pool,
+			void (*worker)(struct ot_item *, int))
+{
+	memset(item, 0, sizeof(*item));
+	item->pool = pool;
+	item->test = test;
+	item->worker = worker;
+
+	item->bulk[0] = test->bulk_normal;
+	item->bulk[1] = test->bulk_irq;
+	item->delay = test->delay;
+
+	/* initialize hrtimer */
+	ot_init_hrtimer(item, item->test->hrtimer);
+	return 0;
+}
+
+static int ot_thread_worker(void *arg)
+{
+	struct ot_item *item = arg;
+	struct ot_test *test = item->test;
+	ktime_t start;
+
+	atomic_inc(&test->data.nthreads);
+	down_read(&test->data.start);
+	up_read(&test->data.start);
+	start = ktime_get();
+	ot_start_hrtimer(item);
+	do {
+		if (atomic_read_acquire(&test->data.stop))
+			break;
+		/* do bulk-testings for objects pop/push */
+		item->worker(item, 0);
+	} while (!kthread_should_stop());
+	ot_stop_hrtimer(item);
+	item->duration = (u64) ktime_us_delta(ktime_get(), start);
+	if (atomic_dec_and_test(&test->data.nthreads))
+		complete(&test->data.wait);
+
+	return 0;
+}
+
+static void ot_perf_report(struct ot_test *test, u64 duration)
+{
+	struct ot_obj_stat total, normal = {0}, irq = {0};
+	int cpu, nthreads = 0;
+
+	pr_info("\n");
+	pr_info("Testing summary for %s\n", test->name);
+
+	for_each_possible_cpu(cpu) {
+		struct ot_item *item = per_cpu_ptr(&ot_pcup_items, cpu);
+		if (!item->duration)
+			continue;
+		normal.nhits += item->stat[0].nhits;
+		normal.nmiss += item->stat[0].nmiss;
+		irq.nhits += item->stat[1].nhits;
+		irq.nmiss += item->stat[1].nmiss;
+		pr_info("CPU: %d  duration: %lluus\n", cpu, item->duration);
+		pr_info("\tthread:\t%16lu hits \t%16lu miss\n",
+			item->stat[0].nhits, item->stat[0].nmiss);
+		pr_info("\tirq:   \t%16lu hits \t%16lu miss\n",
+			item->stat[1].nhits, item->stat[1].nmiss);
+		pr_info("\ttotal: \t%16lu hits \t%16lu miss\n",
+			item->stat[0].nhits + item->stat[1].nhits,
+			item->stat[0].nmiss + item->stat[1].nmiss);
+		nthreads++;
+	}
+
+	total.nhits = normal.nhits + irq.nhits;
+	total.nmiss = normal.nmiss + irq.nmiss;
+
+	pr_info("ALL: \tnthreads: %d  duration: %lluus\n", nthreads, duration);
+	pr_info("SUM: \t%16lu hits \t%16lu miss\n",
+		total.nhits, total.nmiss);
+
+	test->data.objects = total;
+	test->data.duration = duration;
+}
+
+/*
+ * synchronous test cases for objpool manipulation
+ */
+
+/* objpool manipulation for synchronous mode (percpu objpool) */
+static struct ot_context *ot_init_sync_m0(struct ot_test *test)
+{
+	struct ot_context *sop = NULL;
+	int max = num_possible_cpus() << 3;
+	gfp_t gfp = GFP_KERNEL;
+
+	sop = (struct ot_context *)ot_kzalloc(test, sizeof(*sop));
+	if (!sop)
+		return NULL;
+	sop->test = test;
+	if (test->objsz < 512)
+		gfp = GFP_ATOMIC;
+
+	if (objpool_init(&sop->pool, max, test->objsz,
+			 gfp, sop, ot_init_node, NULL)) {
+		ot_kfree(test, sop, sizeof(*sop));
+		return NULL;
+	}
+	WARN_ON(max != sop->pool.nr_objs);
+
+	return sop;
+}
+
+static void ot_fini_sync(struct ot_context *sop)
+{
+	objpool_fini(&sop->pool);
+	ot_kfree(sop->test, sop, sizeof(*sop));
+}
+
+struct {
+	struct ot_context * (*init)(struct ot_test *oc);
+	void (*fini)(struct ot_context *sop);
+} g_ot_sync_ops[] = {
+	{.init = ot_init_sync_m0, .fini = ot_fini_sync},
+};
+
+/*
+ * synchronous test cases: performance mode
+ */
+
+static void ot_bulk_sync(struct ot_item *item, int irq)
+{
+	struct ot_node *nods[OT_NR_MAX_BULK];
+	int i;
+
+	for (i = 0; i < item->bulk[irq]; i++)
+		nods[i] = objpool_pop(item->pool);
+
+	if (!irq && (item->delay || !(++(item->niters) & 0x7FFF)))
+		msleep(item->delay);
+
+	while (i-- > 0) {
+		struct ot_node *on = nods[i];
+		if (on) {
+			on->refs++;
+			objpool_push(on, item->pool);
+			item->stat[irq].nhits++;
+		} else {
+			item->stat[irq].nmiss++;
+		}
+	}
+}
+
+static int ot_start_sync(struct ot_test *test)
+{
+	struct ot_context *sop;
+	ktime_t start;
+	u64 duration;
+	unsigned long timeout;
+	int cpu;
+
+	/* initialize objpool for syncrhonous testcase */
+	sop = g_ot_sync_ops[test->mode].init(test);
+	if (!sop)
+		return -ENOMEM;
+
+	/* grab rwsem to block testing threads */
+	down_write(&test->data.start);
+
+	for_each_possible_cpu(cpu) {
+		struct ot_item *item = per_cpu_ptr(&ot_pcup_items, cpu);
+		struct task_struct *work;
+
+		ot_init_cpu_item(item, test, &sop->pool, ot_bulk_sync);
+
+		/* skip offline cpus */
+		if (!cpu_online(cpu))
+			continue;
+
+		work = kthread_create_on_node(ot_thread_worker, item,
+				cpu_to_node(cpu), "ot_worker_%d", cpu);
+		if (IS_ERR(work)) {
+			pr_err("failed to create thread for cpu %d\n", cpu);
+		} else {
+			kthread_bind(work, cpu);
+			wake_up_process(work);
+		}
+	}
+
+	/* wait a while to make sure all threads waiting at start line */
+	msleep(20);
+
+	/* in case no threads were created: memory insufficient ? */
+	if (atomic_dec_and_test(&test->data.nthreads))
+		complete(&test->data.wait);
+
+	// sched_set_fifo_low(current);
+
+	/* start objpool testing threads */
+	start = ktime_get();
+	up_write(&test->data.start);
+
+	/* yeild cpu to worker threads for duration ms */
+	timeout = msecs_to_jiffies(test->duration);
+	schedule_timeout_interruptible(timeout);
+
+	/* tell workers threads to quit */
+	atomic_set_release(&test->data.stop, 1);
+
+	/* wait all workers threads finish and quit */
+	wait_for_completion(&test->data.wait);
+	duration = (u64) ktime_us_delta(ktime_get(), start);
+
+	/* cleanup objpool */
+	g_ot_sync_ops[test->mode].fini(sop);
+
+	/* report testing summary and performance results */
+	ot_perf_report(test, duration);
+
+	/* report memory allocation summary */
+	ot_mem_report(test);
+
+	return 0;
+}
+
+/*
+ * asynchronous test cases: pool lifecycle controlled by refcount
+ */
+
+static void ot_fini_async_rcu(struct rcu_head *rcu)
+{
+	struct ot_context *sop = container_of(rcu, struct ot_context, rcu);
+	struct ot_test *test = sop->test;
+
+	/* here all cpus are aware of the stop event: test->data.stop = 1 */
+	WARN_ON(!atomic_read_acquire(&test->data.stop));
+
+	objpool_fini(&sop->pool);
+	complete(&test->data.rcu);
+}
+
+static void ot_fini_async(struct ot_context *sop)
+{
+	/* make sure the stop event is acknowledged by all cores */
+	call_rcu(&sop->rcu, ot_fini_async_rcu);
+}
+
+static int ot_objpool_release(struct objpool_head *head, void *context)
+{
+	struct ot_context *sop = context;
+
+	WARN_ON(!head || !sop || head != &sop->pool);
+
+	/* do context cleaning if needed */
+	if (sop)
+		ot_kfree(sop->test, sop, sizeof(*sop));
+
+	return 0;
+}
+
+static struct ot_context *ot_init_async_m0(struct ot_test *test)
+{
+	struct ot_context *sop = NULL;
+	int max = num_possible_cpus() << 3;
+	gfp_t gfp = GFP_KERNEL;
+
+	sop = (struct ot_context *)ot_kzalloc(test, sizeof(*sop));
+	if (!sop)
+		return NULL;
+	sop->test = test;
+	if (test->objsz < 512)
+		gfp = GFP_ATOMIC;
+
+	if (objpool_init(&sop->pool, max, test->objsz, gfp, sop,
+			 ot_init_node, ot_objpool_release)) {
+		ot_kfree(test, sop, sizeof(*sop));
+		return NULL;
+	}
+	WARN_ON(max != sop->pool.nr_objs);
+
+	return sop;
+}
+
+struct {
+	struct ot_context * (*init)(struct ot_test *oc);
+	void (*fini)(struct ot_context *sop);
+} g_ot_async_ops[] = {
+	{.init = ot_init_async_m0, .fini = ot_fini_async},
+};
+
+static void ot_nod_recycle(struct ot_node *on, struct objpool_head *pool,
+			int release)
+{
+	struct ot_context *sop;
+
+	on->refs++;
+
+	if (!release) {
+		/* push object back to opjpool for reuse */
+		objpool_push(on, pool);
+		return;
+	}
+
+	sop = container_of(pool, struct ot_context, pool);
+	WARN_ON(sop != pool->context);
+
+	/* unref objpool with nod removed forever */
+	objpool_drop(on, pool);
+}
+
+static void ot_bulk_async(struct ot_item *item, int irq)
+{
+	struct ot_test *test = item->test;
+	struct ot_node *nods[OT_NR_MAX_BULK];
+	int i, stop;
+
+	for (i = 0; i < item->bulk[irq]; i++)
+		nods[i] = objpool_pop(item->pool);
+
+	if (!irq) {
+		if (item->delay || !(++(item->niters) & 0x7FFF))
+			msleep(item->delay);
+		get_cpu();
+	}
+
+	stop = atomic_read_acquire(&test->data.stop);
+
+	/* drop all objects and deref objpool */
+	while (i-- > 0) {
+		struct ot_node *on = nods[i];
+
+		if (on) {
+			on->refs++;
+			ot_nod_recycle(on, item->pool, stop);
+			item->stat[irq].nhits++;
+		} else {
+			item->stat[irq].nmiss++;
+		}
+	}
+
+	if (!irq)
+		put_cpu();
+}
+
+static int ot_start_async(struct ot_test *test)
+{
+	struct ot_context *sop;
+	ktime_t start;
+	u64 duration;
+	unsigned long timeout;
+	int cpu;
+
+	/* initialize objpool for syncrhonous testcase */
+	sop = g_ot_async_ops[test->mode].init(test);
+	if (!sop)
+		return -ENOMEM;
+
+	/* grab rwsem to block testing threads */
+	down_write(&test->data.start);
+
+	for_each_possible_cpu(cpu) {
+		struct ot_item *item = per_cpu_ptr(&ot_pcup_items, cpu);
+		struct task_struct *work;
+
+		ot_init_cpu_item(item, test, &sop->pool, ot_bulk_async);
+
+		/* skip offline cpus */
+		if (!cpu_online(cpu))
+			continue;
+
+		work = kthread_create_on_node(ot_thread_worker, item,
+				cpu_to_node(cpu), "ot_worker_%d", cpu);
+		if (IS_ERR(work)) {
+			pr_err("failed to create thread for cpu %d\n", cpu);
+		} else {
+			kthread_bind(work, cpu);
+			wake_up_process(work);
+		}
+	}
+
+	/* wait a while to make sure all threads waiting at start line */
+	msleep(20);
+
+	/* in case no threads were created: memory insufficient ? */
+	if (atomic_dec_and_test(&test->data.nthreads))
+		complete(&test->data.wait);
+
+	/* start objpool testing threads */
+	start = ktime_get();
+	up_write(&test->data.start);
+
+	/* yeild cpu to worker threads for duration ms */
+	timeout = msecs_to_jiffies(test->duration);
+	schedule_timeout_interruptible(timeout);
+
+	/* tell workers threads to quit */
+	atomic_set_release(&test->data.stop, 1);
+
+	/* do async-finalization */
+	g_ot_async_ops[test->mode].fini(sop);
+
+	/* wait all workers threads finish and quit */
+	wait_for_completion(&test->data.wait);
+	duration = (u64) ktime_us_delta(ktime_get(), start);
+
+	/* assure rcu callback is triggered */
+	wait_for_completion(&test->data.rcu);
+
+	/*
+	 * now we are sure that objpool is finalized either
+	 * by rcu callback or by worker threads
+	 */
+
+	/* report testing summary and performance results */
+	ot_perf_report(test, duration);
+
+	/* report memory allocation summary */
+	ot_mem_report(test);
+
+	return 0;
+}
+
+/*
+ * predefined testing cases:
+ *   synchronous case / overrun case / async case
+ *
+ * async: synchronous or asynchronous testing
+ * mode: only mode 0 supported
+ * objsz: object size
+ * duration: int, total test time in ms
+ * delay: int, delay (in ms) between each iteration
+ * bulk_normal: int, repeat times for thread worker
+ * bulk_irq: int, repeat times for irq consumer
+ * hrtimer: unsigned long, hrtimer intervnal in ms
+ * name: char *, tag for current test ot_item
+ */
+
+#define NODE_COMPACT sizeof(struct ot_node)
+#define NODE_VMALLOC (512)
+
+struct ot_test g_testcases[] = {
+
+	/* sync & normal */
+	{0, 0, NODE_COMPACT, 1000, 0,  1,  0,  0, "sync: percpu objpool"},
+	{0, 0, NODE_VMALLOC, 1000, 0,  1,  0,  0, "sync: percpu objpool from vmalloc"},
+
+	/* sync & hrtimer */
+	{0, 0, NODE_COMPACT, 1000, 0,  1,  1,  4, "sync & hrtimer: percpu objpool"},
+	{0, 0, NODE_VMALLOC, 1000, 0,  1,  1,  4, "sync & hrtimer: percpu objpool from vmalloc"},
+
+	/* sync & overrun */
+	{0, 0, NODE_COMPACT, 1000, 0, 16,  0,  0, "sync overrun: percpu objpool"},
+	{0, 0, NODE_VMALLOC, 1000, 0, 16,  0,  0, "sync overrun: percpu objpool from vmalloc"},
+
+	/* async mode */
+	{1, 0, NODE_COMPACT, 1000, 100,  1,  0,  0, "async: percpu objpool"},
+	{1, 0, NODE_VMALLOC, 1000, 100,  1,  0,  0, "async: percpu objpool from vmalloc"},
+
+	/* async + hrtimer mode */
+	{1, 0, NODE_COMPACT, 1000, 0,  4,  4,  4, "async & hrtimer: percpu objpool"},
+	{1, 0, NODE_VMALLOC, 1000, 0,  4,  4,  4, "async & hrtimer: percpu objpool from vmalloc"},
+};
+
+static int __init ot_mod_init(void)
+{
+	int i;
+
+	/* perform testings */
+	for (i = 0; i < ARRAY_SIZE(g_testcases); i++) {
+		ot_init_data(&g_testcases[i].data);
+		if (g_testcases[i].async)
+			ot_start_async(&g_testcases[i]);
+		else
+			ot_start_sync(&g_testcases[i]);
+	}
+
+	/* show tests summary */
+	pr_info("\n");
+	pr_info("Summary of testcases:\n");
+	for (i = 0; i < ARRAY_SIZE(g_testcases); i++) {
+		pr_info("    duration: %lluus \thits: %10lu \tmiss: %10lu \t%s\n",
+			g_testcases[i].data.duration, g_testcases[i].data.objects.nhits,
+			g_testcases[i].data.objects.nmiss, g_testcases[i].name);
+	}
+
+	return -EAGAIN;
+}
+
+static void __exit ot_mod_exit(void)
+{
+}
+
+module_init(ot_mod_init);
+module_exit(ot_mod_exit);
+
+MODULE_LICENSE("GPL");
+\ No newline at end of file
diff --git a/lib/ucs2_string.c b/lib/ucs2_string.c
index 0a559a42359b..9308bcfb2ad5 100644
--- a/lib/ucs2_string.c
+++ b/lib/ucs2_string.c
@@ -32,6 +32,58 @@ ucs2_strsize(const ucs2_char_t *data, unsigned long maxlength)
 }
 EXPORT_SYMBOL(ucs2_strsize);
 
+/**
+ * ucs2_strscpy() - Copy a UCS2 string into a sized buffer.
+ *
+ * @dst: Pointer to the destination buffer where to copy the string to.
+ * @src: Pointer to the source buffer where to copy the string from.
+ * @count: Size of the destination buffer, in UCS2 (16-bit) characters.
+ *
+ * Like strscpy(), only for UCS2 strings.
+ *
+ * Copy the source string @src, or as much of it as fits, into the destination
+ * buffer @dst. The behavior is undefined if the string buffers overlap. The
+ * destination buffer @dst is always NUL-terminated, unless it's zero-sized.
+ *
+ * Return: The number of characters copied into @dst (excluding the trailing
+ * %NUL terminator) or -E2BIG if @count is 0 or @src was truncated due to the
+ * destination buffer being too small.
+ */
+ssize_t ucs2_strscpy(ucs2_char_t *dst, const ucs2_char_t *src, size_t count)
+{
+	long res;
+
+	/*
+	 * Ensure that we have a valid amount of space. We need to store at
+	 * least one NUL-character.
+	 */
+	if (count == 0 || WARN_ON_ONCE(count > INT_MAX / sizeof(*dst)))
+		return -E2BIG;
+
+	/*
+	 * Copy at most 'count' characters, return early if we find a
+	 * NUL-terminator.
+	 */
+	for (res = 0; res < count; res++) {
+		ucs2_char_t c;
+
+		c = src[res];
+		dst[res] = c;
+
+		if (!c)
+			return res;
+	}
+
+	/*
+	 * The loop above terminated without finding a NUL-terminator,
+	 * exceeding the 'count': Enforce proper NUL-termination and return
+	 * error.
+	 */
+	dst[count - 1] = 0;
+	return -E2BIG;
+}
+EXPORT_SYMBOL(ucs2_strscpy);
+
 int
 ucs2_strncmp(const ucs2_char_t *a, const ucs2_char_t *b, size_t len)
 {
diff --git a/lib/xz/Kconfig b/lib/xz/Kconfig
index adce22ac18d6..aef086a6bf2f 100644
--- a/lib/xz/Kconfig
+++ b/lib/xz/Kconfig
@@ -19,11 +19,6 @@ config XZ_DEC_POWERPC
 	default y
 	select XZ_DEC_BCJ
 
-config XZ_DEC_IA64
-	bool "IA-64 BCJ filter decoder" if EXPERT
-	default y
-	select XZ_DEC_BCJ
-
 config XZ_DEC_ARM
 	bool "ARM BCJ filter decoder" if EXPERT
 	default y