Merge tag 'modules-next-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux

Pull module updates from Rusty Russell: "Main excitement here is Peter Zijlstra's lockless rbtree optimization to speed module address lookup. He found some abusers of the module lock doing that too. A little bit of parameter work here too; including Dan Streetman's breaking up the big param mutex so writing a parameter can load another module (yeah, really). Unfortunately that broke the usual suspects, !CONFIG_MODULES and !CONFIG_SYSFS, so those fixes were appended too" * tag 'modules-next-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux: (26 commits) modules: only use mod->param_lock if CONFIG_MODULES param: fix module param locks when !CONFIG_SYSFS. rcu: merge fix for Convert ACCESS_ONCE() to READ_ONCE() and WRITE_ONCE() module: add per-module param_lock module: make perm const params: suppress unused variable error, warn once just in case code changes. modules: clarify CONFIG_MODULE_COMPRESS help, suggest 'N'. kernel/module.c: avoid ifdefs for sig_enforce declaration kernel/workqueue.c: remove ifdefs over wq_power_efficient kernel/params.c: export param_ops_bool_enable_only kernel/params.c: generalize bool_enable_only kernel/module.c: use generic module param operaters for sig_enforce kernel/params: constify struct kernel_param_ops uses sysfs: tightened sysfs permission checks module: Rework module_addr_{min,max} module: Use __module_address() for module_address_lookup() module: Make the mod_tree stuff conditional on PERF_EVENTS || TRACING module: Optimize __module_address() using a latched RB-tree rbtree: Implement generic latch_tree seqlock: Introduce raw_read_seqcount_latch() ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2015-07-01 10:49:25 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2015-07-01 10:49:25 -0700
commit: 02201e3f1b46aed7c6348f406b7b40de80ba6de3 (patch)
tree: 2392c9098359725c195dd82a72b20ccedc1a1509 /include
parent: 0890a264794f33df540fbaf274699146903b4e6b (diff)
parent: 20bdc2cfdbc484777b30b96fcdbb8994038f3ce1 (diff)
9 files changed, 416 insertions, 107 deletions
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 26fc8bc77f85..7f8ad9593da7 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -475,6 +475,21 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 	(volatile typeof(x) *)&(x); })
 #define ACCESS_ONCE(x) (*__ACCESS_ONCE(x))
 
+/**
+ * lockless_dereference() - safely load a pointer for later dereference
+ * @p: The pointer to load
+ *
+ * Similar to rcu_dereference(), but for situations where the pointed-to
+ * object's lifetime is managed by something other than RCU.  That
+ * "something other" might be reference counting or simple immortality.
+ */
+#define lockless_dereference(p) \
+({ \
+	typeof(p) _________p1 = READ_ONCE(p); \
+	smp_read_barrier_depends(); /* Dependency order vs. p above. */ \
+	(_________p1); \
+})
+
 /* Ignore/forbid kprobes attach on very low level functions marked by this attribute: */
 #ifdef CONFIG_KPROBES
 # define __kprobes	__attribute__((__section__(".kprobes.text")))
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 5acf5b70866d..cfa9351c7536 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -813,13 +813,15 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
 #endif
 
 /* Permissions on a sysfs file: you didn't miss the 0 prefix did you? */
-#define VERIFY_OCTAL_PERMISSIONS(perms)					\
-	(BUILD_BUG_ON_ZERO((perms) < 0) +				\
-	 BUILD_BUG_ON_ZERO((perms) > 0777) +				\
-	 /* User perms >= group perms >= other perms */			\
-	 BUILD_BUG_ON_ZERO(((perms) >> 6) < (((perms) >> 3) & 7)) +	\
-	 BUILD_BUG_ON_ZERO((((perms) >> 3) & 7) < ((perms) & 7)) +	\
-	 /* Other writable?  Generally considered a bad idea. */	\
-	 BUILD_BUG_ON_ZERO((perms) & 2) +				\
+#define VERIFY_OCTAL_PERMISSIONS(perms)						\
+	(BUILD_BUG_ON_ZERO((perms) < 0) +					\
+	 BUILD_BUG_ON_ZERO((perms) > 0777) +					\
+	 /* USER_READABLE >= GROUP_READABLE >= OTHER_READABLE */		\
+	 BUILD_BUG_ON_ZERO((((perms) >> 6) & 4) < (((perms) >> 3) & 4)) +	\
+	 BUILD_BUG_ON_ZERO((((perms) >> 3) & 4) < ((perms) & 4)) +		\
+	 /* USER_WRITABLE >= GROUP_WRITABLE */					\
+	 BUILD_BUG_ON_ZERO((((perms) >> 6) & 2) < (((perms) >> 3) & 2)) +	\
+	 /* OTHER_WRITABLE?  Generally considered a bad idea. */		\
+	 BUILD_BUG_ON_ZERO((perms) & 2) +					\
 	 (perms))
 #endif
diff --git a/include/linux/module.h b/include/linux/module.h
index 7ffe0851d244..d67b1932cc59 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -17,6 +17,7 @@
 #include <linux/moduleparam.h>
 #include <linux/jump_label.h>
 #include <linux/export.h>
+#include <linux/rbtree_latch.h>
 
 #include <linux/percpu.h>
 #include <asm/module.h>
@@ -210,6 +211,13 @@ enum module_state {
 	MODULE_STATE_UNFORMED,	/* Still setting it up. */
 };
 
+struct module;
+
+struct mod_tree_node {
+	struct module *mod;
+	struct latch_tree_node node;
+};
+
 struct module {
 	enum module_state state;
 
@@ -232,6 +240,9 @@ struct module {
 	unsigned int num_syms;
 
 	/* Kernel parameters. */
+#ifdef CONFIG_SYSFS
+	struct mutex param_lock;
+#endif
 	struct kernel_param *kp;
 	unsigned int num_kp;
 
@@ -271,8 +282,15 @@ struct module {
 	/* Startup function. */
 	int (*init)(void);
 
-	/* If this is non-NULL, vfree after init() returns */
-	void *module_init;
+	/*
+	 * If this is non-NULL, vfree() after init() returns.
+	 *
+	 * Cacheline align here, such that:
+	 *   module_init, module_core, init_size, core_size,
+	 *   init_text_size, core_text_size and mtn_core::{mod,node[0]}
+	 * are on the same cacheline.
+	 */
+	void *module_init	____cacheline_aligned;
 
 	/* Here is the actual code + data, vfree'd on unload. */
 	void *module_core;
@@ -283,6 +301,16 @@ struct module {
 	/* The size of the executable code in each section.  */
 	unsigned int init_text_size, core_text_size;
 
+#ifdef CONFIG_MODULES_TREE_LOOKUP
+	/*
+	 * We want mtn_core::{mod,node[0]} to be in the same cacheline as the
+	 * above entries such that a regular lookup will only touch one
+	 * cacheline.
+	 */
+	struct mod_tree_node	mtn_core;
+	struct mod_tree_node	mtn_init;
+#endif
+
 	/* Size of RO sections of the module (text+rodata) */
 	unsigned int init_ro_size, core_ro_size;
 
@@ -369,7 +397,7 @@ struct module {
 	ctor_fn_t *ctors;
 	unsigned int num_ctors;
 #endif
-};
+} ____cacheline_aligned;
 #ifndef MODULE_ARCH_INIT
 #define MODULE_ARCH_INIT {}
 #endif
@@ -423,14 +451,22 @@ struct symsearch {
 	bool unused;
 };
 
-/* Search for an exported symbol by name. */
+/*
+ * Search for an exported symbol by name.
+ *
+ * Must be called with module_mutex held or preemption disabled.
+ */
 const struct kernel_symbol *find_symbol(const char *name,
 					struct module **owner,
 					const unsigned long **crc,
 					bool gplok,
 					bool warn);
 
-/* Walk the exported symbol table */
+/*
+ * Walk the exported symbol table
+ *
+ * Must be called with module_mutex held or preemption disabled.
+ */
 bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
 				    struct module *owner,
 				    void *data), void *data);
diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 6480dcaca275..c12f2147c350 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -67,8 +67,9 @@ enum {
 
 struct kernel_param {
 	const char *name;
+	struct module *mod;
 	const struct kernel_param_ops *ops;
-	u16 perm;
+	const u16 perm;
 	s8 level;
 	u8 flags;
 	union {
@@ -108,7 +109,7 @@ struct kparam_array
  *
  * @perm is 0 if the the variable is not to appear in sysfs, or 0444
  * for world-readable, 0644 for root-writable, etc.  Note that if it
- * is writable, you may need to use kparam_block_sysfs_write() around
+ * is writable, you may need to use kernel_param_lock() around
  * accesses (esp. charp, which can be kfreed when it changes).
  *
  * The @type is simply pasted to refer to a param_ops_##type and a
@@ -216,16 +217,16 @@ struct kparam_array
    parameters. */
 #define __module_param_call(prefix, name, ops, arg, perm, level, flags)	\
 	/* Default value instead of permissions? */			\
-	static const char __param_str_##name[] = prefix #name; \
+	static const char __param_str_##name[] = prefix #name;		\
 	static struct kernel_param __moduleparam_const __param_##name	\
 	__used								\
     __attribute__ ((unused,__section__ ("__param"),aligned(sizeof(void *)))) \
-	= { __param_str_##name, ops, VERIFY_OCTAL_PERMISSIONS(perm),	\
-	    level, flags, { arg } }
+	= { __param_str_##name, THIS_MODULE, ops,			\
+	    VERIFY_OCTAL_PERMISSIONS(perm), level, flags, { arg } }
 
 /* Obsolete - use module_param_cb() */
 #define module_param_call(name, set, get, arg, perm)			\
-	static struct kernel_param_ops __param_ops_##name =		\
+	static const struct kernel_param_ops __param_ops_##name =		\
 		{ .flags = 0, (void *)set, (void *)get };		\
 	__module_param_call(MODULE_PARAM_PREFIX,			\
 			    name, &__param_ops_##name, arg,		\
@@ -238,58 +239,14 @@ __check_old_set_param(int (*oldset)(const char *, struct kernel_param *))
 	return 0;
 }
 
-/**
- * kparam_block_sysfs_write - make sure a parameter isn't written via sysfs.
- * @name: the name of the parameter
- *
- * There's no point blocking write on a paramter that isn't writable via sysfs!
- */
-#define kparam_block_sysfs_write(name)			\
-	do {						\
-		BUG_ON(!(__param_##name.perm & 0222));	\
-		__kernel_param_lock();			\
-	} while (0)
-
-/**
- * kparam_unblock_sysfs_write - allows sysfs to write to a parameter again.
- * @name: the name of the parameter
- */
-#define kparam_unblock_sysfs_write(name)		\
-	do {						\
-		BUG_ON(!(__param_##name.perm & 0222));	\
-		__kernel_param_unlock();		\
-	} while (0)
-
-/**
- * kparam_block_sysfs_read - make sure a parameter isn't read via sysfs.
- * @name: the name of the parameter
- *
- * This also blocks sysfs writes.
- */
-#define kparam_block_sysfs_read(name)			\
-	do {						\
-		BUG_ON(!(__param_##name.perm & 0444));	\
-		__kernel_param_lock();			\
-	} while (0)
-
-/**
- * kparam_unblock_sysfs_read - allows sysfs to read a parameter again.
- * @name: the name of the parameter
- */
-#define kparam_unblock_sysfs_read(name)			\
-	do {						\
-		BUG_ON(!(__param_##name.perm & 0444));	\
-		__kernel_param_unlock();		\
-	} while (0)
-
 #ifdef CONFIG_SYSFS
-extern void __kernel_param_lock(void);
-extern void __kernel_param_unlock(void);
+extern void kernel_param_lock(struct module *mod);
+extern void kernel_param_unlock(struct module *mod);
 #else
-static inline void __kernel_param_lock(void)
+static inline void kernel_param_lock(struct module *mod)
 {
 }
-static inline void __kernel_param_unlock(void)
+static inline void kernel_param_unlock(struct module *mod)
 {
 }
 #endif
@@ -386,64 +343,70 @@ static inline void destroy_params(const struct kernel_param *params,
 #define __param_check(name, p, type) \
 	static inline type __always_unused *__check_##name(void) { return(p); }
 
-extern struct kernel_param_ops param_ops_byte;
+extern const struct kernel_param_ops param_ops_byte;
 extern int param_set_byte(const char *val, const struct kernel_param *kp);
 extern int param_get_byte(char *buffer, const struct kernel_param *kp);
 #define param_check_byte(name, p) __param_check(name, p, unsigned char)
 
-extern struct kernel_param_ops param_ops_short;
+extern const struct kernel_param_ops param_ops_short;
 extern int param_set_short(const char *val, const struct kernel_param *kp);
 extern int param_get_short(char *buffer, const struct kernel_param *kp);
 #define param_check_short(name, p) __param_check(name, p, short)
 
-extern struct kernel_param_ops param_ops_ushort;
+extern const struct kernel_param_ops param_ops_ushort;
 extern int param_set_ushort(const char *val, const struct kernel_param *kp);
 extern int param_get_ushort(char *buffer, const struct kernel_param *kp);
 #define param_check_ushort(name, p) __param_check(name, p, unsigned short)
 
-extern struct kernel_param_ops param_ops_int;
+extern const struct kernel_param_ops param_ops_int;
 extern int param_set_int(const char *val, const struct kernel_param *kp);
 extern int param_get_int(char *buffer, const struct kernel_param *kp);
 #define param_check_int(name, p) __param_check(name, p, int)
 
-extern struct kernel_param_ops param_ops_uint;
+extern const struct kernel_param_ops param_ops_uint;
 extern int param_set_uint(const char *val, const struct kernel_param *kp);
 extern int param_get_uint(char *buffer, const struct kernel_param *kp);
 #define param_check_uint(name, p) __param_check(name, p, unsigned int)
 
-extern struct kernel_param_ops param_ops_long;
+extern const struct kernel_param_ops param_ops_long;
 extern int param_set_long(const char *val, const struct kernel_param *kp);
 extern int param_get_long(char *buffer, const struct kernel_param *kp);
 #define param_check_long(name, p) __param_check(name, p, long)
 
-extern struct kernel_param_ops param_ops_ulong;
+extern const struct kernel_param_ops param_ops_ulong;
 extern int param_set_ulong(const char *val, const struct kernel_param *kp);
 extern int param_get_ulong(char *buffer, const struct kernel_param *kp);
 #define param_check_ulong(name, p) __param_check(name, p, unsigned long)
 
-extern struct kernel_param_ops param_ops_ullong;
+extern const struct kernel_param_ops param_ops_ullong;
 extern int param_set_ullong(const char *val, const struct kernel_param *kp);
 extern int param_get_ullong(char *buffer, const struct kernel_param *kp);
 #define param_check_ullong(name, p) __param_check(name, p, unsigned long long)
 
-extern struct kernel_param_ops param_ops_charp;
+extern const struct kernel_param_ops param_ops_charp;
 extern int param_set_charp(const char *val, const struct kernel_param *kp);
 extern int param_get_charp(char *buffer, const struct kernel_param *kp);
 #define param_check_charp(name, p) __param_check(name, p, char *)
 
 /* We used to allow int as well as bool.  We're taking that away! */
-extern struct kernel_param_ops param_ops_bool;
+extern const struct kernel_param_ops param_ops_bool;
 extern int param_set_bool(const char *val, const struct kernel_param *kp);
 extern int param_get_bool(char *buffer, const struct kernel_param *kp);
 #define param_check_bool(name, p) __param_check(name, p, bool)
 
-extern struct kernel_param_ops param_ops_invbool;
+extern const struct kernel_param_ops param_ops_bool_enable_only;
+extern int param_set_bool_enable_only(const char *val,
+				      const struct kernel_param *kp);
+/* getter is the same as for the regular bool */
+#define param_check_bool_enable_only param_check_bool
+
+extern const struct kernel_param_ops param_ops_invbool;
 extern int param_set_invbool(const char *val, const struct kernel_param *kp);
 extern int param_get_invbool(char *buffer, const struct kernel_param *kp);
 #define param_check_invbool(name, p) __param_check(name, p, bool)
 
 /* An int, which can only be set like a bool (though it shows as an int). */
-extern struct kernel_param_ops param_ops_bint;
+extern const struct kernel_param_ops param_ops_bint;
 extern int param_set_bint(const char *val, const struct kernel_param *kp);
 #define param_get_bint param_get_int
 #define param_check_bint param_check_int
@@ -487,9 +450,9 @@ extern int param_set_bint(const char *val, const struct kernel_param *kp);
 			    perm, -1, 0);				\
 	__MODULE_PARM_TYPE(name, "array of " #type)
 
-extern struct kernel_param_ops param_array_ops;
+extern const struct kernel_param_ops param_array_ops;
 
-extern struct kernel_param_ops param_ops_string;
+extern const struct kernel_param_ops param_ops_string;
 extern int param_set_copystring(const char *val, const struct kernel_param *);
 extern int param_get_string(char *buffer, const struct kernel_param *kp);
 
diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index fb31765e935a..830c4992088d 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -31,6 +31,7 @@
 
 #include <linux/kernel.h>
 #include <linux/stddef.h>
+#include <linux/rcupdate.h>
 
 struct rb_node {
 	unsigned long  __rb_parent_color;
@@ -73,11 +74,11 @@ extern struct rb_node *rb_first_postorder(const struct rb_root *);
 extern struct rb_node *rb_next_postorder(const struct rb_node *);
 
 /* Fast replacement of a single node without remove/rebalance/add/rebalance */
-extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, 
+extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
 			    struct rb_root *root);
 
-static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
-				struct rb_node ** rb_link)
+static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
+				struct rb_node **rb_link)
 {
 	node->__rb_parent_color = (unsigned long)parent;
 	node->rb_left = node->rb_right = NULL;
@@ -85,6 +86,15 @@ static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
 	*rb_link = node;
 }
 
+static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
+				    struct rb_node **rb_link)
+{
+	node->__rb_parent_color = (unsigned long)parent;
+	node->rb_left = node->rb_right = NULL;
+
+	rcu_assign_pointer(*rb_link, node);
+}
+
 #define rb_entry_safe(ptr, type, member) \
 	({ typeof(ptr) ____ptr = (ptr); \
 	   ____ptr ? rb_entry(____ptr, type, member) : NULL; \
diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
index 378c5ee75f78..14d7b831b63a 100644
--- a/include/linux/rbtree_augmented.h
+++ b/include/linux/rbtree_augmented.h
@@ -123,11 +123,11 @@ __rb_change_child(struct rb_node *old, struct rb_node *new,
 {
 	if (parent) {
 		if (parent->rb_left == old)
-			parent->rb_left = new;
+			WRITE_ONCE(parent->rb_left, new);
 		else
-			parent->rb_right = new;
+			WRITE_ONCE(parent->rb_right, new);
 	} else
-		root->rb_node = new;
+		WRITE_ONCE(root->rb_node, new);
 }
 
 extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
@@ -137,7 +137,8 @@ static __always_inline struct rb_node *
 __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
 		     const struct rb_augment_callbacks *augment)
 {
-	struct rb_node *child = node->rb_right, *tmp = node->rb_left;
+	struct rb_node *child = node->rb_right;
+	struct rb_node *tmp = node->rb_left;
 	struct rb_node *parent, *rebalance;
 	unsigned long pc;
 
@@ -167,6 +168,7 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
 		tmp = parent;
 	} else {
 		struct rb_node *successor = child, *child2;
+
 		tmp = child->rb_left;
 		if (!tmp) {
 			/*
@@ -180,6 +182,7 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
 			 */
 			parent = successor;
 			child2 = successor->rb_right;
+
 			augment->copy(node, successor);
 		} else {
 			/*
@@ -201,19 +204,23 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
 				successor = tmp;
 				tmp = tmp->rb_left;
 			} while (tmp);
-			parent->rb_left = child2 = successor->rb_right;
-			successor->rb_right = child;
+			child2 = successor->rb_right;
+			WRITE_ONCE(parent->rb_left, child2);
+			WRITE_ONCE(successor->rb_right, child);
 			rb_set_parent(child, successor);
+
 			augment->copy(node, successor);
 			augment->propagate(parent, successor);
 		}
 
-		successor->rb_left = tmp = node->rb_left;
+		tmp = node->rb_left;
+		WRITE_ONCE(successor->rb_left, tmp);
 		rb_set_parent(tmp, successor);
 
 		pc = node->__rb_parent_color;
 		tmp = __rb_parent(pc);
 		__rb_change_child(node, successor, tmp, root);
+
 		if (child2) {
 			successor->__rb_parent_color = pc;
 			rb_set_parent_color(child2, parent, RB_BLACK);
diff --git a/include/linux/rbtree_latch.h b/include/linux/rbtree_latch.h
new file mode 100644
index 000000000000..4f3432c61d12
--- /dev/null
+++ b/include/linux/rbtree_latch.h
@@ -0,0 +1,212 @@
+/*
+ * Latched RB-trees
+ *
+ * Copyright (C) 2015 Intel Corp., Peter Zijlstra <peterz@infradead.org>
+ *
+ * Since RB-trees have non-atomic modifications they're not immediately suited
+ * for RCU/lockless queries. Even though we made RB-tree lookups non-fatal for
+ * lockless lookups; we cannot guarantee they return a correct result.
+ *
+ * The simplest solution is a seqlock + RB-tree, this will allow lockless
+ * lookups; but has the constraint (inherent to the seqlock) that read sides
+ * cannot nest in write sides.
+ *
+ * If we need to allow unconditional lookups (say as required for NMI context
+ * usage) we need a more complex setup; this data structure provides this by
+ * employing the latch technique -- see @raw_write_seqcount_latch -- to
+ * implement a latched RB-tree which does allow for unconditional lookups by
+ * virtue of always having (at least) one stable copy of the tree.
+ *
+ * However, while we have the guarantee that there is at all times one stable
+ * copy, this does not guarantee an iteration will not observe modifications.
+ * What might have been a stable copy at the start of the iteration, need not
+ * remain so for the duration of the iteration.
+ *
+ * Therefore, this does require a lockless RB-tree iteration to be non-fatal;
+ * see the comment in lib/rbtree.c. Note however that we only require the first
+ * condition -- not seeing partial stores -- because the latch thing isolates
+ * us from loops. If we were to interrupt a modification the lookup would be
+ * pointed at the stable tree and complete while the modification was halted.
+ */
+
+#ifndef RB_TREE_LATCH_H
+#define RB_TREE_LATCH_H
+
+#include <linux/rbtree.h>
+#include <linux/seqlock.h>
+
+struct latch_tree_node {
+	struct rb_node node[2];
+};
+
+struct latch_tree_root {
+	seqcount_t	seq;
+	struct rb_root	tree[2];
+};
+
+/**
+ * latch_tree_ops - operators to define the tree order
+ * @less: used for insertion; provides the (partial) order between two elements.
+ * @comp: used for lookups; provides the order between the search key and an element.
+ *
+ * The operators are related like:
+ *
+ *	comp(a->key,b) < 0  := less(a,b)
+ *	comp(a->key,b) > 0  := less(b,a)
+ *	comp(a->key,b) == 0 := !less(a,b) && !less(b,a)
+ *
+ * If these operators define a partial order on the elements we make no
+ * guarantee on which of the elements matching the key is found. See
+ * latch_tree_find().
+ */
+struct latch_tree_ops {
+	bool (*less)(struct latch_tree_node *a, struct latch_tree_node *b);
+	int  (*comp)(void *key,                 struct latch_tree_node *b);
+};
+
+static __always_inline struct latch_tree_node *
+__lt_from_rb(struct rb_node *node, int idx)
+{
+	return container_of(node, struct latch_tree_node, node[idx]);
+}
+
+static __always_inline void
+__lt_insert(struct latch_tree_node *ltn, struct latch_tree_root *ltr, int idx,
+	    bool (*less)(struct latch_tree_node *a, struct latch_tree_node *b))
+{
+	struct rb_root *root = &ltr->tree[idx];
+	struct rb_node **link = &root->rb_node;
+	struct rb_node *node = &ltn->node[idx];
+	struct rb_node *parent = NULL;
+	struct latch_tree_node *ltp;
+
+	while (*link) {
+		parent = *link;
+		ltp = __lt_from_rb(parent, idx);
+
+		if (less(ltn, ltp))
+			link = &parent->rb_left;
+		else
+			link = &parent->rb_right;
+	}
+
+	rb_link_node_rcu(node, parent, link);
+	rb_insert_color(node, root);
+}
+
+static __always_inline void
+__lt_erase(struct latch_tree_node *ltn, struct latch_tree_root *ltr, int idx)
+{
+	rb_erase(&ltn->node[idx], &ltr->tree[idx]);
+}
+
+static __always_inline struct latch_tree_node *
+__lt_find(void *key, struct latch_tree_root *ltr, int idx,
+	  int (*comp)(void *key, struct latch_tree_node *node))
+{
+	struct rb_node *node = rcu_dereference_raw(ltr->tree[idx].rb_node);
+	struct latch_tree_node *ltn;
+	int c;
+
+	while (node) {
+		ltn = __lt_from_rb(node, idx);
+		c = comp(key, ltn);
+
+		if (c < 0)
+			node = rcu_dereference_raw(node->rb_left);
+		else if (c > 0)
+			node = rcu_dereference_raw(node->rb_right);
+		else
+			return ltn;
+	}
+
+	return NULL;
+}
+
+/**
+ * latch_tree_insert() - insert @node into the trees @root
+ * @node: nodes to insert
+ * @root: trees to insert @node into
+ * @ops: operators defining the node order
+ *
+ * It inserts @node into @root in an ordered fashion such that we can always
+ * observe one complete tree. See the comment for raw_write_seqcount_latch().
+ *
+ * The inserts use rcu_assign_pointer() to publish the element such that the
+ * tree structure is stored before we can observe the new @node.
+ *
+ * All modifications (latch_tree_insert, latch_tree_remove) are assumed to be
+ * serialized.
+ */
+static __always_inline void
+latch_tree_insert(struct latch_tree_node *node,
+		  struct latch_tree_root *root,
+		  const struct latch_tree_ops *ops)
+{
+	raw_write_seqcount_latch(&root->seq);
+	__lt_insert(node, root, 0, ops->less);
+	raw_write_seqcount_latch(&root->seq);
+	__lt_insert(node, root, 1, ops->less);
+}
+
+/**
+ * latch_tree_erase() - removes @node from the trees @root
+ * @node: nodes to remote
+ * @root: trees to remove @node from
+ * @ops: operators defining the node order
+ *
+ * Removes @node from the trees @root in an ordered fashion such that we can
+ * always observe one complete tree. See the comment for
+ * raw_write_seqcount_latch().
+ *
+ * It is assumed that @node will observe one RCU quiescent state before being
+ * reused of freed.
+ *
+ * All modifications (latch_tree_insert, latch_tree_remove) are assumed to be
+ * serialized.
+ */
+static __always_inline void
+latch_tree_erase(struct latch_tree_node *node,
+		 struct latch_tree_root *root,
+		 const struct latch_tree_ops *ops)
+{
+	raw_write_seqcount_latch(&root->seq);
+	__lt_erase(node, root, 0);
+	raw_write_seqcount_latch(&root->seq);
+	__lt_erase(node, root, 1);
+}
+
+/**
+ * latch_tree_find() - find the node matching @key in the trees @root
+ * @key: search key
+ * @root: trees to search for @key
+ * @ops: operators defining the node order
+ *
+ * Does a lockless lookup in the trees @root for the node matching @key.
+ *
+ * It is assumed that this is called while holding the appropriate RCU read
+ * side lock.
+ *
+ * If the operators define a partial order on the elements (there are multiple
+ * elements which have the same key value) it is undefined which of these
+ * elements will be found. Nor is it possible to iterate the tree to find
+ * further elements with the same key value.
+ *
+ * Returns: a pointer to the node matching @key or NULL.
+ */
+static __always_inline struct latch_tree_node *
+latch_tree_find(void *key, struct latch_tree_root *root,
+		const struct latch_tree_ops *ops)
+{
+	struct latch_tree_node *node;
+	unsigned int seq;
+
+	do {
+		seq = raw_read_seqcount_latch(&root->seq);
+		node = __lt_find(key, root, seq & 1, ops->comp);
+	} while (read_seqcount_retry(&root->seq, seq));
+
+	return node;
+}
+
+#endif /* RB_TREE_LATCH_H */
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 33a056bb886f..4cf5f51b4c9c 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -633,21 +633,6 @@ static inline void rcu_preempt_sleep_check(void)
 #define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
 
 /**
- * lockless_dereference() - safely load a pointer for later dereference
- * @p: The pointer to load
- *
- * Similar to rcu_dereference(), but for situations where the pointed-to
- * object's lifetime is managed by something other than RCU.  That
- * "something other" might be reference counting or simple immortality.
- */
-#define lockless_dereference(p) \
-({ \
-	typeof(p) _________p1 = READ_ONCE(p); \
-	smp_read_barrier_depends(); /* Dependency order vs. p above. */ \
-	(_________p1); \
-})
-
-/**
  * rcu_assign_pointer() - assign to RCU-protected pointer
  * @p: pointer to assign to
  * @v: value to assign (publish)
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 486e685a226a..e0582106ef4f 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -35,6 +35,7 @@
 #include <linux/spinlock.h>
 #include <linux/preempt.h>
 #include <linux/lockdep.h>
+#include <linux/compiler.h>
 #include <asm/processor.h>
 
 /*
@@ -274,9 +275,87 @@ static inline void raw_write_seqcount_barrier(seqcount_t *s)
 	s->sequence++;
 }
 
-/*
+static inline int raw_read_seqcount_latch(seqcount_t *s)
+{
+	return lockless_dereference(s->sequence);
+}
+
+/**
  * raw_write_seqcount_latch - redirect readers to even/odd copy
  * @s: pointer to seqcount_t
+ *
+ * The latch technique is a multiversion concurrency control method that allows
+ * queries during non-atomic modifications. If you can guarantee queries never
+ * interrupt the modification -- e.g. the concurrency is strictly between CPUs
+ * -- you most likely do not need this.
+ *
+ * Where the traditional RCU/lockless data structures rely on atomic
+ * modifications to ensure queries observe either the old or the new state the
+ * latch allows the same for non-atomic updates. The trade-off is doubling the
+ * cost of storage; we have to maintain two copies of the entire data
+ * structure.
+ *
+ * Very simply put: we first modify one copy and then the other. This ensures
+ * there is always one copy in a stable state, ready to give us an answer.
+ *
+ * The basic form is a data structure like:
+ *
+ * struct latch_struct {
+ *	seqcount_t		seq;
+ *	struct data_struct	data[2];
+ * };
+ *
+ * Where a modification, which is assumed to be externally serialized, does the
+ * following:
+ *
+ * void latch_modify(struct latch_struct *latch, ...)
+ * {
+ *	smp_wmb();	<- Ensure that the last data[1] update is visible
+ *	latch->seq++;
+ *	smp_wmb();	<- Ensure that the seqcount update is visible
+ *
+ *	modify(latch->data[0], ...);
+ *
+ *	smp_wmb();	<- Ensure that the data[0] update is visible
+ *	latch->seq++;
+ *	smp_wmb();	<- Ensure that the seqcount update is visible
+ *
+ *	modify(latch->data[1], ...);
+ * }
+ *
+ * The query will have a form like:
+ *
+ * struct entry *latch_query(struct latch_struct *latch, ...)
+ * {
+ *	struct entry *entry;
+ *	unsigned seq, idx;
+ *
+ *	do {
+ *		seq = lockless_dereference(latch->seq);
+ *
+ *		idx = seq & 0x01;
+ *		entry = data_query(latch->data[idx], ...);
+ *
+ *		smp_rmb();
+ *	} while (seq != latch->seq);
+ *
+ *	return entry;
+ * }
+ *
+ * So during the modification, queries are first redirected to data[1]. Then we
+ * modify data[0]. When that is complete, we redirect queries back to data[0]
+ * and we can modify data[1].
+ *
+ * NOTE: The non-requirement for atomic modifications does _NOT_ include
+ *       the publishing of new entries in the case where data is a dynamic
+ *       data structure.
+ *
+ *       An iteration might start in data[0] and get suspended long enough
+ *       to miss an entire modification sequence, once it resumes it might
+ *       observe the new entry.
+ *
+ * NOTE: When data is a dynamic data structure; one should use regular RCU
+ *       patterns to manage the lifetimes of the objects within.
  */
 static inline void raw_write_seqcount_latch(seqcount_t *s)
 {
author	Linus Torvalds <torvalds@linux-foundation.org>	2015-07-01 10:49:25 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2015-07-01 10:49:25 -0700
commit	02201e3f1b46aed7c6348f406b7b40de80ba6de3 (patch)
tree	2392c9098359725c195dd82a72b20ccedc1a1509 /include
parent	0890a264794f33df540fbaf274699146903b4e6b (diff)
parent	20bdc2cfdbc484777b30b96fcdbb8994038f3ce1 (diff)