From 6e1e5196975fb7ecc501b3fe1075b77aea2b7839 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Mon, 14 Sep 2015 00:37:22 -0700
Subject: locking/qrwlock: Rename ->lock to ->wait_lock

... trivial, but reads a little nicer when we name our
actual primitive 'lock'.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <Waiman.Long@hpe.com>
Link: http://lkml.kernel.org/r/1442216244-4409-1-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/qrwlock.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index f17a3e3b3550..fec082338668 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -86,7 +86,7 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
 	/*
 	 * Put the reader into the wait queue
 	 */
-	arch_spin_lock(&lock->lock);
+	arch_spin_lock(&lock->wait_lock);
 
 	/*
 	 * The ACQUIRE semantics of the following spinning code ensure
@@ -99,7 +99,7 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
 	/*
 	 * Signal the next one in queue to become queue head
 	 */
-	arch_spin_unlock(&lock->lock);
+	arch_spin_unlock(&lock->wait_lock);
 }
 EXPORT_SYMBOL(queued_read_lock_slowpath);
 
@@ -112,7 +112,7 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
 	u32 cnts;
 
 	/* Put the writer into the wait queue */
-	arch_spin_lock(&lock->lock);
+	arch_spin_lock(&lock->wait_lock);
 
 	/* Try to acquire the lock directly if no reader is present */
 	if (!atomic_read(&lock->cnts) &&
@@ -144,6 +144,6 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
 		cpu_relax_lowlatency();
 	}
 unlock:
-	arch_spin_unlock(&lock->lock);
+	arch_spin_unlock(&lock->wait_lock);
 }
 EXPORT_SYMBOL(queued_write_lock_slowpath);
-- 
cgit v1.2.3-58-ga151


From c55a6ffa6285e29f874ed403979472631ec70bff Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Mon, 14 Sep 2015 00:37:24 -0700
Subject: locking/osq: Relax atomic semantics

... by using acquire/release for ops around the lock->tail. As such,
weakly ordered archs can benefit from more relaxed use of barriers
when issuing atomics.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <Waiman.Long@hpe.com>
Link: http://lkml.kernel.org/r/1442216244-4409-3-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/osq_lock.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index dc85ee23a26f..d092a0c9c2d4 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -50,7 +50,7 @@ osq_wait_next(struct optimistic_spin_queue *lock,
 
 	for (;;) {
 		if (atomic_read(&lock->tail) == curr &&
-		    atomic_cmpxchg(&lock->tail, curr, old) == curr) {
+		    atomic_cmpxchg_acquire(&lock->tail, curr, old) == curr) {
 			/*
 			 * We were the last queued, we moved @lock back. @prev
 			 * will now observe @lock and will complete its
@@ -92,7 +92,11 @@ bool osq_lock(struct optimistic_spin_queue *lock)
 	node->next = NULL;
 	node->cpu = curr;
 
-	old = atomic_xchg(&lock->tail, curr);
+	/*
+	 * ACQUIRE semantics, pairs with corresponding RELEASE
+	 * in unlock() uncontended, or fastpath.
+	 */
+	old = atomic_xchg_acquire(&lock->tail, curr);
 	if (old == OSQ_UNLOCKED_VAL)
 		return true;
 
@@ -184,7 +188,8 @@ void osq_unlock(struct optimistic_spin_queue *lock)
 	/*
 	 * Fast path for the uncontended case.
 	 */
-	if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr))
+	if (likely(atomic_cmpxchg_release(&lock->tail, curr,
+					  OSQ_UNLOCKED_VAL) == curr))
 		return;
 
 	/*
-- 
cgit v1.2.3-58-ga151


From 93edc8bd7750ff3cae088bfca453ea73dc9004a4 Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hpe.com>
Date: Fri, 11 Sep 2015 14:37:34 -0400
Subject: locking/pvqspinlock: Kick the PV CPU unconditionally when _Q_SLOW_VAL

If _Q_SLOW_VAL has been set, the vCPU state must have been vcpu_hashed.
The extra check at the end of __pv_queued_spin_unlock() is unnecessary
and can be removed.

Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Davidlohr Bueso <dave@stgolabs.net>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Douglas Hatch <doug.hatch@hp.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott J Norton <scott.norton@hp.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1441996658-62854-3-git-send-email-Waiman.Long@hpe.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/qspinlock_paravirt.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index c8e6e9a596f5..f0450ff4829b 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -267,7 +267,6 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
 		}
 
 		if (!lp) { /* ONCE */
-			WRITE_ONCE(pn->state, vcpu_hashed);
 			lp = pv_hash(lock, pn);
 
 			/*
@@ -275,11 +274,9 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
 			 * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock()
 			 * we'll be sure to be able to observe our hash entry.
 			 *
-			 *   [S] pn->state
 			 *   [S] <hash>                 [Rmw] l->locked == _Q_SLOW_VAL
 			 *       MB                           RMB
 			 * [RmW] l->locked = _Q_SLOW_VAL  [L] <unhash>
-			 *                                [L] pn->state
 			 *
 			 * Matches the smp_rmb() in __pv_queued_spin_unlock().
 			 */
@@ -364,8 +361,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
 	 * vCPU is harmless other than the additional latency in completing
 	 * the unlock.
 	 */
-	if (READ_ONCE(node->state) == vcpu_hashed)
-		pv_kick(node->cpu);
+	pv_kick(node->cpu);
 }
 /*
  * Include the architecture specific callee-save thunk of the
-- 
cgit v1.2.3-58-ga151


From ac742d37180bee83bc433be087b66a17af2883b9 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Wed, 9 Sep 2015 23:36:40 +0200
Subject: futex: Force hot variables into a single cache line

futex_hash() references two global variables: the base pointer
futex_queues and the size of the array futex_hashsize. The latter is
marked __read_mostly, while the former is not, so they are likely to
end up very far from each other. This means that futex_hash() is
likely to encounter two cache misses.

We could mark futex_queues as __read_mostly as well, but that doesn't
guarantee they'll end up next to each other (and even if they do, they
may still end up in different cache lines). So put the two variables
in a small singleton struct with sufficient alignment and mark that as
__read_mostly.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: kbuild test robot <fengguang.wu@intel.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: http://lkml.kernel.org/r/1441834601-13633-1-git-send-email-linux@rasmusvillemoes.dk
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 6e443efc65f4..dfc86e93c31d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -255,9 +255,18 @@ struct futex_hash_bucket {
 	struct plist_head chain;
 } ____cacheline_aligned_in_smp;
 
-static unsigned long __read_mostly futex_hashsize;
+/*
+ * The base of the bucket array and its size are always used together
+ * (after initialization only in hash_futex()), so ensure that they
+ * reside in the same cacheline.
+ */
+static struct {
+	struct futex_hash_bucket *queues;
+	unsigned long            hashsize;
+} __futex_data __read_mostly __aligned(2*sizeof(long));
+#define futex_queues   (__futex_data.queues)
+#define futex_hashsize (__futex_data.hashsize)
 
-static struct futex_hash_bucket *futex_queues;
 
 /*
  * Fault injections for futexes.
-- 
cgit v1.2.3-58-ga151


From 81a43adae3b943193fb3afd20a36a7482332f964 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 30 Sep 2015 13:03:12 -0700
Subject: locking/mutex: Use acquire/release semantics

As of 654672d4ba1 (locking/atomics: Add _{acquire|release|relaxed}()
variants of some atomic operations) and 6d79ef2d30e (locking, asm-generic:
Add _{relaxed|acquire|release}() variants for 'atomic_long_t'), weakly
ordered archs can benefit from more relaxed use of barriers when locking
and unlocking, instead of regular full barrier semantics. While currently
only arm64 supports such optimizations, updating corresponding locking
primitives serves for other archs to immediately benefit as well, once the
necessary machinery is implemented of course.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul E.McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/1443643395-17016-3-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/asm-generic/mutex-dec.h  |  8 ++++----
 include/asm-generic/mutex-xchg.h | 10 +++++-----
 kernel/locking/mutex.c           |  9 +++++----
 3 files changed, 14 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-generic/mutex-dec.h b/include/asm-generic/mutex-dec.h
index d4f9fb4e53df..fd694cfd678a 100644
--- a/include/asm-generic/mutex-dec.h
+++ b/include/asm-generic/mutex-dec.h
@@ -20,7 +20,7 @@
 static inline void
 __mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *))
 {
-	if (unlikely(atomic_dec_return(count) < 0))
+	if (unlikely(atomic_dec_return_acquire(count) < 0))
 		fail_fn(count);
 }
 
@@ -35,7 +35,7 @@ __mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *))
 static inline int
 __mutex_fastpath_lock_retval(atomic_t *count)
 {
-	if (unlikely(atomic_dec_return(count) < 0))
+	if (unlikely(atomic_dec_return_acquire(count) < 0))
 		return -1;
 	return 0;
 }
@@ -56,7 +56,7 @@ __mutex_fastpath_lock_retval(atomic_t *count)
 static inline void
 __mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
 {
-	if (unlikely(atomic_inc_return(count) <= 0))
+	if (unlikely(atomic_inc_return_release(count) <= 0))
 		fail_fn(count);
 }
 
@@ -80,7 +80,7 @@ __mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
 static inline int
 __mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
 {
-	if (likely(atomic_cmpxchg(count, 1, 0) == 1))
+	if (likely(atomic_cmpxchg_acquire(count, 1, 0) == 1))
 		return 1;
 	return 0;
 }
diff --git a/include/asm-generic/mutex-xchg.h b/include/asm-generic/mutex-xchg.h
index f169ec064785..a6b4a7bd6ac9 100644
--- a/include/asm-generic/mutex-xchg.h
+++ b/include/asm-generic/mutex-xchg.h
@@ -31,7 +31,7 @@ __mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *))
 		 * to ensure that any waiting tasks are woken up by the
 		 * unlock slow path.
 		 */
-		if (likely(atomic_xchg(count, -1) != 1))
+		if (likely(atomic_xchg_acquire(count, -1) != 1))
 			fail_fn(count);
 }
 
@@ -46,7 +46,7 @@ __mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *))
 static inline int
 __mutex_fastpath_lock_retval(atomic_t *count)
 {
-	if (unlikely(atomic_xchg(count, 0) != 1))
+	if (unlikely(atomic_xchg_acquire(count, 0) != 1))
 		if (likely(atomic_xchg(count, -1) != 1))
 			return -1;
 	return 0;
@@ -67,7 +67,7 @@ __mutex_fastpath_lock_retval(atomic_t *count)
 static inline void
 __mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
 {
-	if (unlikely(atomic_xchg(count, 1) != 0))
+	if (unlikely(atomic_xchg_release(count, 1) != 0))
 		fail_fn(count);
 }
 
@@ -91,7 +91,7 @@ __mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
 static inline int
 __mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
 {
-	int prev = atomic_xchg(count, 0);
+	int prev = atomic_xchg_acquire(count, 0);
 
 	if (unlikely(prev < 0)) {
 		/*
@@ -105,7 +105,7 @@ __mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
 		 *   owner's unlock path needlessly, but that's not a problem
 		 *   in practice. ]
 		 */
-		prev = atomic_xchg(count, prev);
+		prev = atomic_xchg_acquire(count, prev);
 		if (prev < 0)
 			prev = 0;
 	}
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 4cccea6b8934..0551c219c40e 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -277,7 +277,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
 static inline bool mutex_try_to_acquire(struct mutex *lock)
 {
 	return !mutex_is_locked(lock) &&
-		(atomic_cmpxchg(&lock->count, 1, 0) == 1);
+		(atomic_cmpxchg_acquire(&lock->count, 1, 0) == 1);
 }
 
 /*
@@ -529,7 +529,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	 * Once more, try to acquire the lock. Only try-lock the mutex if
 	 * it is unlocked to reduce unnecessary xchg() operations.
 	 */
-	if (!mutex_is_locked(lock) && (atomic_xchg(&lock->count, 0) == 1))
+	if (!mutex_is_locked(lock) &&
+	    (atomic_xchg_acquire(&lock->count, 0) == 1))
 		goto skip_wait;
 
 	debug_mutex_lock_common(lock, &waiter);
@@ -553,7 +554,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		 * non-negative in order to avoid unnecessary xchg operations:
 		 */
 		if (atomic_read(&lock->count) >= 0 &&
-		    (atomic_xchg(&lock->count, -1) == 1))
+		    (atomic_xchg_acquire(&lock->count, -1) == 1))
 			break;
 
 		/*
@@ -867,7 +868,7 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
 
 	spin_lock_mutex(&lock->wait_lock, flags);
 
-	prev = atomic_xchg(&lock->count, -1);
+	prev = atomic_xchg_acquire(&lock->count, -1);
 	if (likely(prev == 1)) {
 		mutex_set_owner(lock);
 		mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
-- 
cgit v1.2.3-58-ga151


From 700318d1d7b38bbfe86813d9c5c18364dd941526 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 30 Sep 2015 13:03:13 -0700
Subject: locking/rtmutex: Use acquire/release semantics

As of 654672d4ba1 (locking/atomics: Add _{acquire|release|relaxed}()
variants of some atomic operations) and 6d79ef2d30e (locking, asm-generic:
Add _{relaxed|acquire|release}() variants for 'atomic_long_t'), weakly
ordered archs can benefit from more relaxed use of barriers when locking
and unlocking, instead of regular full barrier semantics. While currently
only arm64 supports such optimizations, updating corresponding locking
primitives serves for other archs to immediately benefit as well, once the
necessary machinery is implemented of course.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul E.McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/1443643395-17016-4-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/rtmutex.c | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 7781d801212f..bbb72b4f64a1 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -74,14 +74,23 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
  * set up.
  */
 #ifndef CONFIG_DEBUG_RT_MUTEXES
-# define rt_mutex_cmpxchg(l,c,n)	(cmpxchg(&l->owner, c, n) == c)
+# define rt_mutex_cmpxchg_relaxed(l,c,n) (cmpxchg_relaxed(&l->owner, c, n) == c)
+# define rt_mutex_cmpxchg_acquire(l,c,n) (cmpxchg_acquire(&l->owner, c, n) == c)
+# define rt_mutex_cmpxchg_release(l,c,n) (cmpxchg_release(&l->owner, c, n) == c)
+
+/*
+ * Callers must hold the ->wait_lock -- which is the whole purpose as we force
+ * all future threads that attempt to [Rmw] the lock to the slowpath. As such
+ * relaxed semantics suffice.
+ */
 static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
 {
 	unsigned long owner, *p = (unsigned long *) &lock->owner;
 
 	do {
 		owner = *p;
-	} while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
+	} while (cmpxchg_relaxed(p, owner,
+				 owner | RT_MUTEX_HAS_WAITERS) != owner);
 }
 
 /*
@@ -121,11 +130,14 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
 	 *					lock(wait_lock);
 	 *					acquire(lock);
 	 */
-	return rt_mutex_cmpxchg(lock, owner, NULL);
+	return rt_mutex_cmpxchg_release(lock, owner, NULL);
 }
 
 #else
-# define rt_mutex_cmpxchg(l,c,n)	(0)
+# define rt_mutex_cmpxchg_relaxed(l,c,n)	(0)
+# define rt_mutex_cmpxchg_acquire(l,c,n)	(0)
+# define rt_mutex_cmpxchg_release(l,c,n)	(0)
+
 static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
 {
 	lock->owner = (struct task_struct *)
@@ -1321,7 +1333,7 @@ rt_mutex_fastlock(struct rt_mutex *lock, int state,
 				struct hrtimer_sleeper *timeout,
 				enum rtmutex_chainwalk chwalk))
 {
-	if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+	if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
 		rt_mutex_deadlock_account_lock(lock, current);
 		return 0;
 	} else
@@ -1337,7 +1349,7 @@ rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
 				      enum rtmutex_chainwalk chwalk))
 {
 	if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
-	    likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+	    likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
 		rt_mutex_deadlock_account_lock(lock, current);
 		return 0;
 	} else
@@ -1348,7 +1360,7 @@ static inline int
 rt_mutex_fasttrylock(struct rt_mutex *lock,
 		     int (*slowfn)(struct rt_mutex *lock))
 {
-	if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+	if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
 		rt_mutex_deadlock_account_lock(lock, current);
 		return 1;
 	}
@@ -1362,7 +1374,7 @@ rt_mutex_fastunlock(struct rt_mutex *lock,
 {
 	WAKE_Q(wake_q);
 
-	if (likely(rt_mutex_cmpxchg(lock, current, NULL))) {
+	if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
 		rt_mutex_deadlock_account_unlock(current);
 
 	} else {
@@ -1484,7 +1496,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_unlock);
 bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
 				   struct wake_q_head *wqh)
 {
-	if (likely(rt_mutex_cmpxchg(lock, current, NULL))) {
+	if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
 		rt_mutex_deadlock_account_unlock(current);
 		return false;
 	}
-- 
cgit v1.2.3-58-ga151


From 3552a07a9c4aea32cc092fadf10a186c84ed8a61 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 30 Sep 2015 13:03:14 -0700
Subject: locking/mcs: Use acquire/release semantics

As of 654672d4ba1 (locking/atomics: Add _{acquire|release|relaxed}()
variants of some atomic operations) and 6d79ef2d30e (locking, asm-generic:
Add _{relaxed|acquire|release}() variants for 'atomic_long_t'), weakly
ordered archs can benefit from more relaxed use of barriers when locking
and unlocking, instead of regular full barrier semantics. While currently
only arm64 supports such optimizations, updating corresponding locking
primitives serves for other archs to immediately benefit as well, once the
necessary machinery is implemented of course.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul E.McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/1443643395-17016-5-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/mcs_spinlock.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index fd91aaa4554c..5b9102a47ea5 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -67,7 +67,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
 	node->locked = 0;
 	node->next   = NULL;
 
-	prev = xchg(lock, node);
+	prev = xchg_acquire(lock, node);
 	if (likely(prev == NULL)) {
 		/*
 		 * Lock acquired, don't need to set node->locked to 1. Threads
@@ -98,7 +98,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
 		/*
 		 * Release the lock by setting it to NULL
 		 */
-		if (likely(cmpxchg(lock, node, NULL) == node))
+		if (likely(cmpxchg_release(lock, node, NULL) == node))
 			return;
 		/* Wait until the next pointer is set */
 		while (!(next = READ_ONCE(node->next)))
-- 
cgit v1.2.3-58-ga151


From 00eb4bab69db349c3bdc7e0b0f7e9070dafea58c Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 30 Sep 2015 13:03:15 -0700
Subject: locking/rwsem: Use acquire/release semantics

As of 654672d4ba1 (locking/atomics: Add _{acquire|release|relaxed}()
variants of some atomic operations) and 6d79ef2d30e (locking, asm-generic:
Add _{relaxed|acquire|release}() variants for 'atomic_long_t'), weakly
ordered archs can benefit from more relaxed use of barriers when locking
and unlocking, instead of regular full barrier semantics. While currently
only arm64 supports such optimizations, updating corresponding locking
primitives serves for other archs to immediately benefit as well, once the
necessary machinery is implemented of course.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul E.McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/1443643395-17016-6-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/asm-generic/rwsem.h | 21 ++++++++++++++-------
 kernel/locking/rwsem-xadd.c |  5 +++--
 2 files changed, 17 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-generic/rwsem.h b/include/asm-generic/rwsem.h
index d48bf5a95cc1..d6d5dc98d7da 100644
--- a/include/asm-generic/rwsem.h
+++ b/include/asm-generic/rwsem.h
@@ -33,7 +33,7 @@
  */
 static inline void __down_read(struct rw_semaphore *sem)
 {
-	if (unlikely(atomic_long_inc_return((atomic_long_t *)&sem->count) <= 0))
+	if (unlikely(atomic_long_inc_return_acquire((atomic_long_t *)&sem->count) <= 0))
 		rwsem_down_read_failed(sem);
 }
 
@@ -42,7 +42,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
 	long tmp;
 
 	while ((tmp = sem->count) >= 0) {
-		if (tmp == cmpxchg(&sem->count, tmp,
+		if (tmp == cmpxchg_acquire(&sem->count, tmp,
 				   tmp + RWSEM_ACTIVE_READ_BIAS)) {
 			return 1;
 		}
@@ -57,7 +57,7 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
 {
 	long tmp;
 
-	tmp = atomic_long_add_return(RWSEM_ACTIVE_WRITE_BIAS,
+	tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS,
 				     (atomic_long_t *)&sem->count);
 	if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS))
 		rwsem_down_write_failed(sem);
@@ -72,7 +72,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem)
 {
 	long tmp;
 
-	tmp = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE,
+	tmp = cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE,
 		      RWSEM_ACTIVE_WRITE_BIAS);
 	return tmp == RWSEM_UNLOCKED_VALUE;
 }
@@ -84,7 +84,7 @@ static inline void __up_read(struct rw_semaphore *sem)
 {
 	long tmp;
 
-	tmp = atomic_long_dec_return((atomic_long_t *)&sem->count);
+	tmp = atomic_long_dec_return_release((atomic_long_t *)&sem->count);
 	if (unlikely(tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0))
 		rwsem_wake(sem);
 }
@@ -94,7 +94,7 @@ static inline void __up_read(struct rw_semaphore *sem)
  */
 static inline void __up_write(struct rw_semaphore *sem)
 {
-	if (unlikely(atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS,
+	if (unlikely(atomic_long_sub_return_release(RWSEM_ACTIVE_WRITE_BIAS,
 				 (atomic_long_t *)&sem->count) < 0))
 		rwsem_wake(sem);
 }
@@ -114,7 +114,14 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
 {
 	long tmp;
 
-	tmp = atomic_long_add_return(-RWSEM_WAITING_BIAS,
+	/*
+	 * When downgrading from exclusive to shared ownership,
+	 * anything inside the write-locked region cannot leak
+	 * into the read side. In contrast, anything in the
+	 * read-locked region is ok to be re-ordered into the
+	 * write side. As such, rely on RELEASE semantics.
+	 */
+	tmp = atomic_long_add_return_release(-RWSEM_WAITING_BIAS,
 				     (atomic_long_t *)&sem->count);
 	if (tmp < 0)
 		rwsem_downgrade_wake(sem);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 0f189714e457..a4d4de05b2d1 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -262,7 +262,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
 	 * to reduce unnecessary expensive cmpxchg() operations.
 	 */
 	if (count == RWSEM_WAITING_BIAS &&
-	    cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
+	    cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS,
 		    RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
 		if (!list_is_singular(&sem->wait_list))
 			rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
@@ -285,7 +285,8 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
 		if (!(count == 0 || count == RWSEM_WAITING_BIAS))
 			return false;
 
-		old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
+		old = cmpxchg_acquire(&sem->count, count,
+				      count + RWSEM_ACTIVE_WRITE_BIAS);
 		if (old == count) {
 			rwsem_set_owner(sem);
 			return true;
-- 
cgit v1.2.3-58-ga151