diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-10-30 12:38:48 -1000 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-10-30 12:38:48 -1000 |
commit | 3cf3fabccb9dc821ffaec3ad6bf0cd6b278bd012 (patch) | |
tree | 76bf15f5939812608b8ea514c3c50fd1ac8acebb /include | |
parent | 9cda4eb04a68aee4d795438917a4e958b2b2aa07 (diff) | |
parent | c73801ae4f22b390228ebf471d55668e824198b6 (diff) |
Merge tag 'locking-core-2023-10-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull locking updates from Info Molnar:
"Futex improvements:
- Add the 'futex2' syscall ABI, which is an attempt to get away from
the multiplex syscall and adds a little room for extentions, while
lifting some limitations.
- Fix futex PI recursive rt_mutex waiter state bug
- Fix inter-process shared futexes on no-MMU systems
- Use folios instead of pages
Micro-optimizations of locking primitives:
- Improve arch_spin_value_unlocked() on asm-generic ticket spinlock
architectures, to improve lockref code generation
- Improve the x86-32 lockref_get_not_zero() main loop by adding
build-time CMPXCHG8B support detection for the relevant lockref
code, and by better interfacing the CMPXCHG8B assembly code with
the compiler
- Introduce arch_sync_try_cmpxchg() on x86 to improve
sync_try_cmpxchg() code generation. Convert some sync_cmpxchg()
users to sync_try_cmpxchg().
- Micro-optimize rcuref_put_slowpath()
Locking debuggability improvements:
- Improve CONFIG_DEBUG_RT_MUTEXES=y to have a fast-path as well
- Enforce atomicity of sched_submit_work(), which is de-facto atomic
but was un-enforced previously.
- Extend <linux/cleanup.h>'s no_free_ptr() with __must_check
semantics
- Fix ww_mutex self-tests
- Clean up const-propagation in <linux/seqlock.h> and simplify the
API-instantiation macros a bit
RT locking improvements:
- Provide the rt_mutex_*_schedule() primitives/helpers and use them
in the rtmutex code to avoid recursion vs. rtlock on the PI state.
- Add nested blocking lockdep asserts to rt_mutex_lock(),
rtlock_lock() and rwbase_read_lock()
.. plus misc fixes & cleanups"
* tag 'locking-core-2023-10-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (39 commits)
futex: Don't include process MM in futex key on no-MMU
locking/seqlock: Fix grammar in comment
alpha: Fix up new futex syscall numbers
locking/seqlock: Propagate 'const' pointers within read-only methods, remove forced type casts
locking/lockdep: Fix string sizing bug that triggers a format-truncation compiler-warning
locking/seqlock: Change __seqprop() to return the function pointer
locking/seqlock: Simplify SEQCOUNT_LOCKNAME()
locking/atomics: Use atomic_try_cmpxchg_release() to micro-optimize rcuref_put_slowpath()
locking/atomic, xen: Use sync_try_cmpxchg() instead of sync_cmpxchg()
locking/atomic/x86: Introduce arch_sync_try_cmpxchg()
locking/atomic: Add generic support for sync_try_cmpxchg() and its fallback
locking/seqlock: Fix typo in comment
futex/requeue: Remove unnecessary ‘NULL’ initialization from futex_proxy_trylock_atomic()
locking/local, arch: Rewrite local_add_unless() as a static inline function
locking/debug: Fix debugfs API return value checks to use IS_ERR()
locking/ww_mutex/test: Make sure we bail out instead of livelock
locking/ww_mutex/test: Fix potential workqueue corruption
locking/ww_mutex/test: Use prng instead of rng to avoid hangs at bootup
futex: Add sys_futex_requeue()
futex: Add flags2 argument to futex_requeue()
...
Diffstat (limited to 'include')
-rw-r--r-- | include/asm-generic/spinlock.h | 16 | ||||
-rw-r--r-- | include/linux/atomic/atomic-arch-fallback.h | 15 | ||||
-rw-r--r-- | include/linux/atomic/atomic-instrumented.h | 10 | ||||
-rw-r--r-- | include/linux/cleanup.h | 39 | ||||
-rw-r--r-- | include/linux/sched.h | 3 | ||||
-rw-r--r-- | include/linux/sched/rt.h | 4 | ||||
-rw-r--r-- | include/linux/seqlock.h | 52 | ||||
-rw-r--r-- | include/linux/syscalls.h | 10 | ||||
-rw-r--r-- | include/uapi/asm-generic/unistd.h | 8 | ||||
-rw-r--r-- | include/uapi/linux/futex.h | 31 |
10 files changed, 151 insertions, 37 deletions
diff --git a/include/asm-generic/spinlock.h b/include/asm-generic/spinlock.h index fdfebcb050f4..90803a826ba0 100644 --- a/include/asm-generic/spinlock.h +++ b/include/asm-generic/spinlock.h @@ -68,11 +68,18 @@ static __always_inline void arch_spin_unlock(arch_spinlock_t *lock) smp_store_release(ptr, (u16)val + 1); } +static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock) +{ + u32 val = lock.counter; + + return ((val >> 16) == (val & 0xffff)); +} + static __always_inline int arch_spin_is_locked(arch_spinlock_t *lock) { - u32 val = atomic_read(lock); + arch_spinlock_t val = READ_ONCE(*lock); - return ((val >> 16) != (val & 0xffff)); + return !arch_spin_value_unlocked(val); } static __always_inline int arch_spin_is_contended(arch_spinlock_t *lock) @@ -82,11 +89,6 @@ static __always_inline int arch_spin_is_contended(arch_spinlock_t *lock) return (s16)((val >> 16) - (val & 0xffff)) > 1; } -static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock) -{ - return !arch_spin_is_locked(&lock); -} - #include <asm/qrwlock.h> #endif /* __ASM_GENERIC_SPINLOCK_H */ diff --git a/include/linux/atomic/atomic-arch-fallback.h b/include/linux/atomic/atomic-arch-fallback.h index b83ef19da13d..5e95faa959c4 100644 --- a/include/linux/atomic/atomic-arch-fallback.h +++ b/include/linux/atomic/atomic-arch-fallback.h @@ -428,6 +428,19 @@ extern void raw_cmpxchg128_relaxed_not_implemented(void); #define raw_sync_cmpxchg arch_sync_cmpxchg +#ifdef arch_sync_try_cmpxchg +#define raw_sync_try_cmpxchg arch_sync_try_cmpxchg +#else +#define raw_sync_try_cmpxchg(_ptr, _oldp, _new) \ +({ \ + typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ + ___r = raw_sync_cmpxchg((_ptr), ___o, (_new)); \ + if (unlikely(___r != ___o)) \ + *___op = ___r; \ + likely(___r == ___o); \ +}) +#endif + /** * raw_atomic_read() - atomic load with relaxed ordering * @v: pointer to atomic_t @@ -4649,4 +4662,4 @@ raw_atomic64_dec_if_positive(atomic64_t *v) } #endif /* _LINUX_ATOMIC_FALLBACK_H */ -// 2fdd6702823fa842f9cea57a002e6e4476ae780c +// eec048affea735b8464f58e6d96992101f8f85f1 diff --git a/include/linux/atomic/atomic-instrumented.h b/include/linux/atomic/atomic-instrumented.h index d401b406ef7c..54d7bbe0aeaa 100644 --- a/include/linux/atomic/atomic-instrumented.h +++ b/include/linux/atomic/atomic-instrumented.h @@ -4998,6 +4998,14 @@ atomic_long_dec_if_positive(atomic_long_t *v) raw_try_cmpxchg128_local(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) +#define sync_try_cmpxchg(ptr, ...) \ +({ \ + typeof(ptr) __ai_ptr = (ptr); \ + kcsan_mb(); \ + instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ + raw_sync_try_cmpxchg(__ai_ptr, __VA_ARGS__); \ +}) + #endif /* _LINUX_ATOMIC_INSTRUMENTED_H */ -// 1568f875fef72097413caab8339120c065a39aa4 +// 2cc4bc990fef44d3836ec108f11b610f3f438184 diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index 53f1a7a932b0..9f1a9c455b68 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -7,8 +7,9 @@ /* * DEFINE_FREE(name, type, free): * simple helper macro that defines the required wrapper for a __free() - * based cleanup function. @free is an expression using '_T' to access - * the variable. + * based cleanup function. @free is an expression using '_T' to access the + * variable. @free should typically include a NULL test before calling a + * function, see the example below. * * __free(name): * variable attribute to add a scoped based cleanup to the variable. @@ -17,6 +18,9 @@ * like a non-atomic xchg(var, NULL), such that the cleanup function will * be inhibited -- provided it sanely deals with a NULL value. * + * NOTE: this has __must_check semantics so that it is harder to accidentally + * leak the resource. + * * return_ptr(p): * returns p while inhibiting the __free(). * @@ -24,6 +28,8 @@ * * DEFINE_FREE(kfree, void *, if (_T) kfree(_T)) * + * void *alloc_obj(...) + * { * struct obj *p __free(kfree) = kmalloc(...); * if (!p) * return NULL; @@ -32,6 +38,24 @@ * return NULL; * * return_ptr(p); + * } + * + * NOTE: the DEFINE_FREE()'s @free expression includes a NULL test even though + * kfree() is fine to be called with a NULL value. This is on purpose. This way + * the compiler sees the end of our alloc_obj() function as: + * + * tmp = p; + * p = NULL; + * if (p) + * kfree(p); + * return tmp; + * + * And through the magic of value-propagation and dead-code-elimination, it + * eliminates the actual cleanup call and compiles into: + * + * return p; + * + * Without the NULL test it turns into a mess and the compiler can't help us. */ #define DEFINE_FREE(_name, _type, _free) \ @@ -39,8 +63,17 @@ #define __free(_name) __cleanup(__free_##_name) +#define __get_and_null_ptr(p) \ + ({ __auto_type __ptr = &(p); \ + __auto_type __val = *__ptr; \ + *__ptr = NULL; __val; }) + +static inline __must_check +const volatile void * __must_check_fn(const volatile void *val) +{ return val; } + #define no_free_ptr(p) \ - ({ __auto_type __ptr = (p); (p) = NULL; __ptr; }) + ((typeof(p)) __must_check_fn(__get_and_null_ptr(p))) #define return_ptr(p) return no_free_ptr(p) diff --git a/include/linux/sched.h b/include/linux/sched.h index d5951e99706a..6af88873499a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -912,6 +912,9 @@ struct task_struct { * ->sched_remote_wakeup gets used, so it can be in this word. */ unsigned sched_remote_wakeup:1; +#ifdef CONFIG_RT_MUTEXES + unsigned sched_rt_mutex:1; +#endif /* Bit to tell LSMs we're in execve(): */ unsigned in_execve:1; diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h index 994c25640e15..b2b9e6eb9683 100644 --- a/include/linux/sched/rt.h +++ b/include/linux/sched/rt.h @@ -30,6 +30,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) } #ifdef CONFIG_RT_MUTEXES +extern void rt_mutex_pre_schedule(void); +extern void rt_mutex_schedule(void); +extern void rt_mutex_post_schedule(void); + /* * Must hold either p->pi_lock or task_rq(p)->lock. */ diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index e9bd2f65d7f4..e92f9d5577ba 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -191,11 +191,9 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s) * @lockname: "LOCKNAME" part of seqcount_LOCKNAME_t * @locktype: LOCKNAME canonical C data type * @preemptible: preemptibility of above locktype - * @lockmember: argument for lockdep_assert_held() - * @lockbase: associated lock release function (prefix only) - * @lock_acquire: associated lock acquisition function (full call) + * @lockbase: prefix for associated lock/unlock */ -#define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockmember, lockbase, lock_acquire) \ +#define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockbase) \ typedef struct seqcount_##lockname { \ seqcount_t seqcount; \ __SEQ_LOCK(locktype *lock); \ @@ -207,6 +205,12 @@ __seqprop_##lockname##_ptr(seqcount_##lockname##_t *s) \ return &s->seqcount; \ } \ \ +static __always_inline const seqcount_t * \ +__seqprop_##lockname##_const_ptr(const seqcount_##lockname##_t *s) \ +{ \ + return &s->seqcount; \ +} \ + \ static __always_inline unsigned \ __seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s) \ { \ @@ -216,7 +220,7 @@ __seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s) \ return seq; \ \ if (preemptible && unlikely(seq & 1)) { \ - __SEQ_LOCK(lock_acquire); \ + __SEQ_LOCK(lockbase##_lock(s->lock)); \ __SEQ_LOCK(lockbase##_unlock(s->lock)); \ \ /* \ @@ -242,7 +246,7 @@ __seqprop_##lockname##_preemptible(const seqcount_##lockname##_t *s) \ static __always_inline void \ __seqprop_##lockname##_assert(const seqcount_##lockname##_t *s) \ { \ - __SEQ_LOCK(lockdep_assert_held(lockmember)); \ + __SEQ_LOCK(lockdep_assert_held(s->lock)); \ } /* @@ -254,6 +258,11 @@ static inline seqcount_t *__seqprop_ptr(seqcount_t *s) return s; } +static inline const seqcount_t *__seqprop_const_ptr(const seqcount_t *s) +{ + return s; +} + static inline unsigned __seqprop_sequence(const seqcount_t *s) { return READ_ONCE(s->sequence); @@ -271,10 +280,10 @@ static inline void __seqprop_assert(const seqcount_t *s) #define __SEQ_RT IS_ENABLED(CONFIG_PREEMPT_RT) -SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t, false, s->lock, raw_spin, raw_spin_lock(s->lock)) -SEQCOUNT_LOCKNAME(spinlock, spinlock_t, __SEQ_RT, s->lock, spin, spin_lock(s->lock)) -SEQCOUNT_LOCKNAME(rwlock, rwlock_t, __SEQ_RT, s->lock, read, read_lock(s->lock)) -SEQCOUNT_LOCKNAME(mutex, struct mutex, true, s->lock, mutex, mutex_lock(s->lock)) +SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t, false, raw_spin) +SEQCOUNT_LOCKNAME(spinlock, spinlock_t, __SEQ_RT, spin) +SEQCOUNT_LOCKNAME(rwlock, rwlock_t, __SEQ_RT, read) +SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) /* * SEQCNT_LOCKNAME_ZERO - static initializer for seqcount_LOCKNAME_t @@ -294,19 +303,20 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, s->lock, mutex #define SEQCNT_WW_MUTEX_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) #define __seqprop_case(s, lockname, prop) \ - seqcount_##lockname##_t: __seqprop_##lockname##_##prop((void *)(s)) + seqcount_##lockname##_t: __seqprop_##lockname##_##prop #define __seqprop(s, prop) _Generic(*(s), \ - seqcount_t: __seqprop_##prop((void *)(s)), \ + seqcount_t: __seqprop_##prop, \ __seqprop_case((s), raw_spinlock, prop), \ __seqprop_case((s), spinlock, prop), \ __seqprop_case((s), rwlock, prop), \ __seqprop_case((s), mutex, prop)) -#define seqprop_ptr(s) __seqprop(s, ptr) -#define seqprop_sequence(s) __seqprop(s, sequence) -#define seqprop_preemptible(s) __seqprop(s, preemptible) -#define seqprop_assert(s) __seqprop(s, assert) +#define seqprop_ptr(s) __seqprop(s, ptr)(s) +#define seqprop_const_ptr(s) __seqprop(s, const_ptr)(s) +#define seqprop_sequence(s) __seqprop(s, sequence)(s) +#define seqprop_preemptible(s) __seqprop(s, preemptible)(s) +#define seqprop_assert(s) __seqprop(s, assert)(s) /** * __read_seqcount_begin() - begin a seqcount_t read section w/o barrier @@ -355,7 +365,7 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, s->lock, mutex */ #define read_seqcount_begin(s) \ ({ \ - seqcount_lockdep_reader_access(seqprop_ptr(s)); \ + seqcount_lockdep_reader_access(seqprop_const_ptr(s)); \ raw_read_seqcount_begin(s); \ }) @@ -421,7 +431,7 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, s->lock, mutex * Return: true if a read section retry is required, else false */ #define __read_seqcount_retry(s, start) \ - do___read_seqcount_retry(seqprop_ptr(s), start) + do___read_seqcount_retry(seqprop_const_ptr(s), start) static inline int do___read_seqcount_retry(const seqcount_t *s, unsigned start) { @@ -441,7 +451,7 @@ static inline int do___read_seqcount_retry(const seqcount_t *s, unsigned start) * Return: true if a read section retry is required, else false */ #define read_seqcount_retry(s, start) \ - do_read_seqcount_retry(seqprop_ptr(s), start) + do_read_seqcount_retry(seqprop_const_ptr(s), start) static inline int do_read_seqcount_retry(const seqcount_t *s, unsigned start) { @@ -574,7 +584,7 @@ static inline void do_write_seqcount_end(seqcount_t *s) * via WRITE_ONCE): a) to ensure the writes become visible to other threads * atomically, avoiding compiler optimizations; b) to document which writes are * meant to propagate to the reader critical section. This is necessary because - * neither writes before and after the barrier are enclosed in a seq-writer + * neither writes before nor after the barrier are enclosed in a seq-writer * critical section that would ensure readers are aware of ongoing writes:: * * seqcount_t seq; @@ -864,7 +874,7 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start) } /* - * For all seqlock_t write side functions, use the the internal + * For all seqlock_t write side functions, use the internal * do_write_seqcount_begin() instead of generic write_seqcount_begin(). * This way, no redundant lockdep_assert_held() checks are added. */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 22bc6bc147f8..0901af60d971 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -549,6 +549,16 @@ asmlinkage long sys_set_robust_list(struct robust_list_head __user *head, asmlinkage long sys_futex_waitv(struct futex_waitv *waiters, unsigned int nr_futexes, unsigned int flags, struct __kernel_timespec __user *timeout, clockid_t clockid); + +asmlinkage long sys_futex_wake(void __user *uaddr, unsigned long mask, int nr, unsigned int flags); + +asmlinkage long sys_futex_wait(void __user *uaddr, unsigned long val, unsigned long mask, + unsigned int flags, struct __kernel_timespec __user *timespec, + clockid_t clockid); + +asmlinkage long sys_futex_requeue(struct futex_waitv __user *waiters, + unsigned int flags, int nr_wake, int nr_requeue); + asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp, struct __kernel_timespec __user *rmtp); asmlinkage long sys_nanosleep_time32(struct old_timespec32 __user *rqtp, diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index abe087c53b4b..d9e9cd13e577 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -822,9 +822,15 @@ __SYSCALL(__NR_cachestat, sys_cachestat) #define __NR_fchmodat2 452 __SYSCALL(__NR_fchmodat2, sys_fchmodat2) +#define __NR_futex_wake 454 +__SYSCALL(__NR_futex_wake, sys_futex_wake) +#define __NR_futex_wait 455 +__SYSCALL(__NR_futex_wait, sys_futex_wait) +#define __NR_futex_requeue 456 +__SYSCALL(__NR_futex_requeue, sys_futex_requeue) #undef __NR_syscalls -#define __NR_syscalls 453 +#define __NR_syscalls 457 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h index 71a5df8d2689..d2ee625ea189 100644 --- a/include/uapi/linux/futex.h +++ b/include/uapi/linux/futex.h @@ -44,10 +44,35 @@ FUTEX_PRIVATE_FLAG) /* - * Flags to specify the bit length of the futex word for futex2 syscalls. - * Currently, only 32 is supported. + * Flags for futex2 syscalls. + * + * NOTE: these are not pure flags, they can also be seen as: + * + * union { + * u32 flags; + * struct { + * u32 size : 2, + * numa : 1, + * : 4, + * private : 1; + * }; + * }; */ -#define FUTEX_32 2 +#define FUTEX2_SIZE_U8 0x00 +#define FUTEX2_SIZE_U16 0x01 +#define FUTEX2_SIZE_U32 0x02 +#define FUTEX2_SIZE_U64 0x03 +#define FUTEX2_NUMA 0x04 + /* 0x08 */ + /* 0x10 */ + /* 0x20 */ + /* 0x40 */ +#define FUTEX2_PRIVATE FUTEX_PRIVATE_FLAG + +#define FUTEX2_SIZE_MASK 0x03 + +/* do not use */ +#define FUTEX_32 FUTEX2_SIZE_U32 /* historical accident :-( */ /* * Max numbers of elements in a futex_waitv array |