From 85be6d842447067ce76047a14d4258c96fd33b7b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 15 Aug 2023 12:52:04 +0200 Subject: [PATCH 01/39] cleanup: Make no_free_ptr() __must_check recent discussion brought about the realization that it makes sense for no_free_ptr() to have __must_check semantics in order to avoid leaking the resource. Additionally, add a few comments to clarify why/how things work. All credit to Linus on how to combine __must_check and the stmt-expression. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20230816103102.GF980931@hirez.programming.kicks-ass.net --- include/linux/cleanup.h | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index 53f1a7a932b0..9f1a9c455b68 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -7,8 +7,9 @@ /* * DEFINE_FREE(name, type, free): * simple helper macro that defines the required wrapper for a __free() - * based cleanup function. @free is an expression using '_T' to access - * the variable. + * based cleanup function. @free is an expression using '_T' to access the + * variable. @free should typically include a NULL test before calling a + * function, see the example below. * * __free(name): * variable attribute to add a scoped based cleanup to the variable. @@ -17,6 +18,9 @@ * like a non-atomic xchg(var, NULL), such that the cleanup function will * be inhibited -- provided it sanely deals with a NULL value. * + * NOTE: this has __must_check semantics so that it is harder to accidentally + * leak the resource. + * * return_ptr(p): * returns p while inhibiting the __free(). * @@ -24,6 +28,8 @@ * * DEFINE_FREE(kfree, void *, if (_T) kfree(_T)) * + * void *alloc_obj(...) + * { * struct obj *p __free(kfree) = kmalloc(...); * if (!p) * return NULL; @@ -32,6 +38,24 @@ * return NULL; * * return_ptr(p); + * } + * + * NOTE: the DEFINE_FREE()'s @free expression includes a NULL test even though + * kfree() is fine to be called with a NULL value. This is on purpose. This way + * the compiler sees the end of our alloc_obj() function as: + * + * tmp = p; + * p = NULL; + * if (p) + * kfree(p); + * return tmp; + * + * And through the magic of value-propagation and dead-code-elimination, it + * eliminates the actual cleanup call and compiles into: + * + * return p; + * + * Without the NULL test it turns into a mess and the compiler can't help us. */ #define DEFINE_FREE(_name, _type, _free) \ @@ -39,8 +63,17 @@ #define __free(_name) __cleanup(__free_##_name) +#define __get_and_null_ptr(p) \ + ({ __auto_type __ptr = &(p); \ + __auto_type __val = *__ptr; \ + *__ptr = NULL; __val; }) + +static inline __must_check +const volatile void * __must_check_fn(const volatile void *val) +{ return val; } + #define no_free_ptr(p) \ - ({ __auto_type __ptr = (p); (p) = NULL; __ptr; }) + ((typeof(p)) __must_check_fn(__get_and_null_ptr(p))) #define return_ptr(p) return no_free_ptr(p) From e35a6cf1cc343d720ad235f678f1cd2a9876b777 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 21 Aug 2023 15:22:07 +0100 Subject: [PATCH 02/39] futex: Use a folio instead of a page The futex code already handles compound pages correctly, but using a folio tells the compiler that there is already a reference to the head page and it doesn't need to call compound_head() again. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Thomas Gleixner Reviewed-by: Davidlohr Bueso Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20230821142207.2537124-1-willy@infradead.org --- kernel/futex/core.c | 67 ++++++++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 34 deletions(-) diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 514e4582b863..adf7e2c1c8f4 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -222,7 +222,8 @@ int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; - struct page *page, *tail; + struct page *page; + struct folio *folio; struct address_space *mapping; int err, ro = 0; @@ -273,54 +274,52 @@ again: err = 0; /* - * The treatment of mapping from this point on is critical. The page - * lock protects many things but in this context the page lock + * The treatment of mapping from this point on is critical. The folio + * lock protects many things but in this context the folio lock * stabilizes mapping, prevents inode freeing in the shared * file-backed region case and guards against movement to swap cache. * - * Strictly speaking the page lock is not needed in all cases being - * considered here and page lock forces unnecessarily serialization + * Strictly speaking the folio lock is not needed in all cases being + * considered here and folio lock forces unnecessarily serialization. * From this point on, mapping will be re-verified if necessary and - * page lock will be acquired only if it is unavoidable + * folio lock will be acquired only if it is unavoidable * - * Mapping checks require the head page for any compound page so the - * head page and mapping is looked up now. For anonymous pages, it - * does not matter if the page splits in the future as the key is - * based on the address. For filesystem-backed pages, the tail is - * required as the index of the page determines the key. For - * base pages, there is no tail page and tail == page. + * Mapping checks require the folio so it is looked up now. For + * anonymous pages, it does not matter if the folio is split + * in the future as the key is based on the address. For + * filesystem-backed pages, the precise page is required as the + * index of the page determines the key. */ - tail = page; - page = compound_head(page); - mapping = READ_ONCE(page->mapping); + folio = page_folio(page); + mapping = READ_ONCE(folio->mapping); /* - * If page->mapping is NULL, then it cannot be a PageAnon + * If folio->mapping is NULL, then it cannot be an anonymous * page; but it might be the ZERO_PAGE or in the gate area or * in a special mapping (all cases which we are happy to fail); * or it may have been a good file page when get_user_pages_fast * found it, but truncated or holepunched or subjected to - * invalidate_complete_page2 before we got the page lock (also + * invalidate_complete_page2 before we got the folio lock (also * cases which we are happy to fail). And we hold a reference, * so refcount care in invalidate_inode_page's remove_mapping * prevents drop_caches from setting mapping to NULL beneath us. * * The case we do have to guard against is when memory pressure made * shmem_writepage move it from filecache to swapcache beneath us: - * an unlikely race, but we do need to retry for page->mapping. + * an unlikely race, but we do need to retry for folio->mapping. */ if (unlikely(!mapping)) { int shmem_swizzled; /* - * Page lock is required to identify which special case above - * applies. If this is really a shmem page then the page lock + * Folio lock is required to identify which special case above + * applies. If this is really a shmem page then the folio lock * will prevent unexpected transitions. */ - lock_page(page); - shmem_swizzled = PageSwapCache(page) || page->mapping; - unlock_page(page); - put_page(page); + folio_lock(folio); + shmem_swizzled = folio_test_swapcache(folio) || folio->mapping; + folio_unlock(folio); + folio_put(folio); if (shmem_swizzled) goto again; @@ -331,14 +330,14 @@ again: /* * Private mappings are handled in a simple way. * - * If the futex key is stored on an anonymous page, then the associated + * If the futex key is stored in anonymous memory, then the associated * object is the mm which is implicitly pinned by the calling process. * * NOTE: When userspace waits on a MAP_SHARED mapping, even if * it's a read-only handle, it's expected that futexes attach to * the object not the particular process. */ - if (PageAnon(page)) { + if (folio_test_anon(folio)) { /* * A RO anonymous page will never change and thus doesn't make * sense for futex operations. @@ -357,10 +356,10 @@ again: /* * The associated futex object in this case is the inode and - * the page->mapping must be traversed. Ordinarily this should - * be stabilised under page lock but it's not strictly + * the folio->mapping must be traversed. Ordinarily this should + * be stabilised under folio lock but it's not strictly * necessary in this case as we just want to pin the inode, not - * update the radix tree or anything like that. + * update i_pages or anything like that. * * The RCU read lock is taken as the inode is finally freed * under RCU. If the mapping still matches expectations then the @@ -368,9 +367,9 @@ again: */ rcu_read_lock(); - if (READ_ONCE(page->mapping) != mapping) { + if (READ_ONCE(folio->mapping) != mapping) { rcu_read_unlock(); - put_page(page); + folio_put(folio); goto again; } @@ -378,19 +377,19 @@ again: inode = READ_ONCE(mapping->host); if (!inode) { rcu_read_unlock(); - put_page(page); + folio_put(folio); goto again; } key->both.offset |= FUT_OFF_INODE; /* inode-based key */ key->shared.i_seq = get_inode_sequence_number(inode); - key->shared.pgoff = page_to_pgoff(tail); + key->shared.pgoff = folio->index + folio_page_idx(folio, page); rcu_read_unlock(); } out: - put_page(page); + folio_put(folio); return err; } From a432b7c0cf420dbf2448c6bda6a6697afbb153d5 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 18 Sep 2023 20:40:27 +0200 Subject: [PATCH 03/39] locking/lockref/x86: Enable ARCH_USE_CMPXCHG_LOCKREF for X86_CMPXCHG64 The following commit: bc08b449ee14 ("lockref: implement lockless reference count updates using cmpxchg()") enabled lockless reference count updates using cmpxchg() only for x86_64, and left x86_32 behind due to inability to detect support for cmpxchg8b instruction. Nowadays, we can use CONFIG_X86_CMPXCHG64 for this purpose. Also, by using try_cmpxchg64() instead of cmpxchg64() in the CMPXCHG_LOOP macro, the compiler actually produces sane code, improving the lockref_get_not_zero() main loop from: eb: 8d 48 01 lea 0x1(%eax),%ecx ee: 85 c0 test %eax,%eax f0: 7e 2f jle 121 f2: 8b 44 24 10 mov 0x10(%esp),%eax f6: 8b 54 24 14 mov 0x14(%esp),%edx fa: 8b 74 24 08 mov 0x8(%esp),%esi fe: f0 0f c7 0e lock cmpxchg8b (%esi) 102: 8b 7c 24 14 mov 0x14(%esp),%edi 106: 89 c1 mov %eax,%ecx 108: 89 c3 mov %eax,%ebx 10a: 8b 74 24 10 mov 0x10(%esp),%esi 10e: 89 d0 mov %edx,%eax 110: 31 fa xor %edi,%edx 112: 31 ce xor %ecx,%esi 114: 09 f2 or %esi,%edx 116: 75 58 jne 170 to: 350: 8d 4f 01 lea 0x1(%edi),%ecx 353: 85 ff test %edi,%edi 355: 7e 79 jle 3d0 357: f0 0f c7 0e lock cmpxchg8b (%esi) 35b: 75 53 jne 3b0 Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Acked-by: Linus Torvalds Link: https://lore.kernel.org/r/20230918184050.9180-1-ubizjak@gmail.com --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 66bfabae8814..1379603016fd 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -28,7 +28,6 @@ config X86_64 select ARCH_HAS_GIGANTIC_PAGE select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select ARCH_SUPPORTS_PER_VMA_LOCK - select ARCH_USE_CMPXCHG_LOCKREF select HAVE_ARCH_SOFT_DIRTY select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE @@ -118,6 +117,7 @@ config X86 select ARCH_SUPPORTS_LTO_CLANG select ARCH_SUPPORTS_LTO_CLANG_THIN select ARCH_USE_BUILTIN_BSWAP + select ARCH_USE_CMPXCHG_LOCKREF if X86_CMPXCHG64 select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS From 28bc55f654de49f6122c7475b01b5d5ef4bdf0d4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Sep 2023 18:22:48 +0200 Subject: [PATCH 04/39] sched: Constrain locks in sched_submit_work() Even though sched_submit_work() is ran from preemptible context, it is discouraged to have it use blocking locks due to the recursion potential. Enforce this. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20230908162254.999499-2-bigeasy@linutronix.de --- kernel/sched/core.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2299a5cfbfb9..d55564097bd8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6720,11 +6720,18 @@ void __noreturn do_task_dead(void) static inline void sched_submit_work(struct task_struct *tsk) { + static DEFINE_WAIT_OVERRIDE_MAP(sched_map, LD_WAIT_CONFIG); unsigned int task_flags; if (task_is_running(tsk)) return; + /* + * Establish LD_WAIT_CONFIG context to ensure none of the code called + * will use a blocking primitive -- which would lead to recursion. + */ + lock_map_acquire_try(&sched_map); + task_flags = tsk->flags; /* * If a worker goes to sleep, notify and ask workqueue whether it @@ -6749,6 +6756,8 @@ static inline void sched_submit_work(struct task_struct *tsk) * make sure to submit it to avoid deadlocks. */ blk_flush_plug(tsk->plug, true); + + lock_map_release(&sched_map); } static void sched_update_worker(struct task_struct *tsk) From af9f006393b53409be0ca83ae234bef840cdef4a Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 8 Sep 2023 18:22:49 +0200 Subject: [PATCH 05/39] locking/rtmutex: Avoid unconditional slowpath for DEBUG_RT_MUTEXES With DEBUG_RT_MUTEXES enabled the fast-path rt_mutex_cmpxchg_acquire() always fails and all lock operations take the slow path. Provide a new helper inline rt_mutex_try_acquire() which maps to rt_mutex_cmpxchg_acquire() in the non-debug case. For the debug case it invokes rt_mutex_slowtrylock() which can acquire a non-contended rtmutex under full debug coverage. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20230908162254.999499-3-bigeasy@linutronix.de --- kernel/locking/rtmutex.c | 21 ++++++++++++++++++++- kernel/locking/ww_rt_mutex.c | 2 +- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 21db0df0eb00..bcec0533a0cc 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -218,6 +218,11 @@ static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock, return try_cmpxchg_acquire(&lock->owner, &old, new); } +static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock) +{ + return rt_mutex_cmpxchg_acquire(lock, NULL, current); +} + static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock, struct task_struct *old, struct task_struct *new) @@ -297,6 +302,20 @@ static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock, } +static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock); + +static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock) +{ + /* + * With debug enabled rt_mutex_cmpxchg trylock() will always fail. + * + * Avoid unconditionally taking the slow path by using + * rt_mutex_slow_trylock() which is covered by the debug code and can + * acquire a non-contended rtmutex. + */ + return rt_mutex_slowtrylock(lock); +} + static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock, struct task_struct *old, struct task_struct *new) @@ -1755,7 +1774,7 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock, static __always_inline int __rt_mutex_lock(struct rt_mutex_base *lock, unsigned int state) { - if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) + if (likely(rt_mutex_try_acquire(lock))) return 0; return rt_mutex_slowlock(lock, NULL, state); diff --git a/kernel/locking/ww_rt_mutex.c b/kernel/locking/ww_rt_mutex.c index d1473c624105..c7196de838ed 100644 --- a/kernel/locking/ww_rt_mutex.c +++ b/kernel/locking/ww_rt_mutex.c @@ -62,7 +62,7 @@ __ww_rt_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx, } mutex_acquire_nest(&rtm->dep_map, 0, 0, nest_lock, ip); - if (likely(rt_mutex_cmpxchg_acquire(&rtm->rtmutex, NULL, current))) { + if (likely(rt_mutex_try_acquire(&rtm->rtmutex))) { if (ww_ctx) ww_mutex_set_context_fastpath(lock, ww_ctx); return 0; From de1474b46d889ee0367f6e71d9adfeb0711e4a8d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 8 Sep 2023 18:22:50 +0200 Subject: [PATCH 06/39] sched: Extract __schedule_loop() There are currently two implementations of this basic __schedule() loop, and there is soon to be a third. Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20230908162254.999499-4-bigeasy@linutronix.de --- kernel/sched/core.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d55564097bd8..1ea7ba53aad2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6770,16 +6770,21 @@ static void sched_update_worker(struct task_struct *tsk) } } +static __always_inline void __schedule_loop(unsigned int sched_mode) +{ + do { + preempt_disable(); + __schedule(sched_mode); + sched_preempt_enable_no_resched(); + } while (need_resched()); +} + asmlinkage __visible void __sched schedule(void) { struct task_struct *tsk = current; sched_submit_work(tsk); - do { - preempt_disable(); - __schedule(SM_NONE); - sched_preempt_enable_no_resched(); - } while (need_resched()); + __schedule_loop(SM_NONE); sched_update_worker(tsk); } EXPORT_SYMBOL(schedule); @@ -6843,11 +6848,7 @@ void __sched schedule_preempt_disabled(void) #ifdef CONFIG_PREEMPT_RT void __sched notrace schedule_rtlock(void) { - do { - preempt_disable(); - __schedule(SM_RTLOCK_WAIT); - sched_preempt_enable_no_resched(); - } while (need_resched()); + __schedule_loop(SM_RTLOCK_WAIT); } NOKPROBE_SYMBOL(schedule_rtlock); #endif From 6b596e62ed9f90c4a97e68ae1f7b1af5beeb3c05 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Sep 2023 18:22:51 +0200 Subject: [PATCH 07/39] sched: Provide rt_mutex specific scheduler helpers With PREEMPT_RT there is a rt_mutex recursion problem where sched_submit_work() can use an rtlock (aka spinlock_t). More specifically what happens is: mutex_lock() /* really rt_mutex */ ... __rt_mutex_slowlock_locked() task_blocks_on_rt_mutex() // enqueue current task as waiter // do PI chain walk rt_mutex_slowlock_block() schedule() sched_submit_work() ... spin_lock() /* really rtlock */ ... __rt_mutex_slowlock_locked() task_blocks_on_rt_mutex() // enqueue current task as waiter *AGAIN* // *CONFUSION* Fix this by making rt_mutex do the sched_submit_work() early, before it enqueues itself as a waiter -- before it even knows *if* it will wait. [[ basically Thomas' patch but with different naming and a few asserts added ]] Originally-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20230908162254.999499-5-bigeasy@linutronix.de --- include/linux/sched.h | 3 +++ include/linux/sched/rt.h | 4 ++++ kernel/sched/core.c | 36 ++++++++++++++++++++++++++++++++---- 3 files changed, 39 insertions(+), 4 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 77f01ac385f7..67623ffd4a8e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -911,6 +911,9 @@ struct task_struct { * ->sched_remote_wakeup gets used, so it can be in this word. */ unsigned sched_remote_wakeup:1; +#ifdef CONFIG_RT_MUTEXES + unsigned sched_rt_mutex:1; +#endif /* Bit to tell LSMs we're in execve(): */ unsigned in_execve:1; diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h index 994c25640e15..b2b9e6eb9683 100644 --- a/include/linux/sched/rt.h +++ b/include/linux/sched/rt.h @@ -30,6 +30,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) } #ifdef CONFIG_RT_MUTEXES +extern void rt_mutex_pre_schedule(void); +extern void rt_mutex_schedule(void); +extern void rt_mutex_post_schedule(void); + /* * Must hold either p->pi_lock or task_rq(p)->lock. */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1ea7ba53aad2..58d0346d1bb3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6723,9 +6723,6 @@ static inline void sched_submit_work(struct task_struct *tsk) static DEFINE_WAIT_OVERRIDE_MAP(sched_map, LD_WAIT_CONFIG); unsigned int task_flags; - if (task_is_running(tsk)) - return; - /* * Establish LD_WAIT_CONFIG context to ensure none of the code called * will use a blocking primitive -- which would lead to recursion. @@ -6783,7 +6780,12 @@ asmlinkage __visible void __sched schedule(void) { struct task_struct *tsk = current; - sched_submit_work(tsk); +#ifdef CONFIG_RT_MUTEXES + lockdep_assert(!tsk->sched_rt_mutex); +#endif + + if (!task_is_running(tsk)) + sched_submit_work(tsk); __schedule_loop(SM_NONE); sched_update_worker(tsk); } @@ -7044,6 +7046,32 @@ static void __setscheduler_prio(struct task_struct *p, int prio) #ifdef CONFIG_RT_MUTEXES +/* + * Would be more useful with typeof()/auto_type but they don't mix with + * bit-fields. Since it's a local thing, use int. Keep the generic sounding + * name such that if someone were to implement this function we get to compare + * notes. + */ +#define fetch_and_set(x, v) ({ int _x = (x); (x) = (v); _x; }) + +void rt_mutex_pre_schedule(void) +{ + lockdep_assert(!fetch_and_set(current->sched_rt_mutex, 1)); + sched_submit_work(current); +} + +void rt_mutex_schedule(void) +{ + lockdep_assert(current->sched_rt_mutex); + __schedule_loop(SM_NONE); +} + +void rt_mutex_post_schedule(void) +{ + sched_update_worker(current); + lockdep_assert(fetch_and_set(current->sched_rt_mutex, 0)); +} + static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) { if (pi_task) From d14f9e930b9073de264c106bf04968286ef9b3a4 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 8 Sep 2023 18:22:52 +0200 Subject: [PATCH 08/39] locking/rtmutex: Use rt_mutex specific scheduler helpers Have rt_mutex use the rt_mutex specific scheduler helpers to avoid recursion vs rtlock on the PI state. [[ peterz: adapted to new names ]] Reported-by: Crystal Wood Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20230908162254.999499-6-bigeasy@linutronix.de --- kernel/futex/pi.c | 11 +++++++++++ kernel/locking/rtmutex.c | 14 ++++++++++++-- kernel/locking/rwbase_rt.c | 6 ++++++ kernel/locking/rwsem.c | 8 +++++++- kernel/locking/spinlock_rt.c | 4 ++++ 5 files changed, 40 insertions(+), 3 deletions(-) diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c index ce2889f12375..f8e65b27d9d6 100644 --- a/kernel/futex/pi.c +++ b/kernel/futex/pi.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include +#include #include #include "futex.h" @@ -1002,6 +1003,12 @@ retry_private: goto no_block; } + /* + * Must be done before we enqueue the waiter, here is unfortunately + * under the hb lock, but that *should* work because it does nothing. + */ + rt_mutex_pre_schedule(); + rt_mutex_init_waiter(&rt_waiter); /* @@ -1052,6 +1059,10 @@ cleanup: if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) ret = 0; + /* + * Waiter is unqueued. + */ + rt_mutex_post_schedule(); no_block: /* * Fixup the pi_state owner and possibly acquire the lock if we diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index bcec0533a0cc..a3fe05dfd0d8 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1632,7 +1632,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, raw_spin_unlock_irq(&lock->wait_lock); if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) - schedule(); + rt_mutex_schedule(); raw_spin_lock_irq(&lock->wait_lock); set_current_state(state); @@ -1661,7 +1661,7 @@ static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock, WARN(1, "rtmutex deadlock detected\n"); while (1) { set_current_state(TASK_INTERRUPTIBLE); - schedule(); + rt_mutex_schedule(); } } @@ -1756,6 +1756,15 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock, unsigned long flags; int ret; + /* + * Do all pre-schedule work here, before we queue a waiter and invoke + * PI -- any such work that trips on rtlock (PREEMPT_RT spinlock) would + * otherwise recurse back into task_blocks_on_rt_mutex() through + * rtlock_slowlock() and will then enqueue a second waiter for this + * same task and things get really confusing real fast. + */ + rt_mutex_pre_schedule(); + /* * Technically we could use raw_spin_[un]lock_irq() here, but this can * be called in early boot if the cmpxchg() fast path is disabled @@ -1767,6 +1776,7 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock, raw_spin_lock_irqsave(&lock->wait_lock, flags); ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state); raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + rt_mutex_post_schedule(); return ret; } diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c index 25ec0239477c..c7258cb32d91 100644 --- a/kernel/locking/rwbase_rt.c +++ b/kernel/locking/rwbase_rt.c @@ -71,6 +71,7 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb, struct rt_mutex_base *rtm = &rwb->rtmutex; int ret; + rwbase_pre_schedule(); raw_spin_lock_irq(&rtm->wait_lock); /* @@ -125,6 +126,7 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb, rwbase_rtmutex_unlock(rtm); trace_contention_end(rwb, ret); + rwbase_post_schedule(); return ret; } @@ -237,6 +239,8 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb, /* Force readers into slow path */ atomic_sub(READER_BIAS, &rwb->readers); + rwbase_pre_schedule(); + raw_spin_lock_irqsave(&rtm->wait_lock, flags); if (__rwbase_write_trylock(rwb)) goto out_unlock; @@ -248,6 +252,7 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb, if (rwbase_signal_pending_state(state, current)) { rwbase_restore_current_state(); __rwbase_write_unlock(rwb, 0, flags); + rwbase_post_schedule(); trace_contention_end(rwb, -EINTR); return -EINTR; } @@ -266,6 +271,7 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb, out_unlock: raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); + rwbase_post_schedule(); return 0; } diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 9eabd585ce7a..2340b6d90ec6 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1427,8 +1427,14 @@ static inline void __downgrade_write(struct rw_semaphore *sem) #define rwbase_signal_pending_state(state, current) \ signal_pending_state(state, current) +#define rwbase_pre_schedule() \ + rt_mutex_pre_schedule() + #define rwbase_schedule() \ - schedule() + rt_mutex_schedule() + +#define rwbase_post_schedule() \ + rt_mutex_post_schedule() #include "rwbase_rt.c" diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c index 48a19ed8486d..842037b2ba54 100644 --- a/kernel/locking/spinlock_rt.c +++ b/kernel/locking/spinlock_rt.c @@ -184,9 +184,13 @@ static __always_inline int rwbase_rtmutex_trylock(struct rt_mutex_base *rtm) #define rwbase_signal_pending_state(state, current) (0) +#define rwbase_pre_schedule() + #define rwbase_schedule() \ schedule_rtlock() +#define rwbase_post_schedule() + #include "rwbase_rt.c" /* * The common functions which get wrapped into the rwlock API. From 45f67f30a22f264bc7a0a61255c2ee1a838e9403 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 8 Sep 2023 18:22:53 +0200 Subject: [PATCH 09/39] locking/rtmutex: Add a lockdep assert to catch potential nested blocking There used to be a BUG_ON(current->pi_blocked_on) in the lock acquisition functions, but that vanished in one of the rtmutex overhauls. Bring it back in form of a lockdep assert to catch code paths which take rtmutex based locks with current::pi_blocked_on != NULL. Reported-by: Crystal Wood Signed-off-by: Thomas Gleixner Signed-off-by: "Peter Zijlstra (Intel)" Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20230908162254.999499-7-bigeasy@linutronix.de --- kernel/locking/rtmutex.c | 2 ++ kernel/locking/rwbase_rt.c | 2 ++ kernel/locking/spinlock_rt.c | 2 ++ 3 files changed, 6 insertions(+) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index a3fe05dfd0d8..4a10e8c16fd2 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1784,6 +1784,8 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock, static __always_inline int __rt_mutex_lock(struct rt_mutex_base *lock, unsigned int state) { + lockdep_assert(!current->pi_blocked_on); + if (likely(rt_mutex_try_acquire(lock))) return 0; diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c index c7258cb32d91..34a59569db6b 100644 --- a/kernel/locking/rwbase_rt.c +++ b/kernel/locking/rwbase_rt.c @@ -133,6 +133,8 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb, static __always_inline int rwbase_read_lock(struct rwbase_rt *rwb, unsigned int state) { + lockdep_assert(!current->pi_blocked_on); + if (rwbase_read_trylock(rwb)) return 0; diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c index 842037b2ba54..38e292454fcc 100644 --- a/kernel/locking/spinlock_rt.c +++ b/kernel/locking/spinlock_rt.c @@ -37,6 +37,8 @@ static __always_inline void rtlock_lock(struct rt_mutex_base *rtm) { + lockdep_assert(!current->pi_blocked_on); + if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current))) rtlock_slowlock(rtm); } From fbeb558b0dd0d6348e0872bbbbe96e30c65867b7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Sep 2023 17:19:44 +0200 Subject: [PATCH 10/39] futex/pi: Fix recursive rt_mutex waiter state Some new assertions pointed out that the existing code has nested rt_mutex wait state in the futex code. Specifically, the futex_lock_pi() cancel case uses spin_lock() while there still is a rt_waiter enqueued for this task, resulting in a state where there are two waiters for the same task (and task_struct::pi_blocked_on gets scrambled). The reason to take hb->lock at this point is to avoid the wake_futex_pi() EAGAIN case. This happens when futex_top_waiter() and rt_mutex_top_waiter() state becomes inconsistent. The current rules are such that this inconsistency will not be observed. Notably the case that needs to be avoided is where futex_lock_pi() and futex_unlock_pi() interleave such that unlock will fail to observe a new waiter. *However* the case at hand is where a waiter is leaving, in this case the race means a waiter that is going away is not observed -- which is harmless, provided this race is explicitly handled. This is a somewhat dangerous proposition because the converse race is not observing a new waiter, which must absolutely not happen. But since the race is valid this cannot be asserted. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Reviewed-by: Sebastian Andrzej Siewior Tested-by: Sebastian Andrzej Siewior Link: https://lkml.kernel.org/r/20230915151943.GD6743@noisy.programming.kicks-ass.net --- kernel/futex/pi.c | 76 ++++++++++++++++++++++++++---------------- kernel/futex/requeue.c | 6 ++-- 2 files changed, 52 insertions(+), 30 deletions(-) diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c index f8e65b27d9d6..d636a1bbd7d0 100644 --- a/kernel/futex/pi.c +++ b/kernel/futex/pi.c @@ -611,29 +611,16 @@ int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, /* * Caller must hold a reference on @pi_state. */ -static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) +static int wake_futex_pi(u32 __user *uaddr, u32 uval, + struct futex_pi_state *pi_state, + struct rt_mutex_waiter *top_waiter) { - struct rt_mutex_waiter *top_waiter; struct task_struct *new_owner; bool postunlock = false; DEFINE_RT_WAKE_Q(wqh); u32 curval, newval; int ret = 0; - top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex); - if (WARN_ON_ONCE(!top_waiter)) { - /* - * As per the comment in futex_unlock_pi() this should not happen. - * - * When this happens, give up our locks and try again, giving - * the futex_lock_pi() instance time to complete, either by - * waiting on the rtmutex or removing itself from the futex - * queue. - */ - ret = -EAGAIN; - goto out_unlock; - } - new_owner = top_waiter->task; /* @@ -1046,19 +1033,33 @@ retry_private: ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); cleanup: - spin_lock(q.lock_ptr); /* * If we failed to acquire the lock (deadlock/signal/timeout), we must - * first acquire the hb->lock before removing the lock from the - * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait - * lists consistent. + * must unwind the above, however we canont lock hb->lock because + * rt_mutex already has a waiter enqueued and hb->lock can itself try + * and enqueue an rt_waiter through rtlock. * - * In particular; it is important that futex_unlock_pi() can not - * observe this inconsistency. + * Doing the cleanup without holding hb->lock can cause inconsistent + * state between hb and pi_state, but only in the direction of not + * seeing a waiter that is leaving. + * + * See futex_unlock_pi(), it deals with this inconsistency. + * + * There be dragons here, since we must deal with the inconsistency on + * the way out (here), it is impossible to detect/warn about the race + * the other way around (missing an incoming waiter). + * + * What could possibly go wrong... */ if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) ret = 0; + /* + * Now that the rt_waiter has been dequeued, it is safe to use + * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up + * the + */ + spin_lock(q.lock_ptr); /* * Waiter is unqueued. */ @@ -1143,6 +1144,7 @@ retry: top_waiter = futex_top_waiter(hb, &key); if (top_waiter) { struct futex_pi_state *pi_state = top_waiter->pi_state; + struct rt_mutex_waiter *rt_waiter; ret = -EINVAL; if (!pi_state) @@ -1155,22 +1157,39 @@ retry: if (pi_state->owner != current) goto out_unlock; - get_pi_state(pi_state); /* * By taking wait_lock while still holding hb->lock, we ensure - * there is no point where we hold neither; and therefore - * wake_futex_p() must observe a state consistent with what we - * observed. + * there is no point where we hold neither; and thereby + * wake_futex_pi() must observe any new waiters. + * + * Since the cleanup: case in futex_lock_pi() removes the + * rt_waiter without holding hb->lock, it is possible for + * wake_futex_pi() to not find a waiter while the above does, + * in this case the waiter is on the way out and it can be + * ignored. * * In particular; this forces __rt_mutex_start_proxy() to * complete such that we're guaranteed to observe the - * rt_waiter. Also see the WARN in wake_futex_pi(). + * rt_waiter. */ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + + /* + * Futex vs rt_mutex waiter state -- if there are no rt_mutex + * waiters even though futex thinks there are, then the waiter + * is leaving and the uncontended path is safe to take. + */ + rt_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex); + if (!rt_waiter) { + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + goto do_uncontended; + } + + get_pi_state(pi_state); spin_unlock(&hb->lock); /* drops pi_state->pi_mutex.wait_lock */ - ret = wake_futex_pi(uaddr, uval, pi_state); + ret = wake_futex_pi(uaddr, uval, pi_state, rt_waiter); put_pi_state(pi_state); @@ -1198,6 +1217,7 @@ retry: return ret; } +do_uncontended: /* * We have no kernel internal state, i.e. no waiters in the * kernel. Waiters which are about to queue themselves are stuck diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c index cba8b1a6a4cc..4c73e0b81acc 100644 --- a/kernel/futex/requeue.c +++ b/kernel/futex/requeue.c @@ -850,11 +850,13 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, pi_mutex = &q.pi_state->pi_mutex; ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); - /* Current is not longer pi_blocked_on */ - spin_lock(q.lock_ptr); + /* + * See futex_unlock_pi()'s cleanup: comment. + */ if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) ret = 0; + spin_lock(q.lock_ptr); debug_rt_mutex_free_waiter(&rt_waiter); /* * Fixup the pi_state owner and possibly acquire the lock if we From c6f4a90022524d06f6d9de323b1757031dcf0c26 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Fri, 8 Sep 2023 11:43:39 -0400 Subject: [PATCH 11/39] asm-generic: ticket-lock: Optimize arch_spin_value_unlocked() The arch_spin_value_unlocked() of ticket-lock would cause the compiler to generate inefficient asm code in riscv architecture because of unnecessary memory access to the contended value. Before the patch: void lockref_get(struct lockref *lockref) { 78: fd010113 add sp,sp,-48 7c: 02813023 sd s0,32(sp) 80: 02113423 sd ra,40(sp) 84: 03010413 add s0,sp,48 0000000000000088 <.LBB296>: CMPXCHG_LOOP( 88: 00053783 ld a5,0(a0) After the patch: void lockref_get(struct lockref *lockref) { CMPXCHG_LOOP( 78: 00053783 ld a5,0(a0) After the patch, the lockref_get() could get in a fast path instead of the function's prologue. This is because ticket lock complex logic would limit compiler optimization for the spinlock fast path, and qspinlock won't. The caller of arch_spin_value_unlocked() could benefit from this change. Currently, the only caller is lockref. Signed-off-by: Guo Ren Signed-off-by: Guo Ren Signed-off-by: Ingo Molnar Acked-by: Waiman Long Acked-by: Will Deacon Link: https://lore.kernel.org/r/20230908154339.3250567-1-guoren@kernel.org --- include/asm-generic/spinlock.h | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/include/asm-generic/spinlock.h b/include/asm-generic/spinlock.h index fdfebcb050f4..90803a826ba0 100644 --- a/include/asm-generic/spinlock.h +++ b/include/asm-generic/spinlock.h @@ -68,11 +68,18 @@ static __always_inline void arch_spin_unlock(arch_spinlock_t *lock) smp_store_release(ptr, (u16)val + 1); } +static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock) +{ + u32 val = lock.counter; + + return ((val >> 16) == (val & 0xffff)); +} + static __always_inline int arch_spin_is_locked(arch_spinlock_t *lock) { - u32 val = atomic_read(lock); + arch_spinlock_t val = READ_ONCE(*lock); - return ((val >> 16) != (val & 0xffff)); + return !arch_spin_value_unlocked(val); } static __always_inline int arch_spin_is_contended(arch_spinlock_t *lock) @@ -82,11 +89,6 @@ static __always_inline int arch_spin_is_contended(arch_spinlock_t *lock) return (s16)((val >> 16) - (val & 0xffff)) > 1; } -static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock) -{ - return !arch_spin_is_locked(&lock); -} - #include #endif /* __ASM_GENERIC_SPINLOCK_H */ From 4923954bbc4a760e0b2210e0cb5733726ac2e2e9 Mon Sep 17 00:00:00 2001 From: "peterz@infradead.org" Date: Thu, 21 Sep 2023 12:45:06 +0200 Subject: [PATCH 12/39] futex: Clarify FUTEX2 flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sys_futex_waitv() is part of the futex2 series (the first and only so far) of syscalls and has a flags field per futex (as opposed to flags being encoded in the futex op). This new flags field has a new namespace, which unfortunately isn't super explicit. Notably it currently takes FUTEX_32 and FUTEX_PRIVATE_FLAG. Introduce the FUTEX2 namespace to clarify this Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Reviewed-by: André Almeida Link: https://lore.kernel.org/r/20230921105247.507327749@noisy.programming.kicks-ass.net --- include/uapi/linux/futex.h | 16 +++++++++++++--- kernel/futex/syscalls.c | 7 +++---- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h index 71a5df8d2689..21d4eff41162 100644 --- a/include/uapi/linux/futex.h +++ b/include/uapi/linux/futex.h @@ -44,10 +44,20 @@ FUTEX_PRIVATE_FLAG) /* - * Flags to specify the bit length of the futex word for futex2 syscalls. - * Currently, only 32 is supported. + * Flags for futex2 syscalls. */ -#define FUTEX_32 2 + /* 0x00 */ + /* 0x01 */ +#define FUTEX2_SIZE_U32 0x02 + /* 0x04 */ + /* 0x08 */ + /* 0x10 */ + /* 0x20 */ + /* 0x40 */ +#define FUTEX2_PRIVATE FUTEX_PRIVATE_FLAG + +/* do not use */ +#define FUTEX_32 FUTEX2_SIZE_U32 /* historical accident :-( */ /* * Max numbers of elements in a futex_waitv array diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c index a8074079b09e..ff696b0e2e5c 100644 --- a/kernel/futex/syscalls.c +++ b/kernel/futex/syscalls.c @@ -183,8 +183,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); } -/* Mask of available flags for each futex in futex_waitv list */ -#define FUTEXV_WAITER_MASK (FUTEX_32 | FUTEX_PRIVATE_FLAG) +#define FUTEX2_VALID_MASK (FUTEX2_SIZE_U32 | FUTEX2_PRIVATE) /** * futex_parse_waitv - Parse a waitv array from userspace @@ -205,10 +204,10 @@ static int futex_parse_waitv(struct futex_vector *futexv, if (copy_from_user(&aux, &uwaitv[i], sizeof(aux))) return -EFAULT; - if ((aux.flags & ~FUTEXV_WAITER_MASK) || aux.__reserved) + if ((aux.flags & ~FUTEX2_VALID_MASK) || aux.__reserved) return -EINVAL; - if (!(aux.flags & FUTEX_32)) + if (!(aux.flags & FUTEX2_SIZE_U32)) return -EINVAL; futexv[i].w.flags = aux.flags; From d6d08d24790e82c69a46ef78ae44fe1b1ed30775 Mon Sep 17 00:00:00 2001 From: "peterz@infradead.org" Date: Thu, 21 Sep 2023 12:45:07 +0200 Subject: [PATCH 13/39] futex: Extend the FUTEX2 flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the definition for the missing but always intended extra sizes, and add a NUMA flag for the planned numa extention. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Reviewed-by: André Almeida Link: https://lore.kernel.org/r/20230921105247.617057368@noisy.programming.kicks-ass.net --- include/uapi/linux/futex.h | 21 ++++++++++++++++++--- kernel/futex/syscalls.c | 9 +++++++-- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h index 21d4eff41162..d2ee625ea189 100644 --- a/include/uapi/linux/futex.h +++ b/include/uapi/linux/futex.h @@ -45,17 +45,32 @@ /* * Flags for futex2 syscalls. + * + * NOTE: these are not pure flags, they can also be seen as: + * + * union { + * u32 flags; + * struct { + * u32 size : 2, + * numa : 1, + * : 4, + * private : 1; + * }; + * }; */ - /* 0x00 */ - /* 0x01 */ +#define FUTEX2_SIZE_U8 0x00 +#define FUTEX2_SIZE_U16 0x01 #define FUTEX2_SIZE_U32 0x02 - /* 0x04 */ +#define FUTEX2_SIZE_U64 0x03 +#define FUTEX2_NUMA 0x04 /* 0x08 */ /* 0x10 */ /* 0x20 */ /* 0x40 */ #define FUTEX2_PRIVATE FUTEX_PRIVATE_FLAG +#define FUTEX2_SIZE_MASK 0x03 + /* do not use */ #define FUTEX_32 FUTEX2_SIZE_U32 /* historical accident :-( */ diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c index ff696b0e2e5c..953f0a49de3a 100644 --- a/kernel/futex/syscalls.c +++ b/kernel/futex/syscalls.c @@ -183,7 +183,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); } -#define FUTEX2_VALID_MASK (FUTEX2_SIZE_U32 | FUTEX2_PRIVATE) +#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_PRIVATE) /** * futex_parse_waitv - Parse a waitv array from userspace @@ -207,7 +207,12 @@ static int futex_parse_waitv(struct futex_vector *futexv, if ((aux.flags & ~FUTEX2_VALID_MASK) || aux.__reserved) return -EINVAL; - if (!(aux.flags & FUTEX2_SIZE_U32)) + if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall()) { + if ((aux.flags & FUTEX2_SIZE_MASK) == FUTEX2_SIZE_U64) + return -EINVAL; + } + + if ((aux.flags & FUTEX2_SIZE_MASK) != FUTEX2_SIZE_U32) return -EINVAL; futexv[i].w.flags = aux.flags; From 5694289ce183bc3336407a78c8c722a0b9208f9b Mon Sep 17 00:00:00 2001 From: "peterz@infradead.org" Date: Thu, 21 Sep 2023 12:45:08 +0200 Subject: [PATCH 14/39] futex: Flag conversion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Futex has 3 sets of flags: - legacy futex op bits - futex2 flags - internal flags Add a few helpers to convert from the API flags into the internal flags. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Reviewed-by: André Almeida Link: https://lore.kernel.org/r/20230921105247.722140574@noisy.programming.kicks-ass.net --- kernel/futex/futex.h | 63 +++++++++++++++++++++++++++++++++++++++-- kernel/futex/syscalls.c | 24 ++++++---------- kernel/futex/waitwake.c | 4 +-- 3 files changed, 71 insertions(+), 20 deletions(-) diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index b5379c0e6d6d..68fc052dc09b 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -5,6 +5,7 @@ #include #include #include +#include #ifdef CONFIG_PREEMPT_RT #include @@ -16,8 +17,15 @@ * Futex flags used to encode options to functions and preserve them across * restarts. */ +#define FLAGS_SIZE_8 0x00 +#define FLAGS_SIZE_16 0x01 +#define FLAGS_SIZE_32 0x02 +#define FLAGS_SIZE_64 0x03 + +#define FLAGS_SIZE_MASK 0x03 + #ifdef CONFIG_MMU -# define FLAGS_SHARED 0x01 +# define FLAGS_SHARED 0x10 #else /* * NOMMU does not have per process address space. Let the compiler optimize @@ -25,8 +33,57 @@ */ # define FLAGS_SHARED 0x00 #endif -#define FLAGS_CLOCKRT 0x02 -#define FLAGS_HAS_TIMEOUT 0x04 +#define FLAGS_CLOCKRT 0x20 +#define FLAGS_HAS_TIMEOUT 0x40 +#define FLAGS_NUMA 0x80 + +/* FUTEX_ to FLAGS_ */ +static inline unsigned int futex_to_flags(unsigned int op) +{ + unsigned int flags = FLAGS_SIZE_32; + + if (!(op & FUTEX_PRIVATE_FLAG)) + flags |= FLAGS_SHARED; + + if (op & FUTEX_CLOCK_REALTIME) + flags |= FLAGS_CLOCKRT; + + return flags; +} + +/* FUTEX2_ to FLAGS_ */ +static inline unsigned int futex2_to_flags(unsigned int flags2) +{ + unsigned int flags = flags2 & FUTEX2_SIZE_MASK; + + if (!(flags2 & FUTEX2_PRIVATE)) + flags |= FLAGS_SHARED; + + if (flags2 & FUTEX2_NUMA) + flags |= FLAGS_NUMA; + + return flags; +} + +static inline unsigned int futex_size(unsigned int flags) +{ + return 1 << (flags & FLAGS_SIZE_MASK); +} + +static inline bool futex_flags_valid(unsigned int flags) +{ + /* Only 64bit futexes for 64bit code */ + if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall()) { + if ((flags & FLAGS_SIZE_MASK) == FLAGS_SIZE_64) + return false; + } + + /* Only 32bit futexes are implemented -- for now */ + if ((flags & FLAGS_SIZE_MASK) != FLAGS_SIZE_32) + return false; + + return true; +} #ifdef CONFIG_FAIL_FUTEX extern bool should_fail_futex(bool fshared); diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c index 953f0a49de3a..948ac247c1c6 100644 --- a/kernel/futex/syscalls.c +++ b/kernel/futex/syscalls.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -#include #include #include @@ -85,15 +84,12 @@ err_unlock: long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, u32 __user *uaddr2, u32 val2, u32 val3) { + unsigned int flags = futex_to_flags(op); int cmd = op & FUTEX_CMD_MASK; - unsigned int flags = 0; - if (!(op & FUTEX_PRIVATE_FLAG)) - flags |= FLAGS_SHARED; - - if (op & FUTEX_CLOCK_REALTIME) { - flags |= FLAGS_CLOCKRT; - if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI && + if (flags & FLAGS_CLOCKRT) { + if (cmd != FUTEX_WAIT_BITSET && + cmd != FUTEX_WAIT_REQUEUE_PI && cmd != FUTEX_LOCK_PI2) return -ENOSYS; } @@ -201,21 +197,19 @@ static int futex_parse_waitv(struct futex_vector *futexv, unsigned int i; for (i = 0; i < nr_futexes; i++) { + unsigned int flags; + if (copy_from_user(&aux, &uwaitv[i], sizeof(aux))) return -EFAULT; if ((aux.flags & ~FUTEX2_VALID_MASK) || aux.__reserved) return -EINVAL; - if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall()) { - if ((aux.flags & FUTEX2_SIZE_MASK) == FUTEX2_SIZE_U64) - return -EINVAL; - } - - if ((aux.flags & FUTEX2_SIZE_MASK) != FUTEX2_SIZE_U32) + flags = futex2_to_flags(aux.flags); + if (!futex_flags_valid(flags)) return -EINVAL; - futexv[i].w.flags = aux.flags; + futexv[i].w.flags = flags; futexv[i].w.val = aux.val; futexv[i].w.uaddr = aux.uaddr; futexv[i].q = futex_q_init; diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c index ba01b9408203..fa9757766103 100644 --- a/kernel/futex/waitwake.c +++ b/kernel/futex/waitwake.c @@ -419,11 +419,11 @@ static int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *wo */ retry: for (i = 0; i < count; i++) { - if ((vs[i].w.flags & FUTEX_PRIVATE_FLAG) && retry) + if (!(vs[i].w.flags & FLAGS_SHARED) && retry) continue; ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr), - !(vs[i].w.flags & FUTEX_PRIVATE_FLAG), + vs[i].w.flags & FLAGS_SHARED, &vs[i].q.key, FUTEX_READ); if (unlikely(ret)) From 698eb826383616ce0e817d2384da6413d1439fb6 Mon Sep 17 00:00:00 2001 From: "peterz@infradead.org" Date: Thu, 21 Sep 2023 12:45:09 +0200 Subject: [PATCH 15/39] futex: Validate futex value against futex size Ensure the futex value fits in the given futex size. Since this adds a constraint to an existing syscall, it might possibly change behaviour. Currently the value would be truncated to a u32 and any high bits would get silently lost. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/20230921105247.828934099@noisy.programming.kicks-ass.net --- kernel/futex/futex.h | 10 ++++++++++ kernel/futex/syscalls.c | 3 +++ 2 files changed, 13 insertions(+) diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index 68fc052dc09b..a3f1fceafcbe 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -85,6 +85,16 @@ static inline bool futex_flags_valid(unsigned int flags) return true; } +static inline bool futex_validate_input(unsigned int flags, u64 val) +{ + int bits = 8 * futex_size(flags); + + if (bits < 64 && (val >> bits)) + return false; + + return true; +} + #ifdef CONFIG_FAIL_FUTEX extern bool should_fail_futex(bool fshared); #else diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c index 948ac247c1c6..2339f9ccee7f 100644 --- a/kernel/futex/syscalls.c +++ b/kernel/futex/syscalls.c @@ -209,6 +209,9 @@ static int futex_parse_waitv(struct futex_vector *futexv, if (!futex_flags_valid(flags)) return -EINVAL; + if (!futex_validate_input(flags, aux.val)) + return -EINVAL; + futexv[i].w.flags = flags; futexv[i].w.val = aux.val; futexv[i].w.uaddr = aux.uaddr; From 9f6c532f59b20580acf8ede9409c9b8dce6e74e1 Mon Sep 17 00:00:00 2001 From: "peterz@infradead.org" Date: Thu, 21 Sep 2023 12:45:10 +0200 Subject: [PATCH 16/39] futex: Add sys_futex_wake() To complement sys_futex_waitv() add sys_futex_wake(). This syscall implements what was previously known as FUTEX_WAKE_BITSET except it uses 'unsigned long' for the bitmask and takes FUTEX2 flags. The 'unsigned long' allows FUTEX2_SIZE_U64 on 64bit platforms. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Acked-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20230921105247.936205525@noisy.programming.kicks-ass.net --- arch/alpha/kernel/syscalls/syscall.tbl | 1 + arch/arm/tools/syscall.tbl | 1 + arch/arm64/include/asm/unistd.h | 2 +- arch/arm64/include/asm/unistd32.h | 2 ++ arch/ia64/kernel/syscalls/syscall.tbl | 1 + arch/m68k/kernel/syscalls/syscall.tbl | 1 + arch/microblaze/kernel/syscalls/syscall.tbl | 1 + arch/mips/kernel/syscalls/syscall_n32.tbl | 1 + arch/mips/kernel/syscalls/syscall_n64.tbl | 1 + arch/mips/kernel/syscalls/syscall_o32.tbl | 1 + arch/parisc/kernel/syscalls/syscall.tbl | 1 + arch/powerpc/kernel/syscalls/syscall.tbl | 1 + arch/s390/kernel/syscalls/syscall.tbl | 1 + arch/sh/kernel/syscalls/syscall.tbl | 1 + arch/sparc/kernel/syscalls/syscall.tbl | 1 + arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + arch/xtensa/kernel/syscalls/syscall.tbl | 1 + include/linux/syscalls.h | 3 +++ include/uapi/asm-generic/unistd.h | 4 ++- kernel/futex/syscalls.c | 30 +++++++++++++++++++++ kernel/sys_ni.c | 1 + 22 files changed, 56 insertions(+), 2 deletions(-) diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index ad37569d0507..3b86519d68e4 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -492,3 +492,4 @@ 560 common set_mempolicy_home_node sys_ni_syscall 561 common cachestat sys_cachestat 562 common fchmodat2 sys_fchmodat2 +563 common futex_wake sys_futex_wake diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index c572d6c3dee0..714abeb1e6fa 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -466,3 +466,4 @@ 450 common set_mempolicy_home_node sys_set_mempolicy_home_node 451 common cachestat sys_cachestat 452 common fchmodat2 sys_fchmodat2 +454 common futex_wake sys_futex_wake diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index bd77253b62e0..63a8a9c4abc1 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -39,7 +39,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 453 +#define __NR_compat_syscalls 455 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 78b68311ec81..68974683737b 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -911,6 +911,8 @@ __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) __SYSCALL(__NR_cachestat, sys_cachestat) #define __NR_fchmodat2 452 __SYSCALL(__NR_fchmodat2, sys_fchmodat2) +#define __NR_futex_wake 454 +__SYSCALL(__NR_futex_wake, sys_futex_wake) /* * Please add new compat syscalls above this comment and update diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index 83d8609aec03..cd50247508e6 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -373,3 +373,4 @@ 450 common set_mempolicy_home_node sys_set_mempolicy_home_node 451 common cachestat sys_cachestat 452 common fchmodat2 sys_fchmodat2 +454 common futex_wake sys_futex_wake diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 259ceb125367..21eb35c693e1 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -452,3 +452,4 @@ 450 common set_mempolicy_home_node sys_set_mempolicy_home_node 451 common cachestat sys_cachestat 452 common fchmodat2 sys_fchmodat2 +454 common futex_wake sys_futex_wake diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index a3798c2637fd..3a4e8513a8e1 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -458,3 +458,4 @@ 450 common set_mempolicy_home_node sys_set_mempolicy_home_node 451 common cachestat sys_cachestat 452 common fchmodat2 sys_fchmodat2 +454 common futex_wake sys_futex_wake diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 152034b8e0a0..6883ea3b830d 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -391,3 +391,4 @@ 450 n32 set_mempolicy_home_node sys_set_mempolicy_home_node 451 n32 cachestat sys_cachestat 452 n32 fchmodat2 sys_fchmodat2 +454 n32 futex_wake sys_futex_wake diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index cb5e757f6621..48bc0fb4e3dc 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -367,3 +367,4 @@ 450 common set_mempolicy_home_node sys_set_mempolicy_home_node 451 n64 cachestat sys_cachestat 452 n64 fchmodat2 sys_fchmodat2 +454 n64 futex_wake sys_futex_wake diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 1a646813afdc..a92625f5bad8 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -440,3 +440,4 @@ 450 o32 set_mempolicy_home_node sys_set_mempolicy_home_node 451 o32 cachestat sys_cachestat 452 o32 fchmodat2 sys_fchmodat2 +454 o32 futex_wake sys_futex_wake diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index e97c175b56f9..57faa9786ffe 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -451,3 +451,4 @@ 450 common set_mempolicy_home_node sys_set_mempolicy_home_node 451 common cachestat sys_cachestat 452 common fchmodat2 sys_fchmodat2 +454 common futex_wake sys_futex_wake diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 20e50586e8a2..e6c6ed6b30ee 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -539,3 +539,4 @@ 450 nospu set_mempolicy_home_node sys_set_mempolicy_home_node 451 common cachestat sys_cachestat 452 common fchmodat2 sys_fchmodat2 +454 common futex_wake sys_futex_wake diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 0122cc156952..754720154dc1 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -455,3 +455,4 @@ 450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node 451 common cachestat sys_cachestat sys_cachestat 452 common fchmodat2 sys_fchmodat2 sys_fchmodat2 +454 common futex_wake sys_futex_wake sys_futex_wake diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index e90d585c4d3e..902a997e7ec6 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -455,3 +455,4 @@ 450 common set_mempolicy_home_node sys_set_mempolicy_home_node 451 common cachestat sys_cachestat 452 common fchmodat2 sys_fchmodat2 +454 common futex_wake sys_futex_wake diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 4ed06c71c43f..8a1f887c8be6 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -498,3 +498,4 @@ 450 common set_mempolicy_home_node sys_set_mempolicy_home_node 451 common cachestat sys_cachestat 452 common fchmodat2 sys_fchmodat2 +454 common futex_wake sys_futex_wake diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 2d0b1bd866ea..9e81323979b0 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -457,3 +457,4 @@ 450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node 451 i386 cachestat sys_cachestat 452 i386 fchmodat2 sys_fchmodat2 +454 i386 futex_wake sys_futex_wake diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 1d6eee30eceb..d10a6003a7c9 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -375,6 +375,7 @@ 451 common cachestat sys_cachestat 452 common fchmodat2 sys_fchmodat2 453 64 map_shadow_stack sys_map_shadow_stack +454 common futex_wake sys_futex_wake # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index fc1a4f3c81d9..4e511bfd4b8f 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -423,3 +423,4 @@ 450 common set_mempolicy_home_node sys_set_mempolicy_home_node 451 common cachestat sys_cachestat 452 common fchmodat2 sys_fchmodat2 +454 common futex_wake sys_futex_wake diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 22bc6bc147f8..e174ed86da1d 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -549,6 +549,9 @@ asmlinkage long sys_set_robust_list(struct robust_list_head __user *head, asmlinkage long sys_futex_waitv(struct futex_waitv *waiters, unsigned int nr_futexes, unsigned int flags, struct __kernel_timespec __user *timeout, clockid_t clockid); + +asmlinkage long sys_futex_wake(void __user *uaddr, unsigned long mask, int nr, unsigned int flags); + asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp, struct __kernel_timespec __user *rmtp); asmlinkage long sys_nanosleep_time32(struct old_timespec32 __user *rqtp, diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index abe087c53b4b..f5454e6f4c6f 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -822,9 +822,11 @@ __SYSCALL(__NR_cachestat, sys_cachestat) #define __NR_fchmodat2 452 __SYSCALL(__NR_fchmodat2, sys_fchmodat2) +#define __NR_futex_wake 454 +__SYSCALL(__NR_futex_wake, sys_futex_wake) #undef __NR_syscalls -#define __NR_syscalls 453 +#define __NR_syscalls 455 /* * 32 bit systems traditionally used different diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c index 2339f9ccee7f..7049a52ef68e 100644 --- a/kernel/futex/syscalls.c +++ b/kernel/futex/syscalls.c @@ -306,6 +306,36 @@ destroy_timer: return ret; } +/* + * sys_futex_wake - Wake a number of futexes + * @uaddr: Address of the futex(es) to wake + * @mask: bitmask + * @nr: Number of the futexes to wake + * @flags: FUTEX2 flags + * + * Identical to the traditional FUTEX_WAKE_BITSET op, except it is part of the + * futex2 family of calls. + */ + +SYSCALL_DEFINE4(futex_wake, + void __user *, uaddr, + unsigned long, mask, + int, nr, + unsigned int, flags) +{ + if (flags & ~FUTEX2_VALID_MASK) + return -EINVAL; + + flags = futex2_to_flags(flags); + if (!futex_flags_valid(flags)) + return -EINVAL; + + if (!futex_validate_input(flags, mask)) + return -EINVAL; + + return futex_wake(uaddr, flags, nr, mask); +} + #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(set_robust_list, struct compat_robust_list_head __user *, head, diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index e137c1385c56..983c0583c627 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -87,6 +87,7 @@ COND_SYSCALL_COMPAT(set_robust_list); COND_SYSCALL(get_robust_list); COND_SYSCALL_COMPAT(get_robust_list); COND_SYSCALL(futex_waitv); +COND_SYSCALL(futex_wake); COND_SYSCALL(kexec_load); COND_SYSCALL_COMPAT(kexec_load); COND_SYSCALL(init_module); From 43adf844951084c266f172561f84c5f8120dd60b Mon Sep 17 00:00:00 2001 From: "peterz@infradead.org" Date: Thu, 21 Sep 2023 12:45:11 +0200 Subject: [PATCH 17/39] futex: FLAGS_STRICT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current semantics for futex_wake() are a bit loose, specifically asking for 0 futexes to be woken actually gets you 1. Adding a !nr check to sys_futex_wake() makes that it would return 0 for unaligned futex words, because that check comes in the shared futex_wake() function. Adding the !nr check there, would affect the legacy sys_futex() semantics. Hence frob a flag :-( Suggested-by: André Almeida Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/20230921105248.048643656@noisy.programming.kicks-ass.net --- kernel/futex/futex.h | 21 +++++++++++---------- kernel/futex/syscalls.c | 2 +- kernel/futex/waitwake.c | 3 +++ 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index a3f1fceafcbe..0e7821a944a2 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -17,25 +17,26 @@ * Futex flags used to encode options to functions and preserve them across * restarts. */ -#define FLAGS_SIZE_8 0x00 -#define FLAGS_SIZE_16 0x01 -#define FLAGS_SIZE_32 0x02 -#define FLAGS_SIZE_64 0x03 +#define FLAGS_SIZE_8 0x0000 +#define FLAGS_SIZE_16 0x0001 +#define FLAGS_SIZE_32 0x0002 +#define FLAGS_SIZE_64 0x0003 -#define FLAGS_SIZE_MASK 0x03 +#define FLAGS_SIZE_MASK 0x0003 #ifdef CONFIG_MMU -# define FLAGS_SHARED 0x10 +# define FLAGS_SHARED 0x0010 #else /* * NOMMU does not have per process address space. Let the compiler optimize * code away. */ -# define FLAGS_SHARED 0x00 +# define FLAGS_SHARED 0x0000 #endif -#define FLAGS_CLOCKRT 0x20 -#define FLAGS_HAS_TIMEOUT 0x40 -#define FLAGS_NUMA 0x80 +#define FLAGS_CLOCKRT 0x0020 +#define FLAGS_HAS_TIMEOUT 0x0040 +#define FLAGS_NUMA 0x0080 +#define FLAGS_STRICT 0x0100 /* FUTEX_ to FLAGS_ */ static inline unsigned int futex_to_flags(unsigned int op) diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c index 7049a52ef68e..47398926765e 100644 --- a/kernel/futex/syscalls.c +++ b/kernel/futex/syscalls.c @@ -333,7 +333,7 @@ SYSCALL_DEFINE4(futex_wake, if (!futex_validate_input(flags, mask)) return -EINVAL; - return futex_wake(uaddr, flags, nr, mask); + return futex_wake(uaddr, FLAGS_STRICT | flags, nr, mask); } #ifdef CONFIG_COMPAT diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c index fa9757766103..ceb05b876597 100644 --- a/kernel/futex/waitwake.c +++ b/kernel/futex/waitwake.c @@ -155,6 +155,9 @@ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) if (unlikely(ret != 0)) return ret; + if ((flags & FLAGS_STRICT) && !nr_wake) + return 0; + hb = futex_hash(&key); /* Make sure we really have tasks to wakeup */ From cb8c4312afca1b2dc64107e7e7cea81911055612 Mon Sep 17 00:00:00 2001 From: "peterz@infradead.org" Date: Thu, 21 Sep 2023 12:45:12 +0200 Subject: [PATCH 18/39] futex: Add sys_futex_wait() To complement sys_futex_waitv()/wake(), add sys_futex_wait(). This syscall implements what was previously known as FUTEX_WAIT_BITSET except it uses 'unsigned long' for the value and bitmask arguments, takes timespec and clockid_t arguments for the absolute timeout and uses FUTEX2 flags. The 'unsigned long' allows FUTEX2_SIZE_U64 on 64bit platforms. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Acked-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20230921105248.164324363@noisy.programming.kicks-ass.net --- arch/alpha/kernel/syscalls/syscall.tbl | 1 + arch/arm/tools/syscall.tbl | 1 + arch/arm64/include/asm/unistd.h | 2 +- arch/arm64/include/asm/unistd32.h | 2 + arch/ia64/kernel/syscalls/syscall.tbl | 1 + arch/m68k/kernel/syscalls/syscall.tbl | 1 + arch/microblaze/kernel/syscalls/syscall.tbl | 1 + arch/mips/kernel/syscalls/syscall_n32.tbl | 1 + arch/mips/kernel/syscalls/syscall_n64.tbl | 1 + arch/mips/kernel/syscalls/syscall_o32.tbl | 1 + arch/parisc/kernel/syscalls/syscall.tbl | 1 + arch/powerpc/kernel/syscalls/syscall.tbl | 1 + arch/s390/kernel/syscalls/syscall.tbl | 1 + arch/sh/kernel/syscalls/syscall.tbl | 1 + arch/sparc/kernel/syscalls/syscall.tbl | 1 + arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + arch/xtensa/kernel/syscalls/syscall.tbl | 1 + include/linux/syscalls.h | 4 + include/uapi/asm-generic/unistd.h | 4 +- kernel/futex/futex.h | 3 + kernel/futex/syscalls.c | 120 +++++++++++++++----- kernel/futex/waitwake.c | 61 ++++++---- kernel/sys_ni.c | 1 + 24 files changed, 156 insertions(+), 57 deletions(-) diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index 3b86519d68e4..c49f12fd264e 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -493,3 +493,4 @@ 561 common cachestat sys_cachestat 562 common fchmodat2 sys_fchmodat2 563 common futex_wake sys_futex_wake +564 common futex_wait sys_futex_wait diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 714abeb1e6fa..a6cf56277327 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -467,3 +467,4 @@ 451 common cachestat sys_cachestat 452 common fchmodat2 sys_fchmodat2 454 common futex_wake sys_futex_wake +455 common futex_wait sys_futex_wait diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 63a8a9c4abc1..f33190f17ebb 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -39,7 +39,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 455 +#define __NR_compat_syscalls 456 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 68974683737b..6e7d37282ba1 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -913,6 +913,8 @@ __SYSCALL(__NR_cachestat, sys_cachestat) __SYSCALL(__NR_fchmodat2, sys_fchmodat2) #define __NR_futex_wake 454 __SYSCALL(__NR_futex_wake, sys_futex_wake) +#define __NR_futex_wait 455 +__SYSCALL(__NR_futex_wait, sys_futex_wait) /* * Please add new compat syscalls above this comment and update diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index cd50247508e6..4043f0c55170 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -374,3 +374,4 @@ 451 common cachestat sys_cachestat 452 common fchmodat2 sys_fchmodat2 454 common futex_wake sys_futex_wake +455 common futex_wait sys_futex_wait diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 21eb35c693e1..24841674acc5 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -453,3 +453,4 @@ 451 common cachestat sys_cachestat 452 common fchmodat2 sys_fchmodat2 454 common futex_wake sys_futex_wake +455 common futex_wait sys_futex_wait diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index 3a4e8513a8e1..f03927ab0220 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -459,3 +459,4 @@ 451 common cachestat sys_cachestat 452 common fchmodat2 sys_fchmodat2 454 common futex_wake sys_futex_wake +455 common futex_wait sys_futex_wait diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 6883ea3b830d..dbb5edfb667b 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -392,3 +392,4 @@ 451 n32 cachestat sys_cachestat 452 n32 fchmodat2 sys_fchmodat2 454 n32 futex_wake sys_futex_wake +455 n32 futex_wait sys_futex_wait diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index 48bc0fb4e3dc..faff8dfd2983 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -368,3 +368,4 @@ 451 n64 cachestat sys_cachestat 452 n64 fchmodat2 sys_fchmodat2 454 n64 futex_wake sys_futex_wake +455 n64 futex_wait sys_futex_wait diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index a92625f5bad8..542f75605b3e 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -441,3 +441,4 @@ 451 o32 cachestat sys_cachestat 452 o32 fchmodat2 sys_fchmodat2 454 o32 futex_wake sys_futex_wake +455 o32 futex_wait sys_futex_wait diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index 57faa9786ffe..8e50e89551f7 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -452,3 +452,4 @@ 451 common cachestat sys_cachestat 452 common fchmodat2 sys_fchmodat2 454 common futex_wake sys_futex_wake +455 common futex_wait sys_futex_wait diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index e6c6ed6b30ee..ad33a9993a6a 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -540,3 +540,4 @@ 451 common cachestat sys_cachestat 452 common fchmodat2 sys_fchmodat2 454 common futex_wake sys_futex_wake +455 common futex_wait sys_futex_wait diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 754720154dc1..418853fd2a6b 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -456,3 +456,4 @@ 451 common cachestat sys_cachestat sys_cachestat 452 common fchmodat2 sys_fchmodat2 sys_fchmodat2 454 common futex_wake sys_futex_wake sys_futex_wake +455 common futex_wait sys_futex_wait sys_futex_wait diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index 902a997e7ec6..8ef9557d2779 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -456,3 +456,4 @@ 451 common cachestat sys_cachestat 452 common fchmodat2 sys_fchmodat2 454 common futex_wake sys_futex_wake +455 common futex_wait sys_futex_wait diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 8a1f887c8be6..df59a9d5f109 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -499,3 +499,4 @@ 451 common cachestat sys_cachestat 452 common fchmodat2 sys_fchmodat2 454 common futex_wake sys_futex_wake +455 common futex_wait sys_futex_wait diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 9e81323979b0..0f6616822bd5 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -458,3 +458,4 @@ 451 i386 cachestat sys_cachestat 452 i386 fchmodat2 sys_fchmodat2 454 i386 futex_wake sys_futex_wake +455 i386 futex_wait sys_futex_wait diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index d10a6003a7c9..ddf6288823ad 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -376,6 +376,7 @@ 452 common fchmodat2 sys_fchmodat2 453 64 map_shadow_stack sys_map_shadow_stack 454 common futex_wake sys_futex_wake +455 common futex_wait sys_futex_wait # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 4e511bfd4b8f..ac278dbce2ee 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -424,3 +424,4 @@ 451 common cachestat sys_cachestat 452 common fchmodat2 sys_fchmodat2 454 common futex_wake sys_futex_wake +455 common futex_wait sys_futex_wait diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index e174ed86da1d..11f3fdd1ee03 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -552,6 +552,10 @@ asmlinkage long sys_futex_waitv(struct futex_waitv *waiters, asmlinkage long sys_futex_wake(void __user *uaddr, unsigned long mask, int nr, unsigned int flags); +asmlinkage long sys_futex_wait(void __user *uaddr, unsigned long val, unsigned long mask, + unsigned int flags, struct __kernel_timespec __user *timespec, + clockid_t clockid); + asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp, struct __kernel_timespec __user *rmtp); asmlinkage long sys_nanosleep_time32(struct old_timespec32 __user *rqtp, diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index f5454e6f4c6f..f6553bd5d213 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -824,9 +824,11 @@ __SYSCALL(__NR_cachestat, sys_cachestat) __SYSCALL(__NR_fchmodat2, sys_fchmodat2) #define __NR_futex_wake 454 __SYSCALL(__NR_futex_wake, sys_futex_wake) +#define __NR_futex_wait 455 +__SYSCALL(__NR_futex_wait, sys_futex_wait) #undef __NR_syscalls -#define __NR_syscalls 455 +#define __NR_syscalls 456 /* * 32 bit systems traditionally used different diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index 0e7821a944a2..e74888a7d71d 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -332,6 +332,9 @@ extern int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, int nr_wake, int nr_requeue, u32 *cmpval, int requeue_pi); +extern int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, + struct hrtimer_sleeper *to, u32 bitset); + extern int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset); diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c index 47398926765e..e4c8ec713787 100644 --- a/kernel/futex/syscalls.c +++ b/kernel/futex/syscalls.c @@ -221,6 +221,46 @@ static int futex_parse_waitv(struct futex_vector *futexv, return 0; } +static int futex2_setup_timeout(struct __kernel_timespec __user *timeout, + clockid_t clockid, struct hrtimer_sleeper *to) +{ + int flag_clkid = 0, flag_init = 0; + struct timespec64 ts; + ktime_t time; + int ret; + + if (!timeout) + return 0; + + if (clockid == CLOCK_REALTIME) { + flag_clkid = FLAGS_CLOCKRT; + flag_init = FUTEX_CLOCK_REALTIME; + } + + if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC) + return -EINVAL; + + if (get_timespec64(&ts, timeout)) + return -EFAULT; + + /* + * Since there's no opcode for futex_waitv, use + * FUTEX_WAIT_BITSET that uses absolute timeout as well + */ + ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time); + if (ret) + return ret; + + futex_setup_timer(&time, to, flag_clkid, 0); + return 0; +} + +static inline void futex2_destroy_timeout(struct hrtimer_sleeper *to) +{ + hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); +} + /** * sys_futex_waitv - Wait on a list of futexes * @waiters: List of futexes to wait on @@ -250,8 +290,6 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters, { struct hrtimer_sleeper to; struct futex_vector *futexv; - struct timespec64 ts; - ktime_t time; int ret; /* This syscall supports no flags for now */ @@ -261,30 +299,8 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters, if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters) return -EINVAL; - if (timeout) { - int flag_clkid = 0, flag_init = 0; - - if (clockid == CLOCK_REALTIME) { - flag_clkid = FLAGS_CLOCKRT; - flag_init = FUTEX_CLOCK_REALTIME; - } - - if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC) - return -EINVAL; - - if (get_timespec64(&ts, timeout)) - return -EFAULT; - - /* - * Since there's no opcode for futex_waitv, use - * FUTEX_WAIT_BITSET that uses absolute timeout as well - */ - ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time); - if (ret) - return ret; - - futex_setup_timer(&time, &to, flag_clkid, 0); - } + if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to))) + return ret; futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL); if (!futexv) { @@ -299,10 +315,8 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters, kfree(futexv); destroy_timer: - if (timeout) { - hrtimer_cancel(&to.timer); - destroy_hrtimer_on_stack(&to.timer); - } + if (timeout) + futex2_destroy_timeout(&to); return ret; } @@ -336,6 +350,52 @@ SYSCALL_DEFINE4(futex_wake, return futex_wake(uaddr, FLAGS_STRICT | flags, nr, mask); } +/* + * sys_futex_wait - Wait on a futex + * @uaddr: Address of the futex to wait on + * @val: Value of @uaddr + * @mask: bitmask + * @flags: FUTEX2 flags + * @timeout: Optional absolute timeout + * @clockid: Clock to be used for the timeout, realtime or monotonic + * + * Identical to the traditional FUTEX_WAIT_BITSET op, except it is part of the + * futex2 familiy of calls. + */ + +SYSCALL_DEFINE6(futex_wait, + void __user *, uaddr, + unsigned long, val, + unsigned long, mask, + unsigned int, flags, + struct __kernel_timespec __user *, timeout, + clockid_t, clockid) +{ + struct hrtimer_sleeper to; + int ret; + + if (flags & ~FUTEX2_VALID_MASK) + return -EINVAL; + + flags = futex2_to_flags(flags); + if (!futex_flags_valid(flags)) + return -EINVAL; + + if (!futex_validate_input(flags, val) || + !futex_validate_input(flags, mask)) + return -EINVAL; + + if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to))) + return ret; + + ret = __futex_wait(uaddr, flags, val, timeout ? &to : NULL, mask); + + if (timeout) + futex2_destroy_timeout(&to); + + return ret; +} + #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(set_robust_list, struct compat_robust_list_head __user *, head, diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c index ceb05b876597..b109a0810a2c 100644 --- a/kernel/futex/waitwake.c +++ b/kernel/futex/waitwake.c @@ -632,20 +632,18 @@ retry_private: return ret; } -int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset) +int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, + struct hrtimer_sleeper *to, u32 bitset) { - struct hrtimer_sleeper timeout, *to; - struct restart_block *restart; - struct futex_hash_bucket *hb; struct futex_q q = futex_q_init; + struct futex_hash_bucket *hb; int ret; if (!bitset) return -EINVAL; + q.bitset = bitset; - to = futex_setup_timer(abs_time, &timeout, flags, - current->timer_slack_ns); retry: /* * Prepare to wait on uaddr. On success, it holds hb->lock and q @@ -653,18 +651,17 @@ retry: */ ret = futex_wait_setup(uaddr, val, flags, &q, &hb); if (ret) - goto out; + return ret; /* futex_queue and wait for wakeup, timeout, or a signal. */ futex_wait_queue(hb, &q, to); /* If we were woken (and unqueued), we succeeded, whatever. */ - ret = 0; if (!futex_unqueue(&q)) - goto out; - ret = -ETIMEDOUT; + return 0; + if (to && !to->task) - goto out; + return -ETIMEDOUT; /* * We expect signal_pending(current), but we might be the @@ -673,24 +670,38 @@ retry: if (!signal_pending(current)) goto retry; - ret = -ERESTARTSYS; - if (!abs_time) - goto out; + return -ERESTARTSYS; +} - restart = ¤t->restart_block; - restart->futex.uaddr = uaddr; - restart->futex.val = val; - restart->futex.time = *abs_time; - restart->futex.bitset = bitset; - restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; +int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset) +{ + struct hrtimer_sleeper timeout, *to; + struct restart_block *restart; + int ret; - ret = set_restart_fn(restart, futex_wait_restart); + to = futex_setup_timer(abs_time, &timeout, flags, + current->timer_slack_ns); -out: - if (to) { - hrtimer_cancel(&to->timer); - destroy_hrtimer_on_stack(&to->timer); + ret = __futex_wait(uaddr, flags, val, to, bitset); + + /* No timeout, nothing to clean up. */ + if (!to) + return ret; + + hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); + + if (ret == -ERESTARTSYS) { + restart = ¤t->restart_block; + restart->futex.uaddr = uaddr; + restart->futex.val = val; + restart->futex.time = *abs_time; + restart->futex.bitset = bitset; + restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; + + return set_restart_fn(restart, futex_wait_restart); } + return ret; } diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 983c0583c627..13df391194e2 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -88,6 +88,7 @@ COND_SYSCALL(get_robust_list); COND_SYSCALL_COMPAT(get_robust_list); COND_SYSCALL(futex_waitv); COND_SYSCALL(futex_wake); +COND_SYSCALL(futex_wait); COND_SYSCALL(kexec_load); COND_SYSCALL_COMPAT(kexec_load); COND_SYSCALL(init_module); From 3b63a55f498b763aba0886b244df613587a73c46 Mon Sep 17 00:00:00 2001 From: "peterz@infradead.org" Date: Thu, 21 Sep 2023 12:45:13 +0200 Subject: [PATCH 19/39] futex: Propagate flags into get_futex_key() Instead of only passing FLAGS_SHARED as a boolean, pass down flags as a whole. No functional change intended. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/20230921105248.282857501@noisy.programming.kicks-ass.net --- kernel/futex/core.c | 7 +++++-- kernel/futex/futex.h | 2 +- kernel/futex/pi.c | 4 ++-- kernel/futex/requeue.c | 6 +++--- kernel/futex/waitwake.c | 14 +++++++------- 5 files changed, 18 insertions(+), 15 deletions(-) diff --git a/kernel/futex/core.c b/kernel/futex/core.c index d1d7b3c175a4..ade7c731972d 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -193,7 +193,7 @@ static u64 get_inode_sequence_number(struct inode *inode) /** * get_futex_key() - Get parameters which are the keys for a futex * @uaddr: virtual address of the futex - * @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED + * @flags: FLAGS_* * @key: address where result is stored. * @rw: mapping needs to be read/write (values: FUTEX_READ, * FUTEX_WRITE) @@ -217,7 +217,7 @@ static u64 get_inode_sequence_number(struct inode *inode) * * lock_page() might sleep, the caller should not hold a spinlock. */ -int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, +int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, enum futex_access rw) { unsigned long address = (unsigned long)uaddr; @@ -226,6 +226,9 @@ int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, struct folio *folio; struct address_space *mapping; int err, ro = 0; + bool fshared; + + fshared = flags & FLAGS_SHARED; /* * The futex address must be "naturally" aligned. diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index e74888a7d71d..a8ea5ef52424 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -184,7 +184,7 @@ enum futex_access { FUTEX_WRITE }; -extern int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, +extern int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, enum futex_access rw); extern struct hrtimer_sleeper * diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c index d636a1bbd7d0..90e5197f4e56 100644 --- a/kernel/futex/pi.c +++ b/kernel/futex/pi.c @@ -933,7 +933,7 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl to = futex_setup_timer(time, &timeout, flags, 0); retry: - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE); + ret = get_futex_key(uaddr, flags, &q.key, FUTEX_WRITE); if (unlikely(ret != 0)) goto out; @@ -1129,7 +1129,7 @@ retry: if ((uval & FUTEX_TID_MASK) != vpid) return -EPERM; - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE); + ret = get_futex_key(uaddr, flags, &key, FUTEX_WRITE); if (ret) return ret; diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c index 4c73e0b81acc..5bf69581a937 100644 --- a/kernel/futex/requeue.c +++ b/kernel/futex/requeue.c @@ -424,10 +424,10 @@ int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, } retry: - ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); + ret = get_futex_key(uaddr1, flags, &key1, FUTEX_READ); if (unlikely(ret != 0)) return ret; - ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, + ret = get_futex_key(uaddr2, flags, &key2, requeue_pi ? FUTEX_WRITE : FUTEX_READ); if (unlikely(ret != 0)) return ret; @@ -789,7 +789,7 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, */ rt_mutex_init_waiter(&rt_waiter); - ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); + ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE); if (unlikely(ret != 0)) goto out; diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c index b109a0810a2c..37860f794bf7 100644 --- a/kernel/futex/waitwake.c +++ b/kernel/futex/waitwake.c @@ -145,13 +145,13 @@ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) struct futex_hash_bucket *hb; struct futex_q *this, *next; union futex_key key = FUTEX_KEY_INIT; - int ret; DEFINE_WAKE_Q(wake_q); + int ret; if (!bitset) return -EINVAL; - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ); + ret = get_futex_key(uaddr, flags, &key, FUTEX_READ); if (unlikely(ret != 0)) return ret; @@ -248,10 +248,10 @@ int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, DEFINE_WAKE_Q(wake_q); retry: - ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); + ret = get_futex_key(uaddr1, flags, &key1, FUTEX_READ); if (unlikely(ret != 0)) return ret; - ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); + ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE); if (unlikely(ret != 0)) return ret; @@ -426,7 +426,7 @@ retry: continue; ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr), - vs[i].w.flags & FLAGS_SHARED, + vs[i].w.flags, &vs[i].q.key, FUTEX_READ); if (unlikely(ret)) @@ -438,7 +438,7 @@ retry: for (i = 0; i < count; i++) { u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr; struct futex_q *q = &vs[i].q; - u32 val = (u32)vs[i].w.val; + u32 val = vs[i].w.val; hb = futex_q_lock(q); ret = futex_get_value_locked(&uval, uaddr); @@ -602,7 +602,7 @@ int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, * while the syscall executes. */ retry: - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ); + ret = get_futex_key(uaddr, flags, &q->key, FUTEX_READ); if (unlikely(ret != 0)) return ret; From 27b88f3519e72d71c8cead6b835a26c171109c9b Mon Sep 17 00:00:00 2001 From: "peterz@infradead.org" Date: Thu, 21 Sep 2023 12:45:14 +0200 Subject: [PATCH 20/39] futex: Add flags2 argument to futex_requeue() In order to support mixed size requeue, add a second flags argument to the internal futex_requeue() function. No functional change intended. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/20230921105248.396780136@noisy.programming.kicks-ass.net --- kernel/futex/futex.h | 5 +++-- kernel/futex/requeue.c | 12 +++++++----- kernel/futex/syscalls.c | 6 +++--- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index a8ea5ef52424..a06030a1a27b 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -328,8 +328,9 @@ extern int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset, u32 __user *uaddr2); -extern int futex_requeue(u32 __user *uaddr1, unsigned int flags, - u32 __user *uaddr2, int nr_wake, int nr_requeue, +extern int futex_requeue(u32 __user *uaddr1, unsigned int flags1, + u32 __user *uaddr2, unsigned int flags2, + int nr_wake, int nr_requeue, u32 *cmpval, int requeue_pi); extern int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c index 5bf69581a937..a0a79954f506 100644 --- a/kernel/futex/requeue.c +++ b/kernel/futex/requeue.c @@ -346,8 +346,9 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, /** * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 * @uaddr1: source futex user address - * @flags: futex flags (FLAGS_SHARED, etc.) + * @flags1: futex flags (FLAGS_SHARED, etc.) * @uaddr2: target futex user address + * @flags2: futex flags (FLAGS_SHARED, etc.) * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) * @nr_requeue: number of waiters to requeue (0-INT_MAX) * @cmpval: @uaddr1 expected value (or %NULL) @@ -361,7 +362,8 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, * - >=0 - on success, the number of tasks requeued or woken; * - <0 - on error */ -int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, +int futex_requeue(u32 __user *uaddr1, unsigned int flags1, + u32 __user *uaddr2, unsigned int flags2, int nr_wake, int nr_requeue, u32 *cmpval, int requeue_pi) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; @@ -424,10 +426,10 @@ int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, } retry: - ret = get_futex_key(uaddr1, flags, &key1, FUTEX_READ); + ret = get_futex_key(uaddr1, flags1, &key1, FUTEX_READ); if (unlikely(ret != 0)) return ret; - ret = get_futex_key(uaddr2, flags, &key2, + ret = get_futex_key(uaddr2, flags2, &key2, requeue_pi ? FUTEX_WRITE : FUTEX_READ); if (unlikely(ret != 0)) return ret; @@ -459,7 +461,7 @@ retry_private: if (ret) return ret; - if (!(flags & FLAGS_SHARED)) + if (!(flags1 & FLAGS_SHARED)) goto retry_private; goto retry; diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c index e4c8ec713787..dde9b74db9af 100644 --- a/kernel/futex/syscalls.c +++ b/kernel/futex/syscalls.c @@ -106,9 +106,9 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, case FUTEX_WAKE_BITSET: return futex_wake(uaddr, flags, val, val3); case FUTEX_REQUEUE: - return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); + return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, NULL, 0); case FUTEX_CMP_REQUEUE: - return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); + return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 0); case FUTEX_WAKE_OP: return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); case FUTEX_LOCK_PI: @@ -125,7 +125,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, uaddr2); case FUTEX_CMP_REQUEUE_PI: - return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); + return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 1); } return -ENOSYS; } From 0f4b5f972216782a4acb1ae00dcb55173847c2ff Mon Sep 17 00:00:00 2001 From: "peterz@infradead.org" Date: Thu, 21 Sep 2023 12:45:15 +0200 Subject: [PATCH 21/39] futex: Add sys_futex_requeue() Finish off the 'simple' futex2 syscall group by adding sys_futex_requeue(). Unlike sys_futex_{wait,wake}() its arguments are too numerous to fit into a regular syscall. As such, use struct futex_waitv to pass the 'source' and 'destination' futexes to the syscall. This syscall implements what was previously known as FUTEX_CMP_REQUEUE and uses {val, uaddr, flags} for source and {uaddr, flags} for destination. This design explicitly allows requeueing between different types of futex by having a different flags word per uaddr. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Acked-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20230921105248.511860556@noisy.programming.kicks-ass.net --- arch/alpha/kernel/syscalls/syscall.tbl | 1 + arch/arm/tools/syscall.tbl | 1 + arch/arm64/include/asm/unistd.h | 2 +- arch/arm64/include/asm/unistd32.h | 2 ++ arch/ia64/kernel/syscalls/syscall.tbl | 1 + arch/m68k/kernel/syscalls/syscall.tbl | 1 + arch/microblaze/kernel/syscalls/syscall.tbl | 1 + arch/mips/kernel/syscalls/syscall_n32.tbl | 1 + arch/mips/kernel/syscalls/syscall_n64.tbl | 1 + arch/mips/kernel/syscalls/syscall_o32.tbl | 1 + arch/parisc/kernel/syscalls/syscall.tbl | 1 + arch/powerpc/kernel/syscalls/syscall.tbl | 1 + arch/s390/kernel/syscalls/syscall.tbl | 1 + arch/sh/kernel/syscalls/syscall.tbl | 1 + arch/sparc/kernel/syscalls/syscall.tbl | 1 + arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + arch/xtensa/kernel/syscalls/syscall.tbl | 1 + include/linux/syscalls.h | 3 ++ include/uapi/asm-generic/unistd.h | 4 ++- kernel/futex/syscalls.c | 38 +++++++++++++++++++++ kernel/sys_ni.c | 1 + 22 files changed, 64 insertions(+), 2 deletions(-) diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index c49f12fd264e..b1865f9bb31e 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -494,3 +494,4 @@ 562 common fchmodat2 sys_fchmodat2 563 common futex_wake sys_futex_wake 564 common futex_wait sys_futex_wait +565 common futex_requeue sys_futex_requeue diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index a6cf56277327..93d0d46cbb15 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -468,3 +468,4 @@ 452 common fchmodat2 sys_fchmodat2 454 common futex_wake sys_futex_wake 455 common futex_wait sys_futex_wait +456 common futex_requeue sys_futex_requeue diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index f33190f17ebb..531effca5f1f 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -39,7 +39,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 456 +#define __NR_compat_syscalls 457 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 6e7d37282ba1..c453291154fd 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -915,6 +915,8 @@ __SYSCALL(__NR_fchmodat2, sys_fchmodat2) __SYSCALL(__NR_futex_wake, sys_futex_wake) #define __NR_futex_wait 455 __SYSCALL(__NR_futex_wait, sys_futex_wait) +#define __NR_futex_requeue 456 +__SYSCALL(__NR_futex_requeue, sys_futex_requeue) /* * Please add new compat syscalls above this comment and update diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index 4043f0c55170..81375ea78288 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -375,3 +375,4 @@ 452 common fchmodat2 sys_fchmodat2 454 common futex_wake sys_futex_wake 455 common futex_wait sys_futex_wait +456 common futex_requeue sys_futex_requeue diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 24841674acc5..f7f997a88bab 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -454,3 +454,4 @@ 452 common fchmodat2 sys_fchmodat2 454 common futex_wake sys_futex_wake 455 common futex_wait sys_futex_wait +456 common futex_requeue sys_futex_requeue diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index f03927ab0220..2967ec26b978 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -460,3 +460,4 @@ 452 common fchmodat2 sys_fchmodat2 454 common futex_wake sys_futex_wake 455 common futex_wait sys_futex_wait +456 common futex_requeue sys_futex_requeue diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index dbb5edfb667b..383abb1713f4 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -393,3 +393,4 @@ 452 n32 fchmodat2 sys_fchmodat2 454 n32 futex_wake sys_futex_wake 455 n32 futex_wait sys_futex_wait +456 n32 futex_requeue sys_futex_requeue diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index faff8dfd2983..c9bd09ba905f 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -369,3 +369,4 @@ 452 n64 fchmodat2 sys_fchmodat2 454 n64 futex_wake sys_futex_wake 455 n64 futex_wait sys_futex_wait +456 n64 futex_requeue sys_futex_requeue diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 542f75605b3e..ba5ef6cea97a 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -442,3 +442,4 @@ 452 o32 fchmodat2 sys_fchmodat2 454 o32 futex_wake sys_futex_wake 455 o32 futex_wait sys_futex_wait +456 o32 futex_requeue sys_futex_requeue diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index 8e50e89551f7..9f0f6df55361 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -453,3 +453,4 @@ 452 common fchmodat2 sys_fchmodat2 454 common futex_wake sys_futex_wake 455 common futex_wait sys_futex_wait +456 common futex_requeue sys_futex_requeue diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index ad33a9993a6a..26fc41904266 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -541,3 +541,4 @@ 452 common fchmodat2 sys_fchmodat2 454 common futex_wake sys_futex_wake 455 common futex_wait sys_futex_wait +456 common futex_requeue sys_futex_requeue diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 418853fd2a6b..31be90b241f7 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -457,3 +457,4 @@ 452 common fchmodat2 sys_fchmodat2 sys_fchmodat2 454 common futex_wake sys_futex_wake sys_futex_wake 455 common futex_wait sys_futex_wait sys_futex_wait +456 common futex_requeue sys_futex_requeue sys_futex_requeue diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index 8ef9557d2779..4bc5d488ab17 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -457,3 +457,4 @@ 452 common fchmodat2 sys_fchmodat2 454 common futex_wake sys_futex_wake 455 common futex_wait sys_futex_wait +456 common futex_requeue sys_futex_requeue diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index df59a9d5f109..8404c8e50394 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -500,3 +500,4 @@ 452 common fchmodat2 sys_fchmodat2 454 common futex_wake sys_futex_wake 455 common futex_wait sys_futex_wait +456 common futex_requeue sys_futex_requeue diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 0f6616822bd5..31c48bc2c3d8 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -459,3 +459,4 @@ 452 i386 fchmodat2 sys_fchmodat2 454 i386 futex_wake sys_futex_wake 455 i386 futex_wait sys_futex_wait +456 i386 futex_requeue sys_futex_requeue diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index ddf6288823ad..a577bb27c16d 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -377,6 +377,7 @@ 453 64 map_shadow_stack sys_map_shadow_stack 454 common futex_wake sys_futex_wake 455 common futex_wait sys_futex_wait +456 common futex_requeue sys_futex_requeue # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index ac278dbce2ee..dd71ecce8b86 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -425,3 +425,4 @@ 452 common fchmodat2 sys_fchmodat2 454 common futex_wake sys_futex_wake 455 common futex_wait sys_futex_wait +456 common futex_requeue sys_futex_requeue diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 11f3fdd1ee03..0901af60d971 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -556,6 +556,9 @@ asmlinkage long sys_futex_wait(void __user *uaddr, unsigned long val, unsigned l unsigned int flags, struct __kernel_timespec __user *timespec, clockid_t clockid); +asmlinkage long sys_futex_requeue(struct futex_waitv __user *waiters, + unsigned int flags, int nr_wake, int nr_requeue); + asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp, struct __kernel_timespec __user *rmtp); asmlinkage long sys_nanosleep_time32(struct old_timespec32 __user *rqtp, diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index f6553bd5d213..d9e9cd13e577 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -826,9 +826,11 @@ __SYSCALL(__NR_fchmodat2, sys_fchmodat2) __SYSCALL(__NR_futex_wake, sys_futex_wake) #define __NR_futex_wait 455 __SYSCALL(__NR_futex_wait, sys_futex_wait) +#define __NR_futex_requeue 456 +__SYSCALL(__NR_futex_requeue, sys_futex_requeue) #undef __NR_syscalls -#define __NR_syscalls 456 +#define __NR_syscalls 457 /* * 32 bit systems traditionally used different diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c index dde9b74db9af..8200d86d30e1 100644 --- a/kernel/futex/syscalls.c +++ b/kernel/futex/syscalls.c @@ -396,6 +396,44 @@ SYSCALL_DEFINE6(futex_wait, return ret; } +/* + * sys_futex_requeue - Requeue a waiter from one futex to another + * @waiters: array describing the source and destination futex + * @flags: unused + * @nr_wake: number of futexes to wake + * @nr_requeue: number of futexes to requeue + * + * Identical to the traditional FUTEX_CMP_REQUEUE op, except it is part of the + * futex2 family of calls. + */ + +SYSCALL_DEFINE4(futex_requeue, + struct futex_waitv __user *, waiters, + unsigned int, flags, + int, nr_wake, + int, nr_requeue) +{ + struct futex_vector futexes[2]; + u32 cmpval; + int ret; + + if (flags) + return -EINVAL; + + if (!waiters) + return -EINVAL; + + ret = futex_parse_waitv(futexes, waiters, 2); + if (ret) + return ret; + + cmpval = futexes[0].w.val; + + return futex_requeue(u64_to_user_ptr(futexes[0].w.uaddr), futexes[0].w.flags, + u64_to_user_ptr(futexes[1].w.uaddr), futexes[1].w.flags, + nr_wake, nr_requeue, &cmpval, 0); +} + #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(set_robust_list, struct compat_robust_list_head __user *, head, diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 13df391194e2..9db51ea373b0 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -89,6 +89,7 @@ COND_SYSCALL_COMPAT(get_robust_list); COND_SYSCALL(futex_waitv); COND_SYSCALL(futex_wake); COND_SYSCALL(futex_wait); +COND_SYSCALL(futex_requeue); COND_SYSCALL(kexec_load); COND_SYSCALL_COMPAT(kexec_load); COND_SYSCALL(init_module); From 4812c54dc0498c4b757cbc7f41c1999b5a1c9f67 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 22 Sep 2023 04:35:59 +0000 Subject: [PATCH 22/39] locking/ww_mutex/test: Use prng instead of rng to avoid hangs at bootup Booting w/ qemu without kvm, and with 64 cpus, I noticed we'd sometimes hung task watchdog splats in get_random_u32_below() when using the test-ww_mutex stress test. While entropy exhaustion is no longer an issue, the RNG may be slower early in boot. The test-ww_mutex code will spawn off 128 threads (2x cpus) and each thread will call get_random_u32_below() a number of times to generate a random order of the 16 locks. This intense use takes time and without kvm, qemu can be slow enough that we trip the hung task watchdogs. For this test, we don't need true randomness, just mixed up orders for testing ww_mutex lock acquisitions, so it changes the logic to use the prng instead, which takes less time and avoids the watchdgos. Feedback would be appreciated! Signed-off-by: John Stultz Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230922043616.19282-2-jstultz@google.com --- kernel/locking/test-ww_mutex.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 93cca6e69860..9bceba65858a 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include @@ -386,6 +386,19 @@ struct stress { int nlocks; }; +struct rnd_state rng; +DEFINE_SPINLOCK(rng_lock); + +static inline u32 prandom_u32_below(u32 ceil) +{ + u32 ret; + + spin_lock(&rng_lock); + ret = prandom_u32_state(&rng) % ceil; + spin_unlock(&rng_lock); + return ret; +} + static int *get_random_order(int count) { int *order; @@ -399,7 +412,7 @@ static int *get_random_order(int count) order[n] = n; for (n = count - 1; n > 1; n--) { - r = get_random_u32_below(n + 1); + r = prandom_u32_below(n + 1); if (r != n) { tmp = order[n]; order[n] = order[r]; @@ -625,6 +638,8 @@ static int __init test_ww_mutex_init(void) printk(KERN_INFO "Beginning ww mutex selftests\n"); + prandom_seed_state(&rng, get_random_u64()); + wq = alloc_workqueue("test-ww_mutex", WQ_UNBOUND, 0); if (!wq) return -ENOMEM; From bccdd808902f8c677317cec47c306e42b93b849e Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 22 Sep 2023 04:36:00 +0000 Subject: [PATCH 23/39] locking/ww_mutex/test: Fix potential workqueue corruption In some cases running with the test-ww_mutex code, I was seeing odd behavior where sometimes it seemed flush_workqueue was returning before all the work threads were finished. Often this would cause strange crashes as the mutexes would be freed while they were being used. Looking at the code, there is a lifetime problem as the controlling thread that spawns the work allocates the "struct stress" structures that are passed to the workqueue threads. Then when the workqueue threads are finished, they free the stress struct that was passed to them. Unfortunately the workqueue work_struct node is in the stress struct. Which means the work_struct is freed before the work thread returns and while flush_workqueue is waiting. It seems like a better idea to have the controlling thread both allocate and free the stress structures, so that we can be sure we don't corrupt the workqueue by freeing the structure prematurely. So this patch reworks the test to do so, and with this change I no longer see the early flush_workqueue returns. Signed-off-by: John Stultz Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230922043616.19282-3-jstultz@google.com --- kernel/locking/test-ww_mutex.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 9bceba65858a..358d66150426 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -479,7 +479,6 @@ retry: } while (!time_after(jiffies, stress->timeout)); kfree(order); - kfree(stress); } struct reorder_lock { @@ -544,7 +543,6 @@ out: list_for_each_entry_safe(ll, ln, &locks, link) kfree(ll); kfree(order); - kfree(stress); } static void stress_one_work(struct work_struct *work) @@ -565,8 +563,6 @@ static void stress_one_work(struct work_struct *work) break; } } while (!time_after(jiffies, stress->timeout)); - - kfree(stress); } #define STRESS_INORDER BIT(0) @@ -577,15 +573,24 @@ static void stress_one_work(struct work_struct *work) static int stress(int nlocks, int nthreads, unsigned int flags) { struct ww_mutex *locks; - int n; + struct stress *stress_array; + int n, count; locks = kmalloc_array(nlocks, sizeof(*locks), GFP_KERNEL); if (!locks) return -ENOMEM; + stress_array = kmalloc_array(nthreads, sizeof(*stress_array), + GFP_KERNEL); + if (!stress_array) { + kfree(locks); + return -ENOMEM; + } + for (n = 0; n < nlocks; n++) ww_mutex_init(&locks[n], &ww_class); + count = 0; for (n = 0; nthreads; n++) { struct stress *stress; void (*fn)(struct work_struct *work); @@ -609,9 +614,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags) if (!fn) continue; - stress = kmalloc(sizeof(*stress), GFP_KERNEL); - if (!stress) - break; + stress = &stress_array[count++]; INIT_WORK(&stress->work, fn); stress->locks = locks; @@ -626,6 +629,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags) for (n = 0; n < nlocks; n++) ww_mutex_destroy(&locks[n]); + kfree(stress_array); kfree(locks); return 0; From cfa92b6d52071aaa8f27d21affdcb14e7448fbc1 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 22 Sep 2023 04:36:01 +0000 Subject: [PATCH 24/39] locking/ww_mutex/test: Make sure we bail out instead of livelock I've seen what appears to be livelocks in the stress_inorder_work() function, and looking at the code it is clear we can have a case where we continually retry acquiring the locks and never check to see if we have passed the specified timeout. This patch reworks that function so we always check the timeout before iterating through the loop again. I believe others may have hit this previously here: https://lore.kernel.org/lkml/895ef450-4fb3-5d29-a6ad-790657106a5a@intel.com/ Reported-by: Li Zhijian Signed-off-by: John Stultz Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230922043616.19282-4-jstultz@google.com --- kernel/locking/test-ww_mutex.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 358d66150426..78719e1ef1b1 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -465,17 +465,18 @@ retry: ww_mutex_unlock(&locks[order[n]]); if (err == -EDEADLK) { - ww_mutex_lock_slow(&locks[order[contended]], &ctx); - goto retry; + if (!time_after(jiffies, stress->timeout)) { + ww_mutex_lock_slow(&locks[order[contended]], &ctx); + goto retry; + } } + ww_acquire_fini(&ctx); if (err) { pr_err_once("stress (%s) failed with %d\n", __func__, err); break; } - - ww_acquire_fini(&ctx); } while (!time_after(jiffies, stress->timeout)); kfree(order); From 8788c6c2feb3600ba1a2f84ac5d258af4a284cea Mon Sep 17 00:00:00 2001 From: Atul Kumar Pant Date: Mon, 7 Aug 2023 17:48:34 +0530 Subject: [PATCH 25/39] locking/debug: Fix debugfs API return value checks to use IS_ERR() Update the checking of return values from debugfs_create_file() and debugfs_create_dir() to use IS_ERR(). Signed-off-by: Atul Kumar Pant Signed-off-by: Ingo Molnar Acked-by: Waiman Long Link: https://lore.kernel.org/r/20230807121834.7438-1-atulpant.linux@gmail.com --- kernel/locking/lock_events.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/locking/lock_events.c b/kernel/locking/lock_events.c index fa2c2f951c6b..e68d82099558 100644 --- a/kernel/locking/lock_events.c +++ b/kernel/locking/lock_events.c @@ -146,7 +146,7 @@ static int __init init_lockevent_counts(void) struct dentry *d_counts = debugfs_create_dir(LOCK_EVENTS_DIR, NULL); int i; - if (!d_counts) + if (IS_ERR(d_counts)) goto out; /* @@ -159,14 +159,14 @@ static int __init init_lockevent_counts(void) for (i = 0; i < lockevent_num; i++) { if (skip_lockevent(lockevent_names[i])) continue; - if (!debugfs_create_file(lockevent_names[i], 0400, d_counts, - (void *)(long)i, &fops_lockevent)) + if (IS_ERR(debugfs_create_file(lockevent_names[i], 0400, d_counts, + (void *)(long)i, &fops_lockevent))) goto fail_undo; } - if (!debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200, + if (IS_ERR(debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200, d_counts, (void *)(long)LOCKEVENT_reset_cnts, - &fops_lockevent)) + &fops_lockevent))) goto fail_undo; return 0; From 5e0eb67974e88dbaded765278a3ffe7af33e3b22 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 31 Jul 2023 10:42:23 +0200 Subject: [PATCH 26/39] locking/local, arch: Rewrite local_add_unless() as a static inline function Rewrite local_add_unless() as a static inline function with boolean return value, similar to the arch_atomic_add_unless() arch fallbacks. The function is currently unused. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230731084458.28096-1-ubizjak@gmail.com --- arch/alpha/include/asm/local.h | 33 +++++++++++++++--------------- arch/loongarch/include/asm/local.h | 27 ++++++++++++++---------- arch/mips/include/asm/local.h | 27 ++++++++++++++---------- arch/powerpc/include/asm/local.h | 12 +++++------ arch/x86/include/asm/local.h | 33 +++++++++++++++--------------- 5 files changed, 70 insertions(+), 62 deletions(-) diff --git a/arch/alpha/include/asm/local.h b/arch/alpha/include/asm/local.h index 0fcaad642cc3..88eb398947a5 100644 --- a/arch/alpha/include/asm/local.h +++ b/arch/alpha/include/asm/local.h @@ -65,28 +65,27 @@ static __inline__ bool local_try_cmpxchg(local_t *l, long *old, long new) #define local_xchg(l, n) (xchg_local(&((l)->a.counter), (n))) /** - * local_add_unless - add unless the number is a given value + * local_add_unless - add unless the number is already a given value * @l: pointer of type local_t * @a: the amount to add to l... * @u: ...unless l is equal to u. * - * Atomically adds @a to @l, so long as it was not @u. - * Returns non-zero if @l was not @u, and zero otherwise. + * Atomically adds @a to @l, if @v was not already @u. + * Returns true if the addition was done. */ -#define local_add_unless(l, a, u) \ -({ \ - long c, old; \ - c = local_read(l); \ - for (;;) { \ - if (unlikely(c == (u))) \ - break; \ - old = local_cmpxchg((l), c, c + (a)); \ - if (likely(old == c)) \ - break; \ - c = old; \ - } \ - c != (u); \ -}) +static __inline__ bool +local_add_unless(local_t *l, long a, long u) +{ + long c = local_read(l); + + do { + if (unlikely(c == u)) + return false; + } while (!local_try_cmpxchg(l, &c, c + a)); + + return true; +} + #define local_inc_not_zero(l) local_add_unless((l), 1, 0) #define local_add_negative(a, l) (local_add_return((a), (l)) < 0) diff --git a/arch/loongarch/include/asm/local.h b/arch/loongarch/include/asm/local.h index c49675852bdc..f53ea653af76 100644 --- a/arch/loongarch/include/asm/local.h +++ b/arch/loongarch/include/asm/local.h @@ -70,22 +70,27 @@ static inline bool local_try_cmpxchg(local_t *l, long *old, long new) #define local_xchg(l, n) (atomic_long_xchg((&(l)->a), (n))) /** - * local_add_unless - add unless the number is a given value + * local_add_unless - add unless the number is already a given value * @l: pointer of type local_t * @a: the amount to add to l... * @u: ...unless l is equal to u. * - * Atomically adds @a to @l, so long as it was not @u. - * Returns non-zero if @l was not @u, and zero otherwise. + * Atomically adds @a to @l, if @v was not already @u. + * Returns true if the addition was done. */ -#define local_add_unless(l, a, u) \ -({ \ - long c, old; \ - c = local_read(l); \ - while (c != (u) && (old = local_cmpxchg((l), c, c + (a))) != c) \ - c = old; \ - c != (u); \ -}) +static inline bool +local_add_unless(local_t *l, long a, long u) +{ + long c = local_read(l); + + do { + if (unlikely(c == u)) + return false; + } while (!local_try_cmpxchg(l, &c, c + a)); + + return true; +} + #define local_inc_not_zero(l) local_add_unless((l), 1, 0) #define local_dec_return(l) local_sub_return(1, (l)) diff --git a/arch/mips/include/asm/local.h b/arch/mips/include/asm/local.h index e6ae3df0349d..86fc24022242 100644 --- a/arch/mips/include/asm/local.h +++ b/arch/mips/include/asm/local.h @@ -108,22 +108,27 @@ static __inline__ bool local_try_cmpxchg(local_t *l, long *old, long new) #define local_xchg(l, n) (atomic_long_xchg((&(l)->a), (n))) /** - * local_add_unless - add unless the number is a given value + * local_add_unless - add unless the number is already a given value * @l: pointer of type local_t * @a: the amount to add to l... * @u: ...unless l is equal to u. * - * Atomically adds @a to @l, so long as it was not @u. - * Returns non-zero if @l was not @u, and zero otherwise. + * Atomically adds @a to @l, if @v was not already @u. + * Returns true if the addition was done. */ -#define local_add_unless(l, a, u) \ -({ \ - long c, old; \ - c = local_read(l); \ - while (c != (u) && (old = local_cmpxchg((l), c, c + (a))) != c) \ - c = old; \ - c != (u); \ -}) +static __inline__ bool +local_add_unless(local_t *l, long a, long u) +{ + long c = local_read(l); + + do { + if (unlikely(c == u)) + return false; + } while (!local_try_cmpxchg(l, &c, c + a)); + + return true; +} + #define local_inc_not_zero(l) local_add_unless((l), 1, 0) #define local_dec_return(l) local_sub_return(1, (l)) diff --git a/arch/powerpc/include/asm/local.h b/arch/powerpc/include/asm/local.h index 45492fb5bf22..ec6ced6d7ced 100644 --- a/arch/powerpc/include/asm/local.h +++ b/arch/powerpc/include/asm/local.h @@ -115,23 +115,23 @@ static __inline__ long local_xchg(local_t *l, long n) } /** - * local_add_unless - add unless the number is a given value + * local_add_unless - add unless the number is already a given value * @l: pointer of type local_t * @a: the amount to add to v... * @u: ...unless v is equal to u. * - * Atomically adds @a to @l, so long as it was not @u. - * Returns non-zero if @l was not @u, and zero otherwise. + * Atomically adds @a to @l, if @v was not already @u. + * Returns true if the addition was done. */ -static __inline__ int local_add_unless(local_t *l, long a, long u) +static __inline__ bool local_add_unless(local_t *l, long a, long u) { unsigned long flags; - int ret = 0; + bool ret = false; powerpc_local_irq_pmu_save(flags); if (l->v != u) { l->v += a; - ret = 1; + ret = true; } powerpc_local_irq_pmu_restore(flags); diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h index 635132a12778..73dba8b94443 100644 --- a/arch/x86/include/asm/local.h +++ b/arch/x86/include/asm/local.h @@ -135,28 +135,27 @@ static inline bool local_try_cmpxchg(local_t *l, long *old, long new) #define local_xchg(l, n) (xchg(&((l)->a.counter), (n))) /** - * local_add_unless - add unless the number is a given value + * local_add_unless - add unless the number is already a given value * @l: pointer of type local_t * @a: the amount to add to l... * @u: ...unless l is equal to u. * - * Atomically adds @a to @l, so long as it was not @u. - * Returns non-zero if @l was not @u, and zero otherwise. + * Atomically adds @a to @l, if @v was not already @u. + * Returns true if the addition was done. */ -#define local_add_unless(l, a, u) \ -({ \ - long c, old; \ - c = local_read((l)); \ - for (;;) { \ - if (unlikely(c == (u))) \ - break; \ - old = local_cmpxchg((l), c, c + (a)); \ - if (likely(old == c)) \ - break; \ - c = old; \ - } \ - c != (u); \ -}) +static __always_inline bool +local_add_unless(local_t *l, long a, long u) +{ + long c = local_read(l); + + do { + if (unlikely(c == u)) + return false; + } while (!local_try_cmpxchg(l, &c, c + a)); + + return true; +} + #define local_inc_not_zero(l) local_add_unless((l), 1, 0) /* On x86_32, these are no better than the atomic variants. From 01a99a750a4f414c221781de2e5570e0ceae04b5 Mon Sep 17 00:00:00 2001 From: Li zeming Date: Wed, 26 Jul 2023 03:50:47 +0800 Subject: [PATCH 27/39] =?UTF-8?q?futex/requeue:=20Remove=20unnecessary=20?= =?UTF-8?q?=E2=80=98NULL=E2=80=99=20initialization=20from=20futex=5Fproxy?= =?UTF-8?q?=5Ftrylock=5Fatomic()?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 'top_waiter' is assigned unconditionally before first use, so it does not need an initialization. [ mingo: Created legible changelog. ] Signed-off-by: Li zeming Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230725195047.3106-1-zeming@nfschina.com --- kernel/futex/requeue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c index a0a79954f506..16a3645bd786 100644 --- a/kernel/futex/requeue.c +++ b/kernel/futex/requeue.c @@ -269,7 +269,7 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, union futex_key *key2, struct futex_pi_state **ps, struct task_struct **exiting, int set_waiters) { - struct futex_q *top_waiter = NULL; + struct futex_q *top_waiter; u32 curval; int ret; From 0cff993e08a7578e2c1df93a95fc5059f447e7ae Mon Sep 17 00:00:00 2001 From: "pangzizhen001@208suo.com" Date: Thu, 20 Jul 2023 23:45:39 +0800 Subject: [PATCH 28/39] locking/seqlock: Fix typo in comment s/the the /the [ mingo: Cleaned up the changelog. ] Signed-off-by: Zizhen Pang Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/70293ecd5bb7a1cd370fd4d95c35f936@208suo.com --- include/linux/seqlock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 987a59d977c5..ea7a58258af6 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -864,7 +864,7 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start) } /* - * For all seqlock_t write side functions, use the the internal + * For all seqlock_t write side functions, use the internal * do_write_seqcount_begin() instead of generic write_seqcount_begin(). * This way, no redundant lockdep_assert_held() checks are added. */ From e01cc1e8c2ad73cebb980878ede5584e0f2688f7 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 25 Sep 2023 16:50:23 +0200 Subject: [PATCH 29/39] locking/atomic: Add generic support for sync_try_cmpxchg() and its fallback Provide the generic sync_try_cmpxchg() function from the raw_ prefixed version, also adding explicit instrumentation. The patch amends existing scripts to generate sync_try_cmpxchg() locking primitive and its raw_sync_try_cmpxchg() fallback, while leaving existing macros from the try_cmpxchg() family unchanged. The target can define its own arch_sync_try_cmpxchg() to override the generic version of raw_sync_try_cmpxchg(). This allows the target to generate more optimal assembly than the generic version. Additionally, the patch renames two scripts to better reflect whet they really do. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Will Deacon Cc: Peter Zijlstra Cc: Boqun Feng Cc: Mark Rutland Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org --- include/linux/atomic/atomic-arch-fallback.h | 15 +++++++++- include/linux/atomic/atomic-instrumented.h | 10 ++++++- scripts/atomic/gen-atomic-fallback.sh | 33 +++++++++++---------- scripts/atomic/gen-atomic-instrumented.sh | 3 +- 4 files changed, 43 insertions(+), 18 deletions(-) diff --git a/include/linux/atomic/atomic-arch-fallback.h b/include/linux/atomic/atomic-arch-fallback.h index b83ef19da13d..5e95faa959c4 100644 --- a/include/linux/atomic/atomic-arch-fallback.h +++ b/include/linux/atomic/atomic-arch-fallback.h @@ -428,6 +428,19 @@ extern void raw_cmpxchg128_relaxed_not_implemented(void); #define raw_sync_cmpxchg arch_sync_cmpxchg +#ifdef arch_sync_try_cmpxchg +#define raw_sync_try_cmpxchg arch_sync_try_cmpxchg +#else +#define raw_sync_try_cmpxchg(_ptr, _oldp, _new) \ +({ \ + typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ + ___r = raw_sync_cmpxchg((_ptr), ___o, (_new)); \ + if (unlikely(___r != ___o)) \ + *___op = ___r; \ + likely(___r == ___o); \ +}) +#endif + /** * raw_atomic_read() - atomic load with relaxed ordering * @v: pointer to atomic_t @@ -4649,4 +4662,4 @@ raw_atomic64_dec_if_positive(atomic64_t *v) } #endif /* _LINUX_ATOMIC_FALLBACK_H */ -// 2fdd6702823fa842f9cea57a002e6e4476ae780c +// eec048affea735b8464f58e6d96992101f8f85f1 diff --git a/include/linux/atomic/atomic-instrumented.h b/include/linux/atomic/atomic-instrumented.h index d401b406ef7c..54d7bbe0aeaa 100644 --- a/include/linux/atomic/atomic-instrumented.h +++ b/include/linux/atomic/atomic-instrumented.h @@ -4998,6 +4998,14 @@ atomic_long_dec_if_positive(atomic_long_t *v) raw_try_cmpxchg128_local(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) +#define sync_try_cmpxchg(ptr, ...) \ +({ \ + typeof(ptr) __ai_ptr = (ptr); \ + kcsan_mb(); \ + instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \ + raw_sync_try_cmpxchg(__ai_ptr, __VA_ARGS__); \ +}) + #endif /* _LINUX_ATOMIC_INSTRUMENTED_H */ -// 1568f875fef72097413caab8339120c065a39aa4 +// 2cc4bc990fef44d3836ec108f11b610f3f438184 diff --git a/scripts/atomic/gen-atomic-fallback.sh b/scripts/atomic/gen-atomic-fallback.sh index a45154cefa48..f80d69cfeb1f 100755 --- a/scripts/atomic/gen-atomic-fallback.sh +++ b/scripts/atomic/gen-atomic-fallback.sh @@ -223,14 +223,15 @@ gen_xchg_fallbacks() gen_try_cmpxchg_fallback() { + local prefix="$1"; shift local cmpxchg="$1"; shift; - local order="$1"; shift; + local suffix="$1"; shift; cat < Date: Mon, 25 Sep 2023 16:55:48 +0200 Subject: [PATCH 30/39] locking/atomic/x86: Introduce arch_sync_try_cmpxchg() Introduce the arch_sync_try_cmpxchg() macro to improve code using sync_try_cmpxchg() locking primitive. The new definitions use existing __raw_try_cmpxchg() macros, but use its own "lock; " prefix. The new macros improve assembly of the cmpxchg loop in evtchn_fifo_unmask() from drivers/xen/events/events_fifo.c from: 57a: 85 c0 test %eax,%eax 57c: 78 52 js 5d0 <...> 57e: 89 c1 mov %eax,%ecx 580: 25 ff ff ff af and $0xafffffff,%eax 585: c7 04 24 00 00 00 00 movl $0x0,(%rsp) 58c: 81 e1 ff ff ff ef and $0xefffffff,%ecx 592: 89 4c 24 04 mov %ecx,0x4(%rsp) 596: 89 44 24 08 mov %eax,0x8(%rsp) 59a: 8b 74 24 08 mov 0x8(%rsp),%esi 59e: 8b 44 24 04 mov 0x4(%rsp),%eax 5a2: f0 0f b1 32 lock cmpxchg %esi,(%rdx) 5a6: 89 04 24 mov %eax,(%rsp) 5a9: 8b 04 24 mov (%rsp),%eax 5ac: 39 c1 cmp %eax,%ecx 5ae: 74 07 je 5b7 <...> 5b0: a9 00 00 00 40 test $0x40000000,%eax 5b5: 75 c3 jne 57a <...> <...> to: 578: a9 00 00 00 40 test $0x40000000,%eax 57d: 74 2b je 5aa <...> 57f: 85 c0 test %eax,%eax 581: 78 40 js 5c3 <...> 583: 89 c1 mov %eax,%ecx 585: 25 ff ff ff af and $0xafffffff,%eax 58a: 81 e1 ff ff ff ef and $0xefffffff,%ecx 590: 89 4c 24 04 mov %ecx,0x4(%rsp) 594: 89 44 24 08 mov %eax,0x8(%rsp) 598: 8b 4c 24 08 mov 0x8(%rsp),%ecx 59c: 8b 44 24 04 mov 0x4(%rsp),%eax 5a0: f0 0f b1 0a lock cmpxchg %ecx,(%rdx) 5a4: 89 44 24 04 mov %eax,0x4(%rsp) 5a8: 75 30 jne 5da <...> <...> 5da: 8b 44 24 04 mov 0x4(%rsp),%eax 5de: eb 98 jmp 578 <...> The new code removes move instructions from 585: 5a6: and 5a9: and the compare from 5ac:. Additionally, the compiler assumes that cmpxchg success is more probable and optimizes code flow accordingly. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org --- arch/x86/include/asm/cmpxchg.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index d53636506134..5612648b0202 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -221,12 +221,18 @@ extern void __add_wrong_size(void) #define __try_cmpxchg(ptr, pold, new, size) \ __raw_try_cmpxchg((ptr), (pold), (new), (size), LOCK_PREFIX) +#define __sync_try_cmpxchg(ptr, pold, new, size) \ + __raw_try_cmpxchg((ptr), (pold), (new), (size), "lock; ") + #define __try_cmpxchg_local(ptr, pold, new, size) \ __raw_try_cmpxchg((ptr), (pold), (new), (size), "") #define arch_try_cmpxchg(ptr, pold, new) \ __try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr))) +#define arch_sync_try_cmpxchg(ptr, pold, new) \ + __sync_try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr))) + #define arch_try_cmpxchg_local(ptr, pold, new) \ __try_cmpxchg_local((ptr), (pold), (new), sizeof(*(ptr))) From ad0a2e4c2f20510d7e3f2bc110cf74f8578547f0 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 10 Jul 2023 21:01:27 +0200 Subject: [PATCH 31/39] locking/atomic, xen: Use sync_try_cmpxchg() instead of sync_cmpxchg() Use sync_try_cmpxchg() instead of sync_cmpxchg(*ptr, old, new) == old in clear_masked_cond(), clear_linked() and gnttab_end_foreign_access_ref_v1(). x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg), improving the cmpxchg loop in gnttab_end_foreign_access_ref_v1() from: 174: eb 0e jmp 184 <...> 176: 89 d0 mov %edx,%eax 178: f0 66 0f b1 31 lock cmpxchg %si,(%rcx) 17d: 66 39 c2 cmp %ax,%dx 180: 74 11 je 193 <...> 182: 89 c2 mov %eax,%edx 184: 89 d6 mov %edx,%esi 186: 66 83 e6 18 and $0x18,%si 18a: 74 ea je 176 <...> to: 614: 89 c1 mov %eax,%ecx 616: 66 83 e1 18 and $0x18,%cx 61a: 75 11 jne 62d <...> 61c: f0 66 0f b1 0a lock cmpxchg %cx,(%rdx) 621: 75 f1 jne 614 <...> No functional change intended. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Acked-by: Juergen Gross Cc: Peter Zijlstra Cc: Stefano Stabellini Cc: Oleksandr Tyshchenko Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org --- drivers/xen/events/events_fifo.c | 26 ++++++++++++-------------- drivers/xen/grant-table.c | 10 ++++------ 2 files changed, 16 insertions(+), 20 deletions(-) diff --git a/drivers/xen/events/events_fifo.c b/drivers/xen/events/events_fifo.c index ad9fe51d3fb3..655775db7caf 100644 --- a/drivers/xen/events/events_fifo.c +++ b/drivers/xen/events/events_fifo.c @@ -226,21 +226,20 @@ static bool evtchn_fifo_is_masked(evtchn_port_t port) */ static bool clear_masked_cond(volatile event_word_t *word) { - event_word_t new, old, w; + event_word_t new, old; - w = *word; + old = *word; do { - if (!(w & (1 << EVTCHN_FIFO_MASKED))) + if (!(old & (1 << EVTCHN_FIFO_MASKED))) return true; - if (w & (1 << EVTCHN_FIFO_PENDING)) + if (old & (1 << EVTCHN_FIFO_PENDING)) return false; - old = w & ~(1 << EVTCHN_FIFO_BUSY); + old = old & ~(1 << EVTCHN_FIFO_BUSY); new = old & ~(1 << EVTCHN_FIFO_MASKED); - w = sync_cmpxchg(word, old, new); - } while (w != old); + } while (!sync_try_cmpxchg(word, &old, new)); return true; } @@ -259,17 +258,16 @@ static void evtchn_fifo_unmask(evtchn_port_t port) static uint32_t clear_linked(volatile event_word_t *word) { - event_word_t new, old, w; + event_word_t new, old; - w = *word; + old = *word; do { - old = w; - new = (w & ~((1 << EVTCHN_FIFO_LINKED) - | EVTCHN_FIFO_LINK_MASK)); - } while ((w = sync_cmpxchg(word, old, new)) != old); + new = (old & ~((1 << EVTCHN_FIFO_LINKED) + | EVTCHN_FIFO_LINK_MASK)); + } while (!sync_try_cmpxchg(word, &old, new)); - return w & EVTCHN_FIFO_LINK_MASK; + return old & EVTCHN_FIFO_LINK_MASK; } static void consume_one_event(unsigned cpu, struct evtchn_loop_ctrl *ctrl, diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 35659bf70746..04a6b470b15d 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -427,16 +427,14 @@ EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access); static int gnttab_end_foreign_access_ref_v1(grant_ref_t ref) { - u16 flags, nflags; - u16 *pflags; + u16 *pflags = &gnttab_shared.v1[ref].flags; + u16 flags; - pflags = &gnttab_shared.v1[ref].flags; - nflags = *pflags; + flags = *pflags; do { - flags = nflags; if (flags & (GTF_reading|GTF_writing)) return 0; - } while ((nflags = sync_cmpxchg(pflags, flags, 0)) != flags); + } while (!sync_try_cmpxchg(pflags, &flags, 0)); return 1; } From 4fbf8b136ded943f8661cf48270482ad1f5ce7bd Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 9 May 2023 17:02:55 +0200 Subject: [PATCH 32/39] locking/atomics: Use atomic_try_cmpxchg_release() to micro-optimize rcuref_put_slowpath() Use atomic_try_cmpxchg() instead of atomic_cmpxchg(*ptr, old, new) == old in rcuref_put_slowpath(). On x86 the CMPXCHG instruction returns success in the ZF flag, so this change saves a compare after CMPXCHG. Additionaly, the compiler reorders some code blocks to follow likely/unlikely annotations in the atomic_try_cmpxchg() macro, improving the code from: 9a: f0 0f b1 0b lock cmpxchg %ecx,(%rbx) 9e: 83 f8 ff cmp $0xffffffff,%eax a1: 74 04 je a7 a3: 31 c0 xor %eax,%eax to: 9a: f0 0f b1 0b lock cmpxchg %ecx,(%rbx) 9e: 75 4c jne ec a0: b0 01 mov $0x1,%al No functional change intended. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Cc: Paul E. McKenney Link: https://lore.kernel.org/r/20230509150255.3691-1-ubizjak@gmail.com --- lib/rcuref.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rcuref.c b/lib/rcuref.c index 5ec00a4a64d1..97f300eca927 100644 --- a/lib/rcuref.c +++ b/lib/rcuref.c @@ -248,7 +248,7 @@ bool rcuref_put_slowpath(rcuref_t *ref) * require a retry. If this fails the caller is not * allowed to deconstruct the object. */ - if (atomic_cmpxchg_release(&ref->refcnt, RCUREF_NOREF, RCUREF_DEAD) != RCUREF_NOREF) + if (!atomic_try_cmpxchg_release(&ref->refcnt, &cnt, RCUREF_DEAD)) return false; /* From f995443f01b4dbcce723539b99050ce69b319e58 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 12 Oct 2023 16:31:58 +0200 Subject: [PATCH 33/39] locking/seqlock: Simplify SEQCOUNT_LOCKNAME() 1. Kill the "lockmember" argument. It is always s->lock plus __seqprop_##lockname##_sequence() already uses s->lock and ignores "lockmember". 2. Kill the "lock_acquire" argument. __seqprop_##lockname##_sequence() can use the same "lockbase" prefix for _lock and _unlock. Apart from line numbers, gcc -E outputs the same code. Signed-off-by: Oleg Nesterov Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Cc: Waiman Long Cc: Will Deacon Cc: Thomas Gleixner Cc: Linus Torvalds Cc: Paul E. McKenney Link: https://lore.kernel.org/r/20231012143158.GA16133@redhat.com --- include/linux/seqlock.h | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index af518e4d0c6b..7e7109dbc3f5 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -191,11 +191,9 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s) * @lockname: "LOCKNAME" part of seqcount_LOCKNAME_t * @locktype: LOCKNAME canonical C data type * @preemptible: preemptibility of above locktype - * @lockmember: argument for lockdep_assert_held() - * @lockbase: associated lock release function (prefix only) - * @lock_acquire: associated lock acquisition function (full call) + * @lockbase: prefix for associated lock/unlock */ -#define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockmember, lockbase, lock_acquire) \ +#define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockbase) \ typedef struct seqcount_##lockname { \ seqcount_t seqcount; \ __SEQ_LOCK(locktype *lock); \ @@ -216,7 +214,7 @@ __seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s) \ return seq; \ \ if (preemptible && unlikely(seq & 1)) { \ - __SEQ_LOCK(lock_acquire); \ + __SEQ_LOCK(lockbase##_lock(s->lock)); \ __SEQ_LOCK(lockbase##_unlock(s->lock)); \ \ /* \ @@ -242,7 +240,7 @@ __seqprop_##lockname##_preemptible(const seqcount_##lockname##_t *s) \ static __always_inline void \ __seqprop_##lockname##_assert(const seqcount_##lockname##_t *s) \ { \ - __SEQ_LOCK(lockdep_assert_held(lockmember)); \ + __SEQ_LOCK(lockdep_assert_held(s->lock)); \ } /* @@ -271,10 +269,10 @@ static inline void __seqprop_assert(const seqcount_t *s) #define __SEQ_RT IS_ENABLED(CONFIG_PREEMPT_RT) -SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t, false, s->lock, raw_spin, raw_spin_lock(s->lock)) -SEQCOUNT_LOCKNAME(spinlock, spinlock_t, __SEQ_RT, s->lock, spin, spin_lock(s->lock)) -SEQCOUNT_LOCKNAME(rwlock, rwlock_t, __SEQ_RT, s->lock, read, read_lock(s->lock)) -SEQCOUNT_LOCKNAME(mutex, struct mutex, true, s->lock, mutex, mutex_lock(s->lock)) +SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t, false, raw_spin) +SEQCOUNT_LOCKNAME(spinlock, spinlock_t, __SEQ_RT, spin) +SEQCOUNT_LOCKNAME(rwlock, rwlock_t, __SEQ_RT, read) +SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) /* * SEQCNT_LOCKNAME_ZERO - static initializer for seqcount_LOCKNAME_t From e6115c6f7a0ce3388cc60b69a284facf78b5dbfd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 12 Oct 2023 16:32:27 +0200 Subject: [PATCH 34/39] locking/seqlock: Change __seqprop() to return the function pointer This simplifies the macro and makes it easy to add the new seqprop's with 2 or more args. Plus this way we do not lose the type info, the (void*) type cast is no longer needed. And the latter reveals the problem: a lot of seqcount_t helpers pass the "const seqcount_t *s" argument to __seqprop_ptr(seqcount_t *s) but (before this patch) "(void *)(s)" masked the problem. So this patch changes __seqprop_ptr() and __seqprop_##lockname##_ptr() to accept the "const LOCKNAME *s" argument. This is not nice either, they need to drop the constness on return because these helpers are used by both the readers and writers, but at least it is clear what's going on. Signed-off-by: Oleg Nesterov Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Cc: Waiman Long Cc: Will Deacon Cc: Thomas Gleixner Cc: Linus Torvalds Cc: Paul E. McKenney Link: https://lore.kernel.org/r/20231012143227.GA16143@redhat.com --- include/linux/seqlock.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 7e7109dbc3f5..4b8dcd3a0d93 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -200,9 +200,9 @@ typedef struct seqcount_##lockname { \ } seqcount_##lockname##_t; \ \ static __always_inline seqcount_t * \ -__seqprop_##lockname##_ptr(seqcount_##lockname##_t *s) \ +__seqprop_##lockname##_ptr(const seqcount_##lockname##_t *s) \ { \ - return &s->seqcount; \ + return (void *)&s->seqcount; /* drop const */ \ } \ \ static __always_inline unsigned \ @@ -247,9 +247,9 @@ __seqprop_##lockname##_assert(const seqcount_##lockname##_t *s) \ * __seqprop() for seqcount_t */ -static inline seqcount_t *__seqprop_ptr(seqcount_t *s) +static inline seqcount_t *__seqprop_ptr(const seqcount_t *s) { - return s; + return (void *)s; /* drop const */ } static inline unsigned __seqprop_sequence(const seqcount_t *s) @@ -292,19 +292,19 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) #define SEQCNT_WW_MUTEX_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) #define __seqprop_case(s, lockname, prop) \ - seqcount_##lockname##_t: __seqprop_##lockname##_##prop((void *)(s)) + seqcount_##lockname##_t: __seqprop_##lockname##_##prop #define __seqprop(s, prop) _Generic(*(s), \ - seqcount_t: __seqprop_##prop((void *)(s)), \ + seqcount_t: __seqprop_##prop, \ __seqprop_case((s), raw_spinlock, prop), \ __seqprop_case((s), spinlock, prop), \ __seqprop_case((s), rwlock, prop), \ __seqprop_case((s), mutex, prop)) -#define seqprop_ptr(s) __seqprop(s, ptr) -#define seqprop_sequence(s) __seqprop(s, sequence) -#define seqprop_preemptible(s) __seqprop(s, preemptible) -#define seqprop_assert(s) __seqprop(s, assert) +#define seqprop_ptr(s) __seqprop(s, ptr)(s) +#define seqprop_sequence(s) __seqprop(s, sequence)(s) +#define seqprop_preemptible(s) __seqprop(s, preemptible)(s) +#define seqprop_assert(s) __seqprop(s, assert)(s) /** * __read_seqcount_begin() - begin a seqcount_t read section w/o barrier From ac8b60be078abebc3ab8836f3f0ecac6980e0b4f Mon Sep 17 00:00:00 2001 From: Lucy Mielke Date: Thu, 12 Oct 2023 12:44:32 +0200 Subject: [PATCH 35/39] locking/lockdep: Fix string sizing bug that triggers a format-truncation compiler-warning On an allyesconfig, with "treat warnings as errors" unset, GCC emits these warnings: kernel/locking/lockdep_proc.c:438:32: Warning: Format specifier '%lld' may be truncated when writing 1 to 17 bytes into a region of size 15 [-Wformat-truncation=] kernel/locking/lockdep_proc.c:438:31: Note: Format directive argument is in the range [-9223372036854775, 9223372036854775] kernel/locking/lockdep_proc.c:438:9: Note: 'snprintf' has output between 5 and 22 bytes into a target of size 15 In seq_time(), the longest s64 is "-9223372036854775808"-ish, which converted to the fixed-point float format is "-9223372036854775.80": 21 bytes, plus termination is another byte: 22. Therefore, a larger buffer size of 22 is needed here - not 15. The code was safe due to the snprintf(). Fix it. Signed-off-by: Lucy Mielke Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/ZSfOEHRkZAWaQr3U@fedora.fritz.box --- kernel/locking/lockdep_proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 15fdc7fa5c68..e2bfb1db589d 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -440,7 +440,7 @@ static void snprint_time(char *buf, size_t bufsiz, s64 nr) static void seq_time(struct seq_file *m, s64 time) { - char num[15]; + char num[22]; snprint_time(num, sizeof(num), time); seq_printf(m, " %14s", num); From 886ee55eabac0d46faf8bc0b22207ca2740847ba Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 13 Oct 2023 10:15:46 +0200 Subject: [PATCH 36/39] locking/seqlock: Propagate 'const' pointers within read-only methods, remove forced type casts Currently __seqprop_ptr() is an inline function that must chose to either use 'const' or non-const seqcount related pointers - but this results in the undesirable loss of 'const' propagation, via a forced type cast. The easiest solution would be to turn the pointer wrappers into macros that pass through whatever type is passed to them - but the clever maze of seqlock API instantiation macros relies on the GCC CPP '##' macro extension, which isn't recursive, so inline functions must be used here. So create two wrapper variants instead: 'ptr' and 'const_ptr', and pick the right one for the codepaths that are const: read_seqcount_begin() and read_seqcount_retry(). This cleans up type handling and allows the removal of all type forcing. No change in functionality. Signed-off-by: Ingo Molnar Reviewed-by: Oleg Nesterov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Waiman Long Cc: Will Deacon Cc: Thomas Gleixner Cc: Paul E. McKenney --- include/linux/seqlock.h | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 4b8dcd3a0d93..80f21d2ca2aa 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -200,9 +200,15 @@ typedef struct seqcount_##lockname { \ } seqcount_##lockname##_t; \ \ static __always_inline seqcount_t * \ -__seqprop_##lockname##_ptr(const seqcount_##lockname##_t *s) \ +__seqprop_##lockname##_ptr(seqcount_##lockname##_t *s) \ { \ - return (void *)&s->seqcount; /* drop const */ \ + return &s->seqcount; \ +} \ + \ +static __always_inline const seqcount_t * \ +__seqprop_##lockname##_const_ptr(const seqcount_##lockname##_t *s) \ +{ \ + return &s->seqcount; \ } \ \ static __always_inline unsigned \ @@ -247,9 +253,14 @@ __seqprop_##lockname##_assert(const seqcount_##lockname##_t *s) \ * __seqprop() for seqcount_t */ -static inline seqcount_t *__seqprop_ptr(const seqcount_t *s) +static inline seqcount_t *__seqprop_ptr(seqcount_t *s) { - return (void *)s; /* drop const */ + return s; +} + +static inline const seqcount_t *__seqprop_const_ptr(const seqcount_t *s) +{ + return s; } static inline unsigned __seqprop_sequence(const seqcount_t *s) @@ -302,6 +313,7 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) __seqprop_case((s), mutex, prop)) #define seqprop_ptr(s) __seqprop(s, ptr)(s) +#define seqprop_const_ptr(s) __seqprop(s, const_ptr)(s) #define seqprop_sequence(s) __seqprop(s, sequence)(s) #define seqprop_preemptible(s) __seqprop(s, preemptible)(s) #define seqprop_assert(s) __seqprop(s, assert)(s) @@ -353,7 +365,7 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) */ #define read_seqcount_begin(s) \ ({ \ - seqcount_lockdep_reader_access(seqprop_ptr(s)); \ + seqcount_lockdep_reader_access(seqprop_const_ptr(s)); \ raw_read_seqcount_begin(s); \ }) @@ -419,7 +431,7 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) * Return: true if a read section retry is required, else false */ #define __read_seqcount_retry(s, start) \ - do___read_seqcount_retry(seqprop_ptr(s), start) + do___read_seqcount_retry(seqprop_const_ptr(s), start) static inline int do___read_seqcount_retry(const seqcount_t *s, unsigned start) { @@ -439,7 +451,7 @@ static inline int do___read_seqcount_retry(const seqcount_t *s, unsigned start) * Return: true if a read section retry is required, else false */ #define read_seqcount_retry(s, start) \ - do_read_seqcount_retry(seqprop_ptr(s), start) + do_read_seqcount_retry(seqprop_const_ptr(s), start) static inline int do_read_seqcount_retry(const seqcount_t *s, unsigned start) { From dcc134510eefaec6dda4fe71ab824f0300ed9f9f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 16 Oct 2023 13:12:54 +0200 Subject: [PATCH 37/39] alpha: Fix up new futex syscall numbers As per Arnd, Alpha syscalls since time64 are offset by 120, retain this offset. Fixes: 9f6c532f59b2 ("futex: Add sys_futex_wake()") Reported-by: Arnd Bergmann Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/cb4bb8e2-7dfe-4ca4-aa70-060f7b2f8f95@app.fastmail.com --- arch/alpha/kernel/syscalls/syscall.tbl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index b1865f9bb31e..b68f1f56b836 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -492,6 +492,7 @@ 560 common set_mempolicy_home_node sys_ni_syscall 561 common cachestat sys_cachestat 562 common fchmodat2 sys_fchmodat2 -563 common futex_wake sys_futex_wake -564 common futex_wait sys_futex_wait -565 common futex_requeue sys_futex_requeue +# 563 reserved for map_shadow_stack +564 common futex_wake sys_futex_wake +565 common futex_wait sys_futex_wait +566 common futex_requeue sys_futex_requeue From 184fdf9fc7ae6ae7155768faa48fc609d1a24b7e Mon Sep 17 00:00:00 2001 From: Cuda-Chen Date: Tue, 17 Oct 2023 13:37:03 +0800 Subject: [PATCH 38/39] locking/seqlock: Fix grammar in comment The "neither writes before and after ..." for the description of do_write_seqcount_end() should be "neither writes before nor after". Signed-off-by: Cuda-Chen Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231017053703.11312-1-clh960524@gmail.com --- include/linux/seqlock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 80f21d2ca2aa..e92f9d5577ba 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -584,7 +584,7 @@ static inline void do_write_seqcount_end(seqcount_t *s) * via WRITE_ONCE): a) to ensure the writes become visible to other threads * atomically, avoiding compiler optimizations; b) to document which writes are * meant to propagate to the reader critical section. This is necessary because - * neither writes before and after the barrier are enclosed in a seq-writer + * neither writes before nor after the barrier are enclosed in a seq-writer * critical section that would ensure readers are aware of ongoing writes:: * * seqcount_t seq; From c73801ae4f22b390228ebf471d55668e824198b6 Mon Sep 17 00:00:00 2001 From: Ben Wolsieffer Date: Thu, 19 Oct 2023 16:45:49 -0400 Subject: [PATCH 39/39] futex: Don't include process MM in futex key on no-MMU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On no-MMU, all futexes are treated as private because there is no need to map a virtual address to physical to match the futex across processes. This doesn't quite work though, because private futexes include the current process's mm_struct as part of their key. This makes it impossible for one process to wake up a shared futex being waited on in another process. Fix this bug by excluding the mm_struct from the key. With a single address space, the futex address is already a unique key. Fixes: 784bdf3bb694 ("futex: Assume all mappings are private on !MMU systems") Signed-off-by: Ben Wolsieffer Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Thomas Gleixner Cc: Darren Hart Cc: Davidlohr Bueso Cc: André Almeida Link: https://lore.kernel.org/r/20231019204548.1236437-2-ben.wolsieffer@hefring.com --- kernel/futex/core.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/kernel/futex/core.c b/kernel/futex/core.c index ade7c731972d..52695c59d041 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -252,7 +252,17 @@ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, * but access_ok() should be faster than find_vma() */ if (!fshared) { - key->private.mm = mm; + /* + * On no-MMU, shared futexes are treated as private, therefore + * we must not include the current process in the key. Since + * there is only one address space, the address is a unique key + * on its own. + */ + if (IS_ENABLED(CONFIG_MMU)) + key->private.mm = mm; + else + key->private.mm = NULL; + key->private.address = address; return 0; }