summaryrefslogtreecommitdiff
path: root/kernel/futex
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/futex')
-rw-r--r--kernel/futex/core.c1165
-rw-r--r--kernel/futex/futex.h190
-rw-r--r--kernel/futex/pi.c347
-rw-r--r--kernel/futex/requeue.c486
-rw-r--r--kernel/futex/syscalls.c341
-rw-r--r--kernel/futex/waitwake.c336
6 files changed, 2072 insertions, 793 deletions
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 514e4582b863..cf7e610eac42 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -34,9 +34,16 @@
#include <linux/compat.h>
#include <linux/jhash.h>
#include <linux/pagemap.h>
+#include <linux/debugfs.h>
+#include <linux/plist.h>
+#include <linux/gfp.h>
+#include <linux/vmalloc.h>
#include <linux/memblock.h>
#include <linux/fault-inject.h>
#include <linux/slab.h>
+#include <linux/prctl.h>
+#include <linux/mempolicy.h>
+#include <linux/mmap_lock.h>
#include "futex.h"
#include "../locking/rtmutex_common.h"
@@ -47,12 +54,23 @@
* reside in the same cacheline.
*/
static struct {
- struct futex_hash_bucket *queues;
- unsigned long hashsize;
+ unsigned long hashmask;
+ unsigned int hashshift;
+ struct futex_hash_bucket *queues[MAX_NUMNODES];
} __futex_data __read_mostly __aligned(2*sizeof(long));
-#define futex_queues (__futex_data.queues)
-#define futex_hashsize (__futex_data.hashsize)
+#define futex_hashmask (__futex_data.hashmask)
+#define futex_hashshift (__futex_data.hashshift)
+#define futex_queues (__futex_data.queues)
+
+struct futex_private_hash {
+ int state;
+ unsigned int hash_mask;
+ struct rcu_head rcu;
+ void *mm;
+ bool custom;
+ struct futex_hash_bucket queues[];
+};
/*
* Fault injections for futexes.
@@ -105,21 +123,331 @@ late_initcall(fail_futex_debugfs);
#endif /* CONFIG_FAIL_FUTEX */
+static struct futex_hash_bucket *
+__futex_hash(union futex_key *key, struct futex_private_hash *fph);
+
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+static bool futex_ref_get(struct futex_private_hash *fph);
+static bool futex_ref_put(struct futex_private_hash *fph);
+static bool futex_ref_is_dead(struct futex_private_hash *fph);
+
+enum { FR_PERCPU = 0, FR_ATOMIC };
+
+static inline bool futex_key_is_private(union futex_key *key)
+{
+ /*
+ * Relies on get_futex_key() to set either bit for shared
+ * futexes -- see comment with union futex_key.
+ */
+ return !(key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED));
+}
+
+static bool futex_private_hash_get(struct futex_private_hash *fph)
+{
+ return futex_ref_get(fph);
+}
+
+void futex_private_hash_put(struct futex_private_hash *fph)
+{
+ if (futex_ref_put(fph))
+ wake_up_var(fph->mm);
+}
+
/**
- * futex_hash - Return the hash bucket in the global hash
- * @key: Pointer to the futex key for which the hash is calculated
+ * futex_hash_get - Get an additional reference for the local hash.
+ * @hb: ptr to the private local hash.
*
- * We hash on the keys returned from get_futex_key (see below) and return the
- * corresponding hash bucket in the global hash.
+ * Obtain an additional reference for the already obtained hash bucket. The
+ * caller must already own an reference.
*/
+void futex_hash_get(struct futex_hash_bucket *hb)
+{
+ struct futex_private_hash *fph = hb->priv;
+
+ if (!fph)
+ return;
+ WARN_ON_ONCE(!futex_private_hash_get(fph));
+}
+
+void futex_hash_put(struct futex_hash_bucket *hb)
+{
+ struct futex_private_hash *fph = hb->priv;
+
+ if (!fph)
+ return;
+ futex_private_hash_put(fph);
+}
+
+static struct futex_hash_bucket *
+__futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
+{
+ u32 hash;
+
+ if (!futex_key_is_private(key))
+ return NULL;
+
+ if (!fph)
+ fph = rcu_dereference(key->private.mm->futex_phash);
+ if (!fph || !fph->hash_mask)
+ return NULL;
+
+ hash = jhash2((void *)&key->private.address,
+ sizeof(key->private.address) / 4,
+ key->both.offset);
+ return &fph->queues[hash & fph->hash_mask];
+}
+
+static void futex_rehash_private(struct futex_private_hash *old,
+ struct futex_private_hash *new)
+{
+ struct futex_hash_bucket *hb_old, *hb_new;
+ unsigned int slots = old->hash_mask + 1;
+ unsigned int i;
+
+ for (i = 0; i < slots; i++) {
+ struct futex_q *this, *tmp;
+
+ hb_old = &old->queues[i];
+
+ spin_lock(&hb_old->lock);
+ plist_for_each_entry_safe(this, tmp, &hb_old->chain, list) {
+
+ plist_del(&this->list, &hb_old->chain);
+ futex_hb_waiters_dec(hb_old);
+
+ WARN_ON_ONCE(this->lock_ptr != &hb_old->lock);
+
+ hb_new = __futex_hash(&this->key, new);
+ futex_hb_waiters_inc(hb_new);
+ /*
+ * The new pointer isn't published yet but an already
+ * moved user can be unqueued due to timeout or signal.
+ */
+ spin_lock_nested(&hb_new->lock, SINGLE_DEPTH_NESTING);
+ plist_add(&this->list, &hb_new->chain);
+ this->lock_ptr = &hb_new->lock;
+ spin_unlock(&hb_new->lock);
+ }
+ spin_unlock(&hb_old->lock);
+ }
+}
+
+static bool __futex_pivot_hash(struct mm_struct *mm,
+ struct futex_private_hash *new)
+{
+ struct futex_private_hash *fph;
+
+ WARN_ON_ONCE(mm->futex_phash_new);
+
+ fph = rcu_dereference_protected(mm->futex_phash,
+ lockdep_is_held(&mm->futex_hash_lock));
+ if (fph) {
+ if (!futex_ref_is_dead(fph)) {
+ mm->futex_phash_new = new;
+ return false;
+ }
+
+ futex_rehash_private(fph, new);
+ }
+ new->state = FR_PERCPU;
+ scoped_guard(rcu) {
+ mm->futex_batches = get_state_synchronize_rcu();
+ rcu_assign_pointer(mm->futex_phash, new);
+ }
+ kvfree_rcu(fph, rcu);
+ return true;
+}
+
+static void futex_pivot_hash(struct mm_struct *mm)
+{
+ scoped_guard(mutex, &mm->futex_hash_lock) {
+ struct futex_private_hash *fph;
+
+ fph = mm->futex_phash_new;
+ if (fph) {
+ mm->futex_phash_new = NULL;
+ __futex_pivot_hash(mm, fph);
+ }
+ }
+}
+
+struct futex_private_hash *futex_private_hash(void)
+{
+ struct mm_struct *mm = current->mm;
+ /*
+ * Ideally we don't loop. If there is a replacement in progress
+ * then a new private hash is already prepared and a reference can't be
+ * obtained once the last user dropped it's.
+ * In that case we block on mm_struct::futex_hash_lock and either have
+ * to perform the replacement or wait while someone else is doing the
+ * job. Eitherway, on the second iteration we acquire a reference on the
+ * new private hash or loop again because a new replacement has been
+ * requested.
+ */
+again:
+ scoped_guard(rcu) {
+ struct futex_private_hash *fph;
+
+ fph = rcu_dereference(mm->futex_phash);
+ if (!fph)
+ return NULL;
+
+ if (futex_private_hash_get(fph))
+ return fph;
+ }
+ futex_pivot_hash(mm);
+ goto again;
+}
+
+struct futex_hash_bucket *futex_hash(union futex_key *key)
+{
+ struct futex_private_hash *fph;
+ struct futex_hash_bucket *hb;
+
+again:
+ scoped_guard(rcu) {
+ hb = __futex_hash(key, NULL);
+ fph = hb->priv;
+
+ if (!fph || futex_private_hash_get(fph))
+ return hb;
+ }
+ futex_pivot_hash(key->private.mm);
+ goto again;
+}
+
+#else /* !CONFIG_FUTEX_PRIVATE_HASH */
+
+static struct futex_hash_bucket *
+__futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
+{
+ return NULL;
+}
+
struct futex_hash_bucket *futex_hash(union futex_key *key)
{
- u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
- key->both.offset);
+ return __futex_hash(key, NULL);
+}
+
+#endif /* CONFIG_FUTEX_PRIVATE_HASH */
+
+#ifdef CONFIG_FUTEX_MPOL
+
+static int __futex_key_to_node(struct mm_struct *mm, unsigned long addr)
+{
+ struct vm_area_struct *vma = vma_lookup(mm, addr);
+ struct mempolicy *mpol;
+ int node = FUTEX_NO_NODE;
+
+ if (!vma)
+ return FUTEX_NO_NODE;
+
+ mpol = vma_policy(vma);
+ if (!mpol)
+ return FUTEX_NO_NODE;
+
+ switch (mpol->mode) {
+ case MPOL_PREFERRED:
+ node = first_node(mpol->nodes);
+ break;
+ case MPOL_PREFERRED_MANY:
+ case MPOL_BIND:
+ if (mpol->home_node != NUMA_NO_NODE)
+ node = mpol->home_node;
+ break;
+ default:
+ break;
+ }
- return &futex_queues[hash & (futex_hashsize - 1)];
+ return node;
}
+static int futex_key_to_node_opt(struct mm_struct *mm, unsigned long addr)
+{
+ int seq, node;
+
+ guard(rcu)();
+
+ if (!mmap_lock_speculate_try_begin(mm, &seq))
+ return -EBUSY;
+
+ node = __futex_key_to_node(mm, addr);
+
+ if (mmap_lock_speculate_retry(mm, seq))
+ return -EAGAIN;
+
+ return node;
+}
+
+static int futex_mpol(struct mm_struct *mm, unsigned long addr)
+{
+ int node;
+
+ node = futex_key_to_node_opt(mm, addr);
+ if (node >= FUTEX_NO_NODE)
+ return node;
+
+ guard(mmap_read_lock)(mm);
+ return __futex_key_to_node(mm, addr);
+}
+
+#else /* !CONFIG_FUTEX_MPOL */
+
+static int futex_mpol(struct mm_struct *mm, unsigned long addr)
+{
+ return FUTEX_NO_NODE;
+}
+
+#endif /* CONFIG_FUTEX_MPOL */
+
+/**
+ * __futex_hash - Return the hash bucket
+ * @key: Pointer to the futex key for which the hash is calculated
+ * @fph: Pointer to private hash if known
+ *
+ * We hash on the keys returned from get_futex_key (see below) and return the
+ * corresponding hash bucket.
+ * If the FUTEX is PROCESS_PRIVATE then a per-process hash bucket (from the
+ * private hash) is returned if existing. Otherwise a hash bucket from the
+ * global hash is returned.
+ */
+static struct futex_hash_bucket *
+__futex_hash(union futex_key *key, struct futex_private_hash *fph)
+{
+ int node = key->both.node;
+ u32 hash;
+
+ if (node == FUTEX_NO_NODE) {
+ struct futex_hash_bucket *hb;
+
+ hb = __futex_hash_private(key, fph);
+ if (hb)
+ return hb;
+ }
+
+ hash = jhash2((u32 *)key,
+ offsetof(typeof(*key), both.offset) / sizeof(u32),
+ key->both.offset);
+
+ if (node == FUTEX_NO_NODE) {
+ /*
+ * In case of !FLAGS_NUMA, use some unused hash bits to pick a
+ * node -- this ensures regular futexes are interleaved across
+ * the nodes and avoids having to allocate multiple
+ * hash-tables.
+ *
+ * NOTE: this isn't perfectly uniform, but it is fast and
+ * handles sparse node masks.
+ */
+ node = (hash >> futex_hashshift) % nr_node_ids;
+ if (!node_possible(node)) {
+ node = find_next_bit_wrap(node_possible_map.bits,
+ nr_node_ids, node);
+ }
+ }
+
+ return &futex_queues[node][hash & futex_hashmask];
+}
/**
* futex_setup_timer - set up the sleeping hrtimer.
@@ -138,9 +466,9 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
if (!time)
return NULL;
- hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ?
- CLOCK_REALTIME : CLOCK_MONOTONIC,
- HRTIMER_MODE_ABS);
+ hrtimer_setup_sleeper_on_stack(timeout,
+ (flags & FLAGS_CLOCKRT) ? CLOCK_REALTIME : CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS);
/*
* If range_ns is 0, calling hrtimer_set_expires_range_ns() is
* effectively the same as calling hrtimer_set_expires().
@@ -179,12 +507,12 @@ static u64 get_inode_sequence_number(struct inode *inode)
return old;
for (;;) {
- u64 new = atomic64_add_return(1, &i_seq);
+ u64 new = atomic64_inc_return(&i_seq);
if (WARN_ON_ONCE(!new))
continue;
- old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new);
- if (old)
+ old = 0;
+ if (!atomic64_try_cmpxchg_relaxed(&inode->i_sequence, &old, new))
return old;
return new;
}
@@ -193,7 +521,7 @@ static u64 get_inode_sequence_number(struct inode *inode)
/**
* get_futex_key() - Get parameters which are the keys for a futex
* @uaddr: virtual address of the futex
- * @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED
+ * @flags: FLAGS_*
* @key: address where result is stored.
* @rw: mapping needs to be read/write (values: FUTEX_READ,
* FUTEX_WRITE)
@@ -204,7 +532,7 @@ static u64 get_inode_sequence_number(struct inode *inode)
*
* For shared mappings (when @fshared), the key is:
*
- * ( inode->i_sequence, page->index, offset_within_page )
+ * ( inode->i_sequence, page offset within mapping, offset_within_page )
*
* [ also see get_inode_sequence_number() ]
*
@@ -217,29 +545,68 @@ static u64 get_inode_sequence_number(struct inode *inode)
*
* lock_page() might sleep, the caller should not hold a spinlock.
*/
-int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
+int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
enum futex_access rw)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
- struct page *page, *tail;
+ struct page *page;
+ struct folio *folio;
struct address_space *mapping;
- int err, ro = 0;
+ int node, err, size, ro = 0;
+ bool node_updated = false;
+ bool fshared;
+
+ fshared = flags & FLAGS_SHARED;
+ size = futex_size(flags);
+ if (flags & FLAGS_NUMA)
+ size *= 2;
/*
* The futex address must be "naturally" aligned.
*/
key->both.offset = address % PAGE_SIZE;
- if (unlikely((address % sizeof(u32)) != 0))
+ if (unlikely((address % size) != 0))
return -EINVAL;
address -= key->both.offset;
- if (unlikely(!access_ok(uaddr, sizeof(u32))))
+ if (unlikely(!access_ok(uaddr, size)))
return -EFAULT;
if (unlikely(should_fail_futex(fshared)))
return -EFAULT;
+ node = FUTEX_NO_NODE;
+
+ if (flags & FLAGS_NUMA) {
+ u32 __user *naddr = (void *)uaddr + size / 2;
+
+ if (get_user_inline(node, naddr))
+ return -EFAULT;
+
+ if ((node != FUTEX_NO_NODE) &&
+ ((unsigned int)node >= MAX_NUMNODES || !node_possible(node)))
+ return -EINVAL;
+ }
+
+ if (node == FUTEX_NO_NODE && (flags & FLAGS_MPOL)) {
+ node = futex_mpol(mm, address);
+ node_updated = true;
+ }
+
+ if (flags & FLAGS_NUMA) {
+ u32 __user *naddr = (void *)uaddr + size / 2;
+
+ if (node == FUTEX_NO_NODE) {
+ node = numa_node_id();
+ node_updated = true;
+ }
+ if (node_updated && put_user_inline(node, naddr))
+ return -EFAULT;
+ }
+
+ key->both.node = node;
+
/*
* PROCESS_PRIVATE futexes are fast.
* As the mm cannot disappear under us and the 'key' only needs
@@ -248,7 +615,17 @@ int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
* but access_ok() should be faster than find_vma()
*/
if (!fshared) {
- key->private.mm = mm;
+ /*
+ * On no-MMU, shared futexes are treated as private, therefore
+ * we must not include the current process in the key. Since
+ * there is only one address space, the address is a unique key
+ * on its own.
+ */
+ if (IS_ENABLED(CONFIG_MMU))
+ key->private.mm = mm;
+ else
+ key->private.mm = NULL;
+
key->private.address = address;
return 0;
}
@@ -273,54 +650,52 @@ again:
err = 0;
/*
- * The treatment of mapping from this point on is critical. The page
- * lock protects many things but in this context the page lock
+ * The treatment of mapping from this point on is critical. The folio
+ * lock protects many things but in this context the folio lock
* stabilizes mapping, prevents inode freeing in the shared
* file-backed region case and guards against movement to swap cache.
*
- * Strictly speaking the page lock is not needed in all cases being
- * considered here and page lock forces unnecessarily serialization
+ * Strictly speaking the folio lock is not needed in all cases being
+ * considered here and folio lock forces unnecessarily serialization.
* From this point on, mapping will be re-verified if necessary and
- * page lock will be acquired only if it is unavoidable
+ * folio lock will be acquired only if it is unavoidable
*
- * Mapping checks require the head page for any compound page so the
- * head page and mapping is looked up now. For anonymous pages, it
- * does not matter if the page splits in the future as the key is
- * based on the address. For filesystem-backed pages, the tail is
- * required as the index of the page determines the key. For
- * base pages, there is no tail page and tail == page.
+ * Mapping checks require the folio so it is looked up now. For
+ * anonymous pages, it does not matter if the folio is split
+ * in the future as the key is based on the address. For
+ * filesystem-backed pages, the precise page is required as the
+ * index of the page determines the key.
*/
- tail = page;
- page = compound_head(page);
- mapping = READ_ONCE(page->mapping);
+ folio = page_folio(page);
+ mapping = READ_ONCE(folio->mapping);
/*
- * If page->mapping is NULL, then it cannot be a PageAnon
+ * If folio->mapping is NULL, then it cannot be an anonymous
* page; but it might be the ZERO_PAGE or in the gate area or
* in a special mapping (all cases which we are happy to fail);
* or it may have been a good file page when get_user_pages_fast
* found it, but truncated or holepunched or subjected to
- * invalidate_complete_page2 before we got the page lock (also
+ * invalidate_complete_page2 before we got the folio lock (also
* cases which we are happy to fail). And we hold a reference,
* so refcount care in invalidate_inode_page's remove_mapping
* prevents drop_caches from setting mapping to NULL beneath us.
*
* The case we do have to guard against is when memory pressure made
* shmem_writepage move it from filecache to swapcache beneath us:
- * an unlikely race, but we do need to retry for page->mapping.
+ * an unlikely race, but we do need to retry for folio->mapping.
*/
if (unlikely(!mapping)) {
int shmem_swizzled;
/*
- * Page lock is required to identify which special case above
- * applies. If this is really a shmem page then the page lock
+ * Folio lock is required to identify which special case above
+ * applies. If this is really a shmem page then the folio lock
* will prevent unexpected transitions.
*/
- lock_page(page);
- shmem_swizzled = PageSwapCache(page) || page->mapping;
- unlock_page(page);
- put_page(page);
+ folio_lock(folio);
+ shmem_swizzled = folio_test_swapcache(folio) || folio->mapping;
+ folio_unlock(folio);
+ folio_put(folio);
if (shmem_swizzled)
goto again;
@@ -331,14 +706,14 @@ again:
/*
* Private mappings are handled in a simple way.
*
- * If the futex key is stored on an anonymous page, then the associated
+ * If the futex key is stored in anonymous memory, then the associated
* object is the mm which is implicitly pinned by the calling process.
*
* NOTE: When userspace waits on a MAP_SHARED mapping, even if
* it's a read-only handle, it's expected that futexes attach to
* the object not the particular process.
*/
- if (PageAnon(page)) {
+ if (folio_test_anon(folio)) {
/*
* A RO anonymous page will never change and thus doesn't make
* sense for futex operations.
@@ -357,10 +732,10 @@ again:
/*
* The associated futex object in this case is the inode and
- * the page->mapping must be traversed. Ordinarily this should
- * be stabilised under page lock but it's not strictly
+ * the folio->mapping must be traversed. Ordinarily this should
+ * be stabilised under folio lock but it's not strictly
* necessary in this case as we just want to pin the inode, not
- * update the radix tree or anything like that.
+ * update i_pages or anything like that.
*
* The RCU read lock is taken as the inode is finally freed
* under RCU. If the mapping still matches expectations then the
@@ -368,9 +743,9 @@ again:
*/
rcu_read_lock();
- if (READ_ONCE(page->mapping) != mapping) {
+ if (READ_ONCE(folio->mapping) != mapping) {
rcu_read_unlock();
- put_page(page);
+ folio_put(folio);
goto again;
}
@@ -378,19 +753,19 @@ again:
inode = READ_ONCE(mapping->host);
if (!inode) {
rcu_read_unlock();
- put_page(page);
+ folio_put(folio);
goto again;
}
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
key->shared.i_seq = get_inode_sequence_number(inode);
- key->shared.pgoff = page_to_pgoff(tail);
+ key->shared.pgoff = page_pgoff(folio, page);
rcu_read_unlock();
}
out:
- put_page(page);
+ folio_put(folio);
return err;
}
@@ -437,28 +812,6 @@ struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *
return NULL;
}
-int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval)
-{
- int ret;
-
- pagefault_disable();
- ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
- pagefault_enable();
-
- return ret;
-}
-
-int futex_get_value_locked(u32 *dest, u32 __user *from)
-{
- int ret;
-
- pagefault_disable();
- ret = __get_user(*dest, from);
- pagefault_enable();
-
- return ret ? -EFAULT : 0;
-}
-
/**
* wait_for_owner_exiting - Block until the owner has exited
* @ret: owner's current futex lock status
@@ -510,13 +863,9 @@ void __futex_unqueue(struct futex_q *q)
}
/* The key must be already stored in q->key. */
-struct futex_hash_bucket *futex_q_lock(struct futex_q *q)
+void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb)
__acquires(&hb->lock)
{
- struct futex_hash_bucket *hb;
-
- hb = futex_hash(&q->key);
-
/*
* Increment the counter before taking the lock so that
* a potential waker won't miss a to-be-slept task that is
@@ -530,17 +879,17 @@ struct futex_hash_bucket *futex_q_lock(struct futex_q *q)
q->lock_ptr = &hb->lock;
spin_lock(&hb->lock);
- return hb;
}
void futex_q_unlock(struct futex_hash_bucket *hb)
__releases(&hb->lock)
{
- spin_unlock(&hb->lock);
futex_hb_waiters_dec(hb);
+ spin_unlock(&hb->lock);
}
-void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb)
+void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb,
+ struct task_struct *task)
{
int prio;
@@ -556,7 +905,7 @@ void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb)
plist_node_init(&q->list, prio);
plist_add(&q->list, &hb->chain);
- q->task = current;
+ q->task = task;
}
/**
@@ -575,6 +924,8 @@ int futex_unqueue(struct futex_q *q)
spinlock_t *lock_ptr;
int ret = 0;
+ /* RCU so lock_ptr is not going away during locking. */
+ guard(rcu)();
/* In the common case we don't take the spinlock, which is nice. */
retry:
/*
@@ -613,13 +964,40 @@ retry:
return ret;
}
+void futex_q_lockptr_lock(struct futex_q *q)
+{
+ spinlock_t *lock_ptr;
+
+ /*
+ * See futex_unqueue() why lock_ptr can change.
+ */
+ guard(rcu)();
+retry:
+ lock_ptr = READ_ONCE(q->lock_ptr);
+ spin_lock(lock_ptr);
+
+ if (unlikely(lock_ptr != q->lock_ptr)) {
+ spin_unlock(lock_ptr);
+ goto retry;
+ }
+}
+
/*
- * PI futexes can not be requeued and must remove themselves from the
- * hash bucket. The hash bucket lock (i.e. lock_ptr) is held.
+ * PI futexes can not be requeued and must remove themselves from the hash
+ * bucket. The hash bucket lock (i.e. lock_ptr) is held.
*/
void futex_unqueue_pi(struct futex_q *q)
{
- __futex_unqueue(q);
+ /*
+ * If the lock was not acquired (due to timeout or signal) then the
+ * rt_waiter is removed before futex_q is. If this is observed by
+ * an unlocker after dropping the rtmutex wait lock and before
+ * acquiring the hash bucket lock, then the unlocker dequeues the
+ * futex_q from the hash bucket list to guarantee consistent state
+ * vs. userspace. Therefore the dequeue here must be conditional.
+ */
+ if (!plist_node_empty(&q->list))
+ __futex_unqueue(q);
BUG_ON(!q->pi_state);
put_pi_state(q->pi_state);
@@ -688,7 +1066,8 @@ retry:
owner = uval & FUTEX_TID_MASK;
if (pending_op && !pi && !owner) {
- futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+ futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1,
+ FUTEX_BITSET_MATCH_ANY);
return 0;
}
@@ -740,8 +1119,10 @@ retry:
* Wake robust non-PI futexes here. The wakeup of
* PI futexes happens in exit_pi_state():
*/
- if (!pi && (uval & FUTEX_WAITERS))
- futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+ if (!pi && (uval & FUTEX_WAITERS)) {
+ futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1,
+ FUTEX_BITSET_MATCH_ANY);
+ }
return 0;
}
@@ -944,10 +1325,20 @@ static void exit_pi_state_list(struct task_struct *curr)
{
struct list_head *next, *head = &curr->pi_state_list;
struct futex_pi_state *pi_state;
- struct futex_hash_bucket *hb;
union futex_key key = FUTEX_KEY_INIT;
/*
+ * The mutex mm_struct::futex_hash_lock might be acquired.
+ */
+ might_sleep();
+ /*
+ * Ensure the hash remains stable (no resize) during the while loop
+ * below. The hb pointer is acquired under the pi_lock so we can't block
+ * on the mutex.
+ */
+ WARN_ON(curr != current);
+ guard(private_hash)();
+ /*
* We are a ZOMBIE and nobody can enqueue itself on
* pi_state_list anymore, but we have to be careful
* versus waiters unqueueing themselves:
@@ -957,50 +1348,52 @@ static void exit_pi_state_list(struct task_struct *curr)
next = head->next;
pi_state = list_entry(next, struct futex_pi_state, list);
key = pi_state->key;
- hb = futex_hash(&key);
-
- /*
- * We can race against put_pi_state() removing itself from the
- * list (a waiter going away). put_pi_state() will first
- * decrement the reference count and then modify the list, so
- * its possible to see the list entry but fail this reference
- * acquire.
- *
- * In that case; drop the locks to let put_pi_state() make
- * progress and retry the loop.
- */
- if (!refcount_inc_not_zero(&pi_state->refcount)) {
+ if (1) {
+ CLASS(hb, hb)(&key);
+
+ /*
+ * We can race against put_pi_state() removing itself from the
+ * list (a waiter going away). put_pi_state() will first
+ * decrement the reference count and then modify the list, so
+ * its possible to see the list entry but fail this reference
+ * acquire.
+ *
+ * In that case; drop the locks to let put_pi_state() make
+ * progress and retry the loop.
+ */
+ if (!refcount_inc_not_zero(&pi_state->refcount)) {
+ raw_spin_unlock_irq(&curr->pi_lock);
+ cpu_relax();
+ raw_spin_lock_irq(&curr->pi_lock);
+ continue;
+ }
raw_spin_unlock_irq(&curr->pi_lock);
- cpu_relax();
- raw_spin_lock_irq(&curr->pi_lock);
- continue;
- }
- raw_spin_unlock_irq(&curr->pi_lock);
- spin_lock(&hb->lock);
- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
- raw_spin_lock(&curr->pi_lock);
- /*
- * We dropped the pi-lock, so re-check whether this
- * task still owns the PI-state:
- */
- if (head->next != next) {
- /* retain curr->pi_lock for the loop invariant */
- raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+ spin_lock(&hb->lock);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ raw_spin_lock(&curr->pi_lock);
+ /*
+ * We dropped the pi-lock, so re-check whether this
+ * task still owns the PI-state:
+ */
+ if (head->next != next) {
+ /* retain curr->pi_lock for the loop invariant */
+ raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+ spin_unlock(&hb->lock);
+ put_pi_state(pi_state);
+ continue;
+ }
+
+ WARN_ON(pi_state->owner != curr);
+ WARN_ON(list_empty(&pi_state->list));
+ list_del_init(&pi_state->list);
+ pi_state->owner = NULL;
+
+ raw_spin_unlock(&curr->pi_lock);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(&hb->lock);
- put_pi_state(pi_state);
- continue;
}
- WARN_ON(pi_state->owner != curr);
- WARN_ON(list_empty(&pi_state->list));
- list_del_init(&pi_state->list);
- pi_state->owner = NULL;
-
- raw_spin_unlock(&curr->pi_lock);
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- spin_unlock(&hb->lock);
-
rt_mutex_futex_unlock(&pi_state->pi_mutex);
put_pi_state(pi_state);
@@ -1120,30 +1513,500 @@ void futex_exit_release(struct task_struct *tsk)
futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
}
+static void futex_hash_bucket_init(struct futex_hash_bucket *fhb,
+ struct futex_private_hash *fph)
+{
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+ fhb->priv = fph;
+#endif
+ atomic_set(&fhb->waiters, 0);
+ plist_head_init(&fhb->chain);
+ spin_lock_init(&fhb->lock);
+}
+
+#define FH_CUSTOM 0x01
+
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+
+/*
+ * futex-ref
+ *
+ * Heavily inspired by percpu-rwsem/percpu-refcount; not reusing any of that
+ * code because it just doesn't fit right.
+ *
+ * Dual counter, per-cpu / atomic approach like percpu-refcount, except it
+ * re-initializes the state automatically, such that the fph swizzle is also a
+ * transition back to per-cpu.
+ */
+
+static void futex_ref_rcu(struct rcu_head *head);
+
+static void __futex_ref_atomic_begin(struct futex_private_hash *fph)
+{
+ struct mm_struct *mm = fph->mm;
+
+ /*
+ * The counter we're about to switch to must have fully switched;
+ * otherwise it would be impossible for it to have reported success
+ * from futex_ref_is_dead().
+ */
+ WARN_ON_ONCE(atomic_long_read(&mm->futex_atomic) != 0);
+
+ /*
+ * Set the atomic to the bias value such that futex_ref_{get,put}()
+ * will never observe 0. Will be fixed up in __futex_ref_atomic_end()
+ * when folding in the percpu count.
+ */
+ atomic_long_set(&mm->futex_atomic, LONG_MAX);
+ smp_store_release(&fph->state, FR_ATOMIC);
+
+ call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu);
+}
+
+static void __futex_ref_atomic_end(struct futex_private_hash *fph)
+{
+ struct mm_struct *mm = fph->mm;
+ unsigned int count = 0;
+ long ret;
+ int cpu;
+
+ /*
+ * Per __futex_ref_atomic_begin() the state of the fph must be ATOMIC
+ * and per this RCU callback, everybody must now observe this state and
+ * use the atomic variable.
+ */
+ WARN_ON_ONCE(fph->state != FR_ATOMIC);
+
+ /*
+ * Therefore the per-cpu counter is now stable, sum and reset.
+ */
+ for_each_possible_cpu(cpu) {
+ unsigned int *ptr = per_cpu_ptr(mm->futex_ref, cpu);
+ count += *ptr;
+ *ptr = 0;
+ }
+
+ /*
+ * Re-init for the next cycle.
+ */
+ this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */
+
+ /*
+ * Add actual count, subtract bias and initial refcount.
+ *
+ * The moment this atomic operation happens, futex_ref_is_dead() can
+ * become true.
+ */
+ ret = atomic_long_add_return(count - LONG_MAX - 1, &mm->futex_atomic);
+ if (!ret)
+ wake_up_var(mm);
+
+ WARN_ON_ONCE(ret < 0);
+ mmput_async(mm);
+}
+
+static void futex_ref_rcu(struct rcu_head *head)
+{
+ struct mm_struct *mm = container_of(head, struct mm_struct, futex_rcu);
+ struct futex_private_hash *fph = rcu_dereference_raw(mm->futex_phash);
+
+ if (fph->state == FR_PERCPU) {
+ /*
+ * Per this extra grace-period, everybody must now observe
+ * fph as the current fph and no previously observed fph's
+ * are in-flight.
+ *
+ * Notably, nobody will now rely on the atomic
+ * futex_ref_is_dead() state anymore so we can begin the
+ * migration of the per-cpu counter into the atomic.
+ */
+ __futex_ref_atomic_begin(fph);
+ return;
+ }
+
+ __futex_ref_atomic_end(fph);
+}
+
+/*
+ * Drop the initial refcount and transition to atomics.
+ */
+static void futex_ref_drop(struct futex_private_hash *fph)
+{
+ struct mm_struct *mm = fph->mm;
+
+ /*
+ * Can only transition the current fph;
+ */
+ WARN_ON_ONCE(rcu_dereference_raw(mm->futex_phash) != fph);
+ /*
+ * We enqueue at least one RCU callback. Ensure mm stays if the task
+ * exits before the transition is completed.
+ */
+ mmget(mm);
+
+ /*
+ * In order to avoid the following scenario:
+ *
+ * futex_hash() __futex_pivot_hash()
+ * guard(rcu); guard(mm->futex_hash_lock);
+ * fph = mm->futex_phash;
+ * rcu_assign_pointer(&mm->futex_phash, new);
+ * futex_hash_allocate()
+ * futex_ref_drop()
+ * fph->state = FR_ATOMIC;
+ * atomic_set(, BIAS);
+ *
+ * futex_private_hash_get(fph); // OOPS
+ *
+ * Where an old fph (which is FR_ATOMIC) and should fail on
+ * inc_not_zero, will succeed because a new transition is started and
+ * the atomic is bias'ed away from 0.
+ *
+ * There must be at least one full grace-period between publishing a
+ * new fph and trying to replace it.
+ */
+ if (poll_state_synchronize_rcu(mm->futex_batches)) {
+ /*
+ * There was a grace-period, we can begin now.
+ */
+ __futex_ref_atomic_begin(fph);
+ return;
+ }
+
+ call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu);
+}
+
+static bool futex_ref_get(struct futex_private_hash *fph)
+{
+ struct mm_struct *mm = fph->mm;
+
+ guard(preempt)();
+
+ if (READ_ONCE(fph->state) == FR_PERCPU) {
+ __this_cpu_inc(*mm->futex_ref);
+ return true;
+ }
+
+ return atomic_long_inc_not_zero(&mm->futex_atomic);
+}
+
+static bool futex_ref_put(struct futex_private_hash *fph)
+{
+ struct mm_struct *mm = fph->mm;
+
+ guard(preempt)();
+
+ if (READ_ONCE(fph->state) == FR_PERCPU) {
+ __this_cpu_dec(*mm->futex_ref);
+ return false;
+ }
+
+ return atomic_long_dec_and_test(&mm->futex_atomic);
+}
+
+static bool futex_ref_is_dead(struct futex_private_hash *fph)
+{
+ struct mm_struct *mm = fph->mm;
+
+ guard(rcu)();
+
+ if (smp_load_acquire(&fph->state) == FR_PERCPU)
+ return false;
+
+ return atomic_long_read(&mm->futex_atomic) == 0;
+}
+
+int futex_mm_init(struct mm_struct *mm)
+{
+ mutex_init(&mm->futex_hash_lock);
+ RCU_INIT_POINTER(mm->futex_phash, NULL);
+ mm->futex_phash_new = NULL;
+ /* futex-ref */
+ mm->futex_ref = NULL;
+ atomic_long_set(&mm->futex_atomic, 0);
+ mm->futex_batches = get_state_synchronize_rcu();
+ return 0;
+}
+
+void futex_hash_free(struct mm_struct *mm)
+{
+ struct futex_private_hash *fph;
+
+ free_percpu(mm->futex_ref);
+ kvfree(mm->futex_phash_new);
+ fph = rcu_dereference_raw(mm->futex_phash);
+ if (fph)
+ kvfree(fph);
+}
+
+static bool futex_pivot_pending(struct mm_struct *mm)
+{
+ struct futex_private_hash *fph;
+
+ guard(rcu)();
+
+ if (!mm->futex_phash_new)
+ return true;
+
+ fph = rcu_dereference(mm->futex_phash);
+ return futex_ref_is_dead(fph);
+}
+
+static bool futex_hash_less(struct futex_private_hash *a,
+ struct futex_private_hash *b)
+{
+ /* user provided always wins */
+ if (!a->custom && b->custom)
+ return true;
+ if (a->custom && !b->custom)
+ return false;
+
+ /* zero-sized hash wins */
+ if (!b->hash_mask)
+ return true;
+ if (!a->hash_mask)
+ return false;
+
+ /* keep the biggest */
+ if (a->hash_mask < b->hash_mask)
+ return true;
+ if (a->hash_mask > b->hash_mask)
+ return false;
+
+ return false; /* equal */
+}
+
+static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags)
+{
+ struct mm_struct *mm = current->mm;
+ struct futex_private_hash *fph;
+ bool custom = flags & FH_CUSTOM;
+ int i;
+
+ if (hash_slots && (hash_slots == 1 || !is_power_of_2(hash_slots)))
+ return -EINVAL;
+
+ /*
+ * Once we've disabled the global hash there is no way back.
+ */
+ scoped_guard(rcu) {
+ fph = rcu_dereference(mm->futex_phash);
+ if (fph && !fph->hash_mask) {
+ if (custom)
+ return -EBUSY;
+ return 0;
+ }
+ }
+
+ if (!mm->futex_ref) {
+ /*
+ * This will always be allocated by the first thread and
+ * therefore requires no locking.
+ */
+ mm->futex_ref = alloc_percpu(unsigned int);
+ if (!mm->futex_ref)
+ return -ENOMEM;
+ this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */
+ }
+
+ fph = kvzalloc(struct_size(fph, queues, hash_slots),
+ GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+ if (!fph)
+ return -ENOMEM;
+
+ fph->hash_mask = hash_slots ? hash_slots - 1 : 0;
+ fph->custom = custom;
+ fph->mm = mm;
+
+ for (i = 0; i < hash_slots; i++)
+ futex_hash_bucket_init(&fph->queues[i], fph);
+
+ if (custom) {
+ /*
+ * Only let prctl() wait / retry; don't unduly delay clone().
+ */
+again:
+ wait_var_event(mm, futex_pivot_pending(mm));
+ }
+
+ scoped_guard(mutex, &mm->futex_hash_lock) {
+ struct futex_private_hash *free __free(kvfree) = NULL;
+ struct futex_private_hash *cur, *new;
+
+ cur = rcu_dereference_protected(mm->futex_phash,
+ lockdep_is_held(&mm->futex_hash_lock));
+ new = mm->futex_phash_new;
+ mm->futex_phash_new = NULL;
+
+ if (fph) {
+ if (cur && !cur->hash_mask) {
+ /*
+ * If two threads simultaneously request the global
+ * hash then the first one performs the switch,
+ * the second one returns here.
+ */
+ free = fph;
+ mm->futex_phash_new = new;
+ return -EBUSY;
+ }
+ if (cur && !new) {
+ /*
+ * If we have an existing hash, but do not yet have
+ * allocated a replacement hash, drop the initial
+ * reference on the existing hash.
+ */
+ futex_ref_drop(cur);
+ }
+
+ if (new) {
+ /*
+ * Two updates raced; throw out the lesser one.
+ */
+ if (futex_hash_less(new, fph)) {
+ free = new;
+ new = fph;
+ } else {
+ free = fph;
+ }
+ } else {
+ new = fph;
+ }
+ fph = NULL;
+ }
+
+ if (new) {
+ /*
+ * Will set mm->futex_phash_new on failure;
+ * futex_private_hash_get() will try again.
+ */
+ if (!__futex_pivot_hash(mm, new) && custom)
+ goto again;
+ }
+ }
+ return 0;
+}
+
+int futex_hash_allocate_default(void)
+{
+ unsigned int threads, buckets, current_buckets = 0;
+ struct futex_private_hash *fph;
+
+ if (!current->mm)
+ return 0;
+
+ scoped_guard(rcu) {
+ threads = min_t(unsigned int,
+ get_nr_threads(current),
+ num_online_cpus());
+
+ fph = rcu_dereference(current->mm->futex_phash);
+ if (fph) {
+ if (fph->custom)
+ return 0;
+
+ current_buckets = fph->hash_mask + 1;
+ }
+ }
+
+ /*
+ * The default allocation will remain within
+ * 16 <= threads * 4 <= global hash size
+ */
+ buckets = roundup_pow_of_two(4 * threads);
+ buckets = clamp(buckets, 16, futex_hashmask + 1);
+
+ if (current_buckets >= buckets)
+ return 0;
+
+ return futex_hash_allocate(buckets, 0);
+}
+
+static int futex_hash_get_slots(void)
+{
+ struct futex_private_hash *fph;
+
+ guard(rcu)();
+ fph = rcu_dereference(current->mm->futex_phash);
+ if (fph && fph->hash_mask)
+ return fph->hash_mask + 1;
+ return 0;
+}
+
+#else
+
+static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags)
+{
+ return -EINVAL;
+}
+
+static int futex_hash_get_slots(void)
+{
+ return 0;
+}
+
+#endif
+
+int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4)
+{
+ unsigned int flags = FH_CUSTOM;
+ int ret;
+
+ switch (arg2) {
+ case PR_FUTEX_HASH_SET_SLOTS:
+ if (arg4)
+ return -EINVAL;
+ ret = futex_hash_allocate(arg3, flags);
+ break;
+
+ case PR_FUTEX_HASH_GET_SLOTS:
+ ret = futex_hash_get_slots();
+ break;
+
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ return ret;
+}
+
static int __init futex_init(void)
{
- unsigned int futex_shift;
- unsigned long i;
+ unsigned long hashsize, i;
+ unsigned int order, n;
+ unsigned long size;
-#if CONFIG_BASE_SMALL
- futex_hashsize = 16;
+#ifdef CONFIG_BASE_SMALL
+ hashsize = 16;
#else
- futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
+ hashsize = 256 * num_possible_cpus();
+ hashsize /= num_possible_nodes();
+ hashsize = max(4, hashsize);
+ hashsize = roundup_pow_of_two(hashsize);
#endif
+ futex_hashshift = ilog2(hashsize);
+ size = sizeof(struct futex_hash_bucket) * hashsize;
+ order = get_order(size);
+
+ for_each_node(n) {
+ struct futex_hash_bucket *table;
+
+ if (order > MAX_PAGE_ORDER)
+ table = vmalloc_huge_node(size, GFP_KERNEL, n);
+ else
+ table = alloc_pages_exact_nid(n, size, GFP_KERNEL);
+
+ BUG_ON(!table);
+
+ for (i = 0; i < hashsize; i++)
+ futex_hash_bucket_init(&table[i], NULL);
- futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
- futex_hashsize, 0,
- futex_hashsize < 256 ? HASH_SMALL : 0,
- &futex_shift, NULL,
- futex_hashsize, futex_hashsize);
- futex_hashsize = 1UL << futex_shift;
-
- for (i = 0; i < futex_hashsize; i++) {
- atomic_set(&futex_queues[i].waiters, 0);
- plist_head_init(&futex_queues[i].chain);
- spin_lock_init(&futex_queues[i].lock);
+ futex_queues[n] = table;
}
+ futex_hashmask = hashsize - 1;
+ pr_info("futex hash table entries: %lu (%lu bytes on %d NUMA nodes, total %lu KiB, %s).\n",
+ hashsize, size, num_possible_nodes(), size * num_possible_nodes() / 1024,
+ order > MAX_PAGE_ORDER ? "vmalloc" : "linear");
return 0;
}
core_initcall(futex_init);
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index b5379c0e6d6d..30c2afa03889 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -5,6 +5,9 @@
#include <linux/futex.h>
#include <linux/rtmutex.h>
#include <linux/sched/wake_q.h>
+#include <linux/compat.h>
+#include <linux/uaccess.h>
+#include <linux/cleanup.h>
#ifdef CONFIG_PREEMPT_RT
#include <linux/rcuwait.h>
@@ -16,17 +19,103 @@
* Futex flags used to encode options to functions and preserve them across
* restarts.
*/
+#define FLAGS_SIZE_8 0x0000
+#define FLAGS_SIZE_16 0x0001
+#define FLAGS_SIZE_32 0x0002
+#define FLAGS_SIZE_64 0x0003
+
+#define FLAGS_SIZE_MASK 0x0003
+
#ifdef CONFIG_MMU
-# define FLAGS_SHARED 0x01
+# define FLAGS_SHARED 0x0010
#else
/*
* NOMMU does not have per process address space. Let the compiler optimize
* code away.
*/
-# define FLAGS_SHARED 0x00
+# define FLAGS_SHARED 0x0000
#endif
-#define FLAGS_CLOCKRT 0x02
-#define FLAGS_HAS_TIMEOUT 0x04
+#define FLAGS_CLOCKRT 0x0020
+#define FLAGS_HAS_TIMEOUT 0x0040
+#define FLAGS_NUMA 0x0080
+#define FLAGS_STRICT 0x0100
+#define FLAGS_MPOL 0x0200
+
+/* FUTEX_ to FLAGS_ */
+static inline unsigned int futex_to_flags(unsigned int op)
+{
+ unsigned int flags = FLAGS_SIZE_32;
+
+ if (!(op & FUTEX_PRIVATE_FLAG))
+ flags |= FLAGS_SHARED;
+
+ if (op & FUTEX_CLOCK_REALTIME)
+ flags |= FLAGS_CLOCKRT;
+
+ return flags;
+}
+
+#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_NUMA | FUTEX2_MPOL | FUTEX2_PRIVATE)
+
+/* FUTEX2_ to FLAGS_ */
+static inline unsigned int futex2_to_flags(unsigned int flags2)
+{
+ unsigned int flags = flags2 & FUTEX2_SIZE_MASK;
+
+ if (!(flags2 & FUTEX2_PRIVATE))
+ flags |= FLAGS_SHARED;
+
+ if (flags2 & FUTEX2_NUMA)
+ flags |= FLAGS_NUMA;
+
+ if (flags2 & FUTEX2_MPOL)
+ flags |= FLAGS_MPOL;
+
+ return flags;
+}
+
+static inline unsigned int futex_size(unsigned int flags)
+{
+ return 1 << (flags & FLAGS_SIZE_MASK);
+}
+
+static inline bool futex_flags_valid(unsigned int flags)
+{
+ /* Only 64bit futexes for 64bit code */
+ if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall()) {
+ if ((flags & FLAGS_SIZE_MASK) == FLAGS_SIZE_64)
+ return false;
+ }
+
+ /* Only 32bit futexes are implemented -- for now */
+ if ((flags & FLAGS_SIZE_MASK) != FLAGS_SIZE_32)
+ return false;
+
+ /*
+ * Must be able to represent both FUTEX_NO_NODE and every valid nodeid
+ * in a futex word.
+ */
+ if (flags & FLAGS_NUMA) {
+ int bits = 8 * futex_size(flags);
+ u64 max = ~0ULL;
+
+ max >>= 64 - bits;
+ if (nr_node_ids >= max)
+ return false;
+ }
+
+ return true;
+}
+
+static inline bool futex_validate_input(unsigned int flags, u64 val)
+{
+ int bits = 8 * futex_size(flags);
+
+ if (bits < 64 && (val >> bits))
+ return false;
+
+ return true;
+}
#ifdef CONFIG_FAIL_FUTEX
extern bool should_fail_futex(bool fshared);
@@ -46,6 +135,7 @@ struct futex_hash_bucket {
atomic_t waiters;
spinlock_t lock;
struct plist_head chain;
+ struct futex_private_hash *priv;
} ____cacheline_aligned_in_smp;
/*
@@ -69,17 +159,23 @@ struct futex_pi_state {
union futex_key key;
} __randomize_layout;
+struct futex_q;
+typedef void (futex_wake_fn)(struct wake_q_head *wake_q, struct futex_q *q);
+
/**
* struct futex_q - The hashed futex queue entry, one per waiting task
* @list: priority-sorted list of tasks waiting on this futex
* @task: the task waiting on the futex
* @lock_ptr: the hash bucket lock
+ * @wake: the wake handler for this queue
+ * @wake_data: data associated with the wake handler
* @key: the key the futex is hashed on
* @pi_state: optional priority inheritance state
* @rt_waiter: rt_waiter storage for use with requeue_pi
* @requeue_pi_key: the requeue_pi target futex key
* @bitset: bitset for the optional bitmasked wakeup
* @requeue_state: State field for futex_requeue_pi()
+ * @drop_hb_ref: Waiter should drop the extra hash bucket reference if true
* @requeue_wait: RCU wait for futex_requeue_pi() (RT only)
*
* We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
@@ -98,12 +194,15 @@ struct futex_q {
struct task_struct *task;
spinlock_t *lock_ptr;
+ futex_wake_fn *wake;
+ void *wake_data;
union futex_key key;
struct futex_pi_state *pi_state;
struct rt_mutex_waiter *rt_waiter;
union futex_key *requeue_pi_key;
u32 bitset;
atomic_t requeue_state;
+ bool drop_hb_ref;
#ifdef CONFIG_PREEMPT_RT
struct rcuwait requeue_wait;
#endif
@@ -116,14 +215,35 @@ enum futex_access {
FUTEX_WRITE
};
-extern int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
+extern int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
enum futex_access rw);
-
+extern void futex_q_lockptr_lock(struct futex_q *q);
extern struct hrtimer_sleeper *
futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
int flags, u64 range_ns);
extern struct futex_hash_bucket *futex_hash(union futex_key *key);
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+extern void futex_hash_get(struct futex_hash_bucket *hb);
+extern void futex_hash_put(struct futex_hash_bucket *hb);
+
+extern struct futex_private_hash *futex_private_hash(void);
+extern void futex_private_hash_put(struct futex_private_hash *fph);
+
+#else /* !CONFIG_FUTEX_PRIVATE_HASH */
+static inline void futex_hash_get(struct futex_hash_bucket *hb) { }
+static inline void futex_hash_put(struct futex_hash_bucket *hb) { }
+static inline struct futex_private_hash *futex_private_hash(void) { return NULL; }
+static inline void futex_private_hash_put(struct futex_private_hash *fph) { }
+#endif
+
+DEFINE_CLASS(hb, struct futex_hash_bucket *,
+ if (_T) futex_hash_put(_T),
+ futex_hash(key), union futex_key *key);
+
+DEFINE_CLASS(private_hash, struct futex_private_hash *,
+ if (_T) futex_private_hash_put(_T),
+ futex_private_hash(), void);
/**
* futex_match - Check whether two futex keys are equal
@@ -141,24 +261,43 @@ static inline int futex_match(union futex_key *key1, union futex_key *key2)
}
extern int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
- struct futex_q *q, struct futex_hash_bucket **hb);
-extern void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
- struct hrtimer_sleeper *timeout);
+ struct futex_q *q, union futex_key *key2,
+ struct task_struct *task);
+extern void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout);
+extern bool __futex_wake_mark(struct futex_q *q);
extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q);
extern int fault_in_user_writeable(u32 __user *uaddr);
-extern int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval);
-extern int futex_get_value_locked(u32 *dest, u32 __user *from);
extern struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key);
+static inline int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval)
+{
+ int ret;
+
+ pagefault_disable();
+ ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
+ pagefault_enable();
+
+ return ret;
+}
+
+/* Read from user memory with pagefaults disabled */
+static inline int futex_get_value_locked(u32 *dest, u32 __user *from)
+{
+ guard(pagefault)();
+ return get_user_inline(*dest, from);
+}
+
extern void __futex_unqueue(struct futex_q *q);
-extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb);
+extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb,
+ struct task_struct *task);
extern int futex_unqueue(struct futex_q *q);
/**
* futex_queue() - Enqueue the futex_q on the futex_hash_bucket
* @q: The futex_q to enqueue
* @hb: The destination hash bucket
+ * @task: Task queueing this futex
*
* The hb->lock must be held by the caller, and is released here. A call to
* futex_queue() is typically paired with exactly one call to futex_unqueue(). The
@@ -166,11 +305,14 @@ extern int futex_unqueue(struct futex_q *q);
* or nothing if the unqueue is done as part of the wake process and the unqueue
* state is implicit in the state of woken task (see futex_wait_requeue_pi() for
* an example).
+ *
+ * Note that @task may be NULL, for async usage of futexes.
*/
-static inline void futex_queue(struct futex_q *q, struct futex_hash_bucket *hb)
+static inline void futex_queue(struct futex_q *q, struct futex_hash_bucket *hb,
+ struct task_struct *task)
__releases(&hb->lock)
{
- __futex_queue(q, hb);
+ __futex_queue(q, hb, task);
spin_unlock(&hb->lock);
}
@@ -216,7 +358,7 @@ static inline int futex_hb_waiters_pending(struct futex_hash_bucket *hb)
#endif
}
-extern struct futex_hash_bucket *futex_q_lock(struct futex_q *q);
+extern void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb);
extern void futex_q_unlock(struct futex_hash_bucket *hb);
@@ -260,10 +402,14 @@ extern int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32
val, ktime_t *abs_time, u32 bitset, u32 __user
*uaddr2);
-extern int futex_requeue(u32 __user *uaddr1, unsigned int flags,
- u32 __user *uaddr2, int nr_wake, int nr_requeue,
+extern int futex_requeue(u32 __user *uaddr1, unsigned int flags1,
+ u32 __user *uaddr2, unsigned int flags2,
+ int nr_wake, int nr_requeue,
u32 *cmpval, int requeue_pi);
+extern int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
+ struct hrtimer_sleeper *to, u32 bitset);
+
extern int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
ktime_t *abs_time, u32 bitset);
@@ -279,6 +425,16 @@ struct futex_vector {
struct futex_q q;
};
+extern int futex_parse_waitv(struct futex_vector *futexv,
+ struct futex_waitv __user *uwaitv,
+ unsigned int nr_futexes, futex_wake_fn *wake,
+ void *wake_data);
+
+extern int futex_wait_multiple_setup(struct futex_vector *vs, int count,
+ int *woken);
+
+extern int futex_unqueue_multiple(struct futex_vector *v, int count);
+
extern int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
struct hrtimer_sleeper *to);
diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
index ce2889f12375..dacb2330f1fb 100644
--- a/kernel/futex/pi.c
+++ b/kernel/futex/pi.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#include <linux/slab.h>
+#include <linux/sched/rt.h>
#include <linux/sched/task.h>
#include "futex.h"
@@ -610,29 +611,16 @@ int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
/*
* Caller must hold a reference on @pi_state.
*/
-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
+static int wake_futex_pi(u32 __user *uaddr, u32 uval,
+ struct futex_pi_state *pi_state,
+ struct rt_mutex_waiter *top_waiter)
{
- struct rt_mutex_waiter *top_waiter;
struct task_struct *new_owner;
bool postunlock = false;
DEFINE_RT_WAKE_Q(wqh);
u32 curval, newval;
int ret = 0;
- top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
- if (WARN_ON_ONCE(!top_waiter)) {
- /*
- * As per the comment in futex_unlock_pi() this should not happen.
- *
- * When this happens, give up our locks and try again, giving
- * the futex_lock_pi() instance time to complete, either by
- * waiting on the rtmutex or removing itself from the futex
- * queue.
- */
- ret = -EAGAIN;
- goto out_unlock;
- }
-
new_owner = top_waiter->task;
/*
@@ -818,7 +806,7 @@ handle_err:
break;
}
- spin_lock(q->lock_ptr);
+ futex_q_lockptr_lock(q);
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
/*
@@ -932,8 +920,8 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
struct hrtimer_sleeper timeout, *to;
struct task_struct *exiting = NULL;
struct rt_mutex_waiter rt_waiter;
- struct futex_hash_bucket *hb;
struct futex_q q = futex_q_init;
+ DEFINE_WAKE_Q(wake_q);
int res, ret;
if (!IS_ENABLED(CONFIG_FUTEX_PI))
@@ -945,132 +933,188 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
to = futex_setup_timer(time, &timeout, flags, 0);
retry:
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
+ ret = get_futex_key(uaddr, flags, &q.key, FUTEX_WRITE);
if (unlikely(ret != 0))
goto out;
retry_private:
- hb = futex_q_lock(&q);
+ if (1) {
+ CLASS(hb, hb)(&q.key);
- ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
- &exiting, 0);
- if (unlikely(ret)) {
- /*
- * Atomic work succeeded and we got the lock,
- * or failed. Either way, we do _not_ block.
- */
- switch (ret) {
- case 1:
- /* We got the lock. */
- ret = 0;
- goto out_unlock_put_key;
- case -EFAULT:
- goto uaddr_faulted;
- case -EBUSY:
- case -EAGAIN:
- /*
- * Two reasons for this:
- * - EBUSY: Task is exiting and we just wait for the
- * exit to complete.
- * - EAGAIN: The user space value changed.
- */
- futex_q_unlock(hb);
+ futex_q_lock(&q, hb);
+
+ ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
+ &exiting, 0);
+ if (unlikely(ret)) {
/*
- * Handle the case where the owner is in the middle of
- * exiting. Wait for the exit to complete otherwise
- * this task might loop forever, aka. live lock.
+ * Atomic work succeeded and we got the lock,
+ * or failed. Either way, we do _not_ block.
*/
- wait_for_owner_exiting(ret, exiting);
- cond_resched();
- goto retry;
- default:
- goto out_unlock_put_key;
+ switch (ret) {
+ case 1:
+ /* We got the lock. */
+ ret = 0;
+ goto out_unlock_put_key;
+ case -EFAULT:
+ goto uaddr_faulted;
+ case -EBUSY:
+ case -EAGAIN:
+ /*
+ * Two reasons for this:
+ * - EBUSY: Task is exiting and we just wait for the
+ * exit to complete.
+ * - EAGAIN: The user space value changed.
+ */
+ futex_q_unlock(hb);
+ /*
+ * Handle the case where the owner is in the middle of
+ * exiting. Wait for the exit to complete otherwise
+ * this task might loop forever, aka. live lock.
+ */
+ wait_for_owner_exiting(ret, exiting);
+ cond_resched();
+ goto retry;
+ default:
+ goto out_unlock_put_key;
+ }
}
- }
- WARN_ON(!q.pi_state);
+ WARN_ON(!q.pi_state);
- /*
- * Only actually queue now that the atomic ops are done:
- */
- __futex_queue(&q, hb);
+ /*
+ * Only actually queue now that the atomic ops are done:
+ */
+ __futex_queue(&q, hb, current);
- if (trylock) {
- ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
- /* Fixup the trylock return value: */
- ret = ret ? 0 : -EWOULDBLOCK;
- goto no_block;
- }
+ if (trylock) {
+ ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
+ /* Fixup the trylock return value: */
+ ret = ret ? 0 : -EWOULDBLOCK;
+ goto no_block;
+ }
- rt_mutex_init_waiter(&rt_waiter);
+ /*
+ * Caution; releasing @hb in-scope. The hb->lock is still locked
+ * while the reference is dropped. The reference can not be dropped
+ * after the unlock because if a user initiated resize is in progress
+ * then we might need to wake him. This can not be done after the
+ * rt_mutex_pre_schedule() invocation. The hb will remain valid because
+ * the thread, performing resize, will block on hb->lock during
+ * the requeue.
+ */
+ futex_hash_put(no_free_ptr(hb));
+ /*
+ * Must be done before we enqueue the waiter, here is unfortunately
+ * under the hb lock, but that *should* work because it does nothing.
+ */
+ rt_mutex_pre_schedule();
- /*
- * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not
- * hold it while doing rt_mutex_start_proxy(), because then it will
- * include hb->lock in the blocking chain, even through we'll not in
- * fact hold it while blocking. This will lead it to report -EDEADLK
- * and BUG when futex_unlock_pi() interleaves with this.
- *
- * Therefore acquire wait_lock while holding hb->lock, but drop the
- * latter before calling __rt_mutex_start_proxy_lock(). This
- * interleaves with futex_unlock_pi() -- which does a similar lock
- * handoff -- such that the latter can observe the futex_q::pi_state
- * before __rt_mutex_start_proxy_lock() is done.
- */
- raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
- spin_unlock(q.lock_ptr);
- /*
- * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
- * such that futex_unlock_pi() is guaranteed to observe the waiter when
- * it sees the futex_q::pi_state.
- */
- ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
- raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
+ rt_mutex_init_waiter(&rt_waiter);
- if (ret) {
- if (ret == 1)
- ret = 0;
- goto cleanup;
- }
+ /*
+ * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not
+ * hold it while doing rt_mutex_start_proxy(), because then it will
+ * include hb->lock in the blocking chain, even through we'll not in
+ * fact hold it while blocking. This will lead it to report -EDEADLK
+ * and BUG when futex_unlock_pi() interleaves with this.
+ *
+ * Therefore acquire wait_lock while holding hb->lock, but drop the
+ * latter before calling __rt_mutex_start_proxy_lock(). This
+ * interleaves with futex_unlock_pi() -- which does a similar lock
+ * handoff -- such that the latter can observe the futex_q::pi_state
+ * before __rt_mutex_start_proxy_lock() is done.
+ */
+ raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
+ spin_unlock(q.lock_ptr);
+ /*
+ * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
+ * such that futex_unlock_pi() is guaranteed to observe the waiter when
+ * it sees the futex_q::pi_state.
+ */
+ ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q);
+ raw_spin_unlock_irq_wake(&q.pi_state->pi_mutex.wait_lock, &wake_q);
- if (unlikely(to))
- hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
+ if (ret) {
+ if (ret == 1)
+ ret = 0;
+ goto cleanup;
+ }
+
+ if (unlikely(to))
+ hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
- ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
+ ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
cleanup:
- spin_lock(q.lock_ptr);
- /*
- * If we failed to acquire the lock (deadlock/signal/timeout), we must
- * first acquire the hb->lock before removing the lock from the
- * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
- * lists consistent.
- *
- * In particular; it is important that futex_unlock_pi() can not
- * observe this inconsistency.
- */
- if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
- ret = 0;
+ /*
+ * If we failed to acquire the lock (deadlock/signal/timeout), we must
+ * unwind the above, however we canont lock hb->lock because
+ * rt_mutex already has a waiter enqueued and hb->lock can itself try
+ * and enqueue an rt_waiter through rtlock.
+ *
+ * Doing the cleanup without holding hb->lock can cause inconsistent
+ * state between hb and pi_state, but only in the direction of not
+ * seeing a waiter that is leaving.
+ *
+ * See futex_unlock_pi(), it deals with this inconsistency.
+ *
+ * There be dragons here, since we must deal with the inconsistency on
+ * the way out (here), it is impossible to detect/warn about the race
+ * the other way around (missing an incoming waiter).
+ *
+ * What could possibly go wrong...
+ */
+ if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
+ ret = 0;
+ /*
+ * Now that the rt_waiter has been dequeued, it is safe to use
+ * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up
+ * the
+ */
+ futex_q_lockptr_lock(&q);
+ /*
+ * Waiter is unqueued.
+ */
+ rt_mutex_post_schedule();
no_block:
- /*
- * Fixup the pi_state owner and possibly acquire the lock if we
- * haven't already.
- */
- res = fixup_pi_owner(uaddr, &q, !ret);
- /*
- * If fixup_pi_owner() returned an error, propagate that. If it acquired
- * the lock, clear our -ETIMEDOUT or -EINTR.
- */
- if (res)
- ret = (res < 0) ? res : 0;
-
- futex_unqueue_pi(&q);
- spin_unlock(q.lock_ptr);
- goto out;
+ /*
+ * Fixup the pi_state owner and possibly acquire the lock if we
+ * haven't already.
+ */
+ res = fixup_pi_owner(uaddr, &q, !ret);
+ /*
+ * If fixup_pi_owner() returned an error, propagate that. If it acquired
+ * the lock, clear our -ETIMEDOUT or -EINTR.
+ */
+ if (res)
+ ret = (res < 0) ? res : 0;
+
+ futex_unqueue_pi(&q);
+ spin_unlock(q.lock_ptr);
+ if (q.drop_hb_ref) {
+ CLASS(hb, hb)(&q.key);
+ /* Additional reference from futex_unlock_pi() */
+ futex_hash_put(hb);
+ }
+ goto out;
out_unlock_put_key:
- futex_q_unlock(hb);
+ futex_q_unlock(hb);
+ goto out;
+
+uaddr_faulted:
+ futex_q_unlock(hb);
+
+ ret = fault_in_user_writeable(uaddr);
+ if (ret)
+ goto out;
+
+ if (!(flags & FLAGS_SHARED))
+ goto retry_private;
+
+ goto retry;
+ }
out:
if (to) {
@@ -1078,18 +1122,6 @@ out:
destroy_hrtimer_on_stack(&to->timer);
}
return ret != -EINTR ? ret : -ERESTARTNOINTR;
-
-uaddr_faulted:
- futex_q_unlock(hb);
-
- ret = fault_in_user_writeable(uaddr);
- if (ret)
- goto out;
-
- if (!(flags & FLAGS_SHARED))
- goto retry_private;
-
- goto retry;
}
/*
@@ -1101,7 +1133,6 @@ int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
{
u32 curval, uval, vpid = task_pid_vnr(current);
union futex_key key = FUTEX_KEY_INIT;
- struct futex_hash_bucket *hb;
struct futex_q *top_waiter;
int ret;
@@ -1117,12 +1148,13 @@ retry:
if ((uval & FUTEX_TID_MASK) != vpid)
return -EPERM;
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
+ ret = get_futex_key(uaddr, flags, &key, FUTEX_WRITE);
if (ret)
return ret;
- hb = futex_hash(&key);
+ CLASS(hb, hb)(&key);
spin_lock(&hb->lock);
+retry_hb:
/*
* Check waiters first. We do not trust user space values at
@@ -1132,6 +1164,7 @@ retry:
top_waiter = futex_top_waiter(hb, &key);
if (top_waiter) {
struct futex_pi_state *pi_state = top_waiter->pi_state;
+ struct rt_mutex_waiter *rt_waiter;
ret = -EINVAL;
if (!pi_state)
@@ -1144,22 +1177,50 @@ retry:
if (pi_state->owner != current)
goto out_unlock;
- get_pi_state(pi_state);
/*
* By taking wait_lock while still holding hb->lock, we ensure
- * there is no point where we hold neither; and therefore
- * wake_futex_p() must observe a state consistent with what we
- * observed.
+ * there is no point where we hold neither; and thereby
+ * wake_futex_pi() must observe any new waiters.
+ *
+ * Since the cleanup: case in futex_lock_pi() removes the
+ * rt_waiter without holding hb->lock, it is possible for
+ * wake_futex_pi() to not find a waiter while the above does,
+ * in this case the waiter is on the way out and it can be
+ * ignored.
*
* In particular; this forces __rt_mutex_start_proxy() to
* complete such that we're guaranteed to observe the
- * rt_waiter. Also see the WARN in wake_futex_pi().
+ * rt_waiter.
*/
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+ /*
+ * Futex vs rt_mutex waiter state -- if there are no rt_mutex
+ * waiters even though futex thinks there are, then the waiter
+ * is leaving. The entry needs to be removed from the list so a
+ * new futex_lock_pi() is not using this stale PI-state while
+ * the futex is available in user space again.
+ * There can be more than one task on its way out so it needs
+ * to retry.
+ */
+ rt_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
+ if (!rt_waiter) {
+ /*
+ * Acquire a reference for the leaving waiter to ensure
+ * valid futex_q::lock_ptr.
+ */
+ futex_hash_get(hb);
+ top_waiter->drop_hb_ref = true;
+ __futex_unqueue(top_waiter);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ goto retry_hb;
+ }
+
+ get_pi_state(pi_state);
spin_unlock(&hb->lock);
/* drops pi_state->pi_mutex.wait_lock */
- ret = wake_futex_pi(uaddr, uval, pi_state);
+ ret = wake_futex_pi(uaddr, uval, pi_state, rt_waiter);
put_pi_state(pi_state);
diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
index cba8b1a6a4cc..d818b4d47f1b 100644
--- a/kernel/futex/requeue.c
+++ b/kernel/futex/requeue.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/plist.h>
#include <linux/sched/signal.h>
#include "futex.h"
@@ -58,6 +59,7 @@ enum {
const struct futex_q futex_q_init = {
/* list gets initialized in futex_queue()*/
+ .wake = futex_wake_mark,
.key = FUTEX_KEY_INIT,
.bitset = FUTEX_BITSET_MATCH_ANY,
.requeue_state = ATOMIC_INIT(Q_REQUEUE_PI_NONE),
@@ -85,6 +87,11 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
futex_hb_waiters_inc(hb2);
plist_add(&q->list, &hb2->chain);
q->lock_ptr = &hb2->lock;
+ /*
+ * hb1 and hb2 belong to the same futex_hash_bucket_private
+ * because if we managed get a reference on hb1 then it can't be
+ * replaced. Therefore we avoid put(hb1)+get(hb2) here.
+ */
}
q->key = *key2;
}
@@ -223,18 +230,25 @@ static inline
void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
struct futex_hash_bucket *hb)
{
- q->key = *key;
+ struct task_struct *task;
+ q->key = *key;
__futex_unqueue(q);
WARN_ON(!q->rt_waiter);
q->rt_waiter = NULL;
-
+ /*
+ * Acquire a reference for the waiter to ensure valid
+ * futex_q::lock_ptr.
+ */
+ futex_hash_get(hb);
+ q->drop_hb_ref = true;
q->lock_ptr = &hb->lock;
+ task = READ_ONCE(q->task);
/* Signal locked state to the waiter */
futex_requeue_pi_complete(q, 1);
- wake_up_state(q->task, TASK_NORMAL);
+ wake_up_state(task, TASK_NORMAL);
}
/**
@@ -269,7 +283,7 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
union futex_key *key2, struct futex_pi_state **ps,
struct task_struct **exiting, int set_waiters)
{
- struct futex_q *top_waiter = NULL;
+ struct futex_q *top_waiter;
u32 curval;
int ret;
@@ -346,8 +360,9 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
/**
* futex_requeue() - Requeue waiters from uaddr1 to uaddr2
* @uaddr1: source futex user address
- * @flags: futex flags (FLAGS_SHARED, etc.)
+ * @flags1: futex flags (FLAGS_SHARED, etc.)
* @uaddr2: target futex user address
+ * @flags2: futex flags (FLAGS_SHARED, etc.)
* @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
* @nr_requeue: number of waiters to requeue (0-INT_MAX)
* @cmpval: @uaddr1 expected value (or %NULL)
@@ -361,13 +376,13 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
* - >=0 - on success, the number of tasks requeued or woken;
* - <0 - on error
*/
-int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
+int futex_requeue(u32 __user *uaddr1, unsigned int flags1,
+ u32 __user *uaddr2, unsigned int flags2,
int nr_wake, int nr_requeue, u32 *cmpval, int requeue_pi)
{
union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
int task_count = 0, ret;
struct futex_pi_state *pi_state = NULL;
- struct futex_hash_bucket *hb1, *hb2;
struct futex_q *this, *next;
DEFINE_WAKE_Q(wake_q);
@@ -424,10 +439,10 @@ int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
}
retry:
- ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
+ ret = get_futex_key(uaddr1, flags1, &key1, FUTEX_READ);
if (unlikely(ret != 0))
return ret;
- ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
+ ret = get_futex_key(uaddr2, flags2, &key2,
requeue_pi ? FUTEX_WRITE : FUTEX_READ);
if (unlikely(ret != 0))
return ret;
@@ -439,240 +454,242 @@ retry:
if (requeue_pi && futex_match(&key1, &key2))
return -EINVAL;
- hb1 = futex_hash(&key1);
- hb2 = futex_hash(&key2);
-
retry_private:
- futex_hb_waiters_inc(hb2);
- double_lock_hb(hb1, hb2);
+ if (1) {
+ CLASS(hb, hb1)(&key1);
+ CLASS(hb, hb2)(&key2);
- if (likely(cmpval != NULL)) {
- u32 curval;
+ futex_hb_waiters_inc(hb2);
+ double_lock_hb(hb1, hb2);
- ret = futex_get_value_locked(&curval, uaddr1);
+ if (likely(cmpval != NULL)) {
+ u32 curval;
- if (unlikely(ret)) {
- double_unlock_hb(hb1, hb2);
- futex_hb_waiters_dec(hb2);
+ ret = futex_get_value_locked(&curval, uaddr1);
- ret = get_user(curval, uaddr1);
- if (ret)
- return ret;
+ if (unlikely(ret)) {
+ futex_hb_waiters_dec(hb2);
+ double_unlock_hb(hb1, hb2);
- if (!(flags & FLAGS_SHARED))
- goto retry_private;
+ ret = get_user(curval, uaddr1);
+ if (ret)
+ return ret;
- goto retry;
- }
- if (curval != *cmpval) {
- ret = -EAGAIN;
- goto out_unlock;
- }
- }
+ if (!(flags1 & FLAGS_SHARED))
+ goto retry_private;
- if (requeue_pi) {
- struct task_struct *exiting = NULL;
+ goto retry;
+ }
+ if (curval != *cmpval) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+ }
- /*
- * Attempt to acquire uaddr2 and wake the top waiter. If we
- * intend to requeue waiters, force setting the FUTEX_WAITERS
- * bit. We force this here where we are able to easily handle
- * faults rather in the requeue loop below.
- *
- * Updates topwaiter::requeue_state if a top waiter exists.
- */
- ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
- &key2, &pi_state,
- &exiting, nr_requeue);
+ if (requeue_pi) {
+ struct task_struct *exiting = NULL;
- /*
- * At this point the top_waiter has either taken uaddr2 or
- * is waiting on it. In both cases pi_state has been
- * established and an initial refcount on it. In case of an
- * error there's nothing.
- *
- * The top waiter's requeue_state is up to date:
- *
- * - If the lock was acquired atomically (ret == 1), then
- * the state is Q_REQUEUE_PI_LOCKED.
- *
- * The top waiter has been dequeued and woken up and can
- * return to user space immediately. The kernel/user
- * space state is consistent. In case that there must be
- * more waiters requeued the WAITERS bit in the user
- * space futex is set so the top waiter task has to go
- * into the syscall slowpath to unlock the futex. This
- * will block until this requeue operation has been
- * completed and the hash bucket locks have been
- * dropped.
- *
- * - If the trylock failed with an error (ret < 0) then
- * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing
- * happened", or Q_REQUEUE_PI_IGNORE when there was an
- * interleaved early wakeup.
- *
- * - If the trylock did not succeed (ret == 0) then the
- * state is either Q_REQUEUE_PI_IN_PROGRESS or
- * Q_REQUEUE_PI_WAIT if an early wakeup interleaved.
- * This will be cleaned up in the loop below, which
- * cannot fail because futex_proxy_trylock_atomic() did
- * the same sanity checks for requeue_pi as the loop
- * below does.
- */
- switch (ret) {
- case 0:
- /* We hold a reference on the pi state. */
- break;
-
- case 1:
/*
- * futex_proxy_trylock_atomic() acquired the user space
- * futex. Adjust task_count.
+ * Attempt to acquire uaddr2 and wake the top waiter. If we
+ * intend to requeue waiters, force setting the FUTEX_WAITERS
+ * bit. We force this here where we are able to easily handle
+ * faults rather in the requeue loop below.
+ *
+ * Updates topwaiter::requeue_state if a top waiter exists.
*/
- task_count++;
- ret = 0;
- break;
+ ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
+ &key2, &pi_state,
+ &exiting, nr_requeue);
- /*
- * If the above failed, then pi_state is NULL and
- * waiter::requeue_state is correct.
- */
- case -EFAULT:
- double_unlock_hb(hb1, hb2);
- futex_hb_waiters_dec(hb2);
- ret = fault_in_user_writeable(uaddr2);
- if (!ret)
- goto retry;
- return ret;
- case -EBUSY:
- case -EAGAIN:
/*
- * Two reasons for this:
- * - EBUSY: Owner is exiting and we just wait for the
- * exit to complete.
- * - EAGAIN: The user space value changed.
+ * At this point the top_waiter has either taken uaddr2 or
+ * is waiting on it. In both cases pi_state has been
+ * established and an initial refcount on it. In case of an
+ * error there's nothing.
+ *
+ * The top waiter's requeue_state is up to date:
+ *
+ * - If the lock was acquired atomically (ret == 1), then
+ * the state is Q_REQUEUE_PI_LOCKED.
+ *
+ * The top waiter has been dequeued and woken up and can
+ * return to user space immediately. The kernel/user
+ * space state is consistent. In case that there must be
+ * more waiters requeued the WAITERS bit in the user
+ * space futex is set so the top waiter task has to go
+ * into the syscall slowpath to unlock the futex. This
+ * will block until this requeue operation has been
+ * completed and the hash bucket locks have been
+ * dropped.
+ *
+ * - If the trylock failed with an error (ret < 0) then
+ * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing
+ * happened", or Q_REQUEUE_PI_IGNORE when there was an
+ * interleaved early wakeup.
+ *
+ * - If the trylock did not succeed (ret == 0) then the
+ * state is either Q_REQUEUE_PI_IN_PROGRESS or
+ * Q_REQUEUE_PI_WAIT if an early wakeup interleaved.
+ * This will be cleaned up in the loop below, which
+ * cannot fail because futex_proxy_trylock_atomic() did
+ * the same sanity checks for requeue_pi as the loop
+ * below does.
*/
- double_unlock_hb(hb1, hb2);
- futex_hb_waiters_dec(hb2);
- /*
- * Handle the case where the owner is in the middle of
- * exiting. Wait for the exit to complete otherwise
- * this task might loop forever, aka. live lock.
- */
- wait_for_owner_exiting(ret, exiting);
- cond_resched();
- goto retry;
- default:
- goto out_unlock;
- }
- }
-
- plist_for_each_entry_safe(this, next, &hb1->chain, list) {
- if (task_count - nr_wake >= nr_requeue)
- break;
-
- if (!futex_match(&this->key, &key1))
- continue;
-
- /*
- * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always
- * be paired with each other and no other futex ops.
- *
- * We should never be requeueing a futex_q with a pi_state,
- * which is awaiting a futex_unlock_pi().
- */
- if ((requeue_pi && !this->rt_waiter) ||
- (!requeue_pi && this->rt_waiter) ||
- this->pi_state) {
- ret = -EINVAL;
- break;
- }
-
- /* Plain futexes just wake or requeue and are done */
- if (!requeue_pi) {
- if (++task_count <= nr_wake)
- futex_wake_mark(&wake_q, this);
- else
- requeue_futex(this, hb1, hb2, &key2);
- continue;
+ switch (ret) {
+ case 0:
+ /* We hold a reference on the pi state. */
+ break;
+
+ case 1:
+ /*
+ * futex_proxy_trylock_atomic() acquired the user space
+ * futex. Adjust task_count.
+ */
+ task_count++;
+ ret = 0;
+ break;
+
+ /*
+ * If the above failed, then pi_state is NULL and
+ * waiter::requeue_state is correct.
+ */
+ case -EFAULT:
+ futex_hb_waiters_dec(hb2);
+ double_unlock_hb(hb1, hb2);
+ ret = fault_in_user_writeable(uaddr2);
+ if (!ret)
+ goto retry;
+ return ret;
+ case -EBUSY:
+ case -EAGAIN:
+ /*
+ * Two reasons for this:
+ * - EBUSY: Owner is exiting and we just wait for the
+ * exit to complete.
+ * - EAGAIN: The user space value changed.
+ */
+ futex_hb_waiters_dec(hb2);
+ double_unlock_hb(hb1, hb2);
+ /*
+ * Handle the case where the owner is in the middle of
+ * exiting. Wait for the exit to complete otherwise
+ * this task might loop forever, aka. live lock.
+ */
+ wait_for_owner_exiting(ret, exiting);
+ cond_resched();
+ goto retry;
+ default:
+ goto out_unlock;
+ }
}
- /* Ensure we requeue to the expected futex for requeue_pi. */
- if (!futex_match(this->requeue_pi_key, &key2)) {
- ret = -EINVAL;
- break;
- }
+ plist_for_each_entry_safe(this, next, &hb1->chain, list) {
+ if (task_count - nr_wake >= nr_requeue)
+ break;
- /*
- * Requeue nr_requeue waiters and possibly one more in the case
- * of requeue_pi if we couldn't acquire the lock atomically.
- *
- * Prepare the waiter to take the rt_mutex. Take a refcount
- * on the pi_state and store the pointer in the futex_q
- * object of the waiter.
- */
- get_pi_state(pi_state);
+ if (!futex_match(&this->key, &key1))
+ continue;
- /* Don't requeue when the waiter is already on the way out. */
- if (!futex_requeue_pi_prepare(this, pi_state)) {
/*
- * Early woken waiter signaled that it is on the
- * way out. Drop the pi_state reference and try the
- * next waiter. @this->pi_state is still NULL.
+ * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always
+ * be paired with each other and no other futex ops.
+ *
+ * We should never be requeueing a futex_q with a pi_state,
+ * which is awaiting a futex_unlock_pi().
*/
- put_pi_state(pi_state);
- continue;
- }
+ if ((requeue_pi && !this->rt_waiter) ||
+ (!requeue_pi && this->rt_waiter) ||
+ this->pi_state) {
+ ret = -EINVAL;
+ break;
+ }
+
+ /* Plain futexes just wake or requeue and are done */
+ if (!requeue_pi) {
+ if (++task_count <= nr_wake)
+ this->wake(&wake_q, this);
+ else
+ requeue_futex(this, hb1, hb2, &key2);
+ continue;
+ }
+
+ /* Ensure we requeue to the expected futex for requeue_pi. */
+ if (!futex_match(this->requeue_pi_key, &key2)) {
+ ret = -EINVAL;
+ break;
+ }
- ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
- this->rt_waiter,
- this->task);
-
- if (ret == 1) {
- /*
- * We got the lock. We do neither drop the refcount
- * on pi_state nor clear this->pi_state because the
- * waiter needs the pi_state for cleaning up the
- * user space value. It will drop the refcount
- * after doing so. this::requeue_state is updated
- * in the wakeup as well.
- */
- requeue_pi_wake_futex(this, &key2, hb2);
- task_count++;
- } else if (!ret) {
- /* Waiter is queued, move it to hb2 */
- requeue_futex(this, hb1, hb2, &key2);
- futex_requeue_pi_complete(this, 0);
- task_count++;
- } else {
- /*
- * rt_mutex_start_proxy_lock() detected a potential
- * deadlock when we tried to queue that waiter.
- * Drop the pi_state reference which we took above
- * and remove the pointer to the state from the
- * waiters futex_q object.
- */
- this->pi_state = NULL;
- put_pi_state(pi_state);
- futex_requeue_pi_complete(this, ret);
/*
- * We stop queueing more waiters and let user space
- * deal with the mess.
+ * Requeue nr_requeue waiters and possibly one more in the case
+ * of requeue_pi if we couldn't acquire the lock atomically.
+ *
+ * Prepare the waiter to take the rt_mutex. Take a refcount
+ * on the pi_state and store the pointer in the futex_q
+ * object of the waiter.
*/
- break;
+ get_pi_state(pi_state);
+
+ /* Don't requeue when the waiter is already on the way out. */
+ if (!futex_requeue_pi_prepare(this, pi_state)) {
+ /*
+ * Early woken waiter signaled that it is on the
+ * way out. Drop the pi_state reference and try the
+ * next waiter. @this->pi_state is still NULL.
+ */
+ put_pi_state(pi_state);
+ continue;
+ }
+
+ ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
+ this->rt_waiter,
+ this->task);
+
+ if (ret == 1) {
+ /*
+ * We got the lock. We do neither drop the refcount
+ * on pi_state nor clear this->pi_state because the
+ * waiter needs the pi_state for cleaning up the
+ * user space value. It will drop the refcount
+ * after doing so. this::requeue_state is updated
+ * in the wakeup as well.
+ */
+ requeue_pi_wake_futex(this, &key2, hb2);
+ task_count++;
+ } else if (!ret) {
+ /* Waiter is queued, move it to hb2 */
+ requeue_futex(this, hb1, hb2, &key2);
+ futex_requeue_pi_complete(this, 0);
+ task_count++;
+ } else {
+ /*
+ * rt_mutex_start_proxy_lock() detected a potential
+ * deadlock when we tried to queue that waiter.
+ * Drop the pi_state reference which we took above
+ * and remove the pointer to the state from the
+ * waiters futex_q object.
+ */
+ this->pi_state = NULL;
+ put_pi_state(pi_state);
+ futex_requeue_pi_complete(this, ret);
+ /*
+ * We stop queueing more waiters and let user space
+ * deal with the mess.
+ */
+ break;
+ }
}
- }
- /*
- * We took an extra initial reference to the pi_state in
- * futex_proxy_trylock_atomic(). We need to drop it here again.
- */
- put_pi_state(pi_state);
+ /*
+ * We took an extra initial reference to the pi_state in
+ * futex_proxy_trylock_atomic(). We need to drop it here again.
+ */
+ put_pi_state(pi_state);
out_unlock:
- double_unlock_hb(hb1, hb2);
+ futex_hb_waiters_dec(hb2);
+ double_unlock_hb(hb1, hb2);
+ }
wake_up_q(&wake_q);
- futex_hb_waiters_dec(hb2);
return ret ? ret : task_count;
}
@@ -765,7 +782,6 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
{
struct hrtimer_sleeper timeout, *to;
struct rt_mutex_waiter rt_waiter;
- struct futex_hash_bucket *hb;
union futex_key key2 = FUTEX_KEY_INIT;
struct futex_q q = futex_q_init;
struct rt_mutex_base *pi_mutex;
@@ -789,7 +805,7 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
*/
rt_mutex_init_waiter(&rt_waiter);
- ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
+ ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE);
if (unlikely(ret != 0))
goto out;
@@ -801,35 +817,28 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* Prepare to wait on uaddr. On success, it holds hb->lock and q
* is initialized.
*/
- ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
+ ret = futex_wait_setup(uaddr, val, flags, &q, &key2, current);
if (ret)
goto out;
- /*
- * The check above which compares uaddrs is not sufficient for
- * shared futexes. We need to compare the keys:
- */
- if (futex_match(&q.key, &key2)) {
- futex_q_unlock(hb);
- ret = -EINVAL;
- goto out;
- }
-
/* Queue the futex_q, drop the hb lock, wait for wakeup. */
- futex_wait_queue(hb, &q, to);
+ futex_do_wait(&q, to);
switch (futex_requeue_pi_wakeup_sync(&q)) {
case Q_REQUEUE_PI_IGNORE:
- /* The waiter is still on uaddr1 */
- spin_lock(&hb->lock);
- ret = handle_early_requeue_pi_wakeup(hb, &q, to);
- spin_unlock(&hb->lock);
+ {
+ CLASS(hb, hb)(&q.key);
+ /* The waiter is still on uaddr1 */
+ spin_lock(&hb->lock);
+ ret = handle_early_requeue_pi_wakeup(hb, &q, to);
+ spin_unlock(&hb->lock);
+ }
break;
case Q_REQUEUE_PI_LOCKED:
/* The requeue acquired the lock */
if (q.pi_state && (q.pi_state->owner != current)) {
- spin_lock(q.lock_ptr);
+ futex_q_lockptr_lock(&q);
ret = fixup_pi_owner(uaddr2, &q, true);
/*
* Drop the reference to the pi state which the
@@ -850,11 +859,13 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
pi_mutex = &q.pi_state->pi_mutex;
ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
- /* Current is not longer pi_blocked_on */
- spin_lock(q.lock_ptr);
+ /*
+ * See futex_unlock_pi()'s cleanup: comment.
+ */
if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
ret = 0;
+ futex_q_lockptr_lock(&q);
debug_rt_mutex_free_waiter(&rt_waiter);
/*
* Fixup the pi_state owner and possibly acquire the lock if we
@@ -886,6 +897,11 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
default:
BUG();
}
+ if (q.drop_hb_ref) {
+ CLASS(hb, hb)(&q.key);
+ /* Additional reference from requeue_pi_wake_futex() */
+ futex_hash_put(hb);
+ }
out:
if (to) {
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index a8074079b09e..880c9bf2f315 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-#include <linux/compat.h>
#include <linux/syscalls.h>
#include <linux/time_namespace.h>
@@ -40,6 +39,56 @@ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
return 0;
}
+static inline void __user *futex_task_robust_list(struct task_struct *p, bool compat)
+{
+#ifdef CONFIG_COMPAT
+ if (compat)
+ return p->compat_robust_list;
+#endif
+ return p->robust_list;
+}
+
+static void __user *futex_get_robust_list_common(int pid, bool compat)
+{
+ struct task_struct *p = current;
+ void __user *head;
+ int ret;
+
+ scoped_guard(rcu) {
+ if (pid) {
+ p = find_task_by_vpid(pid);
+ if (!p)
+ return (void __user *)ERR_PTR(-ESRCH);
+ }
+ get_task_struct(p);
+ }
+
+ /*
+ * Hold exec_update_lock to serialize with concurrent exec()
+ * so ptrace_may_access() is checked against stable credentials
+ */
+ ret = down_read_killable(&p->signal->exec_update_lock);
+ if (ret)
+ goto err_put;
+
+ ret = -EPERM;
+ if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
+ goto err_unlock;
+
+ head = futex_task_robust_list(p, compat);
+
+ up_read(&p->signal->exec_update_lock);
+ put_task_struct(p);
+
+ return head;
+
+err_unlock:
+ up_read(&p->signal->exec_update_lock);
+err_put:
+ put_task_struct(p);
+ return (void __user *)ERR_PTR(ret);
+}
+
/**
* sys_get_robust_list() - Get the robust-futex list head of a task
* @pid: pid of the process [zero for current task]
@@ -50,50 +99,25 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
struct robust_list_head __user * __user *, head_ptr,
size_t __user *, len_ptr)
{
- struct robust_list_head __user *head;
- unsigned long ret;
- struct task_struct *p;
-
- rcu_read_lock();
-
- ret = -ESRCH;
- if (!pid)
- p = current;
- else {
- p = find_task_by_vpid(pid);
- if (!p)
- goto err_unlock;
- }
-
- ret = -EPERM;
- if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
- goto err_unlock;
+ struct robust_list_head __user *head = futex_get_robust_list_common(pid, false);
- head = p->robust_list;
- rcu_read_unlock();
+ if (IS_ERR(head))
+ return PTR_ERR(head);
if (put_user(sizeof(*head), len_ptr))
return -EFAULT;
return put_user(head, head_ptr);
-
-err_unlock:
- rcu_read_unlock();
-
- return ret;
}
long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
u32 __user *uaddr2, u32 val2, u32 val3)
{
+ unsigned int flags = futex_to_flags(op);
int cmd = op & FUTEX_CMD_MASK;
- unsigned int flags = 0;
-
- if (!(op & FUTEX_PRIVATE_FLAG))
- flags |= FLAGS_SHARED;
- if (op & FUTEX_CLOCK_REALTIME) {
- flags |= FLAGS_CLOCKRT;
- if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI &&
+ if (flags & FLAGS_CLOCKRT) {
+ if (cmd != FUTEX_WAIT_BITSET &&
+ cmd != FUTEX_WAIT_REQUEUE_PI &&
cmd != FUTEX_LOCK_PI2)
return -ENOSYS;
}
@@ -110,9 +134,9 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
case FUTEX_WAKE_BITSET:
return futex_wake(uaddr, flags, val, val3);
case FUTEX_REQUEUE:
- return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
+ return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, NULL, 0);
case FUTEX_CMP_REQUEUE:
- return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
+ return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 0);
case FUTEX_WAKE_OP:
return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
case FUTEX_LOCK_PI:
@@ -129,7 +153,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
uaddr2);
case FUTEX_CMP_REQUEUE_PI:
- return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
+ return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 1);
}
return -ENOSYS;
}
@@ -183,43 +207,91 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
}
-/* Mask of available flags for each futex in futex_waitv list */
-#define FUTEXV_WAITER_MASK (FUTEX_32 | FUTEX_PRIVATE_FLAG)
-
/**
* futex_parse_waitv - Parse a waitv array from userspace
* @futexv: Kernel side list of waiters to be filled
* @uwaitv: Userspace list to be parsed
* @nr_futexes: Length of futexv
+ * @wake: Wake to call when futex is woken
+ * @wake_data: Data for the wake handler
*
* Return: Error code on failure, 0 on success
*/
-static int futex_parse_waitv(struct futex_vector *futexv,
- struct futex_waitv __user *uwaitv,
- unsigned int nr_futexes)
+int futex_parse_waitv(struct futex_vector *futexv,
+ struct futex_waitv __user *uwaitv,
+ unsigned int nr_futexes, futex_wake_fn *wake,
+ void *wake_data)
{
struct futex_waitv aux;
unsigned int i;
for (i = 0; i < nr_futexes; i++) {
+ unsigned int flags;
+
if (copy_from_user(&aux, &uwaitv[i], sizeof(aux)))
return -EFAULT;
- if ((aux.flags & ~FUTEXV_WAITER_MASK) || aux.__reserved)
+ if ((aux.flags & ~FUTEX2_VALID_MASK) || aux.__reserved)
+ return -EINVAL;
+
+ flags = futex2_to_flags(aux.flags);
+ if (!futex_flags_valid(flags))
return -EINVAL;
- if (!(aux.flags & FUTEX_32))
+ if (!futex_validate_input(flags, aux.val))
return -EINVAL;
- futexv[i].w.flags = aux.flags;
+ futexv[i].w.flags = flags;
futexv[i].w.val = aux.val;
futexv[i].w.uaddr = aux.uaddr;
futexv[i].q = futex_q_init;
+ futexv[i].q.wake = wake;
+ futexv[i].q.wake_data = wake_data;
}
return 0;
}
+static int futex2_setup_timeout(struct __kernel_timespec __user *timeout,
+ clockid_t clockid, struct hrtimer_sleeper *to)
+{
+ int flag_clkid = 0, flag_init = 0;
+ struct timespec64 ts;
+ ktime_t time;
+ int ret;
+
+ if (!timeout)
+ return 0;
+
+ if (clockid == CLOCK_REALTIME) {
+ flag_clkid = FLAGS_CLOCKRT;
+ flag_init = FUTEX_CLOCK_REALTIME;
+ }
+
+ if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
+ return -EINVAL;
+
+ if (get_timespec64(&ts, timeout))
+ return -EFAULT;
+
+ /*
+ * Since there's no opcode for futex_waitv, use
+ * FUTEX_WAIT_BITSET that uses absolute timeout as well
+ */
+ ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time);
+ if (ret)
+ return ret;
+
+ futex_setup_timer(&time, to, flag_clkid, 0);
+ return 0;
+}
+
+static inline void futex2_destroy_timeout(struct hrtimer_sleeper *to)
+{
+ hrtimer_cancel(&to->timer);
+ destroy_hrtimer_on_stack(&to->timer);
+}
+
/**
* sys_futex_waitv - Wait on a list of futexes
* @waiters: List of futexes to wait on
@@ -249,8 +321,6 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
{
struct hrtimer_sleeper to;
struct futex_vector *futexv;
- struct timespec64 ts;
- ktime_t time;
int ret;
/* This syscall supports no flags for now */
@@ -260,30 +330,8 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters)
return -EINVAL;
- if (timeout) {
- int flag_clkid = 0, flag_init = 0;
-
- if (clockid == CLOCK_REALTIME) {
- flag_clkid = FLAGS_CLOCKRT;
- flag_init = FUTEX_CLOCK_REALTIME;
- }
-
- if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
- return -EINVAL;
-
- if (get_timespec64(&ts, timeout))
- return -EFAULT;
-
- /*
- * Since there's no opcode for futex_waitv, use
- * FUTEX_WAIT_BITSET that uses absolute timeout as well
- */
- ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time);
- if (ret)
- return ret;
-
- futex_setup_timer(&time, &to, flag_clkid, 0);
- }
+ if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to)))
+ return ret;
futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL);
if (!futexv) {
@@ -291,20 +339,133 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
goto destroy_timer;
}
- ret = futex_parse_waitv(futexv, waiters, nr_futexes);
+ ret = futex_parse_waitv(futexv, waiters, nr_futexes, futex_wake_mark,
+ NULL);
if (!ret)
ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL);
kfree(futexv);
destroy_timer:
- if (timeout) {
- hrtimer_cancel(&to.timer);
- destroy_hrtimer_on_stack(&to.timer);
- }
+ if (timeout)
+ futex2_destroy_timeout(&to);
return ret;
}
+/*
+ * sys_futex_wake - Wake a number of futexes
+ * @uaddr: Address of the futex(es) to wake
+ * @mask: bitmask
+ * @nr: Number of the futexes to wake
+ * @flags: FUTEX2 flags
+ *
+ * Identical to the traditional FUTEX_WAKE_BITSET op, except it is part of the
+ * futex2 family of calls.
+ */
+
+SYSCALL_DEFINE4(futex_wake,
+ void __user *, uaddr,
+ unsigned long, mask,
+ int, nr,
+ unsigned int, flags)
+{
+ if (flags & ~FUTEX2_VALID_MASK)
+ return -EINVAL;
+
+ flags = futex2_to_flags(flags);
+ if (!futex_flags_valid(flags))
+ return -EINVAL;
+
+ if (!futex_validate_input(flags, mask))
+ return -EINVAL;
+
+ return futex_wake(uaddr, FLAGS_STRICT | flags, nr, mask);
+}
+
+/*
+ * sys_futex_wait - Wait on a futex
+ * @uaddr: Address of the futex to wait on
+ * @val: Value of @uaddr
+ * @mask: bitmask
+ * @flags: FUTEX2 flags
+ * @timeout: Optional absolute timeout
+ * @clockid: Clock to be used for the timeout, realtime or monotonic
+ *
+ * Identical to the traditional FUTEX_WAIT_BITSET op, except it is part of the
+ * futex2 familiy of calls.
+ */
+
+SYSCALL_DEFINE6(futex_wait,
+ void __user *, uaddr,
+ unsigned long, val,
+ unsigned long, mask,
+ unsigned int, flags,
+ struct __kernel_timespec __user *, timeout,
+ clockid_t, clockid)
+{
+ struct hrtimer_sleeper to;
+ int ret;
+
+ if (flags & ~FUTEX2_VALID_MASK)
+ return -EINVAL;
+
+ flags = futex2_to_flags(flags);
+ if (!futex_flags_valid(flags))
+ return -EINVAL;
+
+ if (!futex_validate_input(flags, val) ||
+ !futex_validate_input(flags, mask))
+ return -EINVAL;
+
+ if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to)))
+ return ret;
+
+ ret = __futex_wait(uaddr, flags, val, timeout ? &to : NULL, mask);
+
+ if (timeout)
+ futex2_destroy_timeout(&to);
+
+ return ret;
+}
+
+/*
+ * sys_futex_requeue - Requeue a waiter from one futex to another
+ * @waiters: array describing the source and destination futex
+ * @flags: unused
+ * @nr_wake: number of futexes to wake
+ * @nr_requeue: number of futexes to requeue
+ *
+ * Identical to the traditional FUTEX_CMP_REQUEUE op, except it is part of the
+ * futex2 family of calls.
+ */
+
+SYSCALL_DEFINE4(futex_requeue,
+ struct futex_waitv __user *, waiters,
+ unsigned int, flags,
+ int, nr_wake,
+ int, nr_requeue)
+{
+ struct futex_vector futexes[2];
+ u32 cmpval;
+ int ret;
+
+ if (flags)
+ return -EINVAL;
+
+ if (!waiters)
+ return -EINVAL;
+
+ ret = futex_parse_waitv(futexes, waiters, 2, futex_wake_mark, NULL);
+ if (ret)
+ return ret;
+
+ cmpval = futexes[0].w.val;
+
+ return futex_requeue(u64_to_user_ptr(futexes[0].w.uaddr), futexes[0].w.flags,
+ u64_to_user_ptr(futexes[1].w.uaddr), futexes[1].w.flags,
+ nr_wake, nr_requeue, &cmpval, 0);
+}
+
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(set_robust_list,
struct compat_robust_list_head __user *, head,
@@ -322,36 +483,14 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
compat_uptr_t __user *, head_ptr,
compat_size_t __user *, len_ptr)
{
- struct compat_robust_list_head __user *head;
- unsigned long ret;
- struct task_struct *p;
-
- rcu_read_lock();
-
- ret = -ESRCH;
- if (!pid)
- p = current;
- else {
- p = find_task_by_vpid(pid);
- if (!p)
- goto err_unlock;
- }
+ struct compat_robust_list_head __user *head = futex_get_robust_list_common(pid, true);
- ret = -EPERM;
- if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
- goto err_unlock;
-
- head = p->compat_robust_list;
- rcu_read_unlock();
+ if (IS_ERR(head))
+ return PTR_ERR(head);
if (put_user(sizeof(*head), len_ptr))
return -EFAULT;
return put_user(ptr_to_compat(head), head_ptr);
-
-err_unlock:
- rcu_read_unlock();
-
- return ret;
}
#endif /* CONFIG_COMPAT */
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index ba01b9408203..e2bbe5509ec2 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/plist.h>
#include <linux/sched/task.h>
#include <linux/sched/signal.h>
#include <linux/freezer.h>
@@ -106,20 +107,11 @@
* double_lock_hb() and double_unlock_hb(), respectively.
*/
-/*
- * The hash bucket lock must be held when this is called.
- * Afterwards, the futex_q must not be accessed. Callers
- * must ensure to later call wake_up_q() for the actual
- * wakeups to occur.
- */
-void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q)
+bool __futex_wake_mark(struct futex_q *q)
{
- struct task_struct *p = q->task;
-
if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
- return;
+ return false;
- get_task_struct(p);
__futex_unqueue(q);
/*
* The waiting task can free the futex_q as soon as q->lock_ptr = NULL
@@ -130,6 +122,26 @@ void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q)
*/
smp_store_release(&q->lock_ptr, NULL);
+ return true;
+}
+
+/*
+ * The hash bucket lock must be held when this is called.
+ * Afterwards, the futex_q must not be accessed. Callers
+ * must ensure to later call wake_up_q() for the actual
+ * wakeups to occur.
+ */
+void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q)
+{
+ struct task_struct *p = q->task;
+
+ get_task_struct(p);
+
+ if (!__futex_wake_mark(q)) {
+ put_task_struct(p);
+ return;
+ }
+
/*
* Queue the task for later wakeup for after we've released
* the hb->lock.
@@ -142,20 +154,22 @@ void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q)
*/
int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
{
- struct futex_hash_bucket *hb;
struct futex_q *this, *next;
union futex_key key = FUTEX_KEY_INIT;
- int ret;
DEFINE_WAKE_Q(wake_q);
+ int ret;
if (!bitset)
return -EINVAL;
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ);
+ ret = get_futex_key(uaddr, flags, &key, FUTEX_READ);
if (unlikely(ret != 0))
return ret;
- hb = futex_hash(&key);
+ if ((flags & FLAGS_STRICT) && !nr_wake)
+ return 0;
+
+ CLASS(hb, hb)(&key);
/* Make sure we really have tasks to wakeup */
if (!futex_hb_waiters_pending(hb))
@@ -174,7 +188,7 @@ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
if (!(this->bitset & bitset))
continue;
- futex_wake_mark(&wake_q, this);
+ this->wake(&wake_q, this);
if (++ret >= nr_wake)
break;
}
@@ -195,13 +209,12 @@ static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
if (oparg < 0 || oparg > 31) {
- char comm[sizeof(current->comm)];
/*
* kill this print and return -EINVAL when userspace
* is sane again
*/
pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n",
- get_task_comm(comm, current), oparg);
+ current->comm, oparg);
oparg &= 31;
}
oparg = 1 << oparg;
@@ -239,80 +252,81 @@ int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
int nr_wake, int nr_wake2, int op)
{
union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
- struct futex_hash_bucket *hb1, *hb2;
struct futex_q *this, *next;
int ret, op_ret;
DEFINE_WAKE_Q(wake_q);
retry:
- ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
+ ret = get_futex_key(uaddr1, flags, &key1, FUTEX_READ);
if (unlikely(ret != 0))
return ret;
- ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
+ ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE);
if (unlikely(ret != 0))
return ret;
- hb1 = futex_hash(&key1);
- hb2 = futex_hash(&key2);
-
retry_private:
- double_lock_hb(hb1, hb2);
- op_ret = futex_atomic_op_inuser(op, uaddr2);
- if (unlikely(op_ret < 0)) {
- double_unlock_hb(hb1, hb2);
-
- if (!IS_ENABLED(CONFIG_MMU) ||
- unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
- /*
- * we don't get EFAULT from MMU faults if we don't have
- * an MMU, but we might get them from range checking
- */
- ret = op_ret;
- return ret;
- }
-
- if (op_ret == -EFAULT) {
- ret = fault_in_user_writeable(uaddr2);
- if (ret)
+ if (1) {
+ CLASS(hb, hb1)(&key1);
+ CLASS(hb, hb2)(&key2);
+
+ double_lock_hb(hb1, hb2);
+ op_ret = futex_atomic_op_inuser(op, uaddr2);
+ if (unlikely(op_ret < 0)) {
+ double_unlock_hb(hb1, hb2);
+
+ if (!IS_ENABLED(CONFIG_MMU) ||
+ unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
+ /*
+ * we don't get EFAULT from MMU faults if we don't have
+ * an MMU, but we might get them from range checking
+ */
+ ret = op_ret;
return ret;
- }
-
- cond_resched();
- if (!(flags & FLAGS_SHARED))
- goto retry_private;
- goto retry;
- }
+ }
- plist_for_each_entry_safe(this, next, &hb1->chain, list) {
- if (futex_match (&this->key, &key1)) {
- if (this->pi_state || this->rt_waiter) {
- ret = -EINVAL;
- goto out_unlock;
+ if (op_ret == -EFAULT) {
+ ret = fault_in_user_writeable(uaddr2);
+ if (ret)
+ return ret;
}
- futex_wake_mark(&wake_q, this);
- if (++ret >= nr_wake)
- break;
+
+ cond_resched();
+ if (!(flags & FLAGS_SHARED))
+ goto retry_private;
+ goto retry;
}
- }
- if (op_ret > 0) {
- op_ret = 0;
- plist_for_each_entry_safe(this, next, &hb2->chain, list) {
- if (futex_match (&this->key, &key2)) {
+ plist_for_each_entry_safe(this, next, &hb1->chain, list) {
+ if (futex_match(&this->key, &key1)) {
if (this->pi_state || this->rt_waiter) {
ret = -EINVAL;
goto out_unlock;
}
- futex_wake_mark(&wake_q, this);
- if (++op_ret >= nr_wake2)
+ this->wake(&wake_q, this);
+ if (++ret >= nr_wake)
break;
}
}
- ret += op_ret;
- }
+
+ if (op_ret > 0) {
+ op_ret = 0;
+ plist_for_each_entry_safe(this, next, &hb2->chain, list) {
+ if (futex_match(&this->key, &key2)) {
+ if (this->pi_state || this->rt_waiter) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ this->wake(&wake_q, this);
+ if (++op_ret >= nr_wake2)
+ break;
+ }
+ }
+ ret += op_ret;
+ }
out_unlock:
- double_unlock_hb(hb1, hb2);
+ double_unlock_hb(hb1, hb2);
+ }
wake_up_q(&wake_q);
return ret;
}
@@ -320,23 +334,12 @@ out_unlock:
static long futex_wait_restart(struct restart_block *restart);
/**
- * futex_wait_queue() - futex_queue() and wait for wakeup, timeout, or signal
- * @hb: the futex hash bucket, must be locked by the caller
+ * futex_do_wait() - wait for wakeup, timeout, or signal
* @q: the futex_q to queue up on
* @timeout: the prepared hrtimer_sleeper, or null for no timeout
*/
-void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
- struct hrtimer_sleeper *timeout)
+void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout)
{
- /*
- * The task state is guaranteed to be set before another task can
- * wake it. set_current_state() is implemented using smp_store_mb() and
- * futex_queue() calls spin_unlock() upon completion, both serializing
- * access to the hash list and forcing another memory barrier.
- */
- set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
- futex_queue(q, hb);
-
/* Arm the timer */
if (timeout)
hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS);
@@ -358,7 +361,7 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
}
/**
- * unqueue_multiple - Remove various futexes from their hash bucket
+ * futex_unqueue_multiple - Remove various futexes from their hash bucket
* @v: The list of futexes to unqueue
* @count: Number of futexes in the list
*
@@ -368,7 +371,7 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
* - >=0 - Index of the last futex that was awoken;
* - -1 - No futex was awoken
*/
-static int unqueue_multiple(struct futex_vector *v, int count)
+int futex_unqueue_multiple(struct futex_vector *v, int count)
{
int ret = -1, i;
@@ -396,14 +399,19 @@ static int unqueue_multiple(struct futex_vector *v, int count)
* - 0 - Success
* - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL
*/
-static int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken)
+int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken)
{
- struct futex_hash_bucket *hb;
bool retry = false;
int ret, i;
u32 uval;
/*
+ * Make sure to have a reference on the private_hash such that we
+ * don't block on rehash after changing the task state below.
+ */
+ guard(private_hash)();
+
+ /*
* Enqueuing multiple futexes is tricky, because we need to enqueue
* each futex on the list before dealing with the next one to avoid
* deadlocking on the hash bucket. But, before enqueuing, we need to
@@ -419,11 +427,11 @@ static int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *wo
*/
retry:
for (i = 0; i < count; i++) {
- if ((vs[i].w.flags & FUTEX_PRIVATE_FLAG) && retry)
+ if (!(vs[i].w.flags & FLAGS_SHARED) && retry)
continue;
ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr),
- !(vs[i].w.flags & FUTEX_PRIVATE_FLAG),
+ vs[i].w.flags,
&vs[i].q.key, FUTEX_READ);
if (unlikely(ret))
@@ -435,22 +443,26 @@ retry:
for (i = 0; i < count; i++) {
u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr;
struct futex_q *q = &vs[i].q;
- u32 val = (u32)vs[i].w.val;
+ u32 val = vs[i].w.val;
- hb = futex_q_lock(q);
- ret = futex_get_value_locked(&uval, uaddr);
+ if (1) {
+ CLASS(hb, hb)(&q->key);
- if (!ret && uval == val) {
- /*
- * The bucket lock can't be held while dealing with the
- * next futex. Queue each futex at this moment so hb can
- * be unlocked.
- */
- futex_queue(q, hb);
- continue;
- }
+ futex_q_lock(q, hb);
+ ret = futex_get_value_locked(&uval, uaddr);
- futex_q_unlock(hb);
+ if (!ret && uval == val) {
+ /*
+ * The bucket lock can't be held while dealing with the
+ * next futex. Queue each futex at this moment so hb can
+ * be unlocked.
+ */
+ futex_queue(q, hb, current);
+ continue;
+ }
+
+ futex_q_unlock(hb);
+ }
__set_current_state(TASK_RUNNING);
/*
@@ -458,7 +470,7 @@ retry:
* was woken, we don't return error and return this index to
* userspace
*/
- *woken = unqueue_multiple(vs, i);
+ *woken = futex_unqueue_multiple(vs, i);
if (*woken >= 0)
return 1;
@@ -543,7 +555,7 @@ int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
__set_current_state(TASK_RUNNING);
- ret = unqueue_multiple(vs, count);
+ ret = futex_unqueue_multiple(vs, count);
if (ret >= 0)
return ret;
@@ -564,7 +576,8 @@ int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
* @val: the expected value
* @flags: futex flags (FLAGS_SHARED, etc.)
* @q: the associated futex_q
- * @hb: storage for hash_bucket pointer to be returned to caller
+ * @key2: the second futex_key if used for requeue PI
+ * @task: Task queueing this futex
*
* Setup the futex_q and locate the hash_bucket. Get the futex value and
* compare it with the expected value. Handle atomic faults internally.
@@ -572,10 +585,12 @@ int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
*
* Return:
* - 0 - uaddr contains val and hb has been locked;
- * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
+ * - <0 - On error and the hb is unlocked. A possible reason: the uaddr can not
+ * be read, does not contain the expected value or is not properly aligned.
*/
int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
- struct futex_q *q, struct futex_hash_bucket **hb)
+ struct futex_q *q, union futex_key *key2,
+ struct task_struct *task)
{
u32 uval;
int ret;
@@ -599,69 +614,84 @@ int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
* while the syscall executes.
*/
retry:
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ);
+ ret = get_futex_key(uaddr, flags, &q->key, FUTEX_READ);
if (unlikely(ret != 0))
return ret;
retry_private:
- *hb = futex_q_lock(q);
+ if (1) {
+ CLASS(hb, hb)(&q->key);
- ret = futex_get_value_locked(&uval, uaddr);
+ futex_q_lock(q, hb);
- if (ret) {
- futex_q_unlock(*hb);
+ ret = futex_get_value_locked(&uval, uaddr);
- ret = get_user(uval, uaddr);
- if (ret)
- return ret;
+ if (ret) {
+ futex_q_unlock(hb);
+
+ ret = get_user(uval, uaddr);
+ if (ret)
+ return ret;
- if (!(flags & FLAGS_SHARED))
- goto retry_private;
+ if (!(flags & FLAGS_SHARED))
+ goto retry_private;
- goto retry;
- }
+ goto retry;
+ }
- if (uval != val) {
- futex_q_unlock(*hb);
- ret = -EWOULDBLOCK;
+ if (uval != val) {
+ futex_q_unlock(hb);
+ return -EWOULDBLOCK;
+ }
+
+ if (key2 && futex_match(&q->key, key2)) {
+ futex_q_unlock(hb);
+ return -EINVAL;
+ }
+
+ /*
+ * The task state is guaranteed to be set before another task can
+ * wake it. set_current_state() is implemented using smp_store_mb() and
+ * futex_queue() calls spin_unlock() upon completion, both serializing
+ * access to the hash list and forcing another memory barrier.
+ */
+ if (task == current)
+ set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
+ futex_queue(q, hb, task);
}
return ret;
}
-int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset)
+int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
+ struct hrtimer_sleeper *to, u32 bitset)
{
- struct hrtimer_sleeper timeout, *to;
- struct restart_block *restart;
- struct futex_hash_bucket *hb;
struct futex_q q = futex_q_init;
int ret;
if (!bitset)
return -EINVAL;
+
q.bitset = bitset;
- to = futex_setup_timer(abs_time, &timeout, flags,
- current->timer_slack_ns);
retry:
/*
* Prepare to wait on uaddr. On success, it holds hb->lock and q
* is initialized.
*/
- ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
+ ret = futex_wait_setup(uaddr, val, flags, &q, NULL, current);
if (ret)
- goto out;
+ return ret;
/* futex_queue and wait for wakeup, timeout, or a signal. */
- futex_wait_queue(hb, &q, to);
+ futex_do_wait(&q, to);
/* If we were woken (and unqueued), we succeeded, whatever. */
- ret = 0;
if (!futex_unqueue(&q))
- goto out;
- ret = -ETIMEDOUT;
+ return 0;
+
if (to && !to->task)
- goto out;
+ return -ETIMEDOUT;
/*
* We expect signal_pending(current), but we might be the
@@ -670,24 +700,38 @@ retry:
if (!signal_pending(current))
goto retry;
- ret = -ERESTARTSYS;
- if (!abs_time)
- goto out;
+ return -ERESTARTSYS;
+}
+
+int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset)
+{
+ struct hrtimer_sleeper timeout, *to;
+ struct restart_block *restart;
+ int ret;
- restart = &current->restart_block;
- restart->futex.uaddr = uaddr;
- restart->futex.val = val;
- restart->futex.time = *abs_time;
- restart->futex.bitset = bitset;
- restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
+ to = futex_setup_timer(abs_time, &timeout, flags,
+ current->timer_slack_ns);
- ret = set_restart_fn(restart, futex_wait_restart);
+ ret = __futex_wait(uaddr, flags, val, to, bitset);
+
+ /* No timeout, nothing to clean up. */
+ if (!to)
+ return ret;
-out:
- if (to) {
- hrtimer_cancel(&to->timer);
- destroy_hrtimer_on_stack(&to->timer);
+ hrtimer_cancel(&to->timer);
+ destroy_hrtimer_on_stack(&to->timer);
+
+ if (ret == -ERESTARTSYS) {
+ restart = &current->restart_block;
+ restart->futex.uaddr = uaddr;
+ restart->futex.val = val;
+ restart->futex.time = *abs_time;
+ restart->futex.bitset = bitset;
+ restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
+
+ return set_restart_fn(restart, futex_wait_restart);
}
+
return ret;
}