summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/async.c20
-rw-r--r--kernel/bpf/core.c32
-rw-r--r--kernel/cgroup/cpuset.c4
-rw-r--r--kernel/compat.c2
-rw-r--r--kernel/configs/tiny.config4
-rw-r--r--kernel/fork.c60
-rw-r--r--kernel/futex.c20
-rw-r--r--kernel/irq/spurious.c1
-rw-r--r--kernel/kcov.c4
-rw-r--r--kernel/memremap.c174
-rw-r--r--kernel/module.c7
-rw-r--r--kernel/pid.c13
-rw-r--r--kernel/ptrace.c27
-rw-r--r--kernel/relay.c1
-rw-r--r--kernel/resource.c39
-rw-r--r--kernel/sched/core.c76
-rw-r--r--kernel/sched/fair.c101
-rw-r--r--kernel/sched/membarrier.c177
-rw-r--r--kernel/sched/rt.c29
-rw-r--r--kernel/sched/sched.h2
-rw-r--r--kernel/sched/stats.h6
-rw-r--r--kernel/sched/topology.c13
-rw-r--r--kernel/sysctl.c33
-rw-r--r--kernel/taskstats.c6
-rw-r--r--kernel/time/hrtimer.c1
25 files changed, 508 insertions, 344 deletions
diff --git a/kernel/async.c b/kernel/async.c
index 2cbd3dd5940d..a893d6170944 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -84,20 +84,24 @@ static atomic_t entry_count;
static async_cookie_t lowest_in_progress(struct async_domain *domain)
{
- struct list_head *pending;
+ struct async_entry *first = NULL;
async_cookie_t ret = ASYNC_COOKIE_MAX;
unsigned long flags;
spin_lock_irqsave(&async_lock, flags);
- if (domain)
- pending = &domain->pending;
- else
- pending = &async_global_pending;
+ if (domain) {
+ if (!list_empty(&domain->pending))
+ first = list_first_entry(&domain->pending,
+ struct async_entry, domain_list);
+ } else {
+ if (!list_empty(&async_global_pending))
+ first = list_first_entry(&async_global_pending,
+ struct async_entry, global_list);
+ }
- if (!list_empty(pending))
- ret = list_first_entry(pending, struct async_entry,
- domain_list)->cookie;
+ if (first)
+ ret = first->cookie;
spin_unlock_irqrestore(&async_lock, flags);
return ret;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 5f35f93dcab2..29ca9208dcfa 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1576,25 +1576,41 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
__u32 __user *prog_ids, u32 cnt)
{
struct bpf_prog **prog;
- u32 i = 0, id;
-
+ unsigned long err = 0;
+ u32 i = 0, *ids;
+ bool nospc;
+
+ /* users of this function are doing:
+ * cnt = bpf_prog_array_length();
+ * if (cnt > 0)
+ * bpf_prog_array_copy_to_user(..., cnt);
+ * so below kcalloc doesn't need extra cnt > 0 check, but
+ * bpf_prog_array_length() releases rcu lock and
+ * prog array could have been swapped with empty or larger array,
+ * so always copy 'cnt' prog_ids to the user.
+ * In a rare race the user will see zero prog_ids
+ */
+ ids = kcalloc(cnt, sizeof(u32), GFP_USER);
+ if (!ids)
+ return -ENOMEM;
rcu_read_lock();
prog = rcu_dereference(progs)->progs;
for (; *prog; prog++) {
if (*prog == &dummy_bpf_prog.prog)
continue;
- id = (*prog)->aux->id;
- if (copy_to_user(prog_ids + i, &id, sizeof(id))) {
- rcu_read_unlock();
- return -EFAULT;
- }
+ ids[i] = (*prog)->aux->id;
if (++i == cnt) {
prog++;
break;
}
}
+ nospc = !!(*prog);
rcu_read_unlock();
- if (*prog)
+ err = copy_to_user(prog_ids, ids, cnt * sizeof(u32));
+ kfree(ids);
+ if (err)
+ return -EFAULT;
+ if (nospc)
return -ENOSPC;
return 0;
}
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index f7efa7b4d825..b42037e6e81d 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -1254,9 +1254,9 @@ done:
return retval;
}
-int current_cpuset_is_being_rebound(void)
+bool current_cpuset_is_being_rebound(void)
{
- int ret;
+ bool ret;
rcu_read_lock();
ret = task_cs(current) == cpuset_being_rebound;
diff --git a/kernel/compat.c b/kernel/compat.c
index d1cee656a7ed..3247fe761f60 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -355,7 +355,7 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len,
ret = sched_getaffinity(pid, mask);
if (ret == 0) {
- size_t retlen = min_t(size_t, len, cpumask_size());
+ unsigned int retlen = min(len, cpumask_size());
if (compat_put_bitmap(user_mask_ptr, cpumask_bits(mask), retlen * 8))
ret = -EFAULT;
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config
index 7fa0c4ae6394..9bfdffc100da 100644
--- a/kernel/configs/tiny.config
+++ b/kernel/configs/tiny.config
@@ -10,3 +10,7 @@ CONFIG_OPTIMIZE_INLINING=y
# CONFIG_SLAB is not set
# CONFIG_SLUB is not set
CONFIG_SLOB=y
+CONFIG_CC_STACKPROTECTOR_NONE=y
+# CONFIG_CC_STACKPROTECTOR_REGULAR is not set
+# CONFIG_CC_STACKPROTECTOR_STRONG is not set
+# CONFIG_CC_STACKPROTECTOR_AUTO is not set
diff --git a/kernel/fork.c b/kernel/fork.c
index 5c372c954f3b..be8aa5b98666 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -606,6 +606,11 @@ static void __mmdrop(struct mm_struct *mm)
void mmdrop(struct mm_struct *mm)
{
+ /*
+ * The implicit full barrier implied by atomic_dec_and_test() is
+ * required by the membarrier system call before returning to
+ * user-space, after storing to rq->curr.
+ */
if (unlikely(atomic_dec_and_test(&mm->mm_count)))
__mmdrop(mm);
}
@@ -1587,6 +1592,10 @@ static __latent_entropy struct task_struct *copy_process(
int retval;
struct task_struct *p;
+ /*
+ * Don't allow sharing the root directory with processes in a different
+ * namespace
+ */
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
@@ -2062,6 +2071,8 @@ long _do_fork(unsigned long clone_flags,
int __user *child_tidptr,
unsigned long tls)
{
+ struct completion vfork;
+ struct pid *pid;
struct task_struct *p;
int trace = 0;
long nr;
@@ -2087,43 +2098,40 @@ long _do_fork(unsigned long clone_flags,
p = copy_process(clone_flags, stack_start, stack_size,
child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
add_latent_entropy();
+
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
/*
* Do this prior waking up the new thread - the thread pointer
* might get invalid after that point, if the thread exits quickly.
*/
- if (!IS_ERR(p)) {
- struct completion vfork;
- struct pid *pid;
-
- trace_sched_process_fork(current, p);
+ trace_sched_process_fork(current, p);
- pid = get_task_pid(p, PIDTYPE_PID);
- nr = pid_vnr(pid);
+ pid = get_task_pid(p, PIDTYPE_PID);
+ nr = pid_vnr(pid);
- if (clone_flags & CLONE_PARENT_SETTID)
- put_user(nr, parent_tidptr);
+ if (clone_flags & CLONE_PARENT_SETTID)
+ put_user(nr, parent_tidptr);
- if (clone_flags & CLONE_VFORK) {
- p->vfork_done = &vfork;
- init_completion(&vfork);
- get_task_struct(p);
- }
-
- wake_up_new_task(p);
+ if (clone_flags & CLONE_VFORK) {
+ p->vfork_done = &vfork;
+ init_completion(&vfork);
+ get_task_struct(p);
+ }
- /* forking complete and child started to run, tell ptracer */
- if (unlikely(trace))
- ptrace_event_pid(trace, pid);
+ wake_up_new_task(p);
- if (clone_flags & CLONE_VFORK) {
- if (!wait_for_vfork_done(p, &vfork))
- ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
- }
+ /* forking complete and child started to run, tell ptracer */
+ if (unlikely(trace))
+ ptrace_event_pid(trace, pid);
- put_pid(pid);
- } else {
- nr = PTR_ERR(p);
+ if (clone_flags & CLONE_VFORK) {
+ if (!wait_for_vfork_done(p, &vfork))
+ ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
}
+
+ put_pid(pid);
return nr;
}
diff --git a/kernel/futex.c b/kernel/futex.c
index 7f719d110908..1f450e092c74 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -862,24 +862,6 @@ static void put_pi_state(struct futex_pi_state *pi_state)
}
}
-/*
- * Look up the task based on what TID userspace gave us.
- * We dont trust it.
- */
-static struct task_struct *futex_find_get_task(pid_t pid)
-{
- struct task_struct *p;
-
- rcu_read_lock();
- p = find_task_by_vpid(pid);
- if (p)
- get_task_struct(p);
-
- rcu_read_unlock();
-
- return p;
-}
-
#ifdef CONFIG_FUTEX_PI
/*
@@ -1183,7 +1165,7 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
*/
if (!pid)
return -ESRCH;
- p = futex_find_get_task(pid);
+ p = find_get_task_by_vpid(pid);
if (!p)
return -ESRCH;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index ef2a47e0eab6..6cdecc6f4c53 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -10,7 +10,6 @@
#include <linux/jiffies.h>
#include <linux/irq.h>
#include <linux/module.h>
-#include <linux/kallsyms.h>
#include <linux/interrupt.h>
#include <linux/moduleparam.h>
#include <linux/timer.h>
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 7594c033d98a..2c16f1ab5e10 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -358,7 +358,8 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
*/
if (kcov->mode != KCOV_MODE_INIT || !kcov->area)
return -EINVAL;
- if (kcov->t != NULL)
+ t = current;
+ if (kcov->t != NULL || t->kcov != NULL)
return -EBUSY;
if (arg == KCOV_TRACE_PC)
kcov->mode = KCOV_MODE_TRACE_PC;
@@ -370,7 +371,6 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
#endif
else
return -EINVAL;
- t = current;
/* Cache in task struct for performance. */
t->kcov_size = kcov->size;
t->kcov_area = kcov->area;
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 403ab9cdb949..4849be5f9b3c 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -188,13 +188,6 @@ static RADIX_TREE(pgmap_radix, GFP_KERNEL);
#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
#define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
-struct page_map {
- struct resource res;
- struct percpu_ref *ref;
- struct dev_pagemap pgmap;
- struct vmem_altmap altmap;
-};
-
static unsigned long order_at(struct resource *res, unsigned long pgoff)
{
unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff;
@@ -248,34 +241,36 @@ int device_private_entry_fault(struct vm_area_struct *vma,
EXPORT_SYMBOL(device_private_entry_fault);
#endif /* CONFIG_DEVICE_PRIVATE */
-static void pgmap_radix_release(struct resource *res)
+static void pgmap_radix_release(struct resource *res, unsigned long end_pgoff)
{
unsigned long pgoff, order;
mutex_lock(&pgmap_lock);
- foreach_order_pgoff(res, order, pgoff)
+ foreach_order_pgoff(res, order, pgoff) {
+ if (pgoff >= end_pgoff)
+ break;
radix_tree_delete(&pgmap_radix, PHYS_PFN(res->start) + pgoff);
+ }
mutex_unlock(&pgmap_lock);
synchronize_rcu();
}
-static unsigned long pfn_first(struct page_map *page_map)
+static unsigned long pfn_first(struct dev_pagemap *pgmap)
{
- struct dev_pagemap *pgmap = &page_map->pgmap;
- const struct resource *res = &page_map->res;
- struct vmem_altmap *altmap = pgmap->altmap;
+ const struct resource *res = &pgmap->res;
+ struct vmem_altmap *altmap = &pgmap->altmap;
unsigned long pfn;
pfn = res->start >> PAGE_SHIFT;
- if (altmap)
+ if (pgmap->altmap_valid)
pfn += vmem_altmap_offset(altmap);
return pfn;
}
-static unsigned long pfn_end(struct page_map *page_map)
+static unsigned long pfn_end(struct dev_pagemap *pgmap)
{
- const struct resource *res = &page_map->res;
+ const struct resource *res = &pgmap->res;
return (res->start + resource_size(res)) >> PAGE_SHIFT;
}
@@ -283,15 +278,15 @@ static unsigned long pfn_end(struct page_map *page_map)
#define for_each_device_pfn(pfn, map) \
for (pfn = pfn_first(map); pfn < pfn_end(map); pfn++)
-static void devm_memremap_pages_release(struct device *dev, void *data)
+static void devm_memremap_pages_release(void *data)
{
- struct page_map *page_map = data;
- struct resource *res = &page_map->res;
+ struct dev_pagemap *pgmap = data;
+ struct device *dev = pgmap->dev;
+ struct resource *res = &pgmap->res;
resource_size_t align_start, align_size;
- struct dev_pagemap *pgmap = &page_map->pgmap;
unsigned long pfn;
- for_each_device_pfn(pfn, page_map)
+ for_each_device_pfn(pfn, pgmap)
put_page(pfn_to_page(pfn));
if (percpu_ref_tryget_live(pgmap->ref)) {
@@ -301,56 +296,51 @@ static void devm_memremap_pages_release(struct device *dev, void *data)
/* pages are dead and unused, undo the arch mapping */
align_start = res->start & ~(SECTION_SIZE - 1);
- align_size = ALIGN(resource_size(res), SECTION_SIZE);
+ align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
+ - align_start;
mem_hotplug_begin();
- arch_remove_memory(align_start, align_size);
+ arch_remove_memory(align_start, align_size, pgmap->altmap_valid ?
+ &pgmap->altmap : NULL);
mem_hotplug_done();
untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
- pgmap_radix_release(res);
- dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc,
- "%s: failed to free all reserved pages\n", __func__);
-}
-
-/* assumes rcu_read_lock() held at entry */
-struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
-{
- struct page_map *page_map;
-
- WARN_ON_ONCE(!rcu_read_lock_held());
-
- page_map = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys));
- return page_map ? &page_map->pgmap : NULL;
+ pgmap_radix_release(res, -1);
+ dev_WARN_ONCE(dev, pgmap->altmap.alloc,
+ "%s: failed to free all reserved pages\n", __func__);
}
/**
* devm_memremap_pages - remap and provide memmap backing for the given resource
* @dev: hosting device for @res
- * @res: "host memory" address range
- * @ref: a live per-cpu reference count
- * @altmap: optional descriptor for allocating the memmap from @res
+ * @pgmap: pointer to a struct dev_pgmap
*
* Notes:
- * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time
- * (or devm release event). The expected order of events is that @ref has
+ * 1/ At a minimum the res, ref and type members of @pgmap must be initialized
+ * by the caller before passing it to this function
+ *
+ * 2/ The altmap field may optionally be initialized, in which case altmap_valid
+ * must be set to true
+ *
+ * 3/ pgmap.ref must be 'live' on entry and 'dead' before devm_memunmap_pages()
+ * time (or devm release event). The expected order of events is that ref has
* been through percpu_ref_kill() before devm_memremap_pages_release(). The
* wait for the completion of all references being dropped and
* percpu_ref_exit() must occur after devm_memremap_pages_release().
*
- * 2/ @res is expected to be a host memory range that could feasibly be
+ * 4/ res is expected to be a host memory range that could feasibly be
* treated as a "System RAM" range, i.e. not a device mmio range, but
* this is not enforced.
*/
-void *devm_memremap_pages(struct device *dev, struct resource *res,
- struct percpu_ref *ref, struct vmem_altmap *altmap)
+void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
{
resource_size_t align_start, align_size, align_end;
+ struct vmem_altmap *altmap = pgmap->altmap_valid ?
+ &pgmap->altmap : NULL;
unsigned long pfn, pgoff, order;
pgprot_t pgprot = PAGE_KERNEL;
- struct dev_pagemap *pgmap;
- struct page_map *page_map;
int error, nid, is_ram, i = 0;
+ struct resource *res = &pgmap->res;
align_start = res->start & ~(SECTION_SIZE - 1);
align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
@@ -367,47 +357,18 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
if (is_ram == REGION_INTERSECTS)
return __va(res->start);
- if (!ref)
+ if (!pgmap->ref)
return ERR_PTR(-EINVAL);
- page_map = devres_alloc_node(devm_memremap_pages_release,
- sizeof(*page_map), GFP_KERNEL, dev_to_node(dev));
- if (!page_map)
- return ERR_PTR(-ENOMEM);
- pgmap = &page_map->pgmap;
-
- memcpy(&page_map->res, res, sizeof(*res));
-
pgmap->dev = dev;
- if (altmap) {
- memcpy(&page_map->altmap, altmap, sizeof(*altmap));
- pgmap->altmap = &page_map->altmap;
- }
- pgmap->ref = ref;
- pgmap->res = &page_map->res;
- pgmap->type = MEMORY_DEVICE_HOST;
- pgmap->page_fault = NULL;
- pgmap->page_free = NULL;
- pgmap->data = NULL;
mutex_lock(&pgmap_lock);
error = 0;
align_end = align_start + align_size - 1;
foreach_order_pgoff(res, order, pgoff) {
- struct dev_pagemap *dup;
-
- rcu_read_lock();
- dup = find_dev_pagemap(res->start + PFN_PHYS(pgoff));
- rcu_read_unlock();
- if (dup) {
- dev_err(dev, "%s: %pr collides with mapping for %s\n",
- __func__, res, dev_name(dup->dev));
- error = -EBUSY;
- break;
- }
error = __radix_tree_insert(&pgmap_radix,
- PHYS_PFN(res->start) + pgoff, order, page_map);
+ PHYS_PFN(res->start) + pgoff, order, pgmap);
if (error) {
dev_err(dev, "%s: failed: %d\n", __func__, error);
break;
@@ -427,16 +388,16 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
goto err_pfn_remap;
mem_hotplug_begin();
- error = arch_add_memory(nid, align_start, align_size, false);
+ error = arch_add_memory(nid, align_start, align_size, altmap, false);
if (!error)
move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
align_start >> PAGE_SHIFT,
- align_size >> PAGE_SHIFT);
+ align_size >> PAGE_SHIFT, altmap);
mem_hotplug_done();
if (error)
goto err_add_memory;
- for_each_device_pfn(pfn, page_map) {
+ for_each_device_pfn(pfn, pgmap) {
struct page *page = pfn_to_page(pfn);
/*
@@ -447,19 +408,21 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
*/
list_del(&page->lru);
page->pgmap = pgmap;
- percpu_ref_get(ref);
+ percpu_ref_get(pgmap->ref);
if (!(++i % 1024))
cond_resched();
}
- devres_add(dev, page_map);
+
+ devm_add_action(dev, devm_memremap_pages_release, pgmap);
+
return __va(res->start);
err_add_memory:
untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
err_pfn_remap:
err_radix:
- pgmap_radix_release(res);
- devres_free(page_map);
+ pgmap_radix_release(res, pgoff);
+ devres_free(pgmap);
return ERR_PTR(error);
}
EXPORT_SYMBOL(devm_memremap_pages);
@@ -475,34 +438,39 @@ void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
altmap->alloc -= nr_pfns;
}
-struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
+/**
+ * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn
+ * @pfn: page frame number to lookup page_map
+ * @pgmap: optional known pgmap that already has a reference
+ *
+ * If @pgmap is non-NULL and covers @pfn it will be returned as-is. If @pgmap
+ * is non-NULL but does not cover @pfn the reference to it will be released.
+ */
+struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
+ struct dev_pagemap *pgmap)
{
- /*
- * 'memmap_start' is the virtual address for the first "struct
- * page" in this range of the vmemmap array. In the case of
- * CONFIG_SPARSEMEM_VMEMMAP a page_to_pfn conversion is simple
- * pointer arithmetic, so we can perform this to_vmem_altmap()
- * conversion without concern for the initialization state of
- * the struct page fields.
- */
- struct page *page = (struct page *) memmap_start;
- struct dev_pagemap *pgmap;
+ resource_size_t phys = PFN_PHYS(pfn);
/*
- * Unconditionally retrieve a dev_pagemap associated with the
- * given physical address, this is only for use in the
- * arch_{add|remove}_memory() for setting up and tearing down
- * the memmap.
+ * In the cached case we're already holding a live reference.
*/
+ if (pgmap) {
+ if (phys >= pgmap->res.start && phys <= pgmap->res.end)
+ return pgmap;
+ put_dev_pagemap(pgmap);
+ }
+
+ /* fall back to slow path lookup */
rcu_read_lock();
- pgmap = find_dev_pagemap(__pfn_to_phys(page_to_pfn(page)));
+ pgmap = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys));
+ if (pgmap && !percpu_ref_tryget_live(pgmap->ref))
+ pgmap = NULL;
rcu_read_unlock();
- return pgmap ? pgmap->altmap : NULL;
+ return pgmap;
}
#endif /* CONFIG_ZONE_DEVICE */
-
#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
void put_zone_device_private_or_public_page(struct page *page)
{
diff --git a/kernel/module.c b/kernel/module.c
index ccdf24c4949e..ad2d420024f6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3804,6 +3804,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
module_disable_nx(mod);
ddebug_cleanup:
+ ftrace_release_mod(mod);
dynamic_debug_remove(mod, info->debug);
synchronize_sched();
kfree(mod->args);
@@ -3823,12 +3824,6 @@ static int load_module(struct load_info *info, const char __user *uargs,
synchronize_sched();
mutex_unlock(&module_mutex);
free_module:
- /*
- * Ftrace needs to clean up what it initialized.
- * This does nothing if ftrace_module_init() wasn't called,
- * but it must be called outside of module_mutex.
- */
- ftrace_release_mod(mod);
/* Free lock-classes; relies on the preceding sync_rcu() */
lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
diff --git a/kernel/pid.c b/kernel/pid.c
index 5d30c87e3c42..ed6c343fe50d 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -343,6 +343,19 @@ struct task_struct *find_task_by_vpid(pid_t vnr)
return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
}
+struct task_struct *find_get_task_by_vpid(pid_t nr)
+{
+ struct task_struct *task;
+
+ rcu_read_lock();
+ task = find_task_by_vpid(nr);
+ if (task)
+ get_task_struct(task);
+ rcu_read_unlock();
+
+ return task;
+}
+
struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
{
struct pid *pid;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 5e1d713c8e61..21fec73d45d4 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -1103,21 +1103,6 @@ int ptrace_request(struct task_struct *child, long request,
return ret;
}
-static struct task_struct *ptrace_get_task_struct(pid_t pid)
-{
- struct task_struct *child;
-
- rcu_read_lock();
- child = find_task_by_vpid(pid);
- if (child)
- get_task_struct(child);
- rcu_read_unlock();
-
- if (!child)
- return ERR_PTR(-ESRCH);
- return child;
-}
-
#ifndef arch_ptrace_attach
#define arch_ptrace_attach(child) do { } while (0)
#endif
@@ -1135,9 +1120,9 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
goto out;
}
- child = ptrace_get_task_struct(pid);
- if (IS_ERR(child)) {
- ret = PTR_ERR(child);
+ child = find_get_task_by_vpid(pid);
+ if (!child) {
+ ret = -ESRCH;
goto out;
}
@@ -1281,9 +1266,9 @@ COMPAT_SYSCALL_DEFINE4(ptrace, compat_long_t, request, compat_long_t, pid,
goto out;
}
- child = ptrace_get_task_struct(pid);
- if (IS_ERR(child)) {
- ret = PTR_ERR(child);
+ child = find_get_task_by_vpid(pid);
+ if (!child) {
+ ret = -ESRCH;
goto out;
}
diff --git a/kernel/relay.c b/kernel/relay.c
index 41280033a4c5..f7f40a6e6352 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -611,7 +611,6 @@ free_bufs:
kref_put(&chan->kref, relay_destroy_channel);
mutex_unlock(&relay_channels_mutex);
- kfree(chan);
return NULL;
}
EXPORT_SYMBOL_GPL(relay_open);
diff --git a/kernel/resource.c b/kernel/resource.c
index 54ba6de3757c..e270b5048988 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1022,6 +1022,7 @@ static void __init __reserve_region_with_split(struct resource *root,
struct resource *conflict;
struct resource *res = alloc_resource(GFP_ATOMIC);
struct resource *next_res = NULL;
+ int type = resource_type(root);
if (!res)
return;
@@ -1029,7 +1030,7 @@ static void __init __reserve_region_with_split(struct resource *root,
res->name = name;
res->start = start;
res->end = end;
- res->flags = IORESOURCE_BUSY;
+ res->flags = type | IORESOURCE_BUSY;
res->desc = IORES_DESC_NONE;
while (1) {
@@ -1064,7 +1065,7 @@ static void __init __reserve_region_with_split(struct resource *root,
next_res->name = name;
next_res->start = conflict->end + 1;
next_res->end = end;
- next_res->flags = IORESOURCE_BUSY;
+ next_res->flags = type | IORESOURCE_BUSY;
next_res->desc = IORES_DESC_NONE;
}
} else {
@@ -1478,7 +1479,7 @@ void __devm_release_region(struct device *dev, struct resource *parent,
EXPORT_SYMBOL(__devm_release_region);
/*
- * Called from init/main.c to reserve IO ports.
+ * Reserve I/O ports or memory based on "reserve=" kernel parameter.
*/
#define MAXRESERVE 4
static int __init reserve_setup(char *str)
@@ -1489,26 +1490,38 @@ static int __init reserve_setup(char *str)
for (;;) {
unsigned int io_start, io_num;
int x = reserved;
+ struct resource *parent;
- if (get_option (&str, &io_start) != 2)
+ if (get_option(&str, &io_start) != 2)
break;
- if (get_option (&str, &io_num) == 0)
+ if (get_option(&str, &io_num) == 0)
break;
if (x < MAXRESERVE) {
struct resource *res = reserve + x;
+
+ /*
+ * If the region starts below 0x10000, we assume it's
+ * I/O port space; otherwise assume it's memory.
+ */
+ if (io_start < 0x10000) {
+ res->flags = IORESOURCE_IO;
+ parent = &ioport_resource;
+ } else {
+ res->flags = IORESOURCE_MEM;
+ parent = &iomem_resource;
+ }
res->name = "reserved";
res->start = io_start;
res->end = io_start + io_num - 1;
- res->flags = IORESOURCE_BUSY;
+ res->flags |= IORESOURCE_BUSY;
res->desc = IORES_DESC_NONE;
res->child = NULL;
- if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0)
+ if (request_resource(parent, res) == 0)
reserved = x+1;
}
}
return 1;
}
-
__setup("reserve=", reserve_setup);
/*
@@ -1563,17 +1576,17 @@ static int strict_iomem_checks;
/*
* check if an address is reserved in the iomem resource tree
- * returns 1 if reserved, 0 if not reserved.
+ * returns true if reserved, false if not reserved.
*/
-int iomem_is_exclusive(u64 addr)
+bool iomem_is_exclusive(u64 addr)
{
struct resource *p = &iomem_resource;
- int err = 0;
+ bool err = false;
loff_t l;
int size = PAGE_SIZE;
if (!strict_iomem_checks)
- return 0;
+ return false;
addr = addr & PAGE_MASK;
@@ -1596,7 +1609,7 @@ int iomem_is_exclusive(u64 addr)
continue;
if (IS_ENABLED(CONFIG_IO_STRICT_DEVMEM)
|| p->flags & IORESOURCE_EXCLUSIVE) {
- err = 1;
+ err = true;
break;
}
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3da7a2444a91..bf724c1952ea 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1630,16 +1630,16 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
#ifdef CONFIG_SMP
if (cpu == rq->cpu) {
- schedstat_inc(rq->ttwu_local);
- schedstat_inc(p->se.statistics.nr_wakeups_local);
+ __schedstat_inc(rq->ttwu_local);
+ __schedstat_inc(p->se.statistics.nr_wakeups_local);
} else {
struct sched_domain *sd;
- schedstat_inc(p->se.statistics.nr_wakeups_remote);
+ __schedstat_inc(p->se.statistics.nr_wakeups_remote);
rcu_read_lock();
for_each_domain(rq->cpu, sd) {
if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
- schedstat_inc(sd->ttwu_wake_remote);
+ __schedstat_inc(sd->ttwu_wake_remote);
break;
}
}
@@ -1647,14 +1647,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
}
if (wake_flags & WF_MIGRATED)
- schedstat_inc(p->se.statistics.nr_wakeups_migrate);
+ __schedstat_inc(p->se.statistics.nr_wakeups_migrate);
#endif /* CONFIG_SMP */
- schedstat_inc(rq->ttwu_count);
- schedstat_inc(p->se.statistics.nr_wakeups);
+ __schedstat_inc(rq->ttwu_count);
+ __schedstat_inc(p->se.statistics.nr_wakeups);
if (wake_flags & WF_SYNC)
- schedstat_inc(p->se.statistics.nr_wakeups_sync);
+ __schedstat_inc(p->se.statistics.nr_wakeups_sync);
}
static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
@@ -2461,6 +2461,7 @@ void wake_up_new_task(struct task_struct *p)
* Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
* as we're not fully set-up yet.
*/
+ p->recent_used_cpu = task_cpu(p);
__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
rq = __task_rq_lock(p, &rf);
@@ -2698,23 +2699,27 @@ static struct rq *finish_task_switch(struct task_struct *prev)
prev_state = prev->state;
vtime_task_switch(prev);
perf_event_task_sched_in(prev, current);
- /*
- * The membarrier system call requires a full memory barrier
- * after storing to rq->curr, before going back to user-space.
- *
- * TODO: This smp_mb__after_unlock_lock can go away if PPC end
- * up adding a full barrier to switch_mm(), or we should figure
- * out if a smp_mb__after_unlock_lock is really the proper API
- * to use.
- */
- smp_mb__after_unlock_lock();
finish_task(prev);
finish_lock_switch(rq);
finish_arch_post_lock_switch();
fire_sched_in_preempt_notifiers(current);
- if (mm)
+ /*
+ * When switching through a kernel thread, the loop in
+ * membarrier_{private,global}_expedited() may have observed that
+ * kernel thread and not issued an IPI. It is therefore possible to
+ * schedule between user->kernel->user threads without passing though
+ * switch_mm(). Membarrier requires a barrier after storing to
+ * rq->curr, before returning to userspace, so provide them here:
+ *
+ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
+ * provided by mmdrop(),
+ * - a sync_core for SYNC_CORE.
+ */
+ if (mm) {
+ membarrier_mm_sync_core_before_usermode(mm);
mmdrop(mm);
+ }
if (unlikely(prev_state == TASK_DEAD)) {
if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev);
@@ -2818,6 +2823,13 @@ context_switch(struct rq *rq, struct task_struct *prev,
*/
arch_start_context_switch(prev);
+ /*
+ * If mm is non-NULL, we pass through switch_mm(). If mm is
+ * NULL, we will pass through mmdrop() in finish_task_switch().
+ * Both of these contain the full memory barrier required by
+ * membarrier after storing to rq->curr, before returning to
+ * user-space.
+ */
if (!mm) {
next->active_mm = oldmm;
mmgrab(oldmm);
@@ -3354,6 +3366,9 @@ static void __sched notrace __schedule(bool preempt)
* Make sure that signal_pending_state()->signal_pending() below
* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
* done by the caller to avoid the race with signal_wake_up().
+ *
+ * The membarrier system call requires a full memory barrier
+ * after coming from user-space, before storing to rq->curr.
*/
rq_lock(rq, &rf);
smp_mb__after_spinlock();
@@ -3401,17 +3416,16 @@ static void __sched notrace __schedule(bool preempt)
/*
* The membarrier system call requires each architecture
* to have a full memory barrier after updating
- * rq->curr, before returning to user-space. For TSO
- * (e.g. x86), the architecture must provide its own
- * barrier in switch_mm(). For weakly ordered machines
- * for which spin_unlock() acts as a full memory
- * barrier, finish_lock_switch() in common code takes
- * care of this barrier. For weakly ordered machines for
- * which spin_unlock() acts as a RELEASE barrier (only
- * arm64 and PowerPC), arm64 has a full barrier in
- * switch_to(), and PowerPC has
- * smp_mb__after_unlock_lock() before
- * finish_lock_switch().
+ * rq->curr, before returning to user-space.
+ *
+ * Here are the schemes providing that barrier on the
+ * various architectures:
+ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
+ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
+ * - finish_lock_switch() for weakly-ordered
+ * architectures where spin_unlock is a full barrier,
+ * - switch_to() for arm64 (weakly-ordered, spin_unlock
+ * is a RELEASE barrier),
*/
++*switch_count;
@@ -4853,7 +4867,7 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
ret = sched_getaffinity(pid, mask);
if (ret == 0) {
- size_t retlen = min_t(size_t, len, cpumask_size());
+ unsigned int retlen = min(len, cpumask_size());
if (copy_to_user(user_mask_ptr, mask, retlen))
ret = -EFAULT;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7b6535987500..5eb3ffc9be84 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -871,7 +871,7 @@ update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
likely(wait_start > prev_wait_start))
wait_start -= prev_wait_start;
- schedstat_set(se->statistics.wait_start, wait_start);
+ __schedstat_set(se->statistics.wait_start, wait_start);
}
static inline void
@@ -893,17 +893,17 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
* time stamp can be adjusted to accumulate wait time
* prior to migration.
*/
- schedstat_set(se->statistics.wait_start, delta);
+ __schedstat_set(se->statistics.wait_start, delta);
return;
}
trace_sched_stat_wait(p, delta);
}
- schedstat_set(se->statistics.wait_max,
+ __schedstat_set(se->statistics.wait_max,
max(schedstat_val(se->statistics.wait_max), delta));
- schedstat_inc(se->statistics.wait_count);
- schedstat_add(se->statistics.wait_sum, delta);
- schedstat_set(se->statistics.wait_start, 0);
+ __schedstat_inc(se->statistics.wait_count);
+ __schedstat_add(se->statistics.wait_sum, delta);
+ __schedstat_set(se->statistics.wait_start, 0);
}
static inline void
@@ -928,10 +928,10 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
delta = 0;
if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
- schedstat_set(se->statistics.sleep_max, delta);
+ __schedstat_set(se->statistics.sleep_max, delta);
- schedstat_set(se->statistics.sleep_start, 0);
- schedstat_add(se->statistics.sum_sleep_runtime, delta);
+ __schedstat_set(se->statistics.sleep_start, 0);
+ __schedstat_add(se->statistics.sum_sleep_runtime, delta);
if (tsk) {
account_scheduler_latency(tsk, delta >> 10, 1);
@@ -945,15 +945,15 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
delta = 0;
if (unlikely(delta > schedstat_val(se->statistics.block_max)))
- schedstat_set(se->statistics.block_max, delta);
+ __schedstat_set(se->statistics.block_max, delta);
- schedstat_set(se->statistics.block_start, 0);
- schedstat_add(se->statistics.sum_sleep_runtime, delta);
+ __schedstat_set(se->statistics.block_start, 0);
+ __schedstat_add(se->statistics.sum_sleep_runtime, delta);
if (tsk) {
if (tsk->in_iowait) {
- schedstat_add(se->statistics.iowait_sum, delta);
- schedstat_inc(se->statistics.iowait_count);
+ __schedstat_add(se->statistics.iowait_sum, delta);
+ __schedstat_inc(se->statistics.iowait_count);
trace_sched_stat_iowait(tsk, delta);
}
@@ -1012,10 +1012,10 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
struct task_struct *tsk = task_of(se);
if (tsk->state & TASK_INTERRUPTIBLE)
- schedstat_set(se->statistics.sleep_start,
+ __schedstat_set(se->statistics.sleep_start,
rq_clock(rq_of(cfs_rq)));
if (tsk->state & TASK_UNINTERRUPTIBLE)
- schedstat_set(se->statistics.block_start,
+ __schedstat_set(se->statistics.block_start,
rq_clock(rq_of(cfs_rq)));
}
}
@@ -5692,27 +5692,31 @@ static int wake_wide(struct task_struct *p)
* scheduling latency of the CPUs. This seems to work
* for the overloaded case.
*/
-
-static bool
-wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
- int this_cpu, int prev_cpu, int sync)
+static int
+wake_affine_idle(int this_cpu, int prev_cpu, int sync)
{
/*
* If this_cpu is idle, it implies the wakeup is from interrupt
* context. Only allow the move if cache is shared. Otherwise an
* interrupt intensive workload could force all tasks onto one
* node depending on the IO topology or IRQ affinity settings.
+ *
+ * If the prev_cpu is idle and cache affine then avoid a migration.
+ * There is no guarantee that the cache hot data from an interrupt
+ * is more important than cache hot data on the prev_cpu and from
+ * a cpufreq perspective, it's better to have higher utilisation
+ * on one CPU.
*/
if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
- return true;
+ return idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
if (sync && cpu_rq(this_cpu)->nr_running == 1)
- return true;
+ return this_cpu;
- return false;
+ return nr_cpumask_bits;
}
-static bool
+static int
wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int prev_cpu, int sync)
{
@@ -5726,7 +5730,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
unsigned long current_load = task_h_load(current);
if (current_load > this_eff_load)
- return true;
+ return this_cpu;
this_eff_load -= current_load;
}
@@ -5743,28 +5747,28 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
prev_eff_load *= capacity_of(this_cpu);
- return this_eff_load <= prev_eff_load;
+ return this_eff_load <= prev_eff_load ? this_cpu : nr_cpumask_bits;
}
static int wake_affine(struct sched_domain *sd, struct task_struct *p,
int prev_cpu, int sync)
{
int this_cpu = smp_processor_id();
- bool affine = false;
+ int target = nr_cpumask_bits;
- if (sched_feat(WA_IDLE) && !affine)
- affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync);
+ if (sched_feat(WA_IDLE))
+ target = wake_affine_idle(this_cpu, prev_cpu, sync);
- if (sched_feat(WA_WEIGHT) && !affine)
- affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
+ if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
+ target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
- if (affine) {
- schedstat_inc(sd->ttwu_move_affine);
- schedstat_inc(p->se.statistics.nr_wakeups_affine);
- }
+ if (target == nr_cpumask_bits)
+ return prev_cpu;
- return affine;
+ schedstat_inc(sd->ttwu_move_affine);
+ schedstat_inc(p->se.statistics.nr_wakeups_affine);
+ return target;
}
static inline unsigned long task_util(struct task_struct *p);
@@ -6193,7 +6197,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
static int select_idle_sibling(struct task_struct *p, int prev, int target)
{
struct sched_domain *sd;
- int i;
+ int i, recent_used_cpu;
if (idle_cpu(target))
return target;
@@ -6204,6 +6208,21 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
return prev;
+ /* Check a recently used CPU as a potential idle candidate */
+ recent_used_cpu = p->recent_used_cpu;
+ if (recent_used_cpu != prev &&
+ recent_used_cpu != target &&
+ cpus_share_cache(recent_used_cpu, target) &&
+ idle_cpu(recent_used_cpu) &&
+ cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
+ /*
+ * Replace recent_used_cpu with prev as it is a potential
+ * candidate for the next wake.
+ */
+ p->recent_used_cpu = prev;
+ return recent_used_cpu;
+ }
+
sd = rcu_dereference(per_cpu(sd_llc, target));
if (!sd)
return target;
@@ -6357,8 +6376,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (cpu == prev_cpu)
goto pick_cpu;
- if (wake_affine(affine_sd, p, prev_cpu, sync))
- new_cpu = cpu;
+ new_cpu = wake_affine(affine_sd, p, prev_cpu, sync);
}
if (sd && !(sd_flag & SD_BALANCE_FORK)) {
@@ -6372,9 +6390,12 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (!sd) {
pick_cpu:
- if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
+ if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
+ if (want_affine)
+ current->recent_used_cpu = cpu;
+ }
} else {
new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
}
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 9bcbacba82a8..5d0762633639 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -26,24 +26,110 @@
* Bitmask made from a "or" of all commands within enum membarrier_cmd,
* except MEMBARRIER_CMD_QUERY.
*/
+#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
+#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
+ (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \
+ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
+#else
+#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
+#endif
+
#define MEMBARRIER_CMD_BITMASK \
- (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED \
- | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED)
+ (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
+ | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
+ | MEMBARRIER_CMD_PRIVATE_EXPEDITED \
+ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \
+ | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
static void ipi_mb(void *info)
{
smp_mb(); /* IPIs should be serializing but paranoid. */
}
-static int membarrier_private_expedited(void)
+static int membarrier_global_expedited(void)
{
int cpu;
bool fallback = false;
cpumask_var_t tmpmask;
- if (!(atomic_read(&current->mm->membarrier_state)
- & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
- return -EPERM;
+ if (num_online_cpus() == 1)
+ return 0;
+
+ /*
+ * Matches memory barriers around rq->curr modification in
+ * scheduler.
+ */
+ smp_mb(); /* system call entry is not a mb. */
+
+ /*
+ * Expedited membarrier commands guarantee that they won't
+ * block, hence the GFP_NOWAIT allocation flag and fallback
+ * implementation.
+ */
+ if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
+ /* Fallback for OOM. */
+ fallback = true;
+ }
+
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ struct task_struct *p;
+
+ /*
+ * Skipping the current CPU is OK even through we can be
+ * migrated at any point. The current CPU, at the point
+ * where we read raw_smp_processor_id(), is ensured to
+ * be in program order with respect to the caller
+ * thread. Therefore, we can skip this CPU from the
+ * iteration.
+ */
+ if (cpu == raw_smp_processor_id())
+ continue;
+ rcu_read_lock();
+ p = task_rcu_dereference(&cpu_rq(cpu)->curr);
+ if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &
+ MEMBARRIER_STATE_GLOBAL_EXPEDITED)) {
+ if (!fallback)
+ __cpumask_set_cpu(cpu, tmpmask);
+ else
+ smp_call_function_single(cpu, ipi_mb, NULL, 1);
+ }
+ rcu_read_unlock();
+ }
+ if (!fallback) {
+ preempt_disable();
+ smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+ preempt_enable();
+ free_cpumask_var(tmpmask);
+ }
+ cpus_read_unlock();
+
+ /*
+ * Memory barrier on the caller thread _after_ we finished
+ * waiting for the last IPI. Matches memory barriers around
+ * rq->curr modification in scheduler.
+ */
+ smp_mb(); /* exit from system call is not a mb */
+ return 0;
+}
+
+static int membarrier_private_expedited(int flags)
+{
+ int cpu;
+ bool fallback = false;
+ cpumask_var_t tmpmask;
+
+ if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
+ return -EINVAL;
+ if (!(atomic_read(&current->mm->membarrier_state) &
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
+ return -EPERM;
+ } else {
+ if (!(atomic_read(&current->mm->membarrier_state) &
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
+ return -EPERM;
+ }
if (num_online_cpus() == 1)
return 0;
@@ -105,21 +191,69 @@ static int membarrier_private_expedited(void)
return 0;
}
-static void membarrier_register_private_expedited(void)
+static int membarrier_register_global_expedited(void)
{
struct task_struct *p = current;
struct mm_struct *mm = p->mm;
+ if (atomic_read(&mm->membarrier_state) &
+ MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
+ return 0;
+ atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
+ if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) {
+ /*
+ * For single mm user, single threaded process, we can
+ * simply issue a memory barrier after setting
+ * MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that
+ * no memory access following registration is reordered
+ * before registration.
+ */
+ smp_mb();
+ } else {
+ /*
+ * For multi-mm user threads, we need to ensure all
+ * future scheduler executions will observe the new
+ * thread flag state for this mm.
+ */
+ synchronize_sched();
+ }
+ atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
+ &mm->membarrier_state);
+ return 0;
+}
+
+static int membarrier_register_private_expedited(int flags)
+{
+ struct task_struct *p = current;
+ struct mm_struct *mm = p->mm;
+ int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY;
+
+ if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
+ return -EINVAL;
+ state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
+ }
+
/*
* We need to consider threads belonging to different thread
* groups, which use the same mm. (CLONE_VM but not
* CLONE_THREAD).
*/
- if (atomic_read(&mm->membarrier_state)
- & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)
- return;
- atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
- &mm->membarrier_state);
+ if (atomic_read(&mm->membarrier_state) & state)
+ return 0;
+ atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state);
+ if (flags & MEMBARRIER_FLAG_SYNC_CORE)
+ atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE,
+ &mm->membarrier_state);
+ if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) {
+ /*
+ * Ensure all future scheduler executions will observe the
+ * new thread flag state for this process.
+ */
+ synchronize_sched();
+ }
+ atomic_or(state, &mm->membarrier_state);
+ return 0;
}
/**
@@ -159,21 +293,28 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
int cmd_mask = MEMBARRIER_CMD_BITMASK;
if (tick_nohz_full_enabled())
- cmd_mask &= ~MEMBARRIER_CMD_SHARED;
+ cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;
return cmd_mask;
}
- case MEMBARRIER_CMD_SHARED:
- /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */
+ case MEMBARRIER_CMD_GLOBAL:
+ /* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
if (tick_nohz_full_enabled())
return -EINVAL;
if (num_online_cpus() > 1)
synchronize_sched();
return 0;
+ case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
+ return membarrier_global_expedited();
+ case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
+ return membarrier_register_global_expedited();
case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
- return membarrier_private_expedited();
+ return membarrier_private_expedited(0);
case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
- membarrier_register_private_expedited();
- return 0;
+ return membarrier_register_private_expedited(0);
+ case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
+ return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
+ case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
+ return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
default:
return -EINVAL;
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 862a513adca3..663b2355a3aa 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -950,12 +950,13 @@ static void update_curr_rt(struct rq *rq)
{
struct task_struct *curr = rq->curr;
struct sched_rt_entity *rt_se = &curr->rt;
+ u64 now = rq_clock_task(rq);
u64 delta_exec;
if (curr->sched_class != &rt_sched_class)
return;
- delta_exec = rq_clock_task(rq) - curr->se.exec_start;
+ delta_exec = now - curr->se.exec_start;
if (unlikely((s64)delta_exec <= 0))
return;
@@ -968,7 +969,7 @@ static void update_curr_rt(struct rq *rq)
curr->se.sum_exec_runtime += delta_exec;
account_group_exec_runtime(curr, delta_exec);
- curr->se.exec_start = rq_clock_task(rq);
+ curr->se.exec_start = now;
cgroup_account_cputime(curr, delta_exec);
sched_rt_avg_update(rq, delta_exec);
@@ -1907,9 +1908,8 @@ static void push_rt_tasks(struct rq *rq)
* the rt_loop_next will cause the iterator to perform another scan.
*
*/
-static int rto_next_cpu(struct rq *rq)
+static int rto_next_cpu(struct root_domain *rd)
{
- struct root_domain *rd = rq->rd;
int next;
int cpu;
@@ -1985,19 +1985,24 @@ static void tell_cpu_to_push(struct rq *rq)
* Otherwise it is finishing up and an ipi needs to be sent.
*/
if (rq->rd->rto_cpu < 0)
- cpu = rto_next_cpu(rq);
+ cpu = rto_next_cpu(rq->rd);
raw_spin_unlock(&rq->rd->rto_lock);
rto_start_unlock(&rq->rd->rto_loop_start);
- if (cpu >= 0)
+ if (cpu >= 0) {
+ /* Make sure the rd does not get freed while pushing */
+ sched_get_rd(rq->rd);
irq_work_queue_on(&rq->rd->rto_push_work, cpu);
+ }
}
/* Called from hardirq context */
void rto_push_irq_work_func(struct irq_work *work)
{
+ struct root_domain *rd =
+ container_of(work, struct root_domain, rto_push_work);
struct rq *rq;
int cpu;
@@ -2013,18 +2018,20 @@ void rto_push_irq_work_func(struct irq_work *work)
raw_spin_unlock(&rq->lock);
}
- raw_spin_lock(&rq->rd->rto_lock);
+ raw_spin_lock(&rd->rto_lock);
/* Pass the IPI to the next rt overloaded queue */
- cpu = rto_next_cpu(rq);
+ cpu = rto_next_cpu(rd);
- raw_spin_unlock(&rq->rd->rto_lock);
+ raw_spin_unlock(&rd->rto_lock);
- if (cpu < 0)
+ if (cpu < 0) {
+ sched_put_rd(rd);
return;
+ }
/* Try the next RT overloaded CPU */
- irq_work_queue_on(&rq->rd->rto_push_work, cpu);
+ irq_work_queue_on(&rd->rto_push_work, cpu);
}
#endif /* HAVE_RT_PUSH_IPI */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2e95505e23c6..fb5fc458547f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -691,6 +691,8 @@ extern struct mutex sched_domains_mutex;
extern void init_defrootdomain(void);
extern int sched_init_domains(const struct cpumask *cpu_map);
extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
+extern void sched_get_rd(struct root_domain *rd);
+extern void sched_put_rd(struct root_domain *rd);
#ifdef HAVE_RT_PUSH_IPI
extern void rto_push_irq_work_func(struct irq_work *work);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index baf500d12b7c..8e7b58de61e7 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -31,8 +31,11 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
rq->rq_sched_info.run_delay += delta;
}
#define schedstat_enabled() static_branch_unlikely(&sched_schedstats)
+#define __schedstat_inc(var) do { var++; } while (0)
#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0)
+#define __schedstat_add(var, amt) do { var += (amt); } while (0)
#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0)
+#define __schedstat_set(var, val) do { var = (val); } while (0)
#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
#define schedstat_val(var) (var)
#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0)
@@ -48,8 +51,11 @@ static inline void
rq_sched_info_depart(struct rq *rq, unsigned long long delta)
{}
#define schedstat_enabled() 0
+#define __schedstat_inc(var) do { } while (0)
#define schedstat_inc(var) do { } while (0)
+#define __schedstat_add(var, amt) do { } while (0)
#define schedstat_add(var, amt) do { } while (0)
+#define __schedstat_set(var, val) do { } while (0)
#define schedstat_set(var, val) do { } while (0)
#define schedstat_val(var) 0
#define schedstat_val_or_zero(var) 0
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 034cbed7f88b..519b024f4e94 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -259,6 +259,19 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
call_rcu_sched(&old_rd->rcu, free_rootdomain);
}
+void sched_get_rd(struct root_domain *rd)
+{
+ atomic_inc(&rd->refcount);
+}
+
+void sched_put_rd(struct root_domain *rd)
+{
+ if (!atomic_dec_and_test(&rd->refcount))
+ return;
+
+ call_rcu_sched(&rd->rcu, free_rootdomain);
+}
+
static int init_rootdomain(struct root_domain *rd)
{
if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2fb4e27c636a..f98f28c12020 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -218,6 +218,8 @@ static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
static int proc_dostring_coredump(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
#endif
+static int proc_dopipe_max_size(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
#ifdef CONFIG_MAGIC_SYSRQ
/* Note: sysrq code uses it's own private copy */
@@ -1812,8 +1814,7 @@ static struct ctl_table fs_table[] = {
.data = &pipe_max_size,
.maxlen = sizeof(pipe_max_size),
.mode = 0644,
- .proc_handler = &pipe_proc_fn,
- .extra1 = &pipe_min_size,
+ .proc_handler = proc_dopipe_max_size,
},
{
.procname = "pipe-user-pages-hard",
@@ -2615,29 +2616,17 @@ int proc_douintvec_minmax(struct ctl_table *table, int write,
do_proc_douintvec_minmax_conv, &param);
}
-struct do_proc_dopipe_max_size_conv_param {
- unsigned int *min;
-};
-
static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
unsigned int *valp,
int write, void *data)
{
- struct do_proc_dopipe_max_size_conv_param *param = data;
-
if (write) {
unsigned int val;
- if (*lvalp > UINT_MAX)
- return -EINVAL;
-
val = round_pipe_size(*lvalp);
if (val == 0)
return -EINVAL;
- if (param->min && *param->min > val)
- return -ERANGE;
-
*valp = val;
} else {
unsigned int val = *valp;
@@ -2647,14 +2636,11 @@ static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
return 0;
}
-int proc_dopipe_max_size(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int proc_dopipe_max_size(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
{
- struct do_proc_dopipe_max_size_conv_param param = {
- .min = (unsigned int *) table->extra1,
- };
return do_proc_douintvec(table, write, buffer, lenp, ppos,
- do_proc_dopipe_max_size_conv, &param);
+ do_proc_dopipe_max_size_conv, NULL);
}
static void validate_coredump_safety(void)
@@ -3160,12 +3146,6 @@ int proc_douintvec_minmax(struct ctl_table *table, int write,
return -ENOSYS;
}
-int proc_dopipe_max_size(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- return -ENOSYS;
-}
-
int proc_dointvec_jiffies(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -3209,7 +3189,6 @@ EXPORT_SYMBOL(proc_douintvec);
EXPORT_SYMBOL(proc_dointvec_jiffies);
EXPORT_SYMBOL(proc_dointvec_minmax);
EXPORT_SYMBOL_GPL(proc_douintvec_minmax);
-EXPORT_SYMBOL_GPL(proc_dopipe_max_size);
EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
EXPORT_SYMBOL(proc_dostring);
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 4559e914452b..4e62a4a8fa91 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -194,11 +194,7 @@ static int fill_stats_for_pid(pid_t pid, struct taskstats *stats)
{
struct task_struct *tsk;
- rcu_read_lock();
- tsk = find_task_by_vpid(pid);
- if (tsk)
- get_task_struct(tsk);
- rcu_read_unlock();
+ tsk = find_get_task_by_vpid(pid);
if (!tsk)
return -ESRCH;
fill_stats(current_user_ns(), task_active_pid_ns(current), tsk, stats);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index ae0c8a411fe7..23788100e214 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -37,7 +37,6 @@
#include <linux/hrtimer.h>
#include <linux/notifier.h>
#include <linux/syscalls.h>
-#include <linux/kallsyms.h>
#include <linux/interrupt.h>
#include <linux/tick.h>
#include <linux/seq_file.h>