summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fs/exec.c4
-rw-r--r--include/linux/mm.h25
-rw-r--r--include/linux/mm_types.h43
-rw-r--r--include/linux/sched.h5
-rw-r--r--init/Kconfig4
-rw-r--r--kernel/fork.c8
-rw-r--r--kernel/sched/core.c51
-rw-r--r--kernel/sched/sched.h58
-rw-r--r--kernel/signal.c2
9 files changed, 198 insertions, 2 deletions
diff --git a/fs/exec.c b/fs/exec.c
index ab913243a367..58f16312b983 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1010,6 +1010,7 @@ static int exec_mmap(struct mm_struct *mm)
active_mm = tsk->active_mm;
tsk->active_mm = mm;
tsk->mm = mm;
+ mm_init_cid(mm);
/*
* This prevents preemption while active_mm is being loaded and
* it and mm are being updated, which could cause problems for
@@ -1822,6 +1823,7 @@ static int bprm_execve(struct linux_binprm *bprm,
*/
check_unsafe_exec(bprm);
current->in_execve = 1;
+ sched_mm_cid_before_execve(current);
file = do_open_execat(fd, filename, flags);
retval = PTR_ERR(file);
@@ -1852,6 +1854,7 @@ static int bprm_execve(struct linux_binprm *bprm,
if (retval < 0)
goto out;
+ sched_mm_cid_after_execve(current);
/* execve succeeded */
current->fs->in_exec = 0;
current->in_execve = 0;
@@ -1871,6 +1874,7 @@ out:
force_fatal_sig(SIGSEGV);
out_unmark:
+ sched_mm_cid_after_execve(current);
current->fs->in_exec = 0;
current->in_execve = 0;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f3f196e4d66d..cf008c26a883 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1976,6 +1976,31 @@ struct zap_details {
/* Set in unmap_vmas() to indicate a final unmap call. Only used by hugetlb */
#define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1))
+#ifdef CONFIG_SCHED_MM_CID
+void sched_mm_cid_before_execve(struct task_struct *t);
+void sched_mm_cid_after_execve(struct task_struct *t);
+void sched_mm_cid_fork(struct task_struct *t);
+void sched_mm_cid_exit_signals(struct task_struct *t);
+static inline int task_mm_cid(struct task_struct *t)
+{
+ return t->mm_cid;
+}
+#else
+static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
+static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
+static inline void sched_mm_cid_fork(struct task_struct *t) { }
+static inline void sched_mm_cid_exit_signals(struct task_struct *t) { }
+static inline int task_mm_cid(struct task_struct *t)
+{
+ /*
+ * Use the processor id as a fall-back when the mm cid feature is
+ * disabled. This provides functional per-cpu data structure accesses
+ * in user-space, althrough it won't provide the memory usage benefits.
+ */
+ return raw_smp_processor_id();
+}
+#endif
+
#ifdef CONFIG_MMU
extern bool can_do_mlock(void);
#else
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3b8475007734..1c3bf76063d2 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -645,7 +645,18 @@ struct mm_struct {
* &struct mm_struct is freed.
*/
atomic_t mm_count;
-
+#ifdef CONFIG_SCHED_MM_CID
+ /**
+ * @cid_lock: Protect cid bitmap updates vs lookups.
+ *
+ * Prevent situations where updates to the cid bitmap happen
+ * concurrently with lookups. Those can lead to situations
+ * where a lookup cannot find a free bit simply because it was
+ * unlucky enough to load, non-atomically, bitmap words as they
+ * were being concurrently updated by the updaters.
+ */
+ raw_spinlock_t cid_lock;
+#endif
#ifdef CONFIG_MMU
atomic_long_t pgtables_bytes; /* PTE page table pages */
#endif
@@ -909,6 +920,36 @@ static inline void vma_iter_init(struct vma_iterator *vmi,
vmi->mas.node = MAS_START;
}
+#ifdef CONFIG_SCHED_MM_CID
+/* Accessor for struct mm_struct's cidmask. */
+static inline cpumask_t *mm_cidmask(struct mm_struct *mm)
+{
+ unsigned long cid_bitmap = (unsigned long)mm;
+
+ cid_bitmap += offsetof(struct mm_struct, cpu_bitmap);
+ /* Skip cpu_bitmap */
+ cid_bitmap += cpumask_size();
+ return (struct cpumask *)cid_bitmap;
+}
+
+static inline void mm_init_cid(struct mm_struct *mm)
+{
+ raw_spin_lock_init(&mm->cid_lock);
+ cpumask_clear(mm_cidmask(mm));
+}
+
+static inline unsigned int mm_cid_size(void)
+{
+ return cpumask_size();
+}
+#else /* CONFIG_SCHED_MM_CID */
+static inline void mm_init_cid(struct mm_struct *mm) { }
+static inline unsigned int mm_cid_size(void)
+{
+ return 0;
+}
+#endif /* CONFIG_SCHED_MM_CID */
+
struct mmu_gather;
extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e0bc020a63a9..4df2b3e76b30 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1311,6 +1311,11 @@ struct task_struct {
unsigned long rseq_event_mask;
#endif
+#ifdef CONFIG_SCHED_MM_CID
+ int mm_cid; /* Current cid in mm */
+ int mm_cid_active; /* Whether cid bitmap is active */
+#endif
+
struct tlbflush_unmap_batch tlb_ubc;
union {
diff --git a/init/Kconfig b/init/Kconfig
index 7e5c3ddc341d..1ce960aa453e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1041,6 +1041,10 @@ config RT_GROUP_SCHED
endif #CGROUP_SCHED
+config SCHED_MM_CID
+ def_bool y
+ depends on SMP && RSEQ
+
config UCLAMP_TASK_GROUP
bool "Utilization clamping per group of tasks"
depends on CGROUP_SCHED
diff --git a/kernel/fork.c b/kernel/fork.c
index 9f7fe3541897..82b2b5846aae 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1060,6 +1060,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->reported_split_lock = 0;
#endif
+#ifdef CONFIG_SCHED_MM_CID
+ tsk->mm_cid = -1;
+ tsk->mm_cid_active = 0;
+#endif
return tsk;
free_stack:
@@ -1169,6 +1173,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm->user_ns = get_user_ns(user_ns);
lru_gen_init_mm(mm);
+ mm_init_cid(mm);
return mm;
fail_pcpu:
@@ -1601,6 +1606,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
tsk->mm = mm;
tsk->active_mm = mm;
+ sched_mm_cid_fork(tsk);
return 0;
}
@@ -3034,7 +3040,7 @@ void __init mm_cache_init(void)
* dynamically sized based on the maximum CPU number this system
* can have, taking hotplug into account (nr_cpu_ids).
*/
- mm_size = sizeof(struct mm_struct) + cpumask_size();
+ mm_size = sizeof(struct mm_struct) + cpumask_size() + mm_cid_size();
mm_cachep = kmem_cache_create_usercopy("mm_struct",
mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 25b582b6ee5f..75830b7dee8f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5052,6 +5052,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
sched_info_switch(rq, prev, next);
perf_event_task_sched_out(prev, next);
rseq_preempt(prev);
+ switch_mm_cid(prev, next);
fire_sched_out_preempt_notifiers(prev, next);
kmap_local_sched_out();
prepare_task(next);
@@ -11305,3 +11306,53 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
{
trace_sched_update_nr_running_tp(rq, count);
}
+
+#ifdef CONFIG_SCHED_MM_CID
+void sched_mm_cid_exit_signals(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+ unsigned long flags;
+
+ if (!mm)
+ return;
+ local_irq_save(flags);
+ mm_cid_put(mm, t->mm_cid);
+ t->mm_cid = -1;
+ t->mm_cid_active = 0;
+ local_irq_restore(flags);
+}
+
+void sched_mm_cid_before_execve(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+ unsigned long flags;
+
+ if (!mm)
+ return;
+ local_irq_save(flags);
+ mm_cid_put(mm, t->mm_cid);
+ t->mm_cid = -1;
+ t->mm_cid_active = 0;
+ local_irq_restore(flags);
+}
+
+void sched_mm_cid_after_execve(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+ unsigned long flags;
+
+ WARN_ON_ONCE((t->flags & PF_KTHREAD) || !t->mm);
+
+ local_irq_save(flags);
+ t->mm_cid = mm_cid_get(mm);
+ t->mm_cid_active = 1;
+ local_irq_restore(flags);
+ rseq_set_notify_resume(t);
+}
+
+void sched_mm_cid_fork(struct task_struct *t)
+{
+ WARN_ON_ONCE((t->flags & PF_KTHREAD) || !t->mm || t->mm_cid != -1);
+ t->mm_cid_active = 1;
+}
+#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b3d6e819127c..c2d7467fdde1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3269,4 +3269,62 @@ static inline void update_current_exec_runtime(struct task_struct *curr,
cgroup_account_cputime(curr, delta_exec);
}
+#ifdef CONFIG_SCHED_MM_CID
+static inline int __mm_cid_get(struct mm_struct *mm)
+{
+ struct cpumask *cpumask;
+ int cid;
+
+ cpumask = mm_cidmask(mm);
+ cid = cpumask_first_zero(cpumask);
+ if (cid >= nr_cpu_ids)
+ return -1;
+ __cpumask_set_cpu(cid, cpumask);
+ return cid;
+}
+
+static inline void mm_cid_put(struct mm_struct *mm, int cid)
+{
+ lockdep_assert_irqs_disabled();
+ if (cid < 0)
+ return;
+ raw_spin_lock(&mm->cid_lock);
+ __cpumask_clear_cpu(cid, mm_cidmask(mm));
+ raw_spin_unlock(&mm->cid_lock);
+}
+
+static inline int mm_cid_get(struct mm_struct *mm)
+{
+ int ret;
+
+ lockdep_assert_irqs_disabled();
+ raw_spin_lock(&mm->cid_lock);
+ ret = __mm_cid_get(mm);
+ raw_spin_unlock(&mm->cid_lock);
+ return ret;
+}
+
+static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next)
+{
+ if (prev->mm_cid_active) {
+ if (next->mm_cid_active && next->mm == prev->mm) {
+ /*
+ * Context switch between threads in same mm, hand over
+ * the mm_cid from prev to next.
+ */
+ next->mm_cid = prev->mm_cid;
+ prev->mm_cid = -1;
+ return;
+ }
+ mm_cid_put(prev->mm, prev->mm_cid);
+ prev->mm_cid = -1;
+ }
+ if (next->mm_cid_active)
+ next->mm_cid = mm_cid_get(next->mm);
+}
+
+#else
+static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { }
+#endif
+
#endif /* _KERNEL_SCHED_SCHED_H */
diff --git a/kernel/signal.c b/kernel/signal.c
index ae26da61c4d9..8cb28f1df294 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2951,6 +2951,7 @@ void exit_signals(struct task_struct *tsk)
cgroup_threadgroup_change_begin(tsk);
if (thread_group_empty(tsk) || (tsk->signal->flags & SIGNAL_GROUP_EXIT)) {
+ sched_mm_cid_exit_signals(tsk);
tsk->flags |= PF_EXITING;
cgroup_threadgroup_change_end(tsk);
return;
@@ -2961,6 +2962,7 @@ void exit_signals(struct task_struct *tsk)
* From now this task is not visible for group-wide signals,
* see wants_signal(), do_signal_stop().
*/
+ sched_mm_cid_exit_signals(tsk);
tsk->flags |= PF_EXITING;
cgroup_threadgroup_change_end(tsk);