summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/rseq.h8
-rw-r--r--include/linux/rseq_types.h18
-rw-r--r--kernel/sched/core.c2
-rw-r--r--kernel/sched/sched.h150
4 files changed, 168 insertions, 10 deletions
diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index bf8a6bf315f3..4c0e8bdd2dd9 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -73,13 +73,13 @@ static __always_inline void rseq_sched_switch_event(struct task_struct *t)
}
/*
- * Invoked from __set_task_cpu() when a task migrates to enforce an IDs
- * update.
+ * Invoked from __set_task_cpu() when a task migrates or from
+ * mm_cid_schedin() when the CID changes to enforce an IDs update.
*
* This does not raise TIF_NOTIFY_RESUME as that happens in
* rseq_sched_switch_event().
*/
-static __always_inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu)
+static __always_inline void rseq_sched_set_ids_changed(struct task_struct *t)
{
t->rseq.event.ids_changed = true;
}
@@ -168,7 +168,7 @@ static inline void rseq_fork(struct task_struct *t, u64 clone_flags)
static inline void rseq_handle_slowpath(struct pt_regs *regs) { }
static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
static inline void rseq_sched_switch_event(struct task_struct *t) { }
-static inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu) { }
+static inline void rseq_sched_set_ids_changed(struct task_struct *t) { }
static inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid) { }
static inline void rseq_force_update(void) { }
static inline void rseq_virt_userspace_exit(void) { }
diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h
index 87854effe1ad..66b1482e1146 100644
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -119,23 +119,31 @@ struct mm_cid_pcpu {
/**
* struct mm_mm_cid - Storage for per MM CID data
* @pcpu: Per CPU storage for CIDs associated to a CPU
+ * @percpu: Set, when CIDs are in per CPU mode
+ * @transit: Set to MM_CID_TRANSIT during a mode change transition phase
* @max_cids: The exclusive maximum CID value for allocation and convergence
+ * @lock: Spinlock to protect all fields except @pcpu. It also protects
+ * the MM cid cpumask and the MM cidmask bitmap.
+ * @mutex: Mutex to serialize forks and exits related to this mm
* @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. The map
* is growth only.
* @users: The number of tasks sharing this MM. Separate from mm::mm_users
* as that is modified by mmget()/mm_put() by other entities which
* do not actually share the MM.
- * @lock: Spinlock to protect all fields except @pcpu. It also protects
- * the MM cid cpumask and the MM cidmask bitmap.
- * @mutex: Mutex to serialize forks and exits related to this mm
*/
struct mm_mm_cid {
+ /* Hotpath read mostly members */
struct mm_cid_pcpu __percpu *pcpu;
+ unsigned int percpu;
+ unsigned int transit;
unsigned int max_cids;
- unsigned int nr_cpus_allowed;
- unsigned int users;
+
raw_spinlock_t lock;
struct mutex mutex;
+
+ /* Low frequency modified */
+ unsigned int nr_cpus_allowed;
+ unsigned int users;
}____cacheline_aligned_in_smp;
#else /* CONFIG_SCHED_MM_CID */
struct mm_mm_cid { };
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 55bb9c9ae32c..659ae56b459f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10495,6 +10495,8 @@ void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
per_cpu_ptr(pcpu, cpu)->cid = MM_CID_UNSET;
mm->mm_cid.max_cids = 0;
+ mm->mm_cid.percpu = 0;
+ mm->mm_cid.transit = 0;
mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed;
mm->mm_cid.users = 0;
raw_spin_lock_init(&mm->mm_cid.lock);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4b49284504fb..82c7978d548e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2209,7 +2209,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
smp_wmb();
WRITE_ONCE(task_thread_info(p)->cpu, cpu);
p->wake_cpu = cpu;
- rseq_sched_set_task_cpu(p, cpu);
+ rseq_sched_set_ids_changed(p);
#endif /* CONFIG_SMP */
}
@@ -3598,6 +3598,153 @@ static __always_inline void mm_drop_cid_on_cpu(struct mm_struct *mm, struct mm_c
mm_drop_cid(mm, pcp->cid);
}
+static inline unsigned int __mm_get_cid(struct mm_struct *mm, unsigned int max_cids)
+{
+ unsigned int cid = find_first_zero_bit(mm_cidmask(mm), max_cids);
+
+ if (cid >= max_cids)
+ return MM_CID_UNSET;
+ if (test_and_set_bit(cid, mm_cidmask(mm)))
+ return MM_CID_UNSET;
+ return cid;
+}
+
+static inline unsigned int mm_get_cid(struct mm_struct *mm)
+{
+ unsigned int cid = __mm_get_cid(mm, READ_ONCE(mm->mm_cid.max_cids));
+
+ while (cid == MM_CID_UNSET) {
+ cpu_relax();
+ cid = __mm_get_cid(mm, num_possible_cpus());
+ }
+ return cid;
+}
+
+static inline unsigned int mm_cid_converge(struct mm_struct *mm, unsigned int orig_cid,
+ unsigned int max_cids)
+{
+ unsigned int new_cid, cid = cpu_cid_to_cid(orig_cid);
+
+ /* Is it in the optimal CID space? */
+ if (likely(cid < max_cids))
+ return orig_cid;
+
+ /* Try to find one in the optimal space. Otherwise keep the provided. */
+ new_cid = __mm_get_cid(mm, max_cids);
+ if (new_cid != MM_CID_UNSET) {
+ mm_drop_cid(mm, cid);
+ /* Preserve the ONCPU mode of the original CID */
+ return new_cid | (orig_cid & MM_CID_ONCPU);
+ }
+ return orig_cid;
+}
+
+static __always_inline void mm_cid_update_task_cid(struct task_struct *t, unsigned int cid)
+{
+ if (t->mm_cid.cid != cid) {
+ t->mm_cid.cid = cid;
+ rseq_sched_set_ids_changed(t);
+ }
+}
+
+static __always_inline void mm_cid_update_pcpu_cid(struct mm_struct *mm, unsigned int cid)
+{
+ __this_cpu_write(mm->mm_cid.pcpu->cid, cid);
+}
+
+static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int cpu_cid)
+{
+ unsigned int max_cids, tcid = t->mm_cid.cid;
+ struct mm_struct *mm = t->mm;
+
+ max_cids = READ_ONCE(mm->mm_cid.max_cids);
+ /* Optimize for the common case where both have the ONCPU bit set */
+ if (likely(cid_on_cpu(cpu_cid & tcid))) {
+ if (likely(cpu_cid_to_cid(cpu_cid) < max_cids)) {
+ mm_cid_update_task_cid(t, cpu_cid);
+ return;
+ }
+ /* Try to converge into the optimal CID space */
+ cpu_cid = mm_cid_converge(mm, cpu_cid, max_cids);
+ } else {
+ /* Hand over or drop the task owned CID */
+ if (cid_on_task(tcid)) {
+ if (cid_on_cpu(cpu_cid))
+ mm_unset_cid_on_task(t);
+ else
+ cpu_cid = cid_to_cpu_cid(tcid);
+ }
+ /* Still nothing, allocate a new one */
+ if (!cid_on_cpu(cpu_cid))
+ cpu_cid = cid_to_cpu_cid(mm_get_cid(mm));
+ }
+ mm_cid_update_pcpu_cid(mm, cpu_cid);
+ mm_cid_update_task_cid(t, cpu_cid);
+}
+
+static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int cpu_cid)
+{
+ unsigned int max_cids, tcid = t->mm_cid.cid;
+ struct mm_struct *mm = t->mm;
+
+ max_cids = READ_ONCE(mm->mm_cid.max_cids);
+ /* Optimize for the common case, where both have the ONCPU bit clear */
+ if (likely(cid_on_task(tcid | cpu_cid))) {
+ if (likely(tcid < max_cids)) {
+ mm_cid_update_pcpu_cid(mm, tcid);
+ return;
+ }
+ /* Try to converge into the optimal CID space */
+ tcid = mm_cid_converge(mm, tcid, max_cids);
+ } else {
+ /* Hand over or drop the CPU owned CID */
+ if (cid_on_cpu(cpu_cid)) {
+ if (cid_on_task(tcid))
+ mm_drop_cid_on_cpu(mm, this_cpu_ptr(mm->mm_cid.pcpu));
+ else
+ tcid = cpu_cid_to_cid(cpu_cid);
+ }
+ /* Still nothing, allocate a new one */
+ if (!cid_on_task(tcid))
+ tcid = mm_get_cid(mm);
+ /* Set the transition mode flag if required */
+ tcid |= READ_ONCE(mm->mm_cid.transit);
+ }
+ mm_cid_update_pcpu_cid(mm, tcid);
+ mm_cid_update_task_cid(t, tcid);
+}
+
+static __always_inline void mm_cid_schedin(struct task_struct *next)
+{
+ struct mm_struct *mm = next->mm;
+ unsigned int cpu_cid;
+
+ if (!next->mm_cid.active)
+ return;
+
+ cpu_cid = __this_cpu_read(mm->mm_cid.pcpu->cid);
+ if (likely(!READ_ONCE(mm->mm_cid.percpu)))
+ mm_cid_from_task(next, cpu_cid);
+ else
+ mm_cid_from_cpu(next, cpu_cid);
+}
+
+static __always_inline void mm_cid_schedout(struct task_struct *prev)
+{
+ /* During mode transitions CIDs are temporary and need to be dropped */
+ if (likely(!cid_in_transit(prev->mm_cid.cid)))
+ return;
+
+ mm_drop_cid(prev->mm, cid_from_transit_cid(prev->mm_cid.cid));
+ prev->mm_cid.cid = MM_CID_UNSET;
+}
+
+static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next)
+{
+ mm_cid_schedout(prev);
+ mm_cid_schedin(next);
+}
+
/* Active implementation */
static inline void init_sched_mm_cid(struct task_struct *t)
{
@@ -3675,6 +3822,7 @@ static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *n
#else /* !CONFIG_SCHED_MM_CID: */
static inline void mm_cid_select(struct task_struct *t) { }
static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { }
+static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) { }
#endif /* !CONFIG_SCHED_MM_CID */
extern u64 avg_vruntime(struct cfs_rq *cfs_rq);