Applies to 23-rt1 + Steve's latest push_rt patch Changes since v3: 1) Rebased to Steve's latest 2) Added a "highest_prio" feature to eliminate a race w.r.t. activating a task and the time it takes to actually reschedule the RQ. 3) Dropped the PI patch, because the highest_prio patch obsoletes it. 4) Few small tweaks 5) Few small fixes Regards, -Greg -
From: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
kernel/sched.c | 141 ++++++++++++++++++++++++++++++++++++++++++++++++++---
kernel/sched_rt.c | 44 +++++++++++++++++
2 files changed, 178 insertions(+), 7 deletions(-)
diff --git a/kernel/sched.c b/kernel/sched.c
index 3e75c62..0dabf89 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -304,6 +304,7 @@ struct rq {
#ifdef CONFIG_PREEMPT_RT
unsigned long rt_nr_running;
unsigned long rt_nr_uninterruptible;
+ int curr_prio;
#endif
unsigned long switch_timestamp;
@@ -1484,6 +1485,123 @@ next_in_queue:
static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
+/* Only try this algorithm three times */
+#define RT_PUSH_MAX_TRIES 3
+
+/* Will lock the rq it finds */
+static struct rq *find_lock_lowest_rq(cpumask_t *cpu_mask,
+ struct task_struct *task,
+ struct rq *this_rq)
+{
+ struct rq *lowest_rq = NULL;
+ int dst_cpu = -1;
+ int cpu;
+ int tries;
+
+ for (tries = 0; tries < RT_PUSH_MAX_TRIES; tries++) {
+ /*
+ * Scan each rq for the lowest prio.
+ */
+ for_each_cpu_mask(cpu, *cpu_mask) {
+ struct rq *rq = &per_cpu(runqueues, cpu);
+
+ if (cpu == smp_processor_id())
+ continue;
+
+ /* We look for lowest RT prio or non-rt CPU */
+ if (rq->curr_prio >= MAX_RT_PRIO) {
+ lowest_rq = rq;
+ dst_cpu = cpu;
+ break;
+ }
+
+ /* no locking for now */
+ if (rq->curr_prio > task->prio &&
+ (!lowest_rq || rq->curr_prio < lowest_rq->curr_prio)) {
+ lowest_rq = rq;
+ dst_cpu = cpu;
+ }
+ }
+
+ if (!lowest_rq)
+ break;
+
+ /* if the prio of this runqueue changed, try again */
+ if (double_lock_balance(this_rq, lowest_rq)) {
+ /*
+ * We had to unlock the run queue. In
+ * the mean time, task could have
+ * migrated already or had its affinity changed.
+ */
+ if (unlikely(task_rq(task) != this_rq ||
+ !cpu_isset(dst_cpu, ...The system currently evaluates all online CPUs whenever one or more enters
an rt_overload condition. This suffers from scalability limitations as
the # of online CPUs increases. So we introduce a cpumask to track
exactly which CPUs need RT balancing.
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
CC: Peter W. Morreale <pmorreale@novell.com>
---
kernel/sched.c | 12 +++++++++---
1 files changed, 9 insertions(+), 3 deletions(-)
diff --git a/kernel/sched.c b/kernel/sched.c
index 0dabf89..0da8c30 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -632,6 +632,7 @@ static inline struct rq *this_rq_lock(void)
#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
static __cacheline_aligned_in_smp atomic_t rt_overload;
+static cpumask_t rto_cpus;
#endif
static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
@@ -640,8 +641,11 @@ static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
if (rt_task(p)) {
rq->rt_nr_running++;
# ifdef CONFIG_SMP
- if (rq->rt_nr_running == 2)
+ if (rq->rt_nr_running == 2) {
+ cpu_set(rq->cpu, rto_cpus);
+ smp_wmb();
atomic_inc(&rt_overload);
+ }
# endif
}
#endif
@@ -654,8 +658,10 @@ static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq)
WARN_ON(!rq->rt_nr_running);
rq->rt_nr_running--;
# ifdef CONFIG_SMP
- if (rq->rt_nr_running == 1)
+ if (rq->rt_nr_running == 1) {
atomic_dec(&rt_overload);
+ cpu_clear(rq->cpu, rto_cpus);
+ }
# endif
}
#endif
@@ -1621,7 +1627,7 @@ static void balance_rt_tasks(struct rq *this_rq, int this_cpu)
*/
next = pick_next_task(this_rq, this_rq->curr);
- for_each_online_cpu(cpu) {
+ for_each_cpu_mask(cpu, rto_cpus) {
if (cpu == this_cpu)
continue;
src_rq = cpu_rq(cpu);
-
A little cleanup to avoid #ifdef proliferation later in the series
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---
kernel/sched.c | 16 +++++++++++++---
1 files changed, 13 insertions(+), 3 deletions(-)
diff --git a/kernel/sched.c b/kernel/sched.c
index 0da8c30..131f618 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -365,6 +365,16 @@ struct rq {
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
static DEFINE_MUTEX(sched_hotcpu_mutex);
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+static inline void set_rq_prio(struct rq *rq, int prio)
+{
+ rq->curr_prio = prio;
+}
+
+#else
+#define set_rq_prio(rq, prio) do { } while(0)
+#endif
+
static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
{
rq->curr->sched_class->check_preempt_curr(rq, p);
@@ -2328,9 +2338,9 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
*/
prev_state = prev->state;
_finish_arch_switch(prev);
-#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
- rq->curr_prio = current->prio;
-#endif
+
+ set_rq_prio(rq, current->prio);
+
finish_lock_switch(rq, prev);
#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
/*
-
We should init the base value of the current RQ priority to "IDLE" Signed-off-by: Gregory Haskins <ghaskins@novell.com> --- kernel/sched.c | 2 ++ 1 files changed, 2 insertions(+), 0 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 131f618..d68f600 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7385,6 +7385,8 @@ void __init sched_init(void) highest_cpu = i; /* delimiter for bitsearch: */ __set_bit(MAX_RT_PRIO, array->bitmap); + + set_rq_prio(rq, MAX_PRIO); } set_load_weight(&init_task); -
This is an implementation of Steve's idea where we should update the RQ
concept of priority to show the highest-task, even if that task is not (yet)
running. This prevents us from pushing multiple tasks to the RQ before it
gets a chance to reschedule.
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---
kernel/sched.c | 37 ++++++++++++++++++++++++++++---------
1 files changed, 28 insertions(+), 9 deletions(-)
diff --git a/kernel/sched.c b/kernel/sched.c
index d68f600..67034aa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -304,7 +304,7 @@ struct rq {
#ifdef CONFIG_PREEMPT_RT
unsigned long rt_nr_running;
unsigned long rt_nr_uninterruptible;
- int curr_prio;
+ int highest_prio;
#endif
unsigned long switch_timestamp;
@@ -368,11 +368,23 @@ static DEFINE_MUTEX(sched_hotcpu_mutex);
#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
static inline void set_rq_prio(struct rq *rq, int prio)
{
- rq->curr_prio = prio;
+ rq->highest_prio = prio;
+}
+
+static inline void update_rq_prio(struct rq *rq)
+{
+ struct rt_prio_array *array = &rq->rt.active;
+ int prio = MAX_PRIO;
+
+ if (rq->nr_running)
+ prio = sched_find_first_bit(array->bitmap);
+
+ set_rq_prio(rq, prio);
}
#else
#define set_rq_prio(rq, prio) do { } while(0)
+#define update_rq_prio(rq) do { } while(0)
#endif
static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
@@ -1023,12 +1035,14 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
sched_info_queued(p);
p->sched_class->enqueue_task(rq, p, wakeup);
p->se.on_rq = 1;
+ update_rq_prio(rq);
}
static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
{
p->sched_class->dequeue_task(rq, p, sleep);
p->se.on_rq = 0;
+ update_rq_prio(rq);
}
/*
@@ -1525,15 +1539,15 @@ static struct rq *find_lock_lowest_rq(cpumask_t *cpu_mask,
continue;
/* We look for lowest RT prio or non-rt CPU */
- if (rq->curr_prio >= ...There are three events that require consideration for redistributing RT
tasks:
1) When one or more higher-priority tasks preempts a lower-one from a
RQ
2) When a lower-priority task is woken up on a RQ
3) When a RQ downgrades its current priority
Steve Rostedt's push_rt patch addresses (1). It hooks in right after
a new task has been switched-in. If this was the result of an RT
preemption, or if more than one task was awoken at the same time, we
can try to push some of those other tasks away.
This patch addresses (2). When we wake up a task, we check to see
if it would preempt the current task on the queue. If it will not, we
attempt to find a better suited CPU (e.g. one running something lower
priority than the task being woken) and try to activate the task there.
Finally, we have (3). In theory, we only need to balance_rt_tasks() if
the following conditions are met:
1) One or more CPUs are in overload, AND
2) We are about to switch to a task that lowers our priority.
(3) will be addressed in a later patch.
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---
kernel/sched.c | 88 ++++++++++++++++++++++++++------------------------------
1 files changed, 41 insertions(+), 47 deletions(-)
diff --git a/kernel/sched.c b/kernel/sched.c
index d604484..0ee1e21 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1628,6 +1628,13 @@ out:
return ret;
}
+/* Push all tasks that we can to other CPUs */
+static void push_rt_tasks(struct rq *this_rq)
+{
+ while (push_rt_task(this_rq))
+ ;
+}
+
/*
* Pull RT tasks from other CPUs in the RT-overload
* case. Interrupts are disabled, local rq is locked.
@@ -1988,6 +1995,25 @@ out_set_cpu:
this_cpu = smp_processor_id();
cpu = task_cpu(p);
}
+
+#if defined(CONFIG_PREEMPT_RT)
+ /*
+ * If a newly woken up RT task cannot preempt the
+ * current (RT) task (on a target runqueue) then try
+ * to find another CPU it can preempt:
+ */
+ if (rt_task(p) && !TASK_PREEMPTS_CURR(p, ...Get rid of the superfluous dst_cpu, and move the cpu_mask inside the search
function.
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---
kernel/sched.c | 18 +++++++-----------
1 files changed, 7 insertions(+), 11 deletions(-)
diff --git a/kernel/sched.c b/kernel/sched.c
index 67034aa..d604484 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1519,20 +1519,21 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
#define RT_PUSH_MAX_TRIES 3
/* Will lock the rq it finds */
-static struct rq *find_lock_lowest_rq(cpumask_t *cpu_mask,
- struct task_struct *task,
+static struct rq *find_lock_lowest_rq(struct task_struct *task,
struct rq *this_rq)
{
struct rq *lowest_rq = NULL;
- int dst_cpu = -1;
int cpu;
int tries;
+ cpumask_t cpu_mask;
+
+ cpus_and(cpu_mask, cpu_online_map, task->cpus_allowed);
for (tries = 0; tries < RT_PUSH_MAX_TRIES; tries++) {
/*
* Scan each rq for the lowest prio.
*/
- for_each_cpu_mask(cpu, *cpu_mask) {
+ for_each_cpu_mask(cpu, cpu_mask) {
struct rq *rq = &per_cpu(runqueues, cpu);
if (cpu == smp_processor_id())
@@ -1541,7 +1542,6 @@ static struct rq *find_lock_lowest_rq(cpumask_t *cpu_mask,
/* We look for lowest RT prio or non-rt CPU */
if (rq->highest_prio >= MAX_RT_PRIO) {
lowest_rq = rq;
- dst_cpu = cpu;
break;
}
@@ -1549,7 +1549,6 @@ static struct rq *find_lock_lowest_rq(cpumask_t *cpu_mask,
if (rq->highest_prio > task->prio &&
(!lowest_rq || rq->highest_prio < lowest_rq->highest_prio)) {
lowest_rq = rq;
- dst_cpu = cpu;
}
}
@@ -1564,7 +1563,7 @@ static struct rq *find_lock_lowest_rq(cpumask_t *cpu_mask,
* migrated already or had its affinity changed.
*/
if (unlikely(task_rq(task) != this_rq ||
- !cpu_isset(dst_cpu, task->cpus_allowed))) {
+ !cpu_isset(lowest_rq->cpu, task->cpus_allowed))) {
spin_unlock(&lowest_rq->lock);
lowest_rq = ...From: Steven Rostedt <rostedt@goodmis.org>
Steve found these errors in the original patch
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---
kernel/sched.c | 2 +-
kernel/sched_rt.c | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/kernel/sched.c b/kernel/sched.c
index 0ee1e21..8c916de 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1536,7 +1536,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task,
for_each_cpu_mask(cpu, cpu_mask) {
struct rq *rq = &per_cpu(runqueues, cpu);
- if (cpu == smp_processor_id())
+ if (cpu == this_rq->cpu)
continue;
/* We look for lowest RT prio or non-rt CPU */
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 8d59e62..04959fe 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -115,7 +115,7 @@ static struct task_struct *rt_next_highest_task(struct rq *rq)
queue = array->queue + idx;
next = list_entry(queue->next, struct task_struct, run_list);
- if (unlikely(next != current))
+ if (unlikely(next != rq->curr))
return next;
if (queue->next->next != queue) {
-
We can avoid dirtying a rq related cacheline with a simple check, so why not. Signed-off-by: Gregory Haskins <ghaskins@novell.com> --- 0 files changed, 0 insertions(+), 0 deletions(-) -
-- But it is here. Gregory is a Zen master, and this patch does exactly what he wanted it to do. -- Steve -
