Re: [PATCH 9/9] RT: Only dirty a cacheline if the priority is actually changing

Previous thread: [RFC][PATCH] Fix hang in posix_locks_deadlock() by George G. Davis on Wednesday, October 17, 2007 - 11:51 am. (35 messages)

Next thread: [PATCH] RT: Only dirty a cacheline if the priority is actually changing by Gregory Haskins on Wednesday, October 17, 2007 - 11:54 am. (1 message)
From: Gregory Haskins
Date: Wednesday, October 17, 2007 - 11:50 am

Applies to 23-rt1 + Steve's latest push_rt patch

Changes since v3:

1) Rebased to Steve's latest
2) Added a "highest_prio" feature to eliminate a race w.r.t. activating a task
   and the time it takes to actually reschedule the RQ.
3) Dropped the PI patch, because the highest_prio patch obsoletes it.
4) Few small tweaks
5) Few small fixes

Regards,
-Greg
-

From: Gregory Haskins
Date: Wednesday, October 17, 2007 - 11:50 am

From: Steven Rostedt <rostedt@goodmis.org>

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---

 kernel/sched.c    |  141 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 kernel/sched_rt.c |   44 +++++++++++++++++
 2 files changed, 178 insertions(+), 7 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 3e75c62..0dabf89 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -304,6 +304,7 @@ struct rq {
 #ifdef CONFIG_PREEMPT_RT
 	unsigned long rt_nr_running;
 	unsigned long rt_nr_uninterruptible;
+	int curr_prio;
 #endif
 
 	unsigned long switch_timestamp;
@@ -1484,6 +1485,123 @@ next_in_queue:
 
 static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
 
+/* Only try this algorithm three times */
+#define RT_PUSH_MAX_TRIES 3
+
+/* Will lock the rq it finds */
+static struct rq *find_lock_lowest_rq(cpumask_t *cpu_mask,
+				      struct task_struct *task,
+				      struct rq *this_rq)
+{
+	struct rq *lowest_rq = NULL;
+	int dst_cpu = -1;
+	int cpu;
+	int tries;
+
+	for (tries = 0; tries < RT_PUSH_MAX_TRIES; tries++) {
+		/*
+		 * Scan each rq for the lowest prio.
+		 */
+		for_each_cpu_mask(cpu, *cpu_mask) {
+			struct rq *rq = &per_cpu(runqueues, cpu);
+
+			if (cpu == smp_processor_id())
+				continue;
+
+			/* We look for lowest RT prio or non-rt CPU */
+			if (rq->curr_prio >= MAX_RT_PRIO) {
+				lowest_rq = rq;
+				dst_cpu = cpu;
+				break;
+			}
+
+			/* no locking for now */
+			if (rq->curr_prio > task->prio &&
+			    (!lowest_rq || rq->curr_prio < lowest_rq->curr_prio)) {
+				lowest_rq = rq;
+				dst_cpu = cpu;
+			}
+		}
+
+		if (!lowest_rq)
+			break;
+
+		/* if the prio of this runqueue changed, try again */
+		if (double_lock_balance(this_rq, lowest_rq)) {
+			/*
+			 * We had to unlock the run queue. In
+			 * the mean time, task could have
+			 * migrated already or had its affinity changed.
+			 */
+			if (unlikely(task_rq(task) != this_rq ||
+				     !cpu_isset(dst_cpu, ...
From: Gregory Haskins
Date: Wednesday, October 17, 2007 - 11:50 am

The system currently evaluates all online CPUs whenever one or more enters
an rt_overload condition.  This suffers from scalability limitations as
the # of online CPUs increases.  So we introduce a cpumask to track
exactly which CPUs need RT balancing.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
CC: Peter W. Morreale <pmorreale@novell.com>
---

 kernel/sched.c |   12 +++++++++---
 1 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 0dabf89..0da8c30 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -632,6 +632,7 @@ static inline struct rq *this_rq_lock(void)
 
 #if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
 static __cacheline_aligned_in_smp atomic_t rt_overload;
+static cpumask_t rto_cpus;
 #endif
 
 static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
@@ -640,8 +641,11 @@ static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
 	if (rt_task(p)) {
 		rq->rt_nr_running++;
 # ifdef CONFIG_SMP
-		if (rq->rt_nr_running == 2)
+		if (rq->rt_nr_running == 2) {
+			cpu_set(rq->cpu, rto_cpus);
+			smp_wmb();
 			atomic_inc(&rt_overload);
+		}
 # endif
 	}
 #endif
@@ -654,8 +658,10 @@ static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq)
 		WARN_ON(!rq->rt_nr_running);
 		rq->rt_nr_running--;
 # ifdef CONFIG_SMP
-		if (rq->rt_nr_running == 1)
+		if (rq->rt_nr_running == 1) {
 			atomic_dec(&rt_overload);
+			cpu_clear(rq->cpu, rto_cpus);
+		}
 # endif
 	}
 #endif
@@ -1621,7 +1627,7 @@ static void balance_rt_tasks(struct rq *this_rq, int this_cpu)
 	 */
 	next = pick_next_task(this_rq, this_rq->curr);
 
-	for_each_online_cpu(cpu) {
+	for_each_cpu_mask(cpu, rto_cpus) {
 		if (cpu == this_cpu)
 			continue;
 		src_rq = cpu_rq(cpu);

-

From: Gregory Haskins
Date: Wednesday, October 17, 2007 - 11:50 am

A little cleanup to avoid #ifdef proliferation later in the series

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 kernel/sched.c |   16 +++++++++++++---
 1 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 0da8c30..131f618 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -365,6 +365,16 @@ struct rq {
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 static DEFINE_MUTEX(sched_hotcpu_mutex);
 
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+static inline void set_rq_prio(struct rq *rq, int prio)
+{
+	rq->curr_prio = prio;
+}
+
+#else
+#define set_rq_prio(rq, prio) do { } while(0)
+#endif
+
 static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
 {
 	rq->curr->sched_class->check_preempt_curr(rq, p);
@@ -2328,9 +2338,9 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	 */
 	prev_state = prev->state;
 	_finish_arch_switch(prev);
-#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
-	rq->curr_prio = current->prio;
-#endif
+
+	set_rq_prio(rq, current->prio);
+
 	finish_lock_switch(rq, prev);
 #if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
 	/*

-

From: Gregory Haskins
Date: Wednesday, October 17, 2007 - 11:50 am

We should init the base value of the current RQ priority to "IDLE"

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 kernel/sched.c |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 131f618..d68f600 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7385,6 +7385,8 @@ void __init sched_init(void)
 		highest_cpu = i;
 		/* delimiter for bitsearch: */
 		__set_bit(MAX_RT_PRIO, array->bitmap);
+
+		set_rq_prio(rq, MAX_PRIO);
 	}
 
 	set_load_weight(&init_task);

-

From: Gregory Haskins
Date: Wednesday, October 17, 2007 - 11:50 am

This is an implementation of Steve's idea where we should update the RQ
concept of priority to show the highest-task, even if that task is not (yet)
running.  This prevents us from pushing multiple tasks to the RQ before it
gets a chance to reschedule.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 kernel/sched.c |   37 ++++++++++++++++++++++++++++---------
 1 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index d68f600..67034aa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -304,7 +304,7 @@ struct rq {
 #ifdef CONFIG_PREEMPT_RT
 	unsigned long rt_nr_running;
 	unsigned long rt_nr_uninterruptible;
-	int curr_prio;
+	int highest_prio;
 #endif
 
 	unsigned long switch_timestamp;
@@ -368,11 +368,23 @@ static DEFINE_MUTEX(sched_hotcpu_mutex);
 #if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
 static inline void set_rq_prio(struct rq *rq, int prio)
 {
-	rq->curr_prio = prio;
+	rq->highest_prio = prio;
+}
+
+static inline void update_rq_prio(struct rq *rq)
+{
+	struct rt_prio_array *array = &rq->rt.active;
+	int prio = MAX_PRIO;
+
+	if (rq->nr_running)
+		prio = sched_find_first_bit(array->bitmap);
+
+	set_rq_prio(rq, prio);
 }
 
 #else
 #define set_rq_prio(rq, prio) do { } while(0)
+#define update_rq_prio(rq)    do { } while(0)
 #endif
 
 static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
@@ -1023,12 +1035,14 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
 	sched_info_queued(p);
 	p->sched_class->enqueue_task(rq, p, wakeup);
 	p->se.on_rq = 1;
+	update_rq_prio(rq);
 }
 
 static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
 {
 	p->sched_class->dequeue_task(rq, p, sleep);
 	p->se.on_rq = 0;
+	update_rq_prio(rq);
 }
 
 /*
@@ -1525,15 +1539,15 @@ static struct rq *find_lock_lowest_rq(cpumask_t *cpu_mask,
 				continue;
 
 			/* We look for lowest RT prio or non-rt CPU */
-			if (rq->curr_prio >= ...
From: Gregory Haskins
Date: Wednesday, October 17, 2007 - 11:51 am

There are three events that require consideration for redistributing RT
tasks:

1) When one or more higher-priority tasks preempts a lower-one from a
   RQ
2) When a lower-priority task is woken up on a RQ
3) When a RQ downgrades its current priority

Steve Rostedt's push_rt patch addresses (1).  It hooks in right after
a new task has been switched-in.  If this was the result of an RT
preemption, or if more than one task was awoken at the same time, we
can try to push some of those other tasks away.

This patch addresses (2).  When we wake up a task, we check to see
if it would preempt the current task on the queue.  If it will not, we
attempt to find a better suited CPU (e.g. one running something lower
priority than the task being woken) and try to activate the task there.

Finally, we have (3).  In theory, we only need to balance_rt_tasks() if
the following conditions are met:
   1) One or more CPUs are in overload, AND
   2) We are about to switch to a task that lowers our priority.

(3) will be addressed in a later patch.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 kernel/sched.c |   88 ++++++++++++++++++++++++++------------------------------
 1 files changed, 41 insertions(+), 47 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index d604484..0ee1e21 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1628,6 +1628,13 @@ out:
 	return ret;
 }
 
+/* Push all tasks that we can to other CPUs */
+static void push_rt_tasks(struct rq *this_rq)
+{
+	while (push_rt_task(this_rq))
+		;
+}
+
 /*
  * Pull RT tasks from other CPUs in the RT-overload
  * case. Interrupts are disabled, local rq is locked.
@@ -1988,6 +1995,25 @@ out_set_cpu:
 		this_cpu = smp_processor_id();
 		cpu = task_cpu(p);
 	}
+	
+#if defined(CONFIG_PREEMPT_RT)
+	/*
+	 * If a newly woken up RT task cannot preempt the
+	 * current (RT) task (on a target runqueue) then try
+	 * to find another CPU it can preempt:
+	 */
+	if (rt_task(p) && !TASK_PREEMPTS_CURR(p, ...
From: Gregory Haskins
Date: Wednesday, October 17, 2007 - 11:50 am

Get rid of the superfluous dst_cpu, and move the cpu_mask inside the search
function.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 kernel/sched.c |   18 +++++++-----------
 1 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 67034aa..d604484 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1519,20 +1519,21 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
 #define RT_PUSH_MAX_TRIES 3
 
 /* Will lock the rq it finds */
-static struct rq *find_lock_lowest_rq(cpumask_t *cpu_mask,
-				      struct task_struct *task,
+static struct rq *find_lock_lowest_rq(struct task_struct *task,
 				      struct rq *this_rq)
 {
 	struct rq *lowest_rq = NULL;
-	int dst_cpu = -1;
 	int cpu;
 	int tries;
+	cpumask_t cpu_mask;
+
+	cpus_and(cpu_mask, cpu_online_map, task->cpus_allowed);
 
 	for (tries = 0; tries < RT_PUSH_MAX_TRIES; tries++) {
 		/*
 		 * Scan each rq for the lowest prio.
 		 */
-		for_each_cpu_mask(cpu, *cpu_mask) {
+		for_each_cpu_mask(cpu, cpu_mask) {
 			struct rq *rq = &per_cpu(runqueues, cpu);
 
 			if (cpu == smp_processor_id())
@@ -1541,7 +1542,6 @@ static struct rq *find_lock_lowest_rq(cpumask_t *cpu_mask,
 			/* We look for lowest RT prio or non-rt CPU */
 			if (rq->highest_prio >= MAX_RT_PRIO) {
 				lowest_rq = rq;
-				dst_cpu = cpu;
 				break;
 			}
 
@@ -1549,7 +1549,6 @@ static struct rq *find_lock_lowest_rq(cpumask_t *cpu_mask,
 			if (rq->highest_prio > task->prio &&
 			    (!lowest_rq || rq->highest_prio < lowest_rq->highest_prio)) {
 				lowest_rq = rq;
-				dst_cpu = cpu;
 			}
 		}
 
@@ -1564,7 +1563,7 @@ static struct rq *find_lock_lowest_rq(cpumask_t *cpu_mask,
 			 * migrated already or had its affinity changed.
 			 */
 			if (unlikely(task_rq(task) != this_rq ||
-				     !cpu_isset(dst_cpu, task->cpus_allowed))) {
+				     !cpu_isset(lowest_rq->cpu, task->cpus_allowed))) {
 				spin_unlock(&lowest_rq->lock);
 				lowest_rq = ...
From: Gregory Haskins
Date: Wednesday, October 17, 2007 - 11:51 am

From: Steven Rostedt <rostedt@goodmis.org>

Steve found these errors in the original patch

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 kernel/sched.c    |    2 +-
 kernel/sched_rt.c |    2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 0ee1e21..8c916de 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1536,7 +1536,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task,
 		for_each_cpu_mask(cpu, cpu_mask) {
 			struct rq *rq = &per_cpu(runqueues, cpu);
 
-			if (cpu == smp_processor_id())
+			if (cpu == this_rq->cpu)
 				continue;
 
 			/* We look for lowest RT prio or non-rt CPU */
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 8d59e62..04959fe 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -115,7 +115,7 @@ static struct task_struct *rt_next_highest_task(struct rq *rq)
 
 	queue = array->queue + idx;
 	next = list_entry(queue->next, struct task_struct, run_list);
-	if (unlikely(next != current))
+	if (unlikely(next != rq->curr))
 		return next;
 
 	if (queue->next->next != queue) {

-

From: Gregory Haskins
Date: Wednesday, October 17, 2007 - 11:51 am

We can avoid dirtying a rq related cacheline with a simple check, so why not.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 0 files changed, 0 insertions(+), 0 deletions(-)


-

From: Roel Kluin
Date: Friday, October 19, 2007 - 7:48 pm

I think you wanted a patch here?
-

From: Steven Rostedt
Date: Saturday, October 20, 2007 - 12:54 am

--

But it is here.  Gregory is a Zen master, and this patch does exactly what
he wanted it to do.

-- Steve

-

From: Gregory Haskins
Date: Saturday, October 20, 2007 - 3:44 am

[Empty message]
Previous thread: [RFC][PATCH] Fix hang in posix_locks_deadlock() by George G. Davis on Wednesday, October 17, 2007 - 11:51 am. (35 messages)

Next thread: [PATCH] RT: Only dirty a cacheline if the priority is actually changing by Gregory Haskins on Wednesday, October 17, 2007 - 11:54 am. (1 message)