Adds an additional function call to the sched_setscheduler to update the
waiter position of a task if it happens to be waiting on a futex. This
ensures that the kernel level waiter ordering is correctly maintained
based on the changed priority of the task.
I fixed the locking issue noticed by Thomas Gleixner.
This doesn't address userspace at all, only the kernel level wakeups and
kernel level ordering.
The additional locking added to the futex_wait function has no visible speed
impact, and only effects waiters which actual enter the kernel.
Signed-off-by: Daniel Walker <dwalker@mvista.com>
---
include/linux/sched.h | 4 ++++
kernel/futex.c | 41 +++++++++++++++++++++++++++++++++++++++++
kernel/sched.c | 1 +
3 files changed, 46 insertions(+)
Index: linux-2.6.25/include/linux/sched.h
===================================================================
--- linux-2.6.25.orig/include/linux/sched.h
+++ linux-2.6.25/include/linux/sched.h
@@ -1027,6 +1027,7 @@ struct sched_rt_entity {
enum lock_waiter_type {
MUTEX_WAITER = 1,
RT_MUTEX_WAITER,
+ FUTEX_WAITER
};
struct lock_waiter_state {
@@ -1034,6 +1035,7 @@ struct lock_waiter_state {
union {
struct mutex_waiter *mutex_blocked_on;
struct rt_mutex_waiter *rt_blocked_on;
+ union futex_key *futex_blocked_on;
};
};
@@ -1675,6 +1677,8 @@ static inline int rt_mutex_getprio(struc
# define rt_mutex_adjust_pi(p) do { } while (0)
#endif
+extern void futex_adjust_waiters(struct task_struct *p);
+
extern void set_user_nice(struct task_struct *p, long nice);
extern int task_prio(const struct task_struct *p);
extern int task_nice(const struct task_struct *p);
Index: linux-2.6.25/kernel/futex.c
===================================================================
--- linux-2.6.25.orig/kernel/futex.c
+++ linux-2.6.25/kernel/futex.c
@@ -327,6 +327,38 @@ static int get_futex_value_locked(u32 *d
return ret ? -EFAULT : 0;
}
+void futex_adjust_waiters(struct task_struct *p)
+{
+
+ if (p->blocked_on) {
+ struct futex_hash_bucket *hb;
+ struct futex_q *q, *next;
+ union futex_key key;
+
+ spin_lock_irq(&p->pi_lock);
+ if (p->blocked_on && p->blocked_on->lock_type == FUTEX_WAITER) {
+ key = *p->blocked_on->futex_blocked_on;
+ spin_unlock_irq(&p->pi_lock);
+ } else {
+ spin_unlock_irq(&p->pi_lock);
+ return;
+ }
+
+ hb = hash_futex(&key);
+ spin_lock(&hb->lock);
+ plist_for_each_entry_safe(q, next, &hb->chain, list) {
+ if (match_futex(&q->key, &key) && q->task == p) {
+ int prio = min(p->normal_prio, MAX_RT_PRIO);
+ plist_del(&q->list, &hb->chain);
+ plist_node_init(&q->list, prio);
+ plist_add(&q->list, &hb->chain);
+ break;
+ }
+ }
+ spin_unlock(&hb->lock);
+ }
+}
+
/*
* Fault handling.
* if fshared is non NULL, current->mm->mmap_sem is already held
@@ -1159,6 +1191,8 @@ static int futex_wait(u32 __user *uaddr,
DECLARE_WAITQUEUE(wait, curr);
struct futex_hash_bucket *hb;
struct futex_q q;
+ struct lock_waiter_state blocked_on = {
+ .lock_type = FUTEX_WAITER, { .futex_blocked_on = &q.key } };
u32 uval;
int ret;
struct hrtimer_sleeper t;
@@ -1176,6 +1210,8 @@ static int futex_wait(u32 __user *uaddr,
if (unlikely(ret != 0))
goto out_release_sem;
+ set_blocked_on(current, &blocked_on);
+
hb = queue_lock(&q);
/*
@@ -1203,6 +1239,8 @@ static int futex_wait(u32 __user *uaddr,
if (unlikely(ret)) {
queue_unlock(&q, hb);
+ set_blocked_on(current, NULL);
+
/*
* If we would have faulted, release mmap_sem, fault it in and
* start all over again.
@@ -1276,6 +1314,8 @@ static int futex_wait(u32 __user *uaddr,
}
__set_current_state(TASK_RUNNING);
+ set_blocked_on(current, NULL);
+
/*
* NOTE: we don't remove ourselves from the waitqueue because
* we are the only user of it.
@@ -1310,6 +1350,7 @@ static int futex_wait(u32 __user *uaddr,
out_unlock_release_sem:
queue_unlock(&q, hb);
+ set_blocked_on(current, NULL);
out_release_sem:
futex_unlock_mm(fshared);
Index: linux-2.6.25/kernel/sched.c
===================================================================
--- linux-2.6.25.orig/kernel/sched.c
+++ linux-2.6.25/kernel/sched.c
@@ -5209,6 +5209,7 @@ recheck:
spin_unlock_irqrestore(&p->pi_lock, flags);
rt_mutex_adjust_pi(p);
+ futex_adjust_waiters(p);
return 0;
}
--
--
| Ian Campbell | Re: [PATCH] x86: Construct 32 bit boot time page tables in native format. |
| Greg Kroah-Hartman | [PATCH 001/196] Chinese: Add the known_regression URI to the HOWTO |
| Justin Piszcz | Linux Software RAID 5 Performance Optimizations: 2.6.19.1: (211MB/s read & 195... |
| Alan | Re: [RFC] Heads up on sys_fallocate() |
| Matthias Scheler | Re: HEADS UP: timecounters (branch simonb-timecounters) merged into -current |
| David Laight | long usernames |
| Quentin Garnier | Re: Understanding foo_open, foo_read, etc. |
| Jared D. McNeill | Breaking binary compatibility for /dev/joy |
git: | |
| Jarek Poplawski | [PATCH] pkt_sched: Destroy gen estimators under rtnl_lock(). |
| Gerrit Renker | [PATCH 0/37] dccp: Feature negotiation - last call for comments |
| David Miller | [GIT]: Networking |
| Natalie Protasevich | [BUG] New Kernel Bugs |
