Re: [Announce] [patch] Modular Scheduler Core and Completely Fair Scheduler [CFS]

Previous message: [thread] [date] [author]
Next message: [thread] [date] [author]
From: William Lee Irwin III
Date: Tuesday, April 17, 2007 - 4:31 am

On Tue, Apr 17, 2007 at 02:57:49AM -0700, William Lee Irwin III wrote:

Con helped me dredge up the vdls bits, so here is the last version I
before I got tired of toying with the idea. It's not all that clean,
with a fair amount of debug code floating around and a number of
idiocies (it seems there was a plot to use a heap somewhere I forgot
about entirely, never mind other cruft), but I thought I should at least
say something more provable than "there was a patch I never posted."

Enjoy!


-- wli

diff -prauN linux-2.6.0-test11/fs/proc/array.c sched-2.6.0-test11-5/fs/proc/array.c
--- linux-2.6.0-test11/fs/proc/array.c	2003-11-26 12:44:26.000000000 -0800
+++ sched-2.6.0-test11-5/fs/proc/array.c	2003-12-17 07:37:11.000000000 -0800
@@ -162,7 +162,7 @@ static inline char * task_state(struct t
 		"Uid:\t%d\t%d\t%d\t%d\n"
 		"Gid:\t%d\t%d\t%d\t%d\n",
 		get_task_state(p),
-		(p->sleep_avg/1024)*100/(1000000000/1024),
+		0UL, /* was ->sleep_avg */
 	       	p->tgid,
 		p->pid, p->pid ? p->real_parent->pid : 0,
 		p->pid && p->ptrace ? p->parent->pid : 0,
@@ -345,7 +345,7 @@ int proc_pid_stat(struct task_struct *ta
 	read_unlock(&tasklist_lock);
 	res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \
 %lu %lu %lu %lu %lu %ld %ld %ld %ld %d %ld %llu %lu %ld %lu %lu %lu %lu %lu \
-%lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu\n",
+%lu %lu %lu %lu %lu %lu %lu %lu %d %d %d %d\n",
 		task->pid,
 		task->comm,
 		state,
@@ -390,8 +390,8 @@ int proc_pid_stat(struct task_struct *ta
 		task->cnswap,
 		task->exit_signal,
 		task_cpu(task),
-		task->rt_priority,
-		task->policy);
+		task_prio(task),
+		task_sched_policy(task));
 	if(mm)
 		mmput(mm);
 	return res;
diff -prauN linux-2.6.0-test11/include/asm-i386/thread_info.h sched-2.6.0-test11-5/include/asm-i386/thread_info.h
--- linux-2.6.0-test11/include/asm-i386/thread_info.h	2003-11-26 12:43:06.000000000 -0800
+++ sched-2.6.0-test11-5/include/asm-i386/thread_info.h	2003-12-17 04:55:22.000000000 -0800
@@ -114,6 +114,8 @@ static inline struct thread_info *curren
 #define TIF_SINGLESTEP		4	/* restore singlestep on return to user mode */
 #define TIF_IRET		5	/* return with iret */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
+#define TIF_QUEUED		17
+#define TIF_PREEMPT		18
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
diff -prauN linux-2.6.0-test11/include/linux/binomial.h sched-2.6.0-test11-5/include/linux/binomial.h
--- linux-2.6.0-test11/include/linux/binomial.h	1969-12-31 16:00:00.000000000 -0800
+++ sched-2.6.0-test11-5/include/linux/binomial.h	2003-12-20 15:53:33.000000000 -0800
@@ -0,0 +1,16 @@
+/*
+ * Simple binomial heaps.
+ */
+
+struct binomial {
+	unsigned priority, degree;
+	struct binomial *parent, *child, *sibling;
+};
+
+
+struct binomial *binomial_minimum(struct binomial **);
+void binomial_union(struct binomial **, struct binomial **, struct binomial **);
+void binomial_insert(struct binomial **, struct binomial *);
+struct binomial *binomial_extract_min(struct binomial **);
+void binomial_decrease(struct binomial **, struct binomial *, unsigned);
+void binomial_delete(struct binomial **, struct binomial *);
diff -prauN linux-2.6.0-test11/include/linux/init_task.h sched-2.6.0-test11-5/include/linux/init_task.h
--- linux-2.6.0-test11/include/linux/init_task.h	2003-11-26 12:42:58.000000000 -0800
+++ sched-2.6.0-test11-5/include/linux/init_task.h	2003-12-18 05:51:16.000000000 -0800
@@ -56,6 +56,12 @@
 	.siglock	= SPIN_LOCK_UNLOCKED, 		\
 }
 
+#define INIT_SCHED_INFO(info)					\
+{								\
+	.run_list	= LIST_HEAD_INIT((info).run_list),	\
+	.policy		= 1 /* SCHED_POLICY_TS */,		\
+}
+
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -67,14 +73,10 @@
 	.usage		= ATOMIC_INIT(2),				\
 	.flags		= 0,						\
 	.lock_depth	= -1,						\
-	.prio		= MAX_PRIO-20,					\
-	.static_prio	= MAX_PRIO-20,					\
-	.policy		= SCHED_NORMAL,					\
+	.sched_info	= INIT_SCHED_INFO(tsk.sched_info),		\
 	.cpus_allowed	= CPU_MASK_ALL,					\
 	.mm		= NULL,						\
 	.active_mm	= &init_mm,					\
-	.run_list	= LIST_HEAD_INIT(tsk.run_list),			\
-	.time_slice	= HZ,						\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
 	.ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children),		\
 	.ptrace_list	= LIST_HEAD_INIT(tsk.ptrace_list),		\
diff -prauN linux-2.6.0-test11/include/linux/sched.h sched-2.6.0-test11-5/include/linux/sched.h
--- linux-2.6.0-test11/include/linux/sched.h	2003-11-26 12:42:58.000000000 -0800
+++ sched-2.6.0-test11-5/include/linux/sched.h	2003-12-23 03:47:45.000000000 -0800
@@ -126,6 +126,8 @@ extern unsigned long nr_iowait(void);
 #define SCHED_NORMAL		0
 #define SCHED_FIFO		1
 #define SCHED_RR		2
+#define SCHED_BATCH		3
+#define SCHED_IDLE		4
 
 struct sched_param {
 	int sched_priority;
@@ -281,10 +283,14 @@ struct signal_struct {
 
 #define MAX_USER_RT_PRIO	100
 #define MAX_RT_PRIO		MAX_USER_RT_PRIO
-
-#define MAX_PRIO		(MAX_RT_PRIO + 40)
-
-#define rt_task(p)		((p)->prio < MAX_RT_PRIO)
+#define NICE_QLEN		128
+#define MIN_TS_PRIO		MAX_RT_PRIO
+#define MAX_TS_PRIO		(40*NICE_QLEN)
+#define MIN_BATCH_PRIO		(MAX_RT_PRIO + MAX_TS_PRIO)
+#define MAX_BATCH_PRIO		100
+#define MAX_PRIO		(MIN_BATCH_PRIO + MAX_BATCH_PRIO)
+#define USER_PRIO(prio)		((prio) - MAX_RT_PRIO)
+#define MAX_USER_PRIO		USER_PRIO(MAX_PRIO)
 
 /*
  * Some day this will be a full-fledged user tracking system..
@@ -330,6 +336,36 @@ struct k_itimer {
 struct io_context;			/* See blkdev.h */
 void exit_io_context(void);
 
+struct rt_data {
+	int prio, rt_policy;
+	unsigned long quantum, ticks; 
+};
+
+/* XXX: do %cpu estimation for ts wakeup levels */
+struct ts_data {
+	int nice;
+	unsigned long ticks, frac_cpu;
+	unsigned long sample_start, sample_ticks;
+};
+
+struct bt_data {
+	int prio;
+	unsigned long ticks;
+};
+
+union class_data {
+	struct rt_data rt;
+	struct ts_data ts;
+	struct bt_data bt;
+};
+
+struct sched_info {
+	int idx;			/* queue index, used by all classes */
+	unsigned long policy;		/* scheduling policy */
+	struct list_head run_list;	/* list links for priority queues */
+	union class_data cl_data;	/* class-specific data */
+};
+
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	struct thread_info *thread_info;
@@ -339,18 +375,9 @@ struct task_struct {
 
 	int lock_depth;		/* Lock depth */
 
-	int prio, static_prio;
-	struct list_head run_list;
-	prio_array_t *array;
-
-	unsigned long sleep_avg;
-	long interactive_credit;
-	unsigned long long timestamp;
-	int activated;
+	struct sched_info sched_info;
 
-	unsigned long policy;
 	cpumask_t cpus_allowed;
-	unsigned int time_slice, first_time_slice;
 
 	struct list_head tasks;
 	struct list_head ptrace_children;
@@ -391,7 +418,6 @@ struct task_struct {
 	int __user *set_child_tid;		/* CLONE_CHILD_SETTID */
 	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */
 
-	unsigned long rt_priority;
 	unsigned long it_real_value, it_prof_value, it_virt_value;
 	unsigned long it_real_incr, it_prof_incr, it_virt_incr;
 	struct timer_list real_timer;
@@ -520,12 +546,14 @@ extern void node_nr_running_init(void);
 #define node_nr_running_init() {}
 #endif
 
-extern void set_user_nice(task_t *p, long nice);
-extern int task_prio(task_t *p);
-extern int task_nice(task_t *p);
-extern int task_curr(task_t *p);
-extern int idle_cpu(int cpu);
-
+void set_user_nice(task_t *task, long nice);
+int task_prio(task_t *task);
+int task_nice(task_t *task);
+int task_sched_policy(task_t *task);
+void set_task_sched_policy(task_t *task, int policy);
+int rt_task(task_t *task);
+int task_curr(task_t *task);
+int idle_cpu(int cpu);
 void yield(void);
 
 /*
@@ -844,6 +872,21 @@ static inline int need_resched(void)
 	return unlikely(test_thread_flag(TIF_NEED_RESCHED));
 }
 
+static inline void set_task_queued(task_t *task)
+{
+	set_tsk_thread_flag(task, TIF_QUEUED);
+}
+
+static inline void clear_task_queued(task_t *task)
+{
+	clear_tsk_thread_flag(task, TIF_QUEUED);
+}
+
+static inline int task_queued(task_t *task)
+{
+	return test_tsk_thread_flag(task, TIF_QUEUED);
+}
+
 extern void __cond_resched(void);
 static inline void cond_resched(void)
 {
diff -prauN linux-2.6.0-test11/kernel/Makefile sched-2.6.0-test11-5/kernel/Makefile
--- linux-2.6.0-test11/kernel/Makefile	2003-11-26 12:43:24.000000000 -0800
+++ sched-2.6.0-test11-5/kernel/Makefile	2003-12-17 03:30:08.000000000 -0800
@@ -6,7 +6,7 @@ obj-y     = sched.o fork.o exec_domain.o
 	    exit.o itimer.o time.o softirq.o resource.o \
 	    sysctl.o capability.o ptrace.o timer.o user.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o \
-	    rcupdate.o intermodule.o extable.o params.o posix-timers.o
+	    rcupdate.o intermodule.o extable.o params.o posix-timers.o sched/
 
 obj-$(CONFIG_FUTEX) += futex.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
diff -prauN linux-2.6.0-test11/kernel/exit.c sched-2.6.0-test11-5/kernel/exit.c
--- linux-2.6.0-test11/kernel/exit.c	2003-11-26 12:45:29.000000000 -0800
+++ sched-2.6.0-test11-5/kernel/exit.c	2003-12-17 07:04:02.000000000 -0800
@@ -225,7 +225,7 @@ void reparent_to_init(void)
 	/* Set the exit signal to SIGCHLD so we signal init on exit */
 	current->exit_signal = SIGCHLD;
 
-	if ((current->policy == SCHED_NORMAL) && (task_nice(current) < 0))
+	if (task_nice(current) < 0)
 		set_user_nice(current, 0);
 	/* cpus_allowed? */
 	/* rt_priority? */
diff -prauN linux-2.6.0-test11/kernel/fork.c sched-2.6.0-test11-5/kernel/fork.c
--- linux-2.6.0-test11/kernel/fork.c	2003-11-26 12:42:58.000000000 -0800
+++ sched-2.6.0-test11-5/kernel/fork.c	2003-12-23 06:22:59.000000000 -0800
@@ -836,6 +836,9 @@ struct task_struct *copy_process(unsigne
 	atomic_inc(&p->user->__count);
 	atomic_inc(&p->user->processes);
 
+	clear_tsk_thread_flag(p, TIF_SIGPENDING);
+	clear_tsk_thread_flag(p, TIF_QUEUED);
+
 	/*
 	 * If multiple threads are within copy_process(), then this check
 	 * triggers too late. This doesn't hurt, the check is only there
@@ -861,13 +864,21 @@ struct task_struct *copy_process(unsigne
 	p->state = TASK_UNINTERRUPTIBLE;
 
 	copy_flags(clone_flags, p);
-	if (clone_flags & CLONE_IDLETASK)
+	if (clone_flags & CLONE_IDLETASK) {
 		p->pid = 0;
-	else {
+		set_task_sched_policy(p, SCHED_IDLE);
+	} else {
+		if (task_sched_policy(p) == SCHED_IDLE) {
+			memset(&p->sched_info, 0, sizeof(struct sched_info));
+			set_task_sched_policy(p, SCHED_NORMAL);
+			set_user_nice(p, 0);
+		}
 		p->pid = alloc_pidmap();
 		if (p->pid == -1)
 			goto bad_fork_cleanup;
 	}
+	if (p->pid == 1)
+		BUG_ON(task_nice(p));
 	retval = -EFAULT;
 	if (clone_flags & CLONE_PARENT_SETTID)
 		if (put_user(p->pid, parent_tidptr))
@@ -875,8 +886,7 @@ struct task_struct *copy_process(unsigne
 
 	p->proc_dentry = NULL;
 
-	INIT_LIST_HEAD(&p->run_list);
-
+	INIT_LIST_HEAD(&p->sched_info.run_list);
 	INIT_LIST_HEAD(&p->children);
 	INIT_LIST_HEAD(&p->sibling);
 	INIT_LIST_HEAD(&p->posix_timers);
@@ -885,8 +895,6 @@ struct task_struct *copy_process(unsigne
 	spin_lock_init(&p->alloc_lock);
 	spin_lock_init(&p->switch_lock);
 	spin_lock_init(&p->proc_lock);
-
-	clear_tsk_thread_flag(p, TIF_SIGPENDING);
 	init_sigpending(&p->pending);
 
 	p->it_real_value = p->it_virt_value = p->it_prof_value = 0;
@@ -898,7 +906,6 @@ struct task_struct *copy_process(unsigne
 	p->tty_old_pgrp = 0;
 	p->utime = p->stime = 0;
 	p->cutime = p->cstime = 0;
-	p->array = NULL;
 	p->lock_depth = -1;		/* -1 = no lock */
 	p->start_time = get_jiffies_64();
 	p->security = NULL;
@@ -948,33 +955,6 @@ struct task_struct *copy_process(unsigne
 	p->pdeath_signal = 0;
 
 	/*
-	 * Share the timeslice between parent and child, thus the
-	 * total amount of pending timeslices in the system doesn't change,
-	 * resulting in more scheduling fairness.
-	 */
-	local_irq_disable();
-        p->time_slice = (current->time_slice + 1) >> 1;
-	/*
-	 * The remainder of the first timeslice might be recovered by
-	 * the parent if the child exits early enough.
-	 */
-	p->first_time_slice = 1;
-	current->time_slice >>= 1;
-	p->timestamp = sched_clock();
-	if (!current->time_slice) {
-		/*
-	 	 * This case is rare, it happens when the parent has only
-	 	 * a single jiffy left from its timeslice. Taking the
-		 * runqueue lock is not a problem.
-		 */
-		current->time_slice = 1;
-		preempt_disable();
-		scheduler_tick(0, 0);
-		local_irq_enable();
-		preempt_enable();
-	} else
-		local_irq_enable();
-	/*
 	 * Ok, add it to the run-queues and make it
 	 * visible to the rest of the system.
 	 *
diff -prauN linux-2.6.0-test11/kernel/sched/Makefile sched-2.6.0-test11-5/kernel/sched/Makefile
--- linux-2.6.0-test11/kernel/sched/Makefile	1969-12-31 16:00:00.000000000 -0800
+++ sched-2.6.0-test11-5/kernel/sched/Makefile	2003-12-17 03:32:21.000000000 -0800
@@ -0,0 +1 @@
+obj-y = util.o ts.o idle.o rt.o batch.o
diff -prauN linux-2.6.0-test11/kernel/sched/batch.c sched-2.6.0-test11-5/kernel/sched/batch.c
--- linux-2.6.0-test11/kernel/sched/batch.c	1969-12-31 16:00:00.000000000 -0800
+++ sched-2.6.0-test11-5/kernel/sched/batch.c	2003-12-19 21:32:49.000000000 -0800
@@ -0,0 +1,190 @@
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/percpu.h>
+#include <linux/kernel_stat.h>
+#include <asm/page.h>
+#include "queue.h"
+
+struct batch_queue {
+	int base, tasks;
+	task_t *curr;
+	unsigned long bitmap[BITS_TO_LONGS(MAX_BATCH_PRIO)];
+	struct list_head queue[MAX_BATCH_PRIO];
+};
+
+static int batch_quantum = 1024;
+static DEFINE_PER_CPU(struct batch_queue, batch_queues);
+
+static int batch_init(struct policy *policy, int cpu)
+{
+	int k;
+	struct batch_queue *queue = &per_cpu(batch_queues, cpu);
+
+	policy->queue = (struct queue *)queue;
+	for (k = 0; k < MAX_BATCH_PRIO; ++k)
+		INIT_LIST_HEAD(&queue->queue[k]);
+	return 0;
+}
+
+static int batch_tick(struct queue *__queue, task_t *task, int user_ticks, int sys_ticks)
+{
+	struct batch_queue *queue = (struct batch_queue *)__queue;
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+
+	cpustat->nice += user_ticks;
+	cpustat->system += sys_ticks;
+
+	task->sched_info.cl_data.bt.ticks--;
+	if (!task->sched_info.cl_data.bt.ticks) {
+		int new_idx;
+
+		task->sched_info.cl_data.bt.ticks = batch_quantum;
+		new_idx = (task->sched_info.idx + task->sched_info.cl_data.bt.prio)
+				% MAX_BATCH_PRIO;
+		if (!test_bit(new_idx, queue->bitmap))
+			__set_bit(new_idx, queue->bitmap);
+		list_move_tail(&task->sched_info.run_list,
+				&queue->queue[new_idx]);
+		if (list_empty(&queue->queue[task->sched_info.idx]))
+			__clear_bit(task->sched_info.idx, queue->bitmap);
+		task->sched_info.idx = new_idx;
+		queue->base = find_first_circular_bit(queue->bitmap,
+							queue->base,
+							MAX_BATCH_PRIO);
+		set_need_resched();
+	}
+	return 0;
+}
+
+static void batch_yield(struct queue *__queue, task_t *task)
+{
+	int new_idx;
+	struct batch_queue *queue = (struct batch_queue *)__queue;
+
+	new_idx = (queue->base + MAX_BATCH_PRIO - 1) % MAX_BATCH_PRIO;
+	if (!test_bit(new_idx, queue->bitmap))
+		__set_bit(new_idx, queue->bitmap);
+	list_move_tail(&task->sched_info.run_list, &queue->queue[new_idx]);
+	if (list_empty(&queue->queue[task->sched_info.idx]))
+		__clear_bit(task->sched_info.idx, queue->bitmap);
+	task->sched_info.idx = new_idx;
+	queue->base = find_first_circular_bit(queue->bitmap,
+						queue->base,
+						MAX_BATCH_PRIO);
+	set_need_resched();
+}
+
+static task_t *batch_curr(struct queue *__queue)
+{
+	struct batch_queue *queue = (struct batch_queue *)__queue;
+	return queue->curr;
+}
+
+static void batch_set_curr(struct queue *__queue, task_t *task)
+{
+	struct batch_queue *queue = (struct batch_queue *)__queue;
+	queue->curr = task;
+}
+
+static task_t *batch_best(struct queue *__queue)
+{
+	int idx;
+	struct batch_queue *queue = (struct batch_queue *)__queue;
+
+	idx = find_first_circular_bit(queue->bitmap,
+					queue->base,
+					MAX_BATCH_PRIO);
+	BUG_ON(idx >= MAX_BATCH_PRIO);
+	BUG_ON(list_empty(&queue->queue[idx]));
+	return list_entry(queue->queue[idx].next, task_t, sched_info.run_list);
+}
+
+static void batch_enqueue(struct queue *__queue, task_t *task)
+{
+	int idx;
+	struct batch_queue *queue = (struct batch_queue *)__queue;
+
+	idx = (queue->base + task->sched_info.cl_data.bt.prio) % MAX_BATCH_PRIO;
+	if (!test_bit(idx, queue->bitmap))
+		__set_bit(idx, queue->bitmap);
+	list_add_tail(&task->sched_info.run_list, &queue->queue[idx]);
+	task->sched_info.idx = idx;
+	task->sched_info.cl_data.bt.ticks = batch_quantum;
+	queue->tasks++;
+	if (!queue->curr)
+		queue->curr = task;
+}
+
+static void batch_dequeue(struct queue *__queue, task_t *task)
+{
+	struct batch_queue *queue = (struct batch_queue *)__queue;
+	list_del(&task->sched_info.run_list);
+	if (list_empty(&queue->queue[task->sched_info.idx]))
+		__clear_bit(task->sched_info.idx, queue->bitmap);
+	queue->tasks--;
+	if (!queue->tasks)
+		queue->curr = NULL;
+	else if (task == queue->curr)
+		queue->curr = batch_best(__queue);
+}
+
+static int batch_preempt(struct queue *__queue, task_t *task)
+{
+	struct batch_queue *queue = (struct batch_queue *)__queue;
+	if (!queue->curr)
+		return 1;
+	else
+		return task->sched_info.cl_data.bt.prio
+				< queue->curr->sched_info.cl_data.bt.prio;
+}
+
+static int batch_tasks(struct queue *__queue)
+{
+	struct batch_queue *queue = (struct batch_queue *)__queue;
+	return queue->tasks;
+}
+
+static int batch_nice(struct queue *queue, task_t *task)
+{
+	return 20;
+}
+
+static int batch_prio(task_t *task)
+{
+	return USER_PRIO(task->sched_info.cl_data.bt.prio + MIN_BATCH_PRIO);
+}
+
+static void batch_setprio(task_t *task, int prio)
+{
+	BUG_ON(prio < 0);
+	BUG_ON(prio >= MAX_BATCH_PRIO);
+	task->sched_info.cl_data.bt.prio = prio;
+}
+
+struct queue_ops batch_ops = {
+	.init		= batch_init,
+	.fini		= nop_fini,
+	.tick		= batch_tick,
+	.yield		= batch_yield,
+	.curr		= batch_curr,
+	.set_curr	= batch_set_curr,
+	.tasks		= batch_tasks,
+	.best		= batch_best,
+	.enqueue	= batch_enqueue,
+	.dequeue	= batch_dequeue,
+	.start_wait	= queue_nop,
+	.stop_wait	= queue_nop,
+	.sleep		= queue_nop,
+	.wake		= queue_nop,
+	.preempt	= batch_preempt,
+	.nice		= batch_nice,
+	.renice		= nop_renice,
+	.prio		= batch_prio,
+	.setprio	= batch_setprio,
+	.timeslice	= nop_timeslice,
+	.set_timeslice	= nop_set_timeslice,
+};
+
+struct policy batch_policy = {
+	.ops	= &batch_ops,
+};
diff -prauN linux-2.6.0-test11/kernel/sched/idle.c sched-2.6.0-test11-5/kernel/sched/idle.c
--- linux-2.6.0-test11/kernel/sched/idle.c	1969-12-31 16:00:00.000000000 -0800
+++ sched-2.6.0-test11-5/kernel/sched/idle.c	2003-12-19 17:31:39.000000000 -0800
@@ -0,0 +1,99 @@
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/percpu.h>
+#include <linux/kernel_stat.h>
+#include <asm/page.h>
+#include "queue.h"
+
+static DEFINE_PER_CPU(task_t *, idle_tasks) = NULL;
+
+static int idle_nice(struct queue *queue, task_t *task)
+{
+	return 20;
+}
+
+static int idle_tasks(struct queue *queue)
+{
+	task_t **idle = (task_t **)queue;
+	return !!(*idle);
+}
+
+static task_t *idle_task(struct queue *queue)
+{
+	return *((task_t **)queue);
+}
+
+static void idle_yield(struct queue *queue, task_t *task)
+{
+	set_need_resched();
+}
+
+static void idle_enqueue(struct queue *queue, task_t *task)
+{
+	task_t **idle = (task_t **)queue;
+	*idle = task;
+}
+
+static void idle_dequeue(struct queue *queue, task_t *task)
+{
+}
+
+static int idle_preempt(struct queue *queue, task_t *task)
+{
+	return 0;
+}
+
+static int idle_tick(struct queue *queue, task_t *task, int user_ticks, int sys_ticks)
+{
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+	runqueue_t *rq = &per_cpu(runqueues, smp_processor_id());
+
+	if (atomic_read(&rq->nr_iowait) > 0)
+		cpustat->iowait += sys_ticks;
+	else
+		cpustat->idle += sys_ticks;
+	return 1;
+}
+
+static int idle_init(struct policy *policy, int cpu)
+{
+	policy->queue = (struct queue *)&per_cpu(idle_tasks, cpu);
+	return 0;
+}
+
+static int idle_prio(task_t *task)
+{
+	return MAX_USER_PRIO;
+}
+
+static void idle_setprio(task_t *task, int prio)
+{
+}
+
+static struct queue_ops idle_ops = {
+	.init		= idle_init,
+	.fini		= nop_fini,
+	.tick		= idle_tick,
+	.yield		= idle_yield,
+	.curr		= idle_task,
+	.set_curr	= queue_nop,
+	.tasks		= idle_tasks,
+	.best		= idle_task,
+	.enqueue	= idle_enqueue,
+	.dequeue	= idle_dequeue,
+	.start_wait	= queue_nop,
+	.stop_wait	= queue_nop,
+	.sleep		= queue_nop,
+	.wake		= queue_nop,
+	.preempt	= idle_preempt,
+	.nice		= idle_nice,
+	.renice		= nop_renice,
+	.prio		= idle_prio,
+	.setprio	= idle_setprio,
+	.timeslice	= nop_timeslice,
+	.set_timeslice	= nop_set_timeslice,
+};
+
+struct policy idle_policy = {
+	.ops	= &idle_ops,
+};
diff -prauN linux-2.6.0-test11/kernel/sched/queue.h sched-2.6.0-test11-5/kernel/sched/queue.h
--- linux-2.6.0-test11/kernel/sched/queue.h	1969-12-31 16:00:00.000000000 -0800
+++ sched-2.6.0-test11-5/kernel/sched/queue.h	2003-12-23 03:58:02.000000000 -0800
@@ -0,0 +1,104 @@
+#define SCHED_POLICY_RT		0
+#define SCHED_POLICY_TS		1
+#define SCHED_POLICY_BATCH	2
+#define SCHED_POLICY_IDLE	3
+
+#define RT_POLICY_FIFO		0
+#define RT_POLICY_RR		1
+
+#define NODE_THRESHOLD		125
+
+struct queue;
+struct queue_ops;
+
+struct policy {
+	struct queue *queue;
+	struct queue_ops *ops;
+};
+
+extern struct policy rt_policy, ts_policy, batch_policy, idle_policy;
+
+struct runqueue {
+        spinlock_t lock;
+	int curr;
+	task_t *__curr;
+	unsigned long policy_bitmap;
+	struct policy *policies[BITS_PER_LONG];
+        unsigned long nr_running, nr_switches, nr_uninterruptible;
+        struct mm_struct *prev_mm;
+        int prev_cpu_load[NR_CPUS];
+#ifdef CONFIG_NUMA
+        atomic_t *node_nr_running;
+        int prev_node_load[MAX_NUMNODES];
+#endif
+        task_t *migration_thread;
+        struct list_head migration_queue;
+
+        atomic_t nr_iowait;
+};
+
+typedef struct runqueue runqueue_t;
+
+struct queue_ops {
+	int (*init)(struct policy *, int);
+	void (*fini)(struct policy *, int);
+	task_t *(*curr)(struct queue *);
+	void (*set_curr)(struct queue *, task_t *);
+	task_t *(*best)(struct queue *);
+	int (*tick)(struct queue *, task_t *, int, int);
+	int (*tasks)(struct queue *);
+	void (*enqueue)(struct queue *, task_t *);
+	void (*dequeue)(struct queue *, task_t *);
+	void (*start_wait)(struct queue *, task_t *);
+	void (*stop_wait)(struct queue *, task_t *);
+	void (*sleep)(struct queue *, task_t *);
+	void (*wake)(struct queue *, task_t *);
+	int (*preempt)(struct queue *, task_t *);
+	void (*yield)(struct queue *, task_t *);
+	int (*prio)(task_t *);
+	void (*setprio)(task_t *, int);
+	int (*nice)(struct queue *, task_t *);
+	void (*renice)(struct queue *, task_t *, int);
+	unsigned long (*timeslice)(struct queue *, task_t *);
+	void (*set_timeslice)(struct queue *, task_t *, unsigned long);
+};
+
+DECLARE_PER_CPU(runqueue_t, runqueues);
+
+int find_first_circular_bit(unsigned long *, int, int);
+void queue_nop(struct queue *, task_t *);
+void nop_renice(struct queue *, task_t *, int);
+void nop_fini(struct policy *, int);
+unsigned long nop_timeslice(struct queue *, task_t *);
+void nop_set_timeslice(struct queue *, task_t *, unsigned long);
+
+/* #define DEBUG_SCHED */
+
+#ifdef DEBUG_SCHED
+#define __check_task_policy(idx)					\
+do {									\
+	unsigned long __idx__ = (idx);					\
+	if (__idx__ > SCHED_POLICY_IDLE) {				\
+		printk("invalid policy 0x%lx\n", __idx__);		\
+		BUG();							\
+	}								\
+} while (0)
+
+#define check_task_policy(task)						\
+do {									\
+	__check_task_policy((task)->sched_info.policy);			\
+} while (0)
+
+#define check_policy(policy)						\
+do {									\
+	BUG_ON((policy) != &rt_policy &&				\
+		(policy) != &ts_policy &&				\
+		(policy) != &batch_policy &&				\
+		(policy) != &idle_policy);				\
+} while (0)
+
+#else /* !DEBUG_SCHED */
+#define __check_task_policy(idx)			do { } while (0)
+#define check_task_policy(task)				do { } while (0)
+#define check_policy(policy)				do { } while (0)
+#endif /* !DEBUG_SCHED */
diff -prauN linux-2.6.0-test11/kernel/sched/rt.c sched-2.6.0-test11-5/kernel/sched/rt.c
--- linux-2.6.0-test11/kernel/sched/rt.c	1969-12-31 16:00:00.000000000 -0800
+++ sched-2.6.0-test11-5/kernel/sched/rt.c	2003-12-19 18:16:07.000000000 -0800
@@ -0,0 +1,208 @@
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/percpu.h>
+#include <linux/kernel_stat.h>
+#include <asm/page.h>
+#include "queue.h"
+
+#ifdef DEBUG_SCHED
+#define check_rt_policy(task)						\
+do {									\
+	BUG_ON((task)->sched_info.policy != SCHED_POLICY_RT);		\
+	BUG_ON((task)->sched_info.cl_data.rt.rt_policy != RT_POLICY_RR	\
+			&&						\
+	      (task)->sched_info.cl_data.rt.rt_policy!=RT_POLICY_FIFO);	\
+	BUG_ON((task)->sched_info.cl_data.rt.prio < 0);			\
+	BUG_ON((task)->sched_info.cl_data.rt.prio >= MAX_RT_PRIO);	\
+} while (0)
+#else
+#define check_rt_policy(task)				do { } while (0)
+#endif
+
+struct rt_queue {
+	unsigned long bitmap[BITS_TO_LONGS(MAX_RT_PRIO)];
+	struct list_head queue[MAX_RT_PRIO];
+	task_t *curr;
+	int tasks;
+};
+
+static DEFINE_PER_CPU(struct rt_queue, rt_queues);
+
+static int rt_init(struct policy *policy, int cpu)
+{
+	int k;
+	struct rt_queue *queue = &per_cpu(rt_queues, cpu);
+
+	policy->queue = (struct queue *)queue;
+	for (k = 0; k < MAX_RT_PRIO; ++k)
+		INIT_LIST_HEAD(&queue->queue[k]);
+	return 0;
+}
+
+static void rt_yield(struct queue *__queue, task_t *task)
+{
+	struct rt_queue *queue = (struct rt_queue *)__queue;
+	check_rt_policy(task);
+	list_del(&task->sched_info.run_list);
+	if (list_empty(&queue->queue[task->sched_info.cl_data.rt.prio]))
+		set_need_resched();
+	list_add_tail(&task->sched_info.run_list,
+			&queue->queue[task->sched_info.cl_data.rt.prio]);
+	check_rt_policy(task);
+}
+
+static int rt_tick(struct queue *queue, task_t *task, int user_ticks, int sys_ticks)
+{
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+	check_rt_policy(task);
+	cpustat->user += user_ticks;
+	cpustat->system += sys_ticks;
+	if (task->sched_info.cl_data.rt.rt_policy == RT_POLICY_RR) {
+		task->sched_info.cl_data.rt.ticks--;
+		if (!task->sched_info.cl_data.rt.ticks) {
+			task->sched_info.cl_data.rt.ticks =
+				task->sched_info.cl_data.rt.quantum;
+			rt_yield(queue, task);
+		}
+	}
+	check_rt_policy(task);
+	return 0;
+}
+
+static task_t *rt_curr(struct queue *__queue)
+{
+	struct rt_queue *queue = (struct rt_queue *)__queue;
+	task_t *task = queue->curr;
+	check_rt_policy(task);
+	return task;
+}
+
+static void rt_set_curr(struct queue *__queue, task_t *task)
+{
+	struct rt_queue *queue = (struct rt_queue *)__queue;
+	queue->curr = task;
+	check_rt_policy(task);
+}
+
+static task_t *rt_best(struct queue *__queue)
+{
+	struct rt_queue *queue = (struct rt_queue *)__queue;
+	task_t *task;
+	int idx;
+	idx = find_first_bit(queue->bitmap, MAX_RT_PRIO);
+	BUG_ON(idx >= MAX_RT_PRIO);
+	task = list_entry(queue->queue[idx].next, task_t, sched_info.run_list);
+	check_rt_policy(task);
+	return task;
+}
+
+static void rt_enqueue(struct queue *__queue, task_t *task)
+{
+	struct rt_queue *queue = (struct rt_queue *)__queue;
+	check_rt_policy(task);
+	if (!test_bit(task->sched_info.cl_data.rt.prio, queue->bitmap))
+		__set_bit(task->sched_info.cl_data.rt.prio, queue->bitmap);
+	list_add_tail(&task->sched_info.run_list,
+			&queue->queue[task->sched_info.cl_data.rt.prio]);
+	check_rt_policy(task);
+	queue->tasks++;
+	if (!queue->curr)
+		queue->curr = task;
+}
+
+static void rt_dequeue(struct queue *__queue, task_t *task)
+{
+	struct rt_queue *queue = (struct rt_queue *)__queue;
+	check_rt_policy(task);
+	list_del(&task->sched_info.run_list);
+	if (list_empty(&queue->queue[task->sched_info.cl_data.rt.prio]))
+		__clear_bit(task->sched_info.cl_data.rt.prio, queue->bitmap);
+	queue->tasks--;
+	check_rt_policy(task);
+	if (!queue->tasks)
+		queue->curr = NULL;
+	else if (task == queue->curr)
+		queue->curr = rt_best(__queue);
+}
+
+static int rt_preempt(struct queue *__queue, task_t *task)
+{
+	struct rt_queue *queue = (struct rt_queue *)__queue;
+	check_rt_policy(task);
+	if (!queue->curr)
+		return 1;
+	check_rt_policy(queue->curr);
+	return task->sched_info.cl_data.rt.prio
+			< queue->curr->sched_info.cl_data.rt.prio;
+}
+
+static int rt_tasks(struct queue *__queue)
+{
+	struct rt_queue *queue = (struct rt_queue *)__queue;
+	return queue->tasks;
+}
+
+static int rt_nice(struct queue *queue, task_t *task)
+{
+	check_rt_policy(task);
+	return -20;
+}
+
+static unsigned long rt_timeslice(struct queue *queue, task_t *task)
+{
+	check_rt_policy(task);
+	if (task->sched_info.cl_data.rt.rt_policy != RT_POLICY_RR)
+		return 0;
+	else
+		return task->sched_info.cl_data.rt.quantum;
+}
+
+static void rt_set_timeslice(struct queue *queue, task_t *task, unsigned long n)
+{
+	check_rt_policy(task);
+	if (task->sched_info.cl_data.rt.rt_policy == RT_POLICY_RR)
+		task->sched_info.cl_data.rt.quantum = n;
+	check_rt_policy(task);
+}
+
+static void rt_setprio(task_t *task, int prio)
+{
+	check_rt_policy(task);
+	BUG_ON(prio < 0);
+	BUG_ON(prio >= MAX_RT_PRIO);
+	task->sched_info.cl_data.rt.prio = prio;
+}
+
+static int rt_prio(task_t *task)
+{
+	check_rt_policy(task);
+	return USER_PRIO(task->sched_info.cl_data.rt.prio);
+}
+
+static struct queue_ops rt_ops = {
+	.init		= rt_init,
+	.fini		= nop_fini,
+	.tick		= rt_tick,
+	.yield		= rt_yield,
+	.curr		= rt_curr,
+	.set_curr	= rt_set_curr,
+	.tasks		= rt_tasks,
+	.best		= rt_best,
+	.enqueue	= rt_enqueue,
+	.dequeue	= rt_dequeue,
+	.start_wait	= queue_nop,
+	.stop_wait	= queue_nop,
+	.sleep		= queue_nop,
+	.wake		= queue_nop,
+	.preempt	= rt_preempt,
+	.nice		= rt_nice,
+	.renice		= nop_renice,
+	.prio		= rt_prio,
+	.setprio	= rt_setprio,
+	.timeslice	= rt_timeslice,
+	.set_timeslice	= rt_set_timeslice,
+};
+
+struct policy rt_policy = {
+	.ops	= &rt_ops,
+};
diff -prauN linux-2.6.0-test11/kernel/sched/ts.c sched-2.6.0-test11-5/kernel/sched/ts.c
--- linux-2.6.0-test11/kernel/sched/ts.c	1969-12-31 16:00:00.000000000 -0800
+++ sched-2.6.0-test11-5/kernel/sched/ts.c	2003-12-23 08:24:55.000000000 -0800
@@ -0,0 +1,841 @@
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/percpu.h>
+#include <linux/kernel_stat.h>
+#include <asm/page.h>
+#include "queue.h"
+
+#ifdef DEBUG_SCHED
+#define check_ts_policy(task)						\
+do {									\
+	BUG_ON((task)->sched_info.policy != SCHED_POLICY_TS);		\
+} while (0)
+
+#define check_nice(__queue__)						\
+({									\
+	int __k__, __count__ = 0;					\
+	if ((__queue__)->tasks < 0) {					\
+		printk("negative nice task count %d\n", 		\
+			(__queue__)->tasks);				\
+		BUG();							\
+	}								\
+	for (__k__ = 0; __k__ < NICE_QLEN; ++__k__) {			\
+		task_t *__task__;					\
+		if (list_empty(&(__queue__)->queue[__k__])) {		\
+			if (test_bit(__k__, (__queue__)->bitmap)) {	\
+				printk("wrong nice bit set\n");		\
+				BUG();					\
+			}						\
+		} else {						\
+			if (!test_bit(__k__, (__queue__)->bitmap)) {	\
+				printk("wrong nice bit clear\n");	\
+				BUG();					\
+			}						\
+		}							\
+		list_for_each_entry(__task__,				\
+					&(__queue__)->queue[__k__],	\
+					sched_info.run_list) {		\
+			check_ts_policy(__task__);			\
+			if (__task__->sched_info.idx != __k__) {	\
+				printk("nice index mismatch\n");	\
+				BUG();					\
+			}						\
+			++__count__;					\
+		}							\
+	}								\
+	if ((__queue__)->tasks != __count__) {				\
+		printk("wrong nice task count\n");			\
+		printk("expected %d, got %d\n",				\
+			(__queue__)->tasks,				\
+			__count__);					\
+		BUG();							\
+	}								\
+	__count__;							\
+})
+
+#define check_queue(__queue)						\
+do {									\
+	int __k, __count = 0;						\
+	if ((__queue)->tasks < 0) {					\
+		printk("negative queue task count %d\n", 		\
+			(__queue)->tasks);				\
+		BUG();							\
+	}								\
+	for (__k = 0; __k < 40; ++__k) {				\
+		struct nice_queue *__nice;				\
+		if (list_empty(&(__queue)->nices[__k])) {		\
+			if (test_bit(__k, (__queue)->bitmap)) {		\
+				printk("wrong queue bit set\n");	\
+				BUG();					\
+			}						\
+		} else {						\
+			if (!test_bit(__k, (__queue)->bitmap)) {	\
+				printk("wrong queue bit clear\n");	\
+				BUG();					\
+			}						\
+		}							\
+		list_for_each_entry(__nice,				\
+					&(__queue)->nices[__k],		\
+					list) {				\
+			__count += check_nice(__nice);			\
+			if (__nice->idx != __k) {			\
+				printk("queue index mismatch\n");	\
+				BUG();					\
+			}						\
+		}							\
+	}								\
+	if ((__queue)->tasks != __count) {				\
+		printk("wrong queue task count\n");			\
+		printk("expected %d, got %d\n",				\
+			(__queue)->tasks,				\
+			__count);					\
+		BUG();							\
+	}								\
+} while (0)
+
+#else /* !DEBUG_SCHED */
+#define check_ts_policy(task)			do { } while (0)
+#define check_nice(nice)			do { } while (0)
+#define check_queue(queue)			do { } while (0)
+#endif
+
+/*
+ * Hybrid deadline/multilevel scheduling. Cpu utilization
+ * -dependent deadlines at wake. Queue rotation every 50ms or when
+ * demotions empty the highest level, setting demoted deadlines
+ * relative to the new highest level. Intra-level RR quantum at 10ms.
+ */
+struct nice_queue {
+	int idx, nice, base, tasks, level_quantum, expired;
+	unsigned long bitmap[BITS_TO_LONGS(NICE_QLEN)];
+	struct list_head list, queue[NICE_QLEN];
+	task_t *curr;
+};
+
+/*
+ * Deadline schedule nice levels with priority-dependent deadlines,
+ * default quantum of 100ms. Queue rotates at demotions emptying the
+ * highest level, setting the demoted deadline relative to the new
+ * highest level.
+ */
+struct ts_queue {
+	struct nice_queue nice_levels[40];
+	struct list_head nices[40];
+	int base, quantum, tasks;
+	unsigned long bitmap[BITS_TO_LONGS(40)];
+	struct nice_queue *curr;
+};
+
+/*
+ * Make these sysctl-tunable.
+ */
+static int nice_quantum = 100;
+static int rr_quantum = 10;
+static int level_quantum = 50;
+static int sample_interval = HZ;
+
+static DEFINE_PER_CPU(struct ts_queue, ts_queues);
+
+static task_t *nice_best(struct nice_queue *);
+static struct nice_queue *ts_best_nice(struct ts_queue *);
+
+static void nice_init(struct nice_queue *queue)
+{
+	int k;
+
+	INIT_LIST_HEAD(&queue->list);
+	for (k = 0; k < NICE_QLEN; ++k) {
+		INIT_LIST_HEAD(&queue->queue[k]);
+	}
+}
+
+static int ts_init(struct policy *policy, int cpu)
+{
+	int k;
+	struct ts_queue *queue = &per_cpu(ts_queues, cpu);
+
+	policy->queue = (struct queue *)queue;
+	queue->quantum = nice_quantum;
+
+	for (k = 0; k < 40; ++k) {
+		nice_init(&queue->nice_levels[k]);
+		queue->nice_levels[k].nice = k;
+		INIT_LIST_HEAD(&queue->nices[k]);
+	}
+	return 0;
+}
+
+static int task_deadline(task_t *task)
+{
+	u64 frac_cpu = task->sched_info.cl_data.ts.frac_cpu;
+	frac_cpu *= (u64)NICE_QLEN;
+	frac_cpu >>= 32;
+	return (int)min((u32)(NICE_QLEN - 1), (u32)frac_cpu);
+}
+
+static void nice_rotate_queue(struct nice_queue *queue)
+{
+	int idx, new_idx, deadline, idxdiff;
+	task_t *task = queue->curr;
+
+	check_nice(queue);
+
+	/* shit what if idxdiff == NICE_QLEN - 1?? */
+	idx = queue->curr->sched_info.idx;
+	idxdiff = (idx - queue->base + NICE_QLEN) % NICE_QLEN;
+	deadline = min(1 + task_deadline(task), NICE_QLEN - idxdiff - 1);
+	new_idx = (idx + deadline) % NICE_QLEN;
+#if 0
+	if (idx == new_idx) {
+		/*
+		 * buggy; it sets queue->base = idx because in this case
+		 * we have task_deadline(task) == 0
+		 */
+		new_idx = (idx - task_deadline(task) + NICE_QLEN) % NICE_QLEN;
+		if (queue->base != new_idx)
+			queue->base = new_idx;
+		return;
+	}
+	BUG_ON(!deadline);
+	BUG_ON(queue->base <= new_idx && new_idx <= idx);
+	BUG_ON(idx < queue->base && queue->base <= new_idx);
+	BUG_ON(new_idx <= idx && idx < queue->base);
+	if (0 && idx == new_idx) {
+		printk("FUCKUP: pid = %d, tdl = %d, dl = %d, idx = %d, "
+				"base = %d, diff = %d, fcpu = 0x%lx\n",
+			queue->curr->pid,
+			task_deadline(queue->curr),
+			deadline,
+			idx,
+			queue->base,
+			idxdiff,
+			task->sched_info.cl_data.ts.frac_cpu);
+		BUG();
+	}
+#else
+	/*
+	 * RR in the last deadline
+	 * special-cased so as not to trip BUG_ON()'s below
+	 */
+	if (idx == new_idx) {
+		/* if we got here these two things must hold */
+		BUG_ON(idxdiff != NICE_QLEN - 1);
+		BUG_ON(deadline);
+		list_move_tail(&task->sched_info.run_list, &queue->queue[idx]);
+		if (queue->expired) {
+			queue->level_quantum = level_quantum;
+			queue->expired = 0;
+		}
+		return;
+	}
+#endif
+	task->sched_info.idx = new_idx;
+	if (!test_bit(new_idx, queue->bitmap)) {
+		BUG_ON(!list_empty(&queue->queue[new_idx]));
+		__set_bit(new_idx, queue->bitmap);
+	}
+	list_move_tail(&task->sched_info.run_list,
+			&queue->queue[new_idx]);
+
+	/* expired until list drains */
+	if (!list_empty(&queue->queue[idx]))
+		queue->expired = 1;
+	else {
+		int k, w, m = NICE_QLEN % BITS_PER_LONG;
+		BUG_ON(!test_bit(idx, queue->bitmap));
+		__clear_bit(idx, queue->bitmap);
+
+		for (w = 0, k = 0; k < NICE_QLEN/BITS_PER_LONG; ++k)
+			w += hweight_long(queue->bitmap[k]);
+		if (NICE_QLEN % BITS_PER_LONG)
+			w += hweight_long(queue->bitmap[k] & ((1UL << m) - 1));
+		if (w > 1)
+			queue->base = (queue->base + 1) % NICE_QLEN;
+		queue->level_quantum = level_quantum;
+		queue->expired = 0;
+	}
+	check_nice(queue);
+}
+
+static void nice_tick(struct nice_queue *queue, task_t *task)
+{
+	int idx = task->sched_info.idx;
+	BUG_ON(!task_queued(task));
+	BUG_ON(task != queue->curr);
+	BUG_ON(!test_bit(idx, queue->bitmap));
+	BUG_ON(list_empty(&queue->queue[idx]));
+	check_ts_policy(task);
+	check_nice(queue);
+
+	if (task->sched_info.cl_data.ts.ticks)
+		task->sched_info.cl_data.ts.ticks--;
+
+	if (queue->level_quantum > level_quantum) {
+		WARN_ON(1);
+		queue->level_quantum = 1;
+	}
+
+	if (!queue->expired) {
+		if (queue->level_quantum)
+			queue->level_quantum--;
+	} else if (0 && queue->queue[idx].prev != &task->sched_info.run_list) {
+		int queued = 0, new_idx = (queue->base + 1) % NICE_QLEN;
+		task_t *curr, *sav;
+		task_t *victim = list_entry(queue->queue[idx].prev,
+						task_t,
+						sched_info.run_list);
+		victim->sched_info.idx = new_idx;
+		if (!test_bit(new_idx, queue->bitmap))
+			__set_bit(new_idx, queue->bitmap);
+#if 1
+		list_for_each_entry_safe(curr, sav, &queue->queue[new_idx], sched_info.run_list) {
+			if (victim->sched_info.cl_data.ts.frac_cpu
+				< curr->sched_info.cl_data.ts.frac_cpu) {
+				queued = 1;
+				list_move(&victim->sched_info.run_list,
+						curr->sched_info.run_list.prev);
+				break;
+			}
+		}
+		if (!queued)
+			list_move_tail(&victim->sched_info.run_list,
+					&queue->queue[new_idx]);
+#else
+		list_move(&victim->sched_info.run_list, &queue->queue[new_idx]);
+#endif
+		BUG_ON(list_empty(&queue->queue[idx]));
+	}
+
+	if (!queue->level_quantum && !queue->expired) {
+		check_nice(queue);
+		nice_rotate_queue(queue);
+		check_nice(queue);
+		set_need_resched();
+	} else if (!task->sched_info.cl_data.ts.ticks) {
+		int idxdiff = (idx - queue->base + NICE_QLEN) % NICE_QLEN;
+		check_nice(queue);
+		task->sched_info.cl_data.ts.ticks = rr_quantum;
+		BUG_ON(!test_bit(idx, queue->bitmap));
+		BUG_ON(list_empty(&queue->queue[idx]));
+		if (queue->expired)
+			nice_rotate_queue(queue);
+		else if (idxdiff == NICE_QLEN - 1)
+			list_move_tail(&task->sched_info.run_list,
+					&queue->queue[idx]);
+		else {
+			int new_idx = (idx + 1) % NICE_QLEN;
+			list_del(&task->sched_info.run_list);
+			if (list_empty(&queue->queue[idx])) {
+				BUG_ON(!test_bit(idx, queue->bitmap));
+				__clear_bit(idx, queue->bitmap);
+			}
+			if (!test_bit(new_idx, queue->bitmap)) {
+				BUG_ON(!list_empty(&queue->queue[new_idx]));
+				__set_bit(new_idx, queue->bitmap);
+			}
+			task->sched_info.idx = new_idx;
+			list_add(&task->sched_info.run_list,
+					&queue->queue[new_idx]);
+		}
+		check_nice(queue);
+		set_need_resched();
+	}
+	check_nice(queue);
+	check_ts_policy(task);
+}
+
+static void ts_rotate_queue(struct ts_queue *queue)
+{
+	int idx, new_idx, idxdiff, off, deadline;
+
+	queue->base = find_first_circular_bit(queue->bitmap, queue->base, 40);
+
+	/* shit what if idxdiff == 39?? */
+	check_queue(queue);
+	idx = queue->curr->idx;
+	idxdiff = (idx - queue->base + 40) % 40;
+	off = (int)(queue->curr - queue->nice_levels);
+	deadline = min(1 + off, 40 - idxdiff - 1);
+	new_idx = (idx + deadline) % 40;
+	if (idx == new_idx) {
+		new_idx = (idx - off + 40) % 40;
+		if (queue->base != new_idx)
+			queue->base = new_idx;
+		return;
+	}
+	BUG_ON(!deadline);
+	BUG_ON(queue->base <= new_idx && new_idx <= idx);
+	BUG_ON(idx < queue->base && queue->base <= new_idx);
+	BUG_ON(new_idx <= idx && idx < queue->base);
+	if (!test_bit(new_idx, queue->bitmap)) {
+		BUG_ON(!list_empty(&queue->nices[new_idx]));
+		__set_bit(new_idx, queue->bitmap);
+	}
+	list_move_tail(&queue->curr->list, &queue->nices[new_idx]);
+	queue->curr->idx = new_idx;
+
+	if (list_empty(&queue->nices[idx])) {
+		BUG_ON(!test_bit(idx, queue->bitmap));
+		__clear_bit(idx, queue->bitmap);
+		queue->base = (queue->base + 1) % 40;
+	}
+	check_queue(queue);
+}
+
+static int ts_tick(struct queue *__queue, task_t *task, int user_ticks, int sys_ticks)
+{
+	struct ts_queue *queue = (struct ts_queue *)__queue;
+	struct nice_queue *nice = queue->curr;
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+	int nice_idx = (int)(queue->curr - queue->nice_levels);
+	unsigned long sample_end, delta;
+
+	check_queue(queue);
+	check_ts_policy(task);
+	BUG_ON(!nice);
+	BUG_ON(nice_idx != task->sched_info.cl_data.ts.nice);
+	BUG_ON(!test_bit(nice->idx, queue->bitmap));
+	BUG_ON(list_empty(&queue->nices[nice->idx]));
+
+	sample_end = jiffies;
+	delta = sample_end - task->sched_info.cl_data.ts.sample_start;
+	if (delta)
+		task->sched_info.cl_data.ts.sample_ticks++;
+	else {
+		task->sched_info.cl_data.ts.sample_start = jiffies;
+		task->sched_info.cl_data.ts.sample_ticks = 1;
+	}
+
+	if (delta >= sample_interval) {
+		u64 frac_cpu;
+		frac_cpu = (u64)task->sched_info.cl_data.ts.sample_ticks << 32;
+		do_div(frac_cpu, delta);
+		frac_cpu = 2*frac_cpu + task->sched_info.cl_data.ts.frac_cpu;
+		do_div(frac_cpu, 3);
+		frac_cpu = min(frac_cpu, (1ULL << 32) - 1);
+		task->sched_info.cl_data.ts.frac_cpu = (unsigned long)frac_cpu;
+		task->sched_info.cl_data.ts.sample_start = sample_end;
+		task->sched_info.cl_data.ts.sample_ticks = 0;
+	}
+
+	cpustat->user += user_ticks;
+	cpustat->system += sys_ticks;
+	nice_tick(nice, task);
+	if (queue->quantum > nice_quantum) {
+		queue->quantum = 0;
+		WARN_ON(1);
+	} else if (queue->quantum)
+		queue->quantum--;
+	if (!queue->quantum) {
+		queue->quantum = nice_quantum;
+		ts_rotate_queue(queue);
+		set_need_resched();
+	}
+	check_queue(queue);
+	check_ts_policy(task);
+	return 0;
+}
+
+static void nice_yield(struct nice_queue *queue, task_t *task)
+{
+	int idx, new_idx = (queue->base + NICE_QLEN - 1) % NICE_QLEN;
+
+	check_nice(queue);
+	check_ts_policy(task);
+	if (!test_bit(new_idx, queue->bitmap)) {
+		BUG_ON(!list_empty(&queue->queue[new_idx]));
+		__set_bit(new_idx, queue->bitmap);
+	}
+	list_move_tail(&task->sched_info.run_list, &queue->queue[new_idx]);
+	idx = task->sched_info.idx;
+	task->sched_info.idx = new_idx;
+	set_need_resched();
+
+	if (list_empty(&queue->queue[idx])) {
+		BUG_ON(!test_bit(idx, queue->bitmap));
+		__clear_bit(idx, queue->bitmap);
+	}
+	queue->curr = nice_best(queue);
+#if 0
+	if (queue->curr->sched_info.idx != queue->base)
+		queue->base = queue->curr->sched_info.idx;
+#endif
+	check_nice(queue);
+	check_ts_policy(task);
+}
+
+/*
+ * This is somewhat problematic; nice_yield() only parks tasks on
+ * the end of their current nice levels.
+ */
+static void ts_yield(struct queue *__queue, task_t *task)
+{
+	struct ts_queue *queue = (struct ts_queue *)__queue;
+	struct nice_queue *nice = queue->curr;
+
+	check_queue(queue);
+	check_ts_policy(task);
+	nice_yield(nice, task);
+
+	/*
+	 * If there's no one to yield to, move the whole nice level.
+	 * If this is problematic, setting nice-dependent deadlines
+	 * on a single unified queue may be in order.
+	 */
+	if (nice->tasks == 1) {
+		int idx, new_idx = (queue->base + 40 - 1) % 40;
+		idx = nice->idx;
+		if (!test_bit(new_idx, queue->bitmap)) {
+			BUG_ON(!list_empty(&queue->nices[new_idx]));
+			__set_bit(new_idx, queue->bitmap);
+		}
+		list_move_tail(&nice->list, &queue->nices[new_idx]);
+		if (list_empty(&queue->nices[idx])) {
+			BUG_ON(!test_bit(idx, queue->bitmap));
+			__clear_bit(idx, queue->bitmap);
+		}
+		nice->idx = new_idx;
+		queue->base = find_first_circular_bit(queue->bitmap,
+							queue->base,
+							40);
+		BUG_ON(queue->base >= 40);
+		BUG_ON(!test_bit(queue->base, queue->bitmap));
+		queue->curr = ts_best_nice(queue);
+	}
+	check_queue(queue);
+	check_ts_policy(task);
+}
+
+static task_t *ts_curr(struct queue *__queue)
+{
+	struct ts_queue *queue = (struct ts_queue *)__queue;
+	task_t *task = queue->curr->curr;
+	check_queue(queue);
+	if (task)
+		check_ts_policy(task);
+	return task;
+}
+
+static void ts_set_curr(struct queue *__queue, task_t *task)
+{
+	struct ts_queue *queue = (struct ts_queue *)__queue;
+	struct nice_queue *nice;
+	check_queue(queue);
+	check_ts_policy(task);
+	nice = &queue->nice_levels[task->sched_info.cl_data.ts.nice];
+	queue->curr = nice;
+	nice->curr = task;
+	check_queue(queue);
+	check_ts_policy(task);
+}
+
+static task_t *nice_best(struct nice_queue *queue)
+{
+	task_t *task;
+	int idx = find_first_circular_bit(queue->bitmap,
+						queue->base,
+						NICE_QLEN);
+	check_nice(queue);
+	if (idx >= NICE_QLEN)
+		return NULL;
+	BUG_ON(list_empty(&queue->queue[idx]));
+	BUG_ON(!test_bit(idx, queue->bitmap));
+	task = list_entry(queue->queue[idx].next, task_t, sched_info.run_list);
+	check_nice(queue);
+	check_ts_policy(task);
+	return task;
+}
+
+static struct nice_queue *ts_best_nice(struct ts_queue *queue)
+{
+	int idx = find_first_circular_bit(queue->bitmap, queue->base, 40);
+	check_queue(queue);
+	if (idx >= 40)
+		return NULL;
+	BUG_ON(list_empty(&queue->nices[idx]));
+	BUG_ON(!test_bit(idx, queue->bitmap));
+	return list_entry(queue->nices[idx].next, struct nice_queue, list);
+}
+
+static task_t *ts_best(struct queue *__queue)
+{
+	struct ts_queue *queue = (struct ts_queue *)__queue;
+	struct nice_queue *nice = ts_best_nice(queue);
+	return nice ? nice_best(nice) : NULL;
+}
+
+static void nice_enqueue(struct nice_queue *queue, task_t *task)
+{
+	task_t *curr, *sav;
+	int queued = 0, idx, deadline, base, idxdiff;
+	check_nice(queue);
+	check_ts_policy(task);
+
+	/* don't livelock when queue->expired */
+	deadline = min(!!queue->expired + task_deadline(task), NICE_QLEN - 1);
+	idx = (queue->base + deadline) % NICE_QLEN;
+
+	if (!test_bit(idx, queue->bitmap)) {
+		BUG_ON(!list_empty(&queue->queue[idx]));
+		__set_bit(idx, queue->bitmap);
+	}
+
+#if 1
+	/* keep nice level's queue sorted -- use binomial heaps here soon */
+	list_for_each_entry_safe(curr, sav, &queue->queue[idx], sched_info.run_list) {
+		if (task->sched_info.cl_data.ts.frac_cpu
+				>= curr->sched_info.cl_data.ts.frac_cpu) {
+			list_add(&task->sched_info.run_list,
+					curr->sched_info.run_list.prev);
+			queued = 1;
+			break;
+		}
+	}
+	if (!queued)
+		list_add_tail(&task->sched_info.run_list, &queue->queue[idx]);
+#else
+	list_add_tail(&task->sched_info.run_list, &queue->queue[idx]);
+#endif
+	task->sched_info.idx = idx;
+	/* if (!task->sched_info.cl_data.ts.ticks) */
+		task->sched_info.cl_data.ts.ticks = rr_quantum;
+
+	if (queue->tasks)
+		BUG_ON(!queue->curr);
+	else {
+		BUG_ON(queue->curr);
+		queue->curr = task;
+	}
+	queue->tasks++;
+	check_nice(queue);
+	check_ts_policy(task);
+}
+
+static void ts_enqueue(struct queue *__queue, task_t *task)
+{
+	struct ts_queue *queue = (struct ts_queue *)__queue;
+	struct nice_queue *nice;
+
+	check_queue(queue);
+	check_ts_policy(task);
+	nice = &queue->nice_levels[task->sched_info.cl_data.ts.nice];
+	if (!nice->tasks) {
+		int idx = (queue->base + task->sched_info.cl_data.ts.nice) % 40;
+		if (!test_bit(idx, queue->bitmap)) {
+			BUG_ON(!list_empty(&queue->nices[idx]));
+			__set_bit(idx, queue->bitmap);
+		}
+		list_add_tail(&nice->list, &queue->nices[idx]);
+		nice->idx = idx;
+		if (!queue->curr)
+			queue->curr = nice;
+	}
+	nice_enqueue(nice, task);
+	queue->tasks++;
+	queue->base = find_first_circular_bit(queue->bitmap, queue->base, 40);
+	check_queue(queue);
+	check_ts_policy(task);
+}
+
+static void nice_dequeue(struct nice_queue *queue, task_t *task)
+{
+	check_nice(queue);
+	check_ts_policy(task);
+	list_del(&task->sched_info.run_list);
+	if (list_empty(&queue->queue[task->sched_info.idx])) {
+		BUG_ON(!test_bit(task->sched_info.idx, queue->bitmap));
+		__clear_bit(task->sched_info.idx, queue->bitmap);
+	}
+	queue->tasks--;
+	if (task == queue->curr) {
+		queue->curr = nice_best(queue);
+#if 0
+		if (queue->curr)
+			queue->base = queue->curr->sched_info.idx;
+#endif
+	}
+	check_nice(queue);
+	check_ts_policy(task);
+}
+
+static void ts_dequeue(struct queue *__queue, task_t *task)
+{
+	struct ts_queue *queue = (struct ts_queue *)__queue;
+	struct nice_queue *nice;
+
+	BUG_ON(!queue->tasks);
+	check_queue(queue);
+	check_ts_policy(task);
+	nice = &queue->nice_levels[task->sched_info.cl_data.ts.nice];
+
+	nice_dequeue(nice, task);
+	queue->tasks--;
+	if (!nice->tasks) {
+		list_del_init(&nice->list);
+		if (list_empty(&queue->nices[nice->idx])) {
+			BUG_ON(!test_bit(nice->idx, queue->bitmap));
+			__clear_bit(nice->idx, queue->bitmap);
+		}
+		if (nice == queue->curr)
+			queue->curr = ts_best_nice(queue);
+	}
+	queue->base = find_first_circular_bit(queue->bitmap, queue->base, 40);
+	if (queue->base >= 40)
+		queue->base = 0;
+	check_queue(queue);
+	check_ts_policy(task);
+}
+
+static int ts_tasks(struct queue *__queue)
+{
+	struct ts_queue *queue = (struct ts_queue *)__queue;
+	check_queue(queue);
+	return queue->tasks;
+}
+
+static int ts_nice(struct queue *__queue, task_t *task)
+{
+	int nice = task->sched_info.cl_data.ts.nice - 20;
+	check_ts_policy(task);
+	BUG_ON(nice < -20);
+	BUG_ON(nice >= 20);
+	return nice;
+}
+
+static void ts_renice(struct queue *queue, task_t *task, int nice)
+{
+	check_queue((struct ts_queue *)queue);
+	check_ts_policy(task);
+	BUG_ON(nice < -20);
+	BUG_ON(nice >= 20);
+	task->sched_info.cl_data.ts.nice = nice + 20;
+	check_queue((struct ts_queue *)queue);
+}
+
+static int nice_task_prio(struct nice_queue *nice, task_t *task)
+{
+	if (!task_queued(task))
+		return task_deadline(task);
+	else {
+		int prio = task->sched_info.idx - nice->base;
+		return prio < 0 ? prio + NICE_QLEN : prio;
+	}
+}
+
+static int ts_nice_prio(struct ts_queue *ts, struct nice_queue *nice)
+{
+	if (list_empty(&nice->list))
+		return (int)(nice - ts->nice_levels);
+	else {
+		int prio = nice->idx - ts->base;
+		return prio < 0 ? prio + 40 : prio;
+	}
+}
+
+/* 100% fake priority to report heuristics and the like */
+static int ts_prio(task_t *task)
+{
+	int policy_idx;
+	struct policy *policy;
+	struct ts_queue *ts;
+	struct nice_queue *nice;
+
+	policy_idx = task->sched_info.policy;
+	policy = per_cpu(runqueues, task_cpu(task)).policies[policy_idx];
+	ts = (struct ts_queue *)policy->queue;
+	nice = &ts->nice_levels[task->sched_info.cl_data.ts.nice];
+	return 40*ts_nice_prio(ts, nice) + nice_task_prio(nice, task);
+}
+
+static void ts_setprio(task_t *task, int prio)
+{
+}
+
+static void ts_start_wait(struct queue *__queue, task_t *task)
+{
+}
+
+static void ts_stop_wait(struct queue *__queue, task_t *task)
+{
+}
+
+static void ts_sleep(struct queue *__queue, task_t *task)
+{
+}
+
+static void ts_wake(struct queue *__queue, task_t *task)
+{
+}
+
+static int nice_preempt(struct nice_queue *queue, task_t *task)
+{
+	check_nice(queue);
+	check_ts_policy(task);
+	/* assume FB style preemption at wakeup */
+	if (!task_queued(task) || !queue->curr)
+		return 1;
+	else {
+		int delta_t, delta_q;
+		delta_t = (task->sched_info.idx - queue->base + NICE_QLEN)
+				% NICE_QLEN;
+		delta_q = (queue->curr->sched_info.idx - queue->base
+							+ NICE_QLEN)
+				% NICE_QLEN;
+		if (delta_t < delta_q)
+			return 1;
+		else if (task->sched_info.cl_data.ts.frac_cpu
+				< queue->curr->sched_info.cl_data.ts.frac_cpu)
+			return 1;
+		else
+			return 0;
+	}
+	check_nice(queue);
+}
+
+static int ts_preempt(struct queue *__queue, task_t *task)
+{
+	int curr_nice;
+	struct ts_queue *queue = (struct ts_queue *)__queue;
+	struct nice_queue *nice = queue->curr;
+
+	check_queue(queue);
+	check_ts_policy(task);
+	if (!queue->curr)
+		return 1;
+
+	curr_nice = (int)(nice - queue->nice_levels);
+
+	/* preempt when nice number is lower, or the above for matches */
+	if (task->sched_info.cl_data.ts.nice != curr_nice)
+		return task->sched_info.cl_data.ts.nice < curr_nice;
+	else
+		return nice_preempt(nice, task);
+}
+
+static struct queue_ops ts_ops = {
+	.init		= ts_init,
+	.fini		= nop_fini,
+	.tick		= ts_tick,
+	.yield		= ts_yield,
+	.curr		= ts_curr,
+	.set_curr	= ts_set_curr,
+	.tasks		= ts_tasks,
+	.best		= ts_best,
+	.enqueue	= ts_enqueue,
+	.dequeue	= ts_dequeue,
+	.start_wait	= ts_start_wait,
+	.stop_wait	= ts_stop_wait,
+	.sleep		= ts_sleep,
+	.wake		= ts_wake,
+	.preempt	= ts_preempt,
+	.nice		= ts_nice,
+	.renice		= ts_renice,
+	.prio		= ts_prio,
+	.setprio	= ts_setprio,
+	.timeslice	= nop_timeslice,
+	.set_timeslice	= nop_set_timeslice,
+};
+
+struct policy ts_policy = {
+	.ops	= &ts_ops,
+};
diff -prauN linux-2.6.0-test11/kernel/sched/util.c sched-2.6.0-test11-5/kernel/sched/util.c
--- linux-2.6.0-test11/kernel/sched/util.c	1969-12-31 16:00:00.000000000 -0800
+++ sched-2.6.0-test11-5/kernel/sched/util.c	2003-12-19 08:43:20.000000000 -0800
@@ -0,0 +1,37 @@
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/percpu.h>
+#include <asm/page.h>
+#include "queue.h"
+
+int find_first_circular_bit(unsigned long *addr, int start, int end)
+{
+	int bit = find_next_bit(addr, end, start);
+	if (bit < end)
+		return bit;
+	bit = find_first_bit(addr, start);
+	if (bit < start)
+		return bit;
+	return end;
+}
+
+void queue_nop(struct queue *queue, task_t *task)
+{
+}
+
+void nop_renice(struct queue *queue, task_t *task, int nice)
+{
+}
+
+void nop_fini(struct policy *policy, int cpu)
+{
+}
+
+unsigned long nop_timeslice(struct queue *queue, task_t *task)
+{
+	return 0;
+}
+
+void nop_set_timeslice(struct queue *queue, task_t *task, unsigned long n)
+{
+}
diff -prauN linux-2.6.0-test11/kernel/sched.c sched-2.6.0-test11-5/kernel/sched.c
--- linux-2.6.0-test11/kernel/sched.c	2003-11-26 12:45:17.000000000 -0800
+++ sched-2.6.0-test11-5/kernel/sched.c	2003-12-21 06:06:32.000000000 -0800
@@ -15,6 +15,8 @@
  *		and per-CPU runqueues.  Cleanups and useful suggestions
  *		by Davide Libenzi, preemptible kernel bits by Robert Love.
  *  2003-09-03	Interactivity tuning by Con Kolivas.
+ *  2003-12-17	Total rewrite and generalized scheduler policies
+ *		by William Irwin.
  */
 
 #include <linux/mm.h>
@@ -38,6 +40,8 @@
 #include <linux/cpu.h>
 #include <linux/percpu.h>
 
+#include "sched/queue.h"
+
 #ifdef CONFIG_NUMA
 #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
 #else
@@ -45,181 +49,79 @@
 #endif
 
 /*
- * Convert user-nice values [ -20 ... 0 ... 19 ]
- * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
- * and back.
- */
-#define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 20)
-#define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 20)
-#define TASK_NICE(p)		PRIO_TO_NICE((p)->static_prio)
-
-/*
- * 'User priority' is the nice value converted to something we
- * can work with better when scaling various scheduler parameters,
- * it's a [ 0 ... 39 ] range.
- */
-#define USER_PRIO(p)		((p)-MAX_RT_PRIO)
-#define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
-#define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
-#define AVG_TIMESLICE	(MIN_TIMESLICE + ((MAX_TIMESLICE - MIN_TIMESLICE) *\
-			(MAX_PRIO-1-NICE_TO_PRIO(0))/(MAX_USER_PRIO - 1)))
-
-/*
- * Some helpers for converting nanosecond timing to jiffy resolution
- */
-#define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
-#define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
-
-/*
- * These are the 'tuning knobs' of the scheduler:
- *
- * Minimum timeslice is 10 msecs, default timeslice is 100 msecs,
- * maximum timeslice is 200 msecs. Timeslices get refilled after
- * they expire.
- */
-#define MIN_TIMESLICE		( 10 * HZ / 1000)
-#define MAX_TIMESLICE		(200 * HZ / 1000)
-#define ON_RUNQUEUE_WEIGHT	30
-#define CHILD_PENALTY		95
-#define PARENT_PENALTY		100
-#define EXIT_WEIGHT		3
-#define PRIO_BONUS_RATIO	25
-#define MAX_BONUS		(MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
-#define INTERACTIVE_DELTA	2
-#define MAX_SLEEP_AVG		(AVG_TIMESLICE * MAX_BONUS)
-#define STARVATION_LIMIT	(MAX_SLEEP_AVG)
-#define NS_MAX_SLEEP_AVG	(JIFFIES_TO_NS(MAX_SLEEP_AVG))
-#define NODE_THRESHOLD		125
-#define CREDIT_LIMIT		100
-
-/*
- * If a task is 'interactive' then we reinsert it in the active
- * array after it has expired its current timeslice. (it will not
- * continue to run immediately, it will still roundrobin with
- * other interactive tasks.)
- *
- * This part scales the interactivity limit depending on niceness.
- *
- * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
- * Here are a few examples of different nice levels:
- *
- *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
- *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
- *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
- *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
- *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
- *
- * (the X axis represents the possible -5 ... 0 ... +5 dynamic
- *  priority range a task can explore, a value of '1' means the
- *  task is rated interactive.)
- *
- * Ie. nice +19 tasks can never get 'interactive' enough to be
- * reinserted into the active array. And only heavily CPU-hog nice -20
- * tasks will be expired. Default nice 0 tasks are somewhere between,
- * it takes some effort for them to get interactive, but it's not
- * too hard.
- */
-
-#define CURRENT_BONUS(p) \
-	(NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
-		MAX_SLEEP_AVG)
-
-#ifdef CONFIG_SMP
-#define TIMESLICE_GRANULARITY(p)	(MIN_TIMESLICE * \
-		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
-			num_online_cpus())
-#else
-#define TIMESLICE_GRANULARITY(p)	(MIN_TIMESLICE * \
-		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
-#endif
-
-#define SCALE(v1,v1_max,v2_max) \
-	(v1) * (v2_max) / (v1_max)
-
-#define DELTA(p) \
-	(SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \
-		INTERACTIVE_DELTA)
-
-#define TASK_INTERACTIVE(p) \
-	((p)->prio <= (p)->static_prio - DELTA(p))
-
-#define JUST_INTERACTIVE_SLEEP(p) \
-	(JIFFIES_TO_NS(MAX_SLEEP_AVG * \
-		(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
-
-#define HIGH_CREDIT(p) \
-	((p)->interactive_credit > CREDIT_LIMIT)
-
-#define LOW_CREDIT(p) \
-	((p)->interactive_credit < -CREDIT_LIMIT)
-
-#define TASK_PREEMPTS_CURR(p, rq) \
-	((p)->prio < (rq)->curr->prio)
-
-/*
- * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ]
- * to time slice values.
- *
- * The higher a thread's priority, the bigger timeslices
- * it gets during one round of execution. But even the lowest
- * priority thread gets MIN_TIMESLICE worth of execution time.
- *
- * task_timeslice() is the interface that is used by the scheduler.
- */
-
-#define BASE_TIMESLICE(p) (MIN_TIMESLICE + \
-	((MAX_TIMESLICE - MIN_TIMESLICE) * (MAX_PRIO-1-(p)->static_prio)/(MAX_USER_PRIO - 1)))
-
-static inline unsigned int task_timeslice(task_t *p)
-{
-	return BASE_TIMESLICE(p);
-}
-
-/*
- * These are the runqueue data structures:
- */
-
-#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
-
-typedef struct runqueue runqueue_t;
-
-struct prio_array {
-	int nr_active;
-	unsigned long bitmap[BITMAP_SIZE];
-	struct list_head queue[MAX_PRIO];
-};
-
-/*
  * This is the main, per-CPU runqueue data structure.
  *
  * Locking rule: those places that want to lock multiple runqueues
  * (such as the load balancing or the thread migration code), lock
  * acquire operations must be ordered by ascending &runqueue.
  */
-struct runqueue {
-	spinlock_t lock;
-	unsigned long nr_running, nr_switches, expired_timestamp,
-			nr_uninterruptible;
-	task_t *curr, *idle;
-	struct mm_struct *prev_mm;
-	prio_array_t *active, *expired, arrays[2];
-	int prev_cpu_load[NR_CPUS];
-#ifdef CONFIG_NUMA
-	atomic_t *node_nr_running;
-	int prev_node_load[MAX_NUMNODES];
-#endif
-	task_t *migration_thread;
-	struct list_head migration_queue;
+DEFINE_PER_CPU(struct runqueue, runqueues);
 
-	atomic_t nr_iowait;
+struct policy *policies[] = {
+	&rt_policy,
+	&ts_policy,
+	&batch_policy,
+	&idle_policy,
+	NULL,
 };
 
-static DEFINE_PER_CPU(struct runqueue, runqueues);
-
 #define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
 #define this_rq()		(&__get_cpu_var(runqueues))
 #define task_rq(p)		cpu_rq(task_cpu(p))
-#define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
+#define rq_curr(rq)		(rq)->__curr
+#define cpu_curr(cpu)		rq_curr(cpu_rq(cpu))
+
+static inline struct policy *task_policy(task_t *task)
+{
+	unsigned long idx;
+	struct policy *policy;
+	idx = task->sched_info.policy;
+	__check_task_policy(idx);
+	policy = task_rq(task)->policies[idx];
+	check_policy(policy);
+	return policy;
+}
+
+static inline struct policy *rq_policy(runqueue_t *rq)
+{
+	unsigned long idx;
+	task_t *task;
+	struct policy *policy;
+
+	task = rq_curr(rq);
+	BUG_ON(!task);
+	BUG_ON((unsigned long)task < PAGE_OFFSET);
+	idx = task->sched_info.policy;
+	__check_task_policy(idx);
+	policy = rq->policies[idx];
+	check_policy(policy);
+	return policy;
+}
+
+static int __task_nice(task_t *task)
+{
+	struct policy *policy = task_policy(task);
+	return policy->ops->nice(policy->queue, task);
+}
+
+static inline void set_rq_curr(runqueue_t *rq, task_t *task)
+{
+	rq->curr = task->sched_info.policy;
+	__check_task_policy(rq->curr);
+	rq->__curr = task;
+}
+
+static inline int task_preempts_curr(task_t *task, runqueue_t *rq)
+{
+	check_task_policy(rq_curr(rq));
+	check_task_policy(task);
+	if (rq_curr(rq)->sched_info.policy != task->sched_info.policy)
+		return task->sched_info.policy < rq_curr(rq)->sched_info.policy;
+	else {
+		struct policy *policy = rq_policy(rq);
+		return policy->ops->preempt(policy->queue, task);
+	}
+}
 
 /*
  * Default context-switch locking:
@@ -227,7 +129,7 @@ static DEFINE_PER_CPU(struct runqueue, r
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(rq, next)	do { } while(0)
 # define finish_arch_switch(rq, next)	spin_unlock_irq(&(rq)->lock)
-# define task_running(rq, p)		((rq)->curr == (p))
+# define task_running(rq, p)		(rq_curr(rq) == (p))
 #endif
 
 #ifdef CONFIG_NUMA
@@ -320,53 +222,32 @@ static inline void rq_unlock(runqueue_t 
 }
 
 /*
- * Adding/removing a task to/from a priority array:
+ * Adding/removing a task to/from a policy's queue.
Previous message: [thread] [date] [author]
Next message: [thread] [date] [author]

Messages in current thread:
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., William Lee Irwin III, (Fri Apr 13, 2:21 pm)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., Michal Piotrowski, (Fri Apr 13, 2:57 pm)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., William Lee Irwin III, (Fri Apr 13, 3:21 pm)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., William Lee Irwin III, (Fri Apr 13, 4:30 pm)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., William Lee Irwin III, (Fri Apr 13, 4:58 pm)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., Eric W. Biederman, (Sat Apr 14, 10:15 am)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., Eric W. Biederman, (Sat Apr 14, 10:44 am)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., Eric W. Biederman, (Sat Apr 14, 11:40 am)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., William Lee Irwin III, (Sat Apr 14, 12:48 pm)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., William Lee Irwin III, (Sat Apr 14, 9:01 pm)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., William Lee Irwin III, (Sun Apr 15, 8:26 am)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., William Lee Irwin III, (Sun Apr 15, 8:47 am)
Re: Kaffeine problem with CFS, Ingo Molnar, (Sun Apr 15, 9:13 am)
Re: Kaffeine problem with CFS, Ingo Molnar, (Sun Apr 15, 9:25 am)
Re: Kaffeine problem with CFS, Christoph Pfister, (Sun Apr 15, 9:55 am)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., Jonathan Lundell, (Sun Apr 15, 12:00 pm)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., William Lee Irwin III, (Sun Apr 15, 12:35 pm)
Re: Kaffeine problem with CFS, S.Çağlar, (Sun Apr 15, 3:14 pm)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., William Lee Irwin III, (Sun Apr 15, 4:39 pm)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., William Lee Irwin III, (Sun Apr 15, 4:54 pm)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., William Lee Irwin III, (Sun Apr 15, 8:04 pm)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., William Lee Irwin III, (Mon Apr 16, 4:04 am)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., William Lee Irwin III, (Mon Apr 16, 6:46 am)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., William Lee Irwin III, (Mon Apr 16, 8:45 am)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., William Lee Irwin III, (Mon Apr 16, 9:13 am)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., Michael K. Edwards, (Mon Apr 16, 4:10 pm)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., William Lee Irwin III, (Mon Apr 16, 11:09 pm)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., William Lee Irwin III, (Mon Apr 16, 11:14 pm)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., William Lee Irwin III, (Mon Apr 16, 11:26 pm)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., William Lee Irwin III, (Tue Apr 17, 12:09 am)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., William Lee Irwin III, (Tue Apr 17, 1:23 am)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., William Lee Irwin III, (Tue Apr 17, 1:24 am)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., William Lee Irwin III, (Tue Apr 17, 2:05 am)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., William Lee Irwin III, (Tue Apr 17, 2:57 am)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., William Lee Irwin III, (Tue Apr 17, 4:31 am)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., William Lee Irwin III, (Tue Apr 17, 1:05 pm)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., William Lee Irwin III, (Tue Apr 17, 3:32 pm)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., Michael K. Edwards, (Tue Apr 17, 4:00 pm)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., William Lee Irwin III, (Tue Apr 17, 4:07 pm)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., Michael K. Edwards, (Tue Apr 17, 4:52 pm)
Re: Kaffeine problem with CFS, Ingo Molnar, (Wed Apr 18, 1:27 am)
Re: Kaffeine problem with CFS, Ingo Molnar, (Wed Apr 18, 1:57 am)
Re: Kaffeine problem with CFS, Christoph Pfister, (Wed Apr 18, 1:57 am)
Re: Kaffeine problem with CFS, Ingo Molnar, (Wed Apr 18, 2:01 am)
Re: Kaffeine problem with CFS, Ingo Molnar, (Wed Apr 18, 2:06 am)
Re: Kaffeine problem with CFS, Mike Galbraith, (Wed Apr 18, 2:12 am)
Re: Kaffeine problem with CFS, Christoph Pfister, (Wed Apr 18, 2:13 am)
Re: Kaffeine problem with CFS, Ingo Molnar, (Wed Apr 18, 2:17 am)
Re: Kaffeine problem with CFS, Christoph Pfister, (Wed Apr 18, 2:25 am)
Re: Kaffeine problem with CFS, Ingo Molnar, (Wed Apr 18, 2:28 am)
Re: Kaffeine problem with CFS, Christoph Pfister, (Wed Apr 18, 2:52 am)
Re: Kaffeine problem with CFS, Christoph Pfister, (Wed Apr 18, 3:04 am)
Re: Kaffeine problem with CFS, Ingo Molnar, (Wed Apr 18, 3:17 am)
Re: Kaffeine problem with CFS, Ingo Molnar, (Wed Apr 18, 3:32 am)
Re: Kaffeine problem with CFS, Ingo Molnar, (Wed Apr 18, 3:37 am)
Re: Kaffeine problem with CFS, Ingo Molnar, (Wed Apr 18, 3:49 am)
Re: Kaffeine problem with CFS, Ingo Molnar, (Wed Apr 18, 3:53 am)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., William Lee Irwin III, (Wed Apr 18, 6:08 am)
Re: CFS and suspend2: hang in atomic copy, Christian Hesse, (Wed Apr 18, 1:45 pm)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., William Lee Irwin III, (Thu Apr 19, 12:57 am)
Renice X for cpu schedulers, Con Kolivas, (Thu Apr 19, 4:59 am)
Re: Renice X for cpu schedulers, Peter Williams, (Thu Apr 19, 5:42 am)
Re: Renice X for cpu schedulers, Mark Lord, (Thu Apr 19, 6:17 am)
Re: Renice X for cpu schedulers, Con Kolivas, (Thu Apr 19, 8:10 am)
Re: Renice X for cpu schedulers, Mark Lord, (Thu Apr 19, 9:15 am)
Re: Renice X for cpu schedulers, Gene Heskett, (Thu Apr 19, 11:16 am)
Re: Renice X for cpu schedulers, Gene Heskett, (Thu Apr 19, 11:21 am)
Re: Renice X for cpu schedulers, Ray Lee, (Thu Apr 19, 12:26 pm)
Re: Renice X for cpu schedulers, Michael K. Edwards, (Thu Apr 19, 2:35 pm)
Re: Renice X for cpu schedulers, Con Kolivas, (Thu Apr 19, 3:47 pm)
Re: Renice X for cpu schedulers, Con Kolivas, (Thu Apr 19, 3:56 pm)
Re: Renice X for cpu schedulers, Con Kolivas, (Thu Apr 19, 5:17 pm)
Re: Renice X for cpu schedulers, Michael K. Edwards, (Thu Apr 19, 5:20 pm)
Re: Renice X for cpu schedulers, Ray Lee, (Thu Apr 19, 5:56 pm)
Re: Renice X for cpu schedulers, Ed Tomlinson, (Thu Apr 19, 6:17 pm)
Re: Renice X for cpu schedulers, Linus Torvalds, (Thu Apr 19, 6:27 pm)
Re: Renice X for cpu schedulers, Gene Heskett, (Thu Apr 19, 7:00 pm)
Re: Renice X for cpu schedulers, Gene Heskett, (Thu Apr 19, 7:01 pm)
Re: Renice X for cpu schedulers, Nick Piggin, (Thu Apr 19, 8:57 pm)
Re: Renice X for cpu schedulers, Nick Piggin, (Thu Apr 19, 9:09 pm)
Re: Renice X for cpu schedulers, Mike Galbraith, (Thu Apr 19, 10:24 pm)
Re: [Announce] [patch] Modular Scheduler Core and Complete ..., William Lee Irwin III, (Thu Apr 19, 10:26 pm)
Re: Renice X for cpu schedulers, Bill Huey, (Thu Apr 19, 10:34 pm)
Re: Renice X for cpu schedulers, Mark Lord, (Sat Apr 21, 7:55 am)
Re: Renice X for cpu schedulers, Mark Lord, (Sun Apr 22, 5:54 am)
Re: Renice X for cpu schedulers, Con Kolivas, (Sun Apr 22, 5:58 am)
Re: Renice X for cpu schedulers, Ray Lee, (Tue Apr 24, 8:50 am)
Re: Renice X for cpu schedulers, Matt Mackall, (Tue Apr 24, 9:23 am)