This series is a follow-on the the nanosecond select/poll series. The goal of this series is to introduce the capability into hrtimers to deal with a "range" rather than a specific point in time. (Several people discussed this recently, but we've been toying with the concept for a while) In addition, in the last patch of the series, the patches make select() and poll() use these range timers with a standard "slack" that comes from 1) a per process task_struct value 2) a "the longer the sleep the more the slack" function that Linus wrote --
From: Arjan van de Ven <arjan@linux.intel.com>
Subject: [PATCH] hrtimer: rename the "expires" struct member to avoid accidental usage
To catch code that still touches the "expires" memory directly, rename it
to have the compiler complain rather than get nasty, hard to explain,
runtime behavior
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
include/linux/hrtimer.h | 20 ++++++++++----------
1 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 9900e99..485a634 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -111,7 +111,7 @@ enum hrtimer_cb_mode {
*/
struct hrtimer {
struct rb_node node;
- ktime_t expires;
+ ktime_t _expires;
enum hrtimer_restart (*function)(struct hrtimer *);
struct hrtimer_clock_base *base;
unsigned long state;
@@ -219,41 +219,41 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer)
static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
{
- timer->expires = time;
+ timer->_expires = time;
}
static inline void hrtimer_set_expires_tv64(struct hrtimer *timer, s64 tv64)
{
- timer->expires.tv64 = tv64;
+ timer->_expires.tv64 = tv64;
}
static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time)
{
- timer->expires = ktime_add_safe(timer->expires, time);
+ timer->_expires = ktime_add_safe(timer->_expires, time);
}
static inline void hrtimer_add_expires_ns(struct hrtimer *timer, unsigned long ns)
{
- timer->expires = ktime_add_ns(timer->expires, ns);
+ timer->_expires = ktime_add_ns(timer->_expires, ns);
}
static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer)
{
- return timer->expires;
+ return timer->_expires;
}
static inline s64 hrtimer_get_expires_tv64(const struct hrtimer *timer)
{
- return timer->expires.tv64;
+ return timer->_expires.tv64;
}
static inline s64 hrtimer_get_expires_ns(const struct hrtimer ...From: Arjan van de Ven <arjan@linux.intel.com>
Subject: [PATCH] hrtimer: add abstraction functions for accessing the "expires" member
In order to be able to turn hrtimers into range based, we need to provide
accessor functions for getting to the "expires" ktime_t member of the
struct hrtimer.
This patch adds a set of accessors for this purpose:
* hrtimer_set_expires
* hrtimer_set_expires_tv64
* hrtimer_add_expires
* hrtimer_add_expires_ns
* hrtimer_get_expires
* hrtimer_get_expires_tv64
* hrtimer_get_expires_ns
* hrtimer_expires_remaining
* hrtimer_start_expires
No users of these new accessors are added yet; these follow in later patches.
Hopefully this patch can even go into 2.6.27-rc so that the conversions will
not have a bottleneck in -next
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
include/linux/hrtimer.h | 45 +++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 45 insertions(+), 0 deletions(-)
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index becd17d..9900e99 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -217,6 +217,45 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer)
return timer->base->cpu_base->hres_active;
}
+static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
+{
+ timer->expires = time;
+}
+static inline void hrtimer_set_expires_tv64(struct hrtimer *timer, s64 tv64)
+{
+ timer->expires.tv64 = tv64;
+}
+
+static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time)
+{
+ timer->expires = ktime_add_safe(timer->expires, time);
+}
+
+static inline void hrtimer_add_expires_ns(struct hrtimer *timer, unsigned long ns)
+{
+ timer->expires = ktime_add_ns(timer->expires, ns);
+}
+
+static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer)
+{
+ return timer->expires;
+}
+
+static inline s64 hrtimer_get_expires_tv64(const struct hrtimer *timer)
+{
+ return timer->expires.tv64;
+}
+
+static inline ...From: Arjan van de Ven <arjan@linux.intel.com>
Subject: [PATCH] hrtimer: turn hrtimers into range timers
this patch turns hrtimers into range timers; they have 2 expire points
1) the soft expire point
2) the hard expire point
the kernel will do it's regular best effort attempt to get the timer run
at the hard expire point. However, if some other time fires after the soft
expire point, the kernel now has the freedom to fire this timer at this point,
and thus grouping the events and preventing a power-expensive wakeup in the
future.
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
include/linux/hrtimer.h | 31 ++++++++++++++++++++++++++++++-
kernel/hrtimer.c | 43 +++++++++++++++++++++++++++++++++++++++----
2 files changed, 69 insertions(+), 5 deletions(-)
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 485a634..c26b1a5 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -112,6 +112,7 @@ enum hrtimer_cb_mode {
struct hrtimer {
struct rb_node node;
ktime_t _expires;
+ ktime_t _softexpires;
enum hrtimer_restart (*function)(struct hrtimer *);
struct hrtimer_clock_base *base;
unsigned long state;
@@ -220,20 +221,37 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer)
static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
{
timer->_expires = time;
+ timer->_softexpires = time;
}
+
+static inline void hrtimer_set_expires_range(struct hrtimer *timer, ktime_t time, ktime_t delta)
+{
+ timer->_softexpires = time;
+ timer->_expires = ktime_add_safe(time, delta);
+}
+
+static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, unsigned long delta)
+{
+ timer->_softexpires = time;
+ timer->_expires = ktime_add_ns(time, delta);
+}
+
static inline void hrtimer_set_expires_tv64(struct hrtimer *timer, s64 tv64)
{
timer->_expires.tv64 = tv64;
+ timer->_softexpires.tv64 = tv64;
}
static inline void ...Somehow the function is called softexpires, but returns the hard expire I might be missing something, but this code only looks at the leftmost timer, and we're indexed on the hard expire time, which might be rather far to the right of here. This means that esp for those timers for which we can save most we're least likely to do so because we'll plain not see them. --
What you need is a data structure that supports stabbing queries on overlapping intervals, such like a Priority Search Tree. If I'm not mistaken, then the augmented Red-Black tree from the EEVDF paper is identical to PST [*]. This data-structure adds a Heap property to each RB-node, allowing one to search the tree on a different property. So what you can do in this case, is index the RB-tree on the soft expire, and index the heap on the hard expire. Then you can find the leftmost hard expire by traversing the tree using the heap property - and program the clock-event using that time. And you can search for soft expired entries using the RB-tree like we do now. [*] Fabio implemented it on top of the linux RB-tree for their wf2q+ implementation that they used for their BFQ I/O scheduler: http://feanor.sssup.it/~fabio/linux/wfq/ And I borrowed their implementation for my scheduler work: http://programming.kicks-ass.net/kernel-patches/sched-eevdf/sched-eedf.patch --
Even better, in the implementations below, the leftmost heap propery can be read from the root node, so if, as with the clock event, you don't actually need the entry itself, but just the time, you can find it by reading the heap propery of the root node. Which saves a whole log(n) tree traversal ;-) Same goes for reprogramming the clock event on insert and delete, just --
for perfection we'd need that, for a simple good we don't; timers are also performance critical so whatever we do can't be expensive in any way... the current method isn't any more expensive. -- If you want to reach me at my work email, use arjan@linux.intel.com For development, discussion and tips for power savings, visit http://www.lesswatts.org --
On Tue, 02 Sep 2008 10:22:12 +0200 you're missing a little detail ;) yes we start from left to right, and we stop once we find a timer that we can't fire anymore. The thing that you missed is that any timer after that (even if we could fire it now) will just be fired when the timer we stopped on fires.. so it'll still group them around those timers that are otherwise ungroupable. -- If you want to reach me at my work email, use arjan@linux.intel.com For development, discussion and tips for power savings, visit http://www.lesswatts.org --
Gah, right. How about adding the following: /* * The immediate goal is minimizing wakeups, not running * timers at the earliest interrupt after their soft expiration. * This allows us to avoid using a Priority Search Tree, * which can answer a stabbing querry for overlapping * intervals and instead use the simple BST we already have. * We don't add extra wakeups by delaying timers that are * right-of a not yet expired timer, because that timer will * have to trigger a wakeup anyway. */ --
On Tue, 02 Sep 2008 15:47:07 +0200 fair enough; I'll merge this in (I'm not going to resend the whole series just for this redo though... maybe when there's more feedback I'll roll it all up) thanks! -- If you want to reach me at my work email, use arjan@linux.intel.com For development, discussion and tips for power savings, visit http://www.lesswatts.org --
From: Arjan van de Ven <arjan@linux.intel.com>
Subject: [PATCH] hrtimer: convert kvm to the new hrtimer apis
In order to be able to do range hrtimers we need to use accessor functions
to the "expire" member of the hrtimer struct.
This patch converts KVM to these accessors.
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
arch/x86/kvm/i8254.c | 6 +++---
arch/x86/kvm/lapic.c | 6 ++----
2 files changed, 5 insertions(+), 7 deletions(-)
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index c0f7872..1bf8f57 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -205,8 +205,8 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps)
wake_up_interruptible(&vcpu0->wq);
}
- pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
- pt->scheduled = ktime_to_ns(pt->timer.expires);
+ hrtimer_add_expires_ns(&pt->timer, pt->period);
+ pt->scheduled = ktime_to_ns(hrtimer_get_expires(&pt->timer));
return (pt->period == 0 ? 0 : 1);
}
@@ -246,7 +246,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
timer = &pit->pit_state.pit_timer.timer;
if (hrtimer_cancel(timer))
- hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
}
static void destroy_pit_timer(struct kvm_kpit_timer *pt)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 73f43de..a5b61de 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -953,9 +953,7 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
}
if (apic_lvtt_period(apic)) {
result = 1;
- apic->timer.dev.expires = ktime_add_ns(
- apic->timer.dev.expires,
- apic->timer.period);
+ hrtimer_add_expires_ns(&apic->timer.dev, apic->timer.period);
}
return result;
}
@@ -1124,7 +1122,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
timer = &apic->timer.dev;
if (hrtimer_cancel(timer))
- hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
+ hrtimer_start_expires(timer, ...From: Arjan van de Ven <arjan@linux.intel.com>
Subject: [PATCH] hrtimer: create a "timer_slack" field in the task struct
We want to be able to control the default "rounding" that is used by
select() and poll() and friends. This is a per process property
(so that we can have a "nice" like program to start certain programs with
a looser or stricter rounding) that can be set/get via a prctl().
For this purpose, a field called "timer_slack_ns" is added to the task
struct. In addition, a field called "default_timer_slack"ns" is added
so that tasks easily can temporarily to a more/less accurate slack and then
back to the default.
The default value of the slack is set to 50 usec; this is significantly less
than 2.6.27's average select() and poll() timing error but still allows
the kernel to group timers somewhat to preserve power behavior. Applications
and admins can override this via the prctl()
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
include/linux/init_task.h | 1 +
include/linux/prctl.h | 7 +++++++
include/linux/sched.h | 6 ++++++
kernel/fork.c | 2 ++
kernel/sys.c | 10 ++++++++++
5 files changed, 26 insertions(+), 0 deletions(-)
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 021d8e7..23fd890 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -170,6 +170,7 @@ extern struct group_info init_groups;
.cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \
.fs_excl = ATOMIC_INIT(0), \
.pi_lock = __SPIN_LOCK_UNLOCKED(tsk.pi_lock), \
+ .timer_slack_ns = 50000, /* 50 usec default slack */ \
.pids = { \
[PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), \
[PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index 5ad7919..48d887e 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -78,4 +78,11 @@
#define PR_GET_SECUREBITS 27
#define PR_SET_SECUREBITS 28
+/*
+ * ...Is this a good idea? IMO it should be per-syscall, not per application. Threads would certainly like private values... and this makes really ugly interface. ...plus it bloats task struct. ...where did the sys_indirect proposals go? We created new syscalls, right? IMO we should create new syscalls here, too. -- (english) http://www.livejournal.com/~pavelmachek (cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html --
On Tue, 2 Sep 2008 12:04:39 +0200 Yes it would be nice to have new syscalls for this and no, nobody and nothing would use them. THe really big advantag of this default-from-task-struct is that you can have a program similar to "nice" that allows you to run an existing program at a specified granularity (say, a version of acroread that has ... which nobody uses today. It's not just new syscalls, it's a new glibc api as well at that point. -- If you want to reach me at my work email, use arjan@linux.intel.com For development, discussion and tips for power savings, visit http://www.lesswatts.org --
...and new applications, yes. I believe applications should explicitely enable slacking timers. Pavel -- (english) http://www.livejournal.com/~pavelmachek (cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html --
On Mon, 8 Sep 2008 15:27:16 +0200 timers are slacking today, at least for select() and poll(), and are a great deal more so than the defaults in this patchkit. The great advantage of the prctl() approach (which is usable) over new system calls and glibc APIs is that it will get used, because the admin can use it just like he uses the "nice" command, on existing software. -- If you want to reach me at my work email, use arjan@linux.intel.com For development, discussion and tips for power savings, visit http://www.lesswatts.org --
Yes, it is a great advantage, but it feels like a hack. Maybe it is better done with LD_PRELOAD or something? I'd certianly want the applications to specify slack themselves in like 10 years. Pavel -- (english) http://www.livejournal.com/~pavelmachek (cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html --
On Mon, 8 Sep 2008 16:15:55 +0200 that's not working very well in general, and doesn't work across exec We've been talking to Ulrich to figure out what the right API for this is (eg how to extend select/poll for this) and I still plan to work on this, but both Linus and Ulrich seem to be very ademant that it means only few apps will use it (and I agree with that, just look at how many apps use linux specific APIs such as sendfile(), linux AIO etc... ) -- If you want to reach me at my work email, use arjan@linux.intel.com For development, discussion and tips for power savings, visit http://www.lesswatts.org --
LD_PRELOAD should work over exec... Another possibility would be to declare new syscalls, and have glibc automatically use select_slack(, foo) if GLIBC_SLACK=foo. Pavel -- (english) http://www.livejournal.com/~pavelmachek (cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html --
LD_PRELOAD is not a solution. LD_PRELOAD always has been and always will be a hack. You use it to work around problems or to test something. Nothing else. LD_PRELOAD and other variables are ignored in security-relevant contexts and environments are cleared in many situations. Sure, you could use /etc/ld.so.preload but that works around only one problem. Furthermore, there is a significant cost associated with preloading. There are additional files to be loaded and it disables prelinking. The prctl() way plus a default non-zero value is the best way for legacy apps. And you'll hopefully get your wish that apps will take fate into their own hand by specifying the slack themselves. Arjan's proposal also introduces new poll/select-like interface which take the additional slack value (at least that's what we discussed before). I'm strongly opposed to using LD_PRELOAD. And I think requiring the libc implementation of select/ poll, ... etc to wrap around the new interfaces which take the slack and determine the slack at userlevel (by reading some file) is too expensive. It's one little value per process (group) to be kept by the kernel. That's not much. --
On Sun, 14 Sep 2008 08:21:26 -0700 I have not posted the code for this yet (the patch set is huge already :0) but yes it's going to happen. You and I do need to figure out what a sensible interface will be for these ;-) -- Arjan van de Ven Intel Open Source Technology Centre For development, discussion and tips for power savings, visit http://www.lesswatts.org --
...but that's okay, right? You would not want passwd to inherit huge Well, it is not too much, but... is the cost for userspace really significant? You'd clearly want it stored in environment, not filesystem... Pavel -- (english) http://www.livejournal.com/~pavelmachek (cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html --
No, it's not OK. There are enough apps which are privileged and need to be handled this way. Take the X server, for instance. You cannot really use the environment for anything meaningful. Especially for this case, you couldn't change the setting for a running process. What a fully-userlevel implementation would have to do is read the value from a file and monitor the file for changes for every new poll/select call. That's a huge cost. --
On Sun, 14 Sep 2008 09:04:08 -0700 in addition, the value really is per thread, not per process, and how do you want to do that with env. variables? -- Arjan van de Ven Intel Open Source Technology Centre For development, discussion and tips for power savings, visit http://www.lesswatts.org --
_Need_ to be handled? They are not handled that way today, and it still seems to work ok. (Plus X is no longer setuid on new distros...) So -- how do you prevent user from setting excessively high slack and Is this important enough to warrant setting for already-running processes? I don't think so... Pavel -- (english) http://www.livejournal.com/~pavelmachek (cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html --
Hi Arjan, Sorry for _really_ late responce. I recently found this patch in linux-next. In general, I like this patch. I wonder to why PR_SET_TIMERSLACK decreasing doesn't need root privilege. example, nice() systemcall is - nice increasing (pirority decreasing) doesn't need root privilege. - nice decreasing (priority incriasing) need root privilege. So, I think time slack setting need similar one. Otherwise, non-privilege user can increase power consumpsion easily by PR_SET_TIMERSLACK. What do you think? --
On Tue, 30 Sep 2008 14:16:09 +0900 (JST) setting timerslack to 0 has no real negative effects on the system on the one hand, on the other hand, it'll be multimedia apps and games who want to do this. Requiring this type of app to be root doesn't sound like a good idea, especially since all you get by "cheating" is ... the exact behavior you ask for anyway. "Increased power consumption" isn't a root privilege, the app can consume much more power just by a "while (1);" loop for example. -- Arjan van de Ven Intel Open Source Technology Centre For development, discussion and tips for power savings, visit http://www.lesswatts.org --
Hi Arjan, Right. But I worry about an end user can't find a application which spent large power comsumption. Many laptop users think battery life is really really important. end user can find "while(1)" app easily by top command. but they can't find timerslack==0 app easily. So, I can drop my proposal. but I hope you explain your expected end user usages. --
From: Arjan van de Ven <arjan@linux.intel.com>
Subject: [PATCH] hrtimer: convert timerfd to the new hrtimer apis
In order to be able to do range hrtimers we need to use accessor functions
to the "expire" member of the hrtimer struct.
This patch converts timerfd to these accessors.
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
fs/timerfd.c | 8 +++-----
1 files changed, 3 insertions(+), 5 deletions(-)
diff --git a/fs/timerfd.c b/fs/timerfd.c
index c502c60..0862f0e 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -52,11 +52,9 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
{
- ktime_t now, remaining;
-
- now = ctx->tmr.base->get_time();
- remaining = ktime_sub(ctx->tmr.expires, now);
+ ktime_t remaining;
+ remaining = hrtimer_expires_remaining(&ctx->tmr);
return remaining.tv64 < 0 ? ktime_set(0, 0): remaining;
}
@@ -74,7 +72,7 @@ static void timerfd_setup(struct timerfd_ctx *ctx, int flags,
ctx->ticks = 0;
ctx->tintv = timespec_to_ktime(ktmr->it_interval);
hrtimer_init(&ctx->tmr, ctx->clockid, htmode);
- ctx->tmr.expires = texp;
+ hrtimer_set_expires(&ctx->tmr, texp);
ctx->tmr.function = timerfd_tmrproc;
if (texp.tv64 != 0)
hrtimer_start(&ctx->tmr, texp, htmode);
--
1.5.5.1
--
From: Arjan van de Ven <arjan@linux.intel.com>
Subject: [PATCH] hrtimer: make select() and poll() use the hrtimer range feature
This patch makes the select() and poll() hrtimers use the new range
feature and settings from the task struct.
In addition, this includes the estimate_accuracy() function that Linus
posted to lkml (but with a few steps added based on experiments).
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
fs/select.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 files changed, 66 insertions(+), 2 deletions(-)
diff --git a/fs/select.c b/fs/select.c
index f6dceb5..21bf77d 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -28,6 +28,62 @@
#include <asm/uaccess.h>
+
+/* Estimate expected accuracy in ns from a timeval */
+
+static unsigned long __estimate_accuracy(struct timespec *tv)
+{
+ /*
+ * Tens of ms if we're looking at seconds, even
+ * more for 10s+ sleeping
+ */
+ if (tv->tv_sec) {
+ /* 100 milliseconds for long sleeps */
+ if (tv->tv_sec > 10)
+ return 100 * NSEC_PER_MSEC;
+
+ /*
+ * Tens of ms for second-granularity sleeps. This,
+ * btw, is the historical Linux 100Hz timer range.
+ */
+ return 10 * NSEC_PER_MSEC;
+ }
+
+ /* 5 msec if we're looking at 100+ milliseconds */
+ if (tv->tv_nsec > 100 * NSEC_PER_MSEC)
+ return 5 * NSEC_PER_MSEC;
+
+ /* A msec if we're looking at 10+ milliseconds */
+ if (tv->tv_nsec > 10 * NSEC_PER_MSEC)
+ return NSEC_PER_MSEC;
+
+ /* half a msec if we're looking at milliseconds */
+ if (tv->tv_nsec > NSEC_PER_MSEC)
+ return NSEC_PER_MSEC/2;
+
+ /* Single usecs if we're looking at microseconds */
+ if (tv->tv_nsec > NSEC_PER_USEC)
+ return NSEC_PER_USEC;
+
+ /* Aim for tenths of nanosecs otherwise */
+ return 10;
+}
+
+static unsigned long estimate_accuracy(struct timespec *tv)
+{
+ unsigned long ret;
+ struct timespec now;
+
+ ktime_get_ts(&now);
+ now = timespec_sub(*tv, now);
+ ret = __estimate_accuracy(&now);
+ if (ret < ...Why not use a simple logarithmic decay to drive this estimate? --
On Tue, 02 Sep 2008 10:22:20 +0200 or just take a 0.1% of the time ;) Linus wrote the original, who am I to argue? (and arguably, it doesn't really matter much, the current code is nice and simple enough) -- If you want to reach me at my work email, use arjan@linux.intel.com For development, discussion and tips for power savings, visit http://www.lesswatts.org --
From: Arjan van de Ven <arjan@linux.intel.com> Subject: [PATCH] hrtimer: convert net::sched_cbq to the new hrtimer apis In order to be able to do range hrtimers we need to use accessor functions to the "expire" member of the hrtimer struct. This patch converts sched_cbq to these accessors. Signed-off-by: Arjan van de Ven <arjan@linux.intel.com> --- net/sched/sch_cbq.c | 7 ++++--- 1 files changed, 4 insertions(+), 3 deletions(-) diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 9b720ad..0fa7270 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -545,9 +545,10 @@ static void cbq_ovl_delay(struct cbq_class *cl) expires = ktime_set(0, 0); expires = ktime_add_ns(expires, PSCHED_US2NS(sched)); if (hrtimer_try_to_cancel(&q->delay_timer) && - ktime_to_ns(ktime_sub(q->delay_timer.expires, - expires)) > 0) - q->delay_timer.expires = expires; + ktime_to_ns(ktime_sub( + hrtimer_get_expires(&q->delay_timer), + expires)) > 0) + hrtimer_set_expires(&q->delay_timer, expires); hrtimer_restart(&q->delay_timer); cl->delayed = 1; cl->xstats.overactions++; -- 1.5.5.1 --
From: Arjan van de Ven <arjan@linux.intel.com> Subject: [PATCH] hrtimer: convert kernel/* to the new hrtimer apis In order to be able to do range hrtimers we need to use accessor functions to the "expire" member of the hrtimer struct. This patch converts kernel/* to these accessors. Signed-off-by: Arjan van de Ven <arjan@linux.intel.com> --- kernel/futex.c | 7 +++---- kernel/hrtimer.c | 44 +++++++++++++++++++++++--------------------- kernel/posix-timers.c | 10 ++++------ kernel/rtmutex.c | 3 +-- kernel/sched.c | 7 +++---- kernel/time/ntp.c | 3 +-- kernel/time/tick-sched.c | 21 ++++++++++----------- kernel/time/timer_list.c | 4 ++-- 8 files changed, 47 insertions(+), 52 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 7d1136e..4cd5b43 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1299,10 +1299,9 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); hrtimer_init_sleeper(&t, current); - t.timer.expires = *abs_time; + hrtimer_set_expires(&t.timer, *abs_time); - hrtimer_start(&t.timer, t.timer.expires, - HRTIMER_MODE_ABS); + hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); if (!hrtimer_active(&t.timer)) t.task = NULL; @@ -1404,7 +1403,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); hrtimer_init_sleeper(to, current); - to->timer.expires = *time; + hrtimer_set_expires(&to->timer, *time); } q.pi_state = NULL; diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 782137d..ae307fe 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -517,7 +517,7 @@ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base) if (!base->first) continue; timer = rb_entry(base->first, struct hrtimer, ...
From: Arjan van de Ven <arjan@linux.intel.com> Subject: [PATCH] hrtimer: convert powerpc/oprofile to the new hrtimer apis In order to be able to do range hrtimers we need to use accessor functions to the "expire" member of the hrtimer struct. This patch converts powerpc/oprofile to these accessors. Signed-off-by: Arjan van de Ven <arjan@linux.intel.com> --- arch/powerpc/oprofile/cell/spu_profiler.c | 2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/arch/powerpc/oprofile/cell/spu_profiler.c b/arch/powerpc/oprofile/cell/spu_profiler.c index 380d7e2..02ffe06 100644 --- a/arch/powerpc/oprofile/cell/spu_profiler.c +++ b/arch/powerpc/oprofile/cell/spu_profiler.c @@ -196,7 +196,7 @@ int start_spu_profiling(unsigned int cycles_reset) pr_debug("timer resolution: %lu\n", TICK_NSEC); kt = ktime_set(0, profiling_interval); hrtimer_init(&timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - timer.expires = kt; + hrtimer_set_expires(&timer, kt); timer.function = profile_spus; /* Allocate arrays for collecting SPU PC samples */ -- 1.5.5.1 --
From: Arjan van de Ven <arjan@linux.intel.com> Subject: [PATCH] hrtimer: convert kvm-ia64 to the new hrtimer apis In order to be able to do range hrtimers we need to use accessor functions to the "expire" member of the hrtimer struct. This patch converts KVM-ia64 to these accessors. Signed-off-by: Arjan van de Ven <arjan@linux.intel.com> --- arch/ia64/kvm/kvm-ia64.c | 2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 7a37d06..cf8eae1 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -1112,7 +1112,7 @@ static void kvm_migrate_hlt_timer(struct kvm_vcpu *vcpu) struct hrtimer *p_ht = &vcpu->arch.hlt_timer; if (hrtimer_cancel(p_ht)) - hrtimer_start(p_ht, p_ht->expires, HRTIMER_MODE_ABS); + hrtimer_start_expires(p_ht, HRTIMER_MODE_ABS); } static enum hrtimer_restart hlt_timer_fn(struct hrtimer *data) -- 1.5.5.1 --
From: Arjan van de Ven <arjan@linux.intel.com>
Subject: [PATCH] hrtimer: convert s390 to the new hrtimer apis
In order to be able to do range hrtimers we need to use accessor functions
to the "expire" member of the hrtimer struct.
This patch converts s390 to these accessors.
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
drivers/s390/crypto/ap_bus.c | 6 +++---
1 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c
index 62b6b55..6f02f1e 100644
--- a/drivers/s390/crypto/ap_bus.c
+++ b/drivers/s390/crypto/ap_bus.c
@@ -659,9 +659,9 @@ static ssize_t poll_timeout_store(struct bus_type *bus, const char *buf,
hr_time = ktime_set(0, poll_timeout);
if (!hrtimer_is_queued(&ap_poll_timer) ||
- !hrtimer_forward(&ap_poll_timer, ap_poll_timer.expires, hr_time)) {
- ap_poll_timer.expires = hr_time;
- hrtimer_start(&ap_poll_timer, hr_time, HRTIMER_MODE_ABS);
+ !hrtimer_forward(&ap_poll_timer, hrtimer_get_expires(&ap_poll_timer), hr_time)) {
+ hrtimer_set_expires(&ap_poll_timer, hr_time);
+ hrtimer_start_expires(&ap_poll_timer, HRTIMER_MODE_ABS);
}
return count;
}
--
1.5.5.1
--
From: Arjan van de Ven <arjan@linux.intel.com> Subject: [PATCH] hrtimer: convert sound/ to the new hrtimer apis In order to be able to do range hrtimers we need to use accessor functions to the "expire" member of the hrtimer struct. This patch converts sound/ to these accessors. Signed-off-by: Arjan van de Ven <arjan@linux.intel.com> --- sound/drivers/pcsp/pcsp_lib.c | 5 +++-- 1 files changed, 3 insertions(+), 2 deletions(-) diff --git a/sound/drivers/pcsp/pcsp_lib.c b/sound/drivers/pcsp/pcsp_lib.c index e341f3f..1f42e40 100644 --- a/sound/drivers/pcsp/pcsp_lib.c +++ b/sound/drivers/pcsp/pcsp_lib.c @@ -34,7 +34,7 @@ enum hrtimer_restart pcsp_do_timer(struct hrtimer *handle) chip->thalf = 0; if (!atomic_read(&chip->timer_active)) return HRTIMER_NORESTART; - hrtimer_forward(&chip->timer, chip->timer.expires, + hrtimer_forward(&chip->timer, hrtimer_get_expires(&chip->timer), ktime_set(0, chip->ns_rem)); return HRTIMER_RESTART; } @@ -118,7 +118,8 @@ enum hrtimer_restart pcsp_do_timer(struct hrtimer *handle) chip->ns_rem = PCSP_PERIOD_NS(); ns = (chip->thalf ? PCSP_CALC_NS(timer_cnt) : chip->ns_rem); chip->ns_rem -= ns; - hrtimer_forward(&chip->timer, chip->timer.expires, ktime_set(0, ns)); + hrtimer_forward(&chip->timer, hrtimer_get_expires(&chip->timer), + ktime_set(0, ns)); return HRTIMER_RESTART; exit_nr_unlock2: -- 1.5.5.1 --
From: Arjan van de Ven <arjan@linux.intel.com>
Subject: [PATCH] hrtimer: rename the "expires" struct member to avoid accidental usage
To catch code that still touches the "expires" memory directly, rename it
to have the compiler complain rather than get nasty, hard to explain,
runtime behavior
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
include/linux/hrtimer.h | 20 ++++++++++----------
1 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 9900e99..485a634 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -111,7 +111,7 @@ enum hrtimer_cb_mode {
*/
struct hrtimer {
struct rb_node node;
- ktime_t expires;
+ ktime_t _expires;
enum hrtimer_restart (*function)(struct hrtimer *);
struct hrtimer_clock_base *base;
unsigned long state;
@@ -219,41 +219,41 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer)
static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
{
- timer->expires = time;
+ timer->_expires = time;
}
static inline void hrtimer_set_expires_tv64(struct hrtimer *timer, s64 tv64)
{
- timer->expires.tv64 = tv64;
+ timer->_expires.tv64 = tv64;
}
static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time)
{
- timer->expires = ktime_add_safe(timer->expires, time);
+ timer->_expires = ktime_add_safe(timer->_expires, time);
}
static inline void hrtimer_add_expires_ns(struct hrtimer *timer, unsigned long ns)
{
- timer->expires = ktime_add_ns(timer->expires, ns);
+ timer->_expires = ktime_add_ns(timer->_expires, ns);
}
static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer)
{
- return timer->expires;
+ return timer->_expires;
}
static inline s64 hrtimer_get_expires_tv64(const struct hrtimer *timer)
{
- return timer->expires.tv64;
+ return timer->_expires.tv64;
}
static inline s64 hrtimer_get_expires_ns(const struct hrtimer ...hi Arjan,
i've started doing some QA of this series in -tip.
it has a new -git based topic: tip/timers/range-hrtimers.
testing found this build failure:
In file included from include/linux/sched.h:87,
from arch/x86/kernel/asm-offsets_32.c:9,
from arch/x86/kernel/asm-offsets.c:3:
include/linux/hrtimer.h: In function 'hrtimer_start_expires':
include/linux/hrtimer.h:359: error: implicit declaration of function 'hrtimer_get_expires'
include/linux/hrtimer.h:359: error: incompatible type for argument 2 of 'hrtimer_start'
4.69user 2.38system 0:13.19elapsed 53%CPU (0avgtext+0avgdata 0maxresident)k
with the attached config.
Ingo
On Sat, 6 Sep 2008 16:56:10 +0200 ok I fixed this in the master branch of git://git.kernel.org/pub/scm/linux/kernel/git/arjan/linux-2.6-hrtimer.git --
thanks, pulled. Next build failure is:
In file included from include/linux/sched.h:87,
from arch/x86/kernel/asm-offsets_32.c:9,
from arch/x86/kernel/asm-offsets.c:3:
include/linux/hrtimer.h: In function 'hrtimer_is_hres_active':
include/linux/hrtimer.h:211: error: 'struct hrtimer_cpu_base' has no member named 'hres_active'
include/linux/hrtimer.h: At top level:
include/linux/hrtimer.h:316: error: redefinition of 'hrtimer_cb_get_time'
include/linux/hrtimer.h:205: error: previous definition of 'hrtimer_cb_get_time' was here
include/linux/hrtimer.h:321: error: redefinition of 'hrtimer_is_hres_active'
include/linux/hrtimer.h:210: error: previous definition of 'hrtimer_is_hres_active' was here
config attached.
i've pushed out the broken tree into tip/tmp.broken.range-hrtimers
Ingo.
Hi Arjen, sorry for not replying sooner. I had half a patch to create a new "timer layer to rule them all" called ktimers, which took an explicit "slop" value. Slop is the "how long before its worth waking the machine for this?" value, with friendly SLOP_USECS, SLOP_SECONDS, SLOP_DAYS etc defines. Implemented in terms of normal and hr timers, which get deprecated over time. Heuristics work for a while, but IMHO eventually this is going to have to be plumbed through to userspace. Cheers, Rusty. --
On Fri, 12 Sep 2008 13:39:49 +1000 yes we need to add the system calls at some point (beyond the prctl); however both Ulrich and Linus indicated that this will be one of those "handful of users" kind of things. IMO it needs to work well enough without having to change the whole application stack (and with my -- Arjan van de Ven Intel Open Source Technology Centre For development, discussion and tips for power savings, visit http://www.lesswatts.org --
Be warned, the name ktimers has already a bad history. Just ask the Agreed. tglx --
