There have been various reports of the ondemand governor causing some serious performance issues, one of the latest ones from Andrew. There are several fundamental issues with ondemand (being worked on), but the report from Andrew can be fixed relatively easily. The fundamental issue is that ondemand will go to a (too) low CPU frequency for workloads that alternatingly disk and CPU bound... -- Arjan van de Ven Intel Open Source Technology Centre For development, discussion and tips for power savings, visit http://www.lesswatts.org --
From 166b7526ccfea8b44626b6023ff5b0a8eb869bb3 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 18 Apr 2010 10:33:02 -0700
Subject: [PATCH 2/7] sched: introduce a function to update the idle statistics
Currently, two places update the idle statistics (and more to
come later in this series).
This patch creates a helper function for updating these statistics.
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
kernel/time/tick-sched.c | 29 +++++++++++++++++++----------
1 files changed, 19 insertions(+), 10 deletions(-)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 54dc155..ca2211d 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -150,14 +150,25 @@ static void tick_nohz_update_jiffies(ktime_t now)
touch_softlockup_watchdog();
}
-static void tick_nohz_stop_idle(int cpu, ktime_t now)
+/*
+ * Updates the per cpu time idle statistics counters
+ */
+static void update_ts_time_stats(struct tick_sched *ts, ktime_t now)
{
- struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
ktime_t delta;
- delta = ktime_sub(now, ts->idle_entrytime);
ts->idle_lastupdate = now;
- ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+ if (ts->idle_active) {
+ delta = ktime_sub(now, ts->idle_entrytime);
+ ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+ }
+}
+
+static void tick_nohz_stop_idle(int cpu, ktime_t now)
+{
+ struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+
+ update_ts_time_stats(ts, now);
ts->idle_active = 0;
sched_clock_idle_wakeup_event(0);
@@ -165,14 +176,12 @@ static void tick_nohz_stop_idle(int cpu, ktime_t now)
static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
{
- ktime_t now, delta;
+ ktime_t now;
now = ktime_get();
- if (ts->idle_active) {
- delta = ktime_sub(now, ts->idle_entrytime);
- ts->idle_lastupdate = now;
- ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, ...Reviewed-by: Rik van Riel <riel@redhat.com> -- All rights reversed --
From e75d6cd203e43ea4c5e9919f19e2882c066491b8 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 18 Apr 2010 10:47:02 -0700
Subject: [PATCH 4/7] sched: fold updating of the last update time into update_ts_time_stats()
This patch folds the updating of the last_update_time into the
update_ts_time_stats() function, and updates the callers.
This allows for further cleanups that are done in the next patch.
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
kernel/time/tick-sched.c | 22 +++++++++++-----------
1 files changed, 11 insertions(+), 11 deletions(-)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 7dbad2f..ac54543 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -153,7 +153,8 @@ static void tick_nohz_update_jiffies(ktime_t now)
/*
* Updates the per cpu time idle statistics counters
*/
-static void update_ts_time_stats(struct tick_sched *ts, ktime_t now)
+static void
+update_ts_time_stats(struct tick_sched *ts, ktime_t now, u64 *last_update_time)
{
ktime_t delta;
@@ -163,13 +164,19 @@ static void update_ts_time_stats(struct tick_sched *ts, ktime_t now)
ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
ts->idle_entrytime = now;
}
+
+ if (ts->idle_active && last_update_time)
+ *last_update_time = ktime_to_us(ts->idle_lastupdate);
+ else
+ *last_update_time = ktime_to_us(now);
+
}
static void tick_nohz_stop_idle(int cpu, ktime_t now)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
- update_ts_time_stats(ts, now);
+ update_ts_time_stats(ts, now, NULL);
ts->idle_active = 0;
sched_clock_idle_wakeup_event(0);
@@ -181,7 +188,7 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
now = ktime_get();
- update_ts_time_stats(ts, now);
+ update_ts_time_stats(ts, now, NULL);
ts->idle_entrytime = now;
ts->idle_active = 1;
@@ -206,18 +213,11 @@ static ktime_t tick_nohz_start_idle(struct tick_sched ...Reviewed-by: Rik van Riel <riel@redhat.com> -- All rights reversed --
From 526a9f347d5a953f37b67b4b2afb39d7b4d77a92 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 18 Apr 2010 10:49:30 -0700
Subject: [PATCH 5/7] sched: eliminate the ts->idle_lastupdate field
Now that the only user of ts->idle_lastupdate is update_ts_time_stats(),
the entire field can be eliminated.
In update_ts_time_stats(), idle_lastupdate is first set to "now",
and a few lines later, the only user is an if() statement that
assigns a variable either to "now" or to ts->idle_lastupdate,
which has the value of "now" at that point.
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
include/linux/tick.h | 1 -
kernel/time/tick-sched.c | 5 +----
2 files changed, 1 insertions(+), 5 deletions(-)
diff --git a/include/linux/tick.h b/include/linux/tick.h
index d2ae79e..0343eed 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -60,7 +60,6 @@ struct tick_sched {
ktime_t idle_waketime;
ktime_t idle_exittime;
ktime_t idle_sleeptime;
- ktime_t idle_lastupdate;
ktime_t sleep_length;
unsigned long last_jiffies;
unsigned long next_jiffies;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index ac54543..326f5f8 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -158,16 +158,13 @@ update_ts_time_stats(struct tick_sched *ts, ktime_t now, u64 *last_update_time)
{
ktime_t delta;
- ts->idle_lastupdate = now;
if (ts->idle_active) {
delta = ktime_sub(now, ts->idle_entrytime);
ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
ts->idle_entrytime = now;
}
- if (ts->idle_active && last_update_time)
- *last_update_time = ktime_to_us(ts->idle_lastupdate);
- else
+ if (last_update_time)
*last_update_time = ktime_to_us(now);
}
--
1.6.2.5
--
Reviewed-by: Rik van Riel <riel@redhat.com> -- All rights reversed --
From 27966bedabea83c4f3ae77507eceb746b1f6ebae Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 18 Apr 2010 11:15:56 -0700
Subject: [PATCH 7/7] ondemand: Solve the big performance issue with ondemand during disk IO
The ondemand cpufreq governor uses CPU busy time (e.g. not-idle time) as
a measure for scaling the CPU frequency up or down.
If the CPU is busy, the CPU frequency scales up, if it's idle, the CPU
frequency scales down. Effectively, it uses the CPU busy time as proxy
variable for the more nebulous "how critical is performance right now"
question.
This algorithm falls flat on its face in the light of workloads where
you're alternatingly disk and CPU bound, such as the ever popular
"git grep", but also things like startup of programs and maildir using
email clients... much to the chagarin of Andrew Morton.
This patch changes the ondemand algorithm to count iowait time as busy,
not idle, time. As shown in the breakdown cases above, iowait is performance
critical often, and by counting iowait, the proxy variable becomes a more
accurate representation of the "how critical is performance" question.
The problem and fix are both verified with the "perf timechar" tool.
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
drivers/cpufreq/cpufreq_ondemand.c | 30 ++++++++++++++++++++++++++++--
1 files changed, 28 insertions(+), 2 deletions(-)
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index bd444dc..ed472f8 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -73,6 +73,7 @@ enum {DBS_NORMAL_SAMPLE, DBS_SUB_SAMPLE};
struct cpu_dbs_info_s {
cputime64_t prev_cpu_idle;
+ cputime64_t prev_cpu_iowait;
cputime64_t prev_cpu_wall;
cputime64_t prev_cpu_nice;
struct cpufreq_policy *cur_policy;
@@ -148,6 +149,16 @@ static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall)
return idle_time;
}
+static inline ...Hi, I don't doubt that keeping the cpu full frequency during IO can improve some specific workloads, however in your log message you don't explain how much we are loosing. How much more energy is consumed when doing a "updatedb" or "find /" ? I'm not saying your patch is wrong, but I'm saying that if you tweak the ondemand governor just by looking at performance and not at energy consumption, as it seems in your log message, it will soon look very similar to the performance governor! So it would be good to check the whole spectrum of what needs to be optimised :-) Cheers, Eric --
On Mon, 19 Apr 2010 10:29:47 +0200 first of all, it's so bad that people will just turn the whole power management off... at which point fixing the really bad bug is actually on the machines I used this on (Core i7's) it's actually hardly measurable. All CPUs I have access to turn the voltage entirely off during idle, so while the frequency is higher, if you're mostly IO bound, it's only for very short durations... while still being mostly "off". -- Arjan van de Ven Intel Open Source Technology Centre For development, discussion and tips for power savings, visit http://www.lesswatts.org --
Fair enough, and I have to fully agree, it's better to have a power management which consumes a bit more than the unreachable optimal, than Yes, but keep in mind the Linux ondemand governor should not be tweaked only for the latest Intel CPUs. I've just done a very little and _extremely rough_ measurement on my laptop with a Core Duo 2, and while it seems that, indeed, at idle the frequency didn't matter for the consumption (about 12.5W with any governor), when running updatedb (so IO bound), the performance governor seems to consume more than ondemand and powersave (14.8-15.8W instead of 14.0-14.6W). I'm very careful with the results of this "experiment" because it's only using the ACPI report for the power usage and done with many other programs in the background. Nevertheless, it manages to convince me that this change is not going to be as harmless for the power consumption as you suggest. Don't take me wrong. Here, I'm not saying that this patch is bad per se. In particular, I do understand the specific workloads it tries to handle, and I don't have a better solution for it in mind. However, this is quite a change in the ondemand governor logic, and the log message do not mention anything about energy consumption. This triggers a big warning "only half of the problem was looked at to get a solution"! So if this patch ever goes in, it should at least have a better changelog which describes what are the potential consequences on the power consumption. Cheers, Eric --
be careful; you're measuring power not energy. You also need to take into account that things are now done quicker, so that you can then be idle longer later! So yes instantaneous you're using a bit more power (since you're getting much more performance), but you're done much quicker as well.... Once you do this the equation changes, and it's more or less a wash. As for your general "ondemand is for everyone" concern; there are many things wrong with ondemand, and I'm writing a new governor to fix the more fundamental issues with it (and also, frankly, so that I won't break existing users and hardware I don't have access to). This is basically a backport of a specific feature of my new governor to ondemand because Andrew keeps hitting the really bad case and basically ended up turning power management off. -- Arjan van de Ven Intel Open Source Technology Centre For development, discussion and tips for power savings, visit http://www.lesswatts.org --
Not sure you fix a bug, I expect this was done on purpose.
The ondemand governor disadvantages processes with alternating short CPU
load peaks and idle sequences.
IO bound processes typically show up with such a behavior.
But I follow Eric and agree that if it costs that much, changing
above sounds sane.
Still, I could imagine some people might want to not raise freq on IO bound
process activity, therefore this should get another ondemand param, similar
to ignore_nice_load.
Thomas
--
I agree with Thomas here. Some of these assumptions on IO / FSB performance with cpu speed do not hold true on various ARM platforms. Perhaps we could have a min_io_freq value? Which is the min speed for the cpu to run at for IO bound activity. In the original patch, min_io_freq = scaling_max_freq. For various arm devices I can happily set min_io_freq to the lowest cpu speed that satisfies bus speeds. --
'satisfies bus speeds' == minimum cpu frequency where i/o works at all or == minimum cpu frequency where i/o works at full speed ? Pavel -- (english) http://www.livejournal.com/~pavelmachek (cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html --
Sorry for the confusion, I mean the later, cpu frequency where i/o works at full speed. However this value can be set via sysfs, it could be both. Really its up to the user, if they want max io performance, minimum io performance or a happy medium. --
Is the improvement really because IO benefited from CPU being held at a higher frequency, or perhaps because it is now not scaled down during IO, so when CPU intensive part of git grep comes along it is already "revved up"? Or in other words, does a pure IO workload benefit from now higher selected frequency? One idea I had but a) never had time to implement it and b) was not sure it would be accepted anyway, was to modify ondemand governor to ramp up instantly, but slow down slowly (in a configurable way). Or to make it a hybrid of conservative and ondemand in a way. (This was all long time ago so perhaps today what I wanted to do then is already achievable with some knobs.) It was not an idea out of thin air, but based on unpleasant desktop latencies with ondemand, while performance governor was of course perfect. Tvrtko Sophos Plc, The Pentagon, Abingdon Science Park, Abingdon, OX14 3YP, United Kingdom. Company Reg No 2096520. VAT Reg No GB 348 3873 20. --
On Mon, 19 Apr 2010 10:09:55 +0100 the IO part does not get much faster (some systems lower the FSB speed at low frequencies, but disks are still much slower than the FSB anyway), but the CPU part, which is performance critical for, say, no. Mixed workloads do. but pure IO workloads also don't suffer since while idle, the voltage that's what ondemand does already. -- Arjan van de Ven Intel Open Source Technology Centre For development, discussion and tips for power savings, visit http://www.lesswatts.org --
You mean that higher frequency does not have effect on power use if CPU is How and where in the code and how to enable that behaviour? From my experiments frequency goes down to minimum as soon as load goes away. What I was talking about is gradual lowering over a configurable period. It is not power efficient, but it could be good for latency in some workloads. Tvrtko Sophos Plc, The Pentagon, Abingdon Science Park, Abingdon, OX14 3YP, United Kingdom. Company Reg No 2096520. VAT Reg No GB 348 3873 20. --
On Mon, 19 Apr 2010 16:29:39 +0100 this is true for most processors that I'm aware of. there's exceptions for things like where the idle time is really short, where going up and down in voltage will take more energy than it'll it's not even good for that ;-( it's better then to stay high longer... at least on modern machines the inbetween states are pretty much either useless or actually energy -- Arjan van de Ven Intel Open Source Technology Centre For development, discussion and tips for power savings, visit http://www.lesswatts.org --
Why do you think it is not good for latency to stay at higher frequency for longer? I had a machine until recently with a relatively slow Turion64 which had 800Mhz minimum and 1.8Ghz maximum frequency states. With ondemand governor when web browsing for example, frequency would drop to 800Mhz as soon as scrolling or such stopped, and then was pushed back up to max on user interaction. Overall experience was sluggish, but when switching to performance governor it was much much better. That is why I proposed to have a gradual lowering as an option in on-demand. You said it already does that - I ask are you sure? And also now you are saying it would not be good for latency - above is an example when it clearly does help (a lot). Would this idea of gradual lowering help for the "git grep" use case as well? To me it sounds better than taking IO wait into account. What happens with your scheme with a pure IO load? Ok, you say power use is not determined by frequency, fine, but it sounds slightly wrong to max the frequency on pure IO wait. If frequency per se really does not waste power then gradual lowering could both "git grep" and my web browsing latency use case, what do you think? Tvrtko Sophos Plc, The Pentagon, Abingdon Science Park, Abingdon, OX14 3YP, United Kingdom. Company Reg No 2096520. VAT Reg No GB 348 3873 20. --
On Tue, 20 Apr 2010 10:10:49 +0100 but the *gradual* lowering just does not make sense. might as well just stay at the highest frequency for the duration for which you do the gradual lowering, it's more efficient for power on most modern machines. The problem is if you do this gradual thing for long enough to help you on desktop loads, or the stay-high thing, on server workloads the power efficiency will completely and utterly tank. Been there tried that ;-) frankly, you're starting to touch at the more fundamental issues with ondemand. I'm not trying to solve that in this small patch, they really can only be solved in a complete redesign. While I'm doing such a redesign, I was asked to at least fix this particular gap urgently. Solving all of ondemand's issues needs a complete rethink of the core assumptions it makes (which are not correct anymore nowadays). -- Arjan van de Ven Intel Open Source Technology Centre For development, discussion and tips for power savings, visit http://www.lesswatts.org --
Well you said that "it's not even good for that ;-(" when I said that it may Maybe it does not make sense for energy efficiency but it does for latency, I think we have now agreed on that so it would be a nice knob to have. If I had some time I would implement it if for nothing then to spice up this thread. :) I also like Mike Chan's idea of minimum_full_io_speed_frequency parameter, in I understand that, although you gave no detail on how you designed this new ondemand. Problem I have is that to me this workaround is quite ugly. If you have to do it please add minimum_full_io_speed_frequency knob and also make it default to current behaviour (no increase on iowait). Tvrtko Sophos Plc, The Pentagon, Abingdon Science Park, Abingdon, OX14 3YP, United Kingdom. Company Reg No 2096520. VAT Reg No GB 348 3873 20. --
Is not that exactly what will happen for 'cat /dev/<usb1>' case? Plus I suspect that older cpus are slower at changing voltages, and So what about hiding those from ondemand on modern hw? Pavel -- (english) http://www.livejournal.com/~pavelmachek (cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html --
Have you looked at:
/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_transition_latency
and
/sys/devices/system/cpu/cpu0/cpufreq/ondemand/sampling_rate
Transition latency comes from ACPI tables and sampling rate depends
on it.
Reducing the sampling rate, significantly reduces performance loss.
There were bugs in this area, on latest kernels with latest (MSR switching)
HW you should see:
cat /sys/devices/system/cpu/cpu0/cpufreq/ondemand/sampling_rate
10000
Thomas
--
On Tue, 20 Apr 2010 11:29:02 +0200 of course I have; in practice on systems i use it is always a 10 milliseconds interval, which is the minimum it gets. while it does, it is not nearly sufficient for the alternating IO/CPU cases I've looked at. I've looked at many timecharts for various IO/CPU workloads, including my normal own use as well as Andrews and the CPU busy periods are in the 1 - 20 msec range between IOs most of the time, for which a 10 msec sampling is obviously problematic. -- Arjan van de Ven Intel Open Source Technology Centre For development, discussion and tips for power savings, visit http://www.lesswatts.org --
Well, and now, if you do something like cat /dev/<your usb1.1 hdd> > /dev/null, you'll keep cpu on max frequency. Not a problem for new core i7, but probably big deal for athlon 64. Maybe modern cpus can and should simply react faster? -- (english) http://www.livejournal.com/~pavelmachek (cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html --
So that also means that my notebook's CPU fan will spin like mad as soon as I access a USB key ? It will not help the key go faster... Probably that iowait should only change the time required to go back to low frequency. That way, if there is no CPU nor I/O activity, we can switch back to a low frequency quickly, but if we see I/O activity, we could decide to wait for 1 second (for instance) for CPU idle before It's above all that we should not try to switch too fast if we know the CPU can't follow. Willy --
I'd just make this decission (consider IO time yes/no) configurable.
A userspace daemon/app could then e.g. switch when on battery, AC, if
it's a laptop in general or if user switched to a silent/powersave mode
when being in a library.
It may be more efficient to ramp the freq up during the whole IO process and
this is crucial in the server area, but most laptop/desktop users are happy
as long as the data for their video/audio stream is fetched from the device
quickly enough for displaying.
Especially on battery, users will appreciate some minutes of more battery
lifetime and do not care about some ms of IO latencies.
You would cut of this elementary cpufreq feature for the ondemand governor
with these patches.
Thomas
--
On Fri, 23 Apr 2010 10:50:10 +0200 Thomas Renninger <trenn@suse.de> wrote: the assumption that power doesn't matter on AC is a huge fiction that any data center operator would love to get out of everyones head as quickly as possible. -- Arjan van de Ven Intel Open Source Technology Centre For development, discussion and tips for power savings, visit http://www.lesswatts.org --
Have I said power doesn't matter on AC? Do you agree that a datacenter has different performance vs power tradeoff demands as a battery driven mobile device? Back to the topic: As you did not answer on my (several) sysfs knob request(s), I expect you agree with it and will add one. It's not only for real usage, but to give people a chance to easily test and get feedback on different HW. Thanks, Thomas --
On Tue, 27 Apr 2010 13:39:34 +0200
yup it makes sense to have a sysfs knob with a sane default value
From: Arjan van de Ven <arjan@linux.intel.com>
Subject: [PATCH] cpufreq: make the iowait-is-busy-time a sysfs tunable
Pavel Machek pointed out that not all CPUs have an efficient idle
at high frequency. Specifically, older Intel and various AMD cpus
would get a higher power usage when copying files from USB.
Mike Chan pointed out that the same is true for various ARM chips
as well.
Thomas Renninger suggested to make this a sysfs tunable with a
reasonable default.
This patch adds a sysfs tunable for the new behavior, and uses
a very simple function to determine a reasonable default, depending
on the CPU vendor/type.
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
drivers/cpufreq/cpufreq_ondemand.c | 46 +++++++++++++++++++++++++++++++++++-
1 files changed, 45 insertions(+), 1 deletions(-)
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index ed472f8..4877e8f 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -109,6 +109,7 @@ static struct dbs_tuners {
unsigned int down_differential;
unsigned int ignore_nice;
unsigned int powersave_bias;
+ unsigned int io_is_busy;
} dbs_tuners_ins = {
.up_threshold = DEF_FREQUENCY_UP_THRESHOLD,
.down_differential = DEF_FREQUENCY_DOWN_DIFFERENTIAL,
@@ -260,6 +261,7 @@ static ssize_t show_##file_name \
return sprintf(buf, "%u\n", dbs_tuners_ins.object); \
}
show_one(sampling_rate, sampling_rate);
+show_one(io_is_busy, io_is_busy);
show_one(up_threshold, up_threshold);
show_one(ignore_nice_load, ignore_nice);
show_one(powersave_bias, powersave_bias);
@@ -310,6 +312,22 @@ static ssize_t store_sampling_rate(struct kobject *a, struct attribute *b,
return count;
}
+static ssize_t store_io_is_busy(struct kobject *a, struct attribute *b,
+ const char *buf, size_t count)
+{
+ unsigned int input;
+ int ...That's very nice Arjan. That way we'll be able to select between performance and power savings, which is what most laptop users want after all ! Thanks, Willy --
-- (english) http://www.livejournal.com/~pavelmachek (cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html --
Acked-by: Rik van Riel <riel@redhat.com> -- All rights reversed --
On Fri, 23 Apr 2010 07:38:58 +0200 unlikely. your notebook CPU will stop its clocks, if not drop its voltage, during idle. So during that time the frequency is 0; the only difference is in how much leakage you get from a higher voltage for those CPUs that do not powergate or drop their cpu in idle. Most CPUs that I know of do either or both of that. -- Arjan van de Ven Intel Open Source Technology Centre For development, discussion and tips for power savings, visit http://www.lesswatts.org --
OK, I will have to give that a run on my Atom and Pentium-M, both of which provide me with a very long run on battery precisely because they almost always run at the lowest possible frequency and voltage. On the pentium-M, I already know there is a difference between full voltage and low voltage even when idle. Its lowest frequency is 600 MHz, and my kernel still has the old phc patch to lower clock and voltage. When idle on this kernel, the fan is completely stopped and never wakes up. When idle on a 2.6.32 (which does not have the patch), its voltage doesn't drop though the frequency does, and the fan runs at 1/4. It also lasts shorter (about 30 min less). So that means that even when idle there's a noticeable difference. As was suggested, if the difference appears to be significant, maybe a /sys tunable would allow all users to select their preferred mode. Alternatively, we could have two distinct ondemand settings, one for pure CPU and another one for CPU+I/O. Regards, Willy --
On Fri, 23 Apr 2010 07:24:39 +0200 do you have facts not speculation for this? Does the athlon 64 really keep its voltage high during idle? That would surprise me greatly... (and if it does, does it matter? the clock is stopped anyway there) the only place where my patch makes a real difference is when the cpu is idle due to blocking IO! So do you have data that the athlon 64 gets too hot if you select a high frequency during an idle period, where the clock is already stopped? -- Arjan van de Ven Intel Open Source Technology Centre For development, discussion and tips for power savings, visit http://www.lesswatts.org --
Yes, I believe it keeps voltage up, and as a leakage is big part of power consumption there, stopped clocks will not help much. I believe you are developing on wrong machine. Seems like core i7 just wants max frequency, all the time. Older designs were not like that. iirc even idle power consumption was much higher when on max voltage... I'll get some numbers from my old notes; I no longer have the hw. -- (english) http://www.livejournal.com/~pavelmachek (cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html --
On Fri, 23 Apr 2010 10:38:28 +0200 You make a claim... all I am asking if you are doing just random guess or basing this on facts. The machines I have access to don't show any impact during actual idle, because they stop clocks and generally even lower the voltage. You make a claim that a certain machine does not do either... all I'm oh I use many different machines. the intel machines at least stop the clocks, and for a really long time also lower the frequency in idle. make sure it's data based on tickless... without tickless we were never really idle ;-( -- Arjan van de Ven Intel Open Source Technology Centre For development, discussion and tips for power savings, visit http://www.lesswatts.org --
So... some old data. It is not exactly athlon 64 -- I don't have that
particular number for it -- but: (from my old notes):
evo n620c [63.3 Wh]
~~~~~~~~~~~~~~~~~
idle machine at 1.7GHz, min bl: 13.0 W
idle machine at 600MHz, min bl: 12.0 W
thinkpad x32 [52Wh]
~~~~~~~~~~~~
idle machine at 600MHz, min bl: 10 W
at 1.8GHz: +6.6 W
hp nx5k [56.8Wh]
~~~~~~~
idle, min bl: 19 W
min bl, 1.4GHz: 22 W
...so yes, I kind of see a pattern there. And you should be able to
easily see the difference, too, if you took something from that era...
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
--
On Sat, 24 Apr 2010 06:56:26 +0200 so I finally found a machine based on a Pentium M (similar to this Thinkpad, at least according to google) and spend half a day getting its Fedora 8 installation to compile a modern kernel (these guys don't compile very fast....). Turns out that things are not as simple as your data suggest. The complexity is in USB autosuspend. Without USB devices, I see: "powersave" 9.7 / 9.8 Watts (alternating between these readings) "ondemand" 9.7 / 9.8 Watts "performance" 9.8 Watts (solid on 9.8, not alternating) if all USB devices are in autosuspend, it's pretty much the same picture. But if there is an active USB device, the CPU will no longer go to C4, but will be stuck in C2 (this is the last generation of Intel chipsets that suffer from this behavior; "c2 popup" is what the ich4m is missing). And the C2 power behavior does show sensitivity to the CPU frequency... roughly 3 1/2 Watts for my machine. Doing the 'cat' from a usb mount as you suggested, which implies active USB DMA, shows this 3 1/2 Watts increase between the old and new ondemand. At the same time, the performance went up a little as well (a 3.7Gb took 3m32.5s before, and now takes 3m20.5s, so that saves a little of the energy back). -- Arjan van de Ven Intel Open Source Technology Centre For development, discussion and tips for power savings, visit http://www.lesswatts.org --
Reviewed-by: Rik van Riel <riel@redhat.com> -- All rights reversed --
From c4dd11703034f2ecbc3180603663fab14c292d7c Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 18 Apr 2010 10:57:43 -0700
Subject: [PATCH 6/7] sched: introduce get_cpu_iowait_time_us()
For the ondemand cpufreq governor, it is desired that the iowait
time is microaccounted in a similar way as idle time is.
This patch introduces the infrastructure to account and expose
this information via the get_cpu_iowait_time_us() function.
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
include/linux/tick.h | 4 ++++
kernel/time/tick-sched.c | 28 ++++++++++++++++++++++++++++
kernel/time/timer_list.c | 1 +
3 files changed, 33 insertions(+), 0 deletions(-)
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 0343eed..4aa3703 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -42,6 +42,7 @@ enum tick_nohz_mode {
* @idle_waketime: Time when the idle was interrupted
* @idle_exittime: Time when the idle state was left
* @idle_sleeptime: Sum of the time slept in idle with sched tick stopped
+ * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding
* @sleep_length: Duration of the current idle sleep
* @do_timer_lst: CPU was the last one doing do_timer before going idle
*/
@@ -60,6 +61,7 @@ struct tick_sched {
ktime_t idle_waketime;
ktime_t idle_exittime;
ktime_t idle_sleeptime;
+ ktime_t iowait_sleeptime;
ktime_t sleep_length;
unsigned long last_jiffies;
unsigned long next_jiffies;
@@ -123,6 +125,7 @@ extern void tick_nohz_stop_sched_tick(int inidle);
extern void tick_nohz_restart_sched_tick(void);
extern ktime_t tick_nohz_get_sleep_length(void);
extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
+extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
# else
static inline void tick_nohz_stop_sched_tick(int inidle) { }
static inline void tick_nohz_restart_sched_tick(void) { }
@@ -133,6 ...Reviewed-by: Rik van Riel <riel@redhat.com> -- All rights reversed --
From 6f9df3bc6571d6545c552151f408d69265e15f92 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 18 Apr 2010 10:25:19 -0700
Subject: [PATCH 1/7] sched: add a comment to get_cpu_idle_time_us()
The exported function get_cpu_idle_time_us() has no comment
describing it; add a kerneldoc comment
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
kernel/time/tick-sched.c | 14 ++++++++++++++
1 files changed, 14 insertions(+), 0 deletions(-)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f992762..54dc155 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -179,6 +179,20 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
return now;
}
+/**
+ * get_cpu_idle_time_us - get the total idle time of a cpu
+ * @cpu: CPU number to query
+ * @last_update_time: variable to store update time in
+ *
+ * Return the cummulative idle time (since boot) for a given
+ * CPU, in microseconds. The idle time returned includes
+ * the iowait time (unlike what "top" and co report).
+ *
+ * This time is measured via accounting rather than sampling,
+ * and is as accurate as ktime_get() is.
+ *
+ * This function returns -1 if NOHZ is not enabled.
+ */
u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
--
1.6.2.5
--
Reviewed-by: Rik van Riel <riel@redhat.com> -- All rights reversed --
From 60851b131900af03bf013afef69f3bcdbb04f1d6 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 18 Apr 2010 10:41:30 -0700
Subject: [PATCH 3/7] sched: update the idle statistics in get_cpu_idle_time_us()
Right now, get_cpu_idle_time_us() only reports the idle statistics
upto the point the CPU entered last idle; not what is valid right now.
This patch adds an update of the idle statistics to get_cpu_idle_time_us(),
so that calling this function always returns statistics that are accurate
at the point of the call.
This includes resetting the start of the idle time for accounting purposes
to avoid double accounting.
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
kernel/time/tick-sched.c | 7 ++++++-
1 files changed, 6 insertions(+), 1 deletions(-)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index ca2211d..7dbad2f 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -161,6 +161,7 @@ static void update_ts_time_stats(struct tick_sched *ts, ktime_t now)
if (ts->idle_active) {
delta = ktime_sub(now, ts->idle_entrytime);
ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+ ts->idle_entrytime = now;
}
}
@@ -205,14 +206,18 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+ ktime_t now;
if (!tick_nohz_enabled)
return -1;
+ now = ktime_get();
+ update_ts_time_stats(ts, now);
+
if (ts->idle_active)
*last_update_time = ktime_to_us(ts->idle_lastupdate);
else
- *last_update_time = ktime_to_us(ktime_get());
+ *last_update_time = ktime_to_us(now);
return ktime_to_us(ts->idle_sleeptime);
}
--
1.6.2.5
--
Reviewed-by: Rik van Riel <riel@redhat.com> -- All rights reversed --
On Sun, Apr 18, 2010 at 11:59:49AM -0700, Arjan van de Ven wrote: > There have been various reports of the ondemand governor causing some > serious performance issues, one of the latest ones from Andrew. > There are several fundamental issues with ondemand (being worked on), > but the report from Andrew can be fixed relatively easily. > > The fundamental issue is that ondemand will go to a (too) low CPU > frequency for workloads that alternatingly disk and CPU bound... given the scheduler changes, these probably make more sense to go via one of Ingo's trees ? If everyone agrees, feel free to add my Signed-off-by: Dave --
What about the issue Pavel reported and Willy confirmed (regression on older hardware)? Best, Dominik --
Having a /sys tunable makes sense, especially when running on battery power. I guess that can be introduced as a patch 8/7 :) -- All rights reversed --
... only if combined with sane default values. Best, Dominik --
