If stop_machine() invoked while one of onlined cpu is locked up
by some reason, stop_machine cannot finish its work because the
locked cpu cannot stop. This means all other healthy cpus
will be blocked infinitely by one dead cpu.This patch allows stop_machine to return -EBUSY with some printk
messages if any of stop_machine's threads cannot start running on
its target cpu.Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
---
kernel/stop_machine.c | 52 +++++++++++++++++++++++++++++++++++++++++++++---
kernel/sysctl.c | 15 ++++++++++++++
2 files changed, 63 insertions(+), 4 deletions(-)diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 5b72c2b..9e36809 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -35,15 +35,19 @@ struct stop_machine_data {
};/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
-static unsigned int num_threads;
+static atomic_t num_threads;
static atomic_t thread_ack;
+static cpumask_t prepared_cpus;
static struct completion finished;
static DEFINE_MUTEX(lock);+static unsigned long limit;
+unsigned long stopmachine_timeout = 5; /* secs, arbitrary */
+
static void set_state(enum stopmachine_state newstate)
{
/* Reset ack counter. */
- atomic_set(&thread_ack, num_threads);
+ atomic_set(&thread_ack, atomic_read(&num_threads));
smp_wmb();
state = newstate;
}
@@ -62,11 +66,14 @@ static void ack_state(void)/* This is the actual thread which stops the CPU. It exits by itself rather
* than waiting for kthread_stop(), because it's easier for hotplug CPU. */
-static int stop_cpu(struct stop_machine_data *smdata)
+static int stop_cpu(void *__smdata)
{
+ struct stop_machine_data *smdata = __smdata;
enum stopmachine_state curstate = STOPMACHINE_NONE;
int uninitialized_var(ret);+ cpu_set(smp_processor_id(), prepared_cpus);
+
/* Simple state machine */
do {
/* Chill out and ensure we re-read stopmachine_state. */
@@...
If stop_machine() invoked while one of onlined cpu is locked up
by some reason, stop_machine cannot finish its work because the
locked cpu cannot stop. This means all other healthy cpus
will be blocked infinitely by one dead cpu.This patch allows stop_machine to return -EBUSY with some printk
messages if any of stop_machine's threads cannot start running on
its target cpu in time. You can enable this timeout via sysctl.v4:
- move extern into linux/stop_machine.h and add include of the
header to kernel/sysctl.c. Now stopmachine_timeout is available
only on smp.v3:
- set stopmachine_timeout default to 0 (= never timeout)v2:
- remove fix for warning since it will be fixed upcoming typesafe
patches
- make stopmachine_timeout from secs to msecs
- allow disabling timeout by setting the stopmachine_timeout to 0Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
---
include/linux/stop_machine.h | 3 ++
kernel/stop_machine.c | 54 +++++++++++++++++++++++++++++++++++++++--
kernel/sysctl.c | 12 +++++++++
3 files changed, 66 insertions(+), 3 deletions(-)diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 0a7815c..4c934f7 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -13,6 +13,9 @@
/* Deprecated, but useful for transition. */
#define ALL_CPUS CPU_MASK_ALL_PTR+/* for sysctl entry */
+extern unsigned long stopmachine_timeout;
+
/**
* stop_machine_run: freeze the machine on all CPUs and run this function
* @fn: the function to run
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 5b72c2b..9059b9e 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -35,15 +35,18 @@ struct stop_machine_data {
};/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
-static unsigned int num_threads;
+static atomic_t num_threads;
static atomic_t thread_ack;
+static cpumask_t prepared_cpus;
static str...
Looks good to me.
Ack.Max
--
Thank you for useful feedbacks!
Here is the updated version.
Could you put this on top of your patches, Rusty?Thanks,
H.SetoIf stop_machine() invoked while one of onlined cpu is locked up
by some reason, stop_machine cannot finish its work because the
locked cpu cannot stop. This means all other healthy cpus
will be blocked infinitely by one dead cpu.This patch allows stop_machine to return -EBUSY with some printk
messages if any of stop_machine's threads cannot start running on
its target cpu.v2:
- remove fix for warning since it will be fixed upcoming typesafe
patches
- make stopmachine_timeout from secs to msecs, and set default to
200 msec (since v1's arbitrary 5 sec is too long)
- allow disabling timeout by setting the stopmachine_timeout to 0Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
---
kernel/stop_machine.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++--
kernel/sysctl.c | 15 +++++++++++++
2 files changed, 66 insertions(+), 3 deletions(-)diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 5b72c2b..2968b8a 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -35,15 +35,18 @@ struct stop_machine_data {
};/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
-static unsigned int num_threads;
+static atomic_t num_threads;
static atomic_t thread_ack;
+static cpumask_t prepared_cpus;
static struct completion finished;
static DEFINE_MUTEX(lock);+unsigned long stopmachine_timeout = 200; /* msecs, arbitrary */
+
static void set_state(enum stopmachine_state newstate)
{
/* Reset ack counter. */
- atomic_set(&thread_ack, num_threads);
+ atomic_set(&thread_ack, atomic_read(&num_threads));
smp_wmb();
state = newstate;
}
@@ -67,6 +70,8 @@ static int stop_cpu(struct stop_machine_data *smdata)
enum stopmachine_state curstate = STOPMACHINE_NONE;
int uninitialized_var(ret);+ cpu_set(smp_processor_id(), prepared_cpus);
...
No externs in C files. Put it in an appropriate header.
I'll do a proper review soon.
J
--
sysctl.c already has many externs... but I can fix at least
Is it better to postpone v4 until your comment comes?
Thanks,
H.Seto
--
I think the only other real change is to make the value in ms, not seconds,
and allow a 0 value to mean "don't check".Cheers,
Rusty.--
No.
J
--
I'd set the default to zero. I beleive that's what Heiko suggested too.
Max
--
Oh, yes, you are right. I missed to catch the suggestion.
I'll post fixed version soon. Wait a minutes...Thanks,
H.Seto
--
If stop_machine() invoked while one of onlined cpu is locked up
by some reason, stop_machine cannot finish its work because the
locked cpu cannot stop. This means all other healthy cpus
will be blocked infinitely by one dead cpu.This patch allows stop_machine to return -EBUSY with some printk
messages if any of stop_machine's threads cannot start running on
its target cpu in time. You can enable this timeout via sysctl.v3:
- set stopmachine_timeout default to 0 (= never timeout)v2:
- remove fix for warning since it will be fixed upcoming typesafe
patches
- make stopmachine_timeout from secs to msecs
- allow disabling timeout by setting the stopmachine_timeout to 0Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
---
kernel/stop_machine.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++--
kernel/sysctl.c | 15 +++++++++++++
2 files changed, 66 insertions(+), 3 deletions(-)diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 5b72c2b..77b7944 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -35,15 +35,18 @@ struct stop_machine_data {
};/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
-static unsigned int num_threads;
+static atomic_t num_threads;
static atomic_t thread_ack;
+static cpumask_t prepared_cpus;
static struct completion finished;
static DEFINE_MUTEX(lock);+unsigned long stopmachine_timeout = 0; /* msecs, 0 = "never timeout" */
+
static void set_state(enum stopmachine_state newstate)
{
/* Reset ack counter. */
- atomic_set(&thread_ack, num_threads);
+ atomic_set(&thread_ack, atomic_read(&num_threads));
smp_wmb();
state = newstate;
}
@@ -67,6 +70,8 @@ static int stop_cpu(struct stop_machine_data *smdata)
enum stopmachine_state curstate = STOPMACHINE_NONE;
int uninitialized_var(ret);+ cpu_set(smp_processor_id(), prepared_cpus);
+
/* Simple state machine */
do {
/* Chill out and ensure we re-read stopmachi...
I really don't like this, it means the system is really screwed up and
--
It can be said that after timeout we just back to previous state, where
machine already limp(=partially screwed up), but have some degree of
performance. We might be able to do some recovery, such as killing
process, restart or reset of subsystem and so on. Even if a CPU get
stuck, it might be possible to continue its service with remaining
CPUs, ex. assume there are 1024 CPUs total.
(I wish if we were able to force-reset such unstable CPU in future...)I agree that there are much amount of situation where this feature is
not acceptable. But there would be others.Thanks,
H.Seto
--
Hmm. I think this could become interesting on virtual machines. The hypervisor
might be to busy to schedule a specific cpu at certain load scenarios. This
would cause a failure even if the cpu is not really locked up. We had similar
problems with the soft lockup daemon on s390.It would be good to not-use wall-clock time, but really used cpu time instead.
Unfortunately I have no idea, if that is possible in a generic way.
Heiko, any ideas?
--
5 seconds is a fairly long time. If all else fails we could have a config
Ah, cpu time comes up again. Perhaps we should actually dig that up again;
Zach and Jeremy CC'd.Thanks,
Rusty.
--
Hm, yeah. But in this case, it's tricky. CPU time is an inherently
per-cpu quantity. If cpu A is waiting for cpu B, and wants to do the
timeout in cpu-seconds, then it has to be in *B*s cpu-seconds (and if A
is waiting on B,C,D,E,F... it needs to measure separate timeouts with
separate timebases for each other CPU). It also means that if B is
unresponsive but also not consuming any time (blocked in IO,
administratively paused, etc), then the timeout will never trigger.So I think monotonic wallclock time actually makes the most sense here.
The other issue is whether cpu_relax() is the right thing to put in the
busywait. We don't hook it in pvops, so it's just an x86 "pause"
instruction, so from the hypervisor's perspective it just looks like a
spinning CPU. We could either hook cpu_relax() into a hypervisor yield,
or come up with a heavier-weight cpu_snooze() (cpu_relax() is often used
in loops which are expected to have a short duration, where doing a
hypercall+yield would be overkill).J
--
Hmm.. probably a stupid question: but what could happen that a real cpu
(not virtual) becomes unresponsive so that it won't schedule a MAX_RT_PRIO-1This is asking for trouble... a config option to disable this would be
nice. But as I don't know which problem this patch originally addressescpu_relax() translates to a hypervisor yield on s390. Probably makes sense
if other architectures would do the same.
--
The original problem (once I heard and easily reproduced) was there was an
another MAX_RT_PRIO-1 task and the task was spinning in itself by a bug.
(Now this would not be a problem since RLIMIT_RTTIME will work for it, but
I cannot deny that there are some situations which cannot set the limit.)However there would be more possible problem in the world, ex. assume that
a routine work with interrupt (and also preemption) disabled have an issue
of scalability so it takes long time on huge machine then stop_machine will
stop whole system such long time. You can assume a driver's bug. Now theI'm not good at VM etc., but I think user doesn't care who holds a cpu,
whether other guest or actual buggy software or space alien or so.
The important thing here is return control to user if timeout.Thanks,
H.Seto
--
Yep. As I described in the prev email in my case it's a legitimate thing. Some
of the CPU cores are running wireless basestation schedulers and the deadlines
are way too tight for them to sleep (it's "cpu as a dedicated engine" kind of
thing, they are properly isolated and stuff).In this case actually RT limit is the first thing that I disable :).
I'd rather have stop_machine fail and tell the user that something is wrong.
In which case they can simply stop the basestation app that is running when
convinient. ie It give control back to the user rather than wedging the box or
killing the app.Max
--
I have a workload where MAX_PRIO RT thread runs and never yeilds. That's what
my cpu isolation patches/tree addresses. Stopmachine is the only (that I know
of) thing that really brakes in that case. btw In case you're wondering yes
we've discussed workqueue threads starvation and stuff in the other threads.
How about this. We'll make this a sysctl, as Rusty already did, and set the
default to 0 which means "never timeout". That way crazy people like me who
care about this scenario can enable this feature.btw Rusty, I just had this "why didn't I think of that" moments. This is
actually another way of handling my workload. I mean it certainly does not fix
the root case of the problems and we still need other things that we talked
about (non-blocking module delete, lock-free module insertion, etc) but at
least in the mean time it avoids wedging the machines for good.
btw I'd like that timeout in milliseconds. I think 5 seconds is way tooooo
long :).Max
--
Indeed, this was my thought too. s390 can initialize it to zero somewhere in
We can make it ms, sure. 200ms should be plenty of time: worst I ever saw was
150ms, and that was some weird Power box doing crazy stuff. I wouldn't be
surprised if you'd never see 1ms on your hardware.The ipi idea would handle your case a little more nicely, too, but that's
probably not going to hit this merge window.Cheers,
Rusty.
--
I disagree that 5 seconds is to long :-). I even think having it default to 0
is the safest option for virtualized environments. What if the host is paging
like hell and the vcpu cannot run due to a missing page? In that case 200ms
can be an incredible short amount of time. If the timeout triggers,
stop_machine_run fails, but everything would work fine - it just takes
longer.--
Which reminds me that I wanted to submit a bunch of kthread and workqueue
related things in this window :).Max
--
Shouldn't we default to zero and let whowever wants something different
configure that via sysctl.conf?
Whatever default value > 0 you pick is likely wrong for whatever specific
usecase.
--
Yes, sounds like that should work for everyone and no additional config
option as well. Sounds good to me.
--
Yes. That's exactly what we're trying to detect. Currently the entire
machine will wedge. With this patch we can often limp along.Hidetoshi's original problem was a client whose machine had one CPU die, then
got wedged as the emergency backup tried to load a module.Along these lines, I found VMWare's relaxed co-scheduling interesting, BTW:
Yes, I think so too. Actually, doing a random yield-to-other-VCPU on
cpu_relax is arguable the right semantic (in Linux it's used for spinning,
almost exclusively to wait for other cpus).Cheers,
Rusty.--
Hello Hidetoshi!
I took your version and modified it slightly. What do you think?
(BTW, the "warning: passing argument 1 of 'kthread_create' from incompatible
pointer type" is caused by a kernel without the typesafe patches: this will
be fixed soon, do I've removed your fix for that).Thanks,
Rusty.===
stopmachine: add stopmachine_timeoutThis patch is based on Hidetoshi Seto's patch to make stop_machine()
more robust in the face of wedged CPUs.This version does not fail unless the stuck CPU is one we wanted to do
something on, and does not fail all future stop_machines.Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
kernel/stop_machine.c | 52 +++++++++++++++++++++++++++++++++++++++++++++---
kernel/sysctl.c | 15 ++++++++++++++
2 files changed, 63 insertions(+), 4 deletions(-)diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -35,10 +35,14 @@ struct stop_machine_data {
};/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
-static unsigned int num_threads;
+static unsigned num_threads;
static atomic_t thread_ack;
+static cpumask_t prepared_cpus;
static struct completion finished;
static DEFINE_MUTEX(lock);
+
+static unsigned long limit;
+unsigned long stopmachine_timeout = 5; /* secs, arbitrary */static void set_state(enum stopmachine_state newstate)
{
@@ -66,6 +70,10 @@ static int stop_cpu(struct stop_machine_
{
enum stopmachine_state curstate = STOPMACHINE_NONE;
int uninitialized_var(ret);
+
+ /* If we've been shoved off the normal CPU, abort. */
+ if (cpu_test_and_set(smp_processor_id(), prepared_cpus))
+ do_exit(0);/* Simple state machine */
do {
@@ -100,6 +108,44 @@ static int chill(void *unused)
return 0;
}+static bool fixup_timeout(struct task_struct **threads, const cpumask_t *cpus)
+{
+ unsigned int i;
+ bool stagger_onwards = true;
+
+ printk(KERN_CRIT "stopmachine: Failed to stop m...
Hi Rusty,
We must protect next stop_machine from threads spawned by previous one.
I like your idea that if we did not want to do something on the stuck CPU
then treat the CPU as stopped. However we need to be careful that theNote that here is a window that a not-prepared frozen cpu can be thawed and
--
What if we use:
if (i != smp_processor_id() && !cpu_test_and_set(i, prepared_cpus)) {instead of cpu_isset? That means that if a CPU restarts during that window,
either the thread will exit (because we set the bit here), or this will
detect it.Hmm, there's still the vague possibility that the thread doesn't schedule
until we start a new stop_machine (and clear prepared_cpus). We could simply
loop in the main thread if any threads are alive, before freeing them (inside
the lock). A counter and notifier is the other way, but it seems like
overkill for a very unlikely event.Thanks for the analysis!
Rusty.
--
Hi Rusty,
After having a relaxing day, once I said:
"I like your idea that if we did not want to do something on the stuck CPU
then treat the CPU as stopped."
but now I noticed that the stuck CPU can harm what we want to do if it is
not real stuck... ex. busy loop in a subsystem, and we want to touch the
core of the subsystem exclusively.
So "force progress" is not safe, on some rare case. I'd like to make this
timeout feature as a safe-net, therefore we should return error withoutI suppose my current implementation, returning control to user immediately,
is better than looping in main thread. In my implementation, num_threads is
initialized to num_online_cpus() by main thread, and decremented 1 by 1
each child thread. If time out happen, main thread will return without
waiting completion but set state STOPMACHINE_EXIT. Then child threads are now
detached from usual procedure, so they exit soon without do any work.At the beginning of new stop_machine, we can check the num_threads to know
whether there are remaining child threads. If there are, something is wrong
since the system cannot run MAX_PRIO RT thread, not binded to typical cpu now.
So we can return error in such case, assuming that the new stop_machine will
fail in same way.Anyway, I also think we can better thing here, but we don't need to do all
at once. Making steps by incremental patches would be nice, I think.Thanks,
H.Seto
--
No. You aim for perfection, but there is no "right" answer other than "don't
get your system into this mess". Whatever we do is going to be an educated
guess. And guessing that there'll be no race is a very good guess indeed.The scenario we are addressing is a stuck CPU and module load. If we fail
stop machine, module load fails.That is why we should continue if we can. It is also why the default timeout
cannot be 0. You can't turn this on once you notice there's a problem: it's
too late.If we don't want to handle this case, let's not apply any patch at all.
Rusty.--
Hi Rusty,
I see. Still we can have an another patch for an another feature.
How about this patch? This will be applied on "add stopmachine_timeout v4".Thanks,
H.Seto===
This patch reflects Rusty's suggestion that if we did not want to do
something on the stuck CPU then treat the CPU as stopped.If you afraid that the stuck CPUs may harm what we want to do, then
turn off stopmachine_force by writing 0 via sysctl.Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
---
include/linux/stop_machine.h | 1 +
kernel/stop_machine.c | 84 +++++++++++++++++++++++++++---------------
kernel/sysctl.c | 8 ++++
3 files changed, 63 insertions(+), 30 deletions(-)diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 4c934f7..2bb339b 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -15,6 +15,7 @@/* for sysctl entry */
extern unsigned long stopmachine_timeout;
+extern unsigned int stopmachine_force;/**
* stop_machine_run: freeze the machine on all CPUs and run this function
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 9059b9e..824aafe 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -35,18 +35,20 @@ struct stop_machine_data {
};/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
-static atomic_t num_threads;
+static unsigned int num_threads;
+static atomic_t num_tardy;
static atomic_t thread_ack;
static cpumask_t prepared_cpus;
static struct completion finished;
static DEFINE_MUTEX(lock);unsigned long stopmachine_timeout; /* msecs, default is 0 = "never timeout" */
+unsigned int stopmachine_force = 1; /* force progress on timeout, 0:turn off */static void set_state(enum stopmachine_state newstate)
{
/* Reset ack counter. */
- atomic_set(&thread_ack, atomic_read(&num_threads));
+ atomic_set(&thread_ack, num_threads);...
p.s.
Thanks,
H.Seto
--
| Jeremy Allison | Re: [RFC] Heads up on sys_fallocate() |
| Greg KH | [GIT PATCH] driver core patches against 2.6.24 |
| Joerg Roedel | [PATCH 03/34] AMD IOMMU: add defines and structures for ACPI scanning code |
| Eric W. Biederman | [PATCH] powerpc pseries eeh: Convert to kthread API |
| David Miller | [GIT]: Networking |
| Gerrit Renker | [PATCH 27/37] dccp: Integration of dynamic feature activation - part 2 (server side) |
| Natalie Protasevich | [BUG] New Kernel Bugs |
git: | |
