[PATCH] stopmachine: allow force progress on timeout

!MAILaRCHIVE_VOTE_RePLACE
Previous message: [thread] [date] [author]
Next message: [thread] [date] [author]
To: Rusty Russell <rusty@...>
Cc: <linux-kernel@...>
Date: Monday, July 21, 2008 - 11:28 pm

Hi Rusty,

Rusty Russell wrote:

I see.  Still we can have an another patch for an another feature.
How about this patch?  This will be applied on "add stopmachine_timeout v4".

Thanks,
H.Seto

===

This patch reflects Rusty's suggestion that if we did not want to do
something on the stuck CPU then treat the CPU as stopped.

If you afraid that the stuck CPUs may harm what we want to do, then
turn off stopmachine_force by writing 0 via sysctl.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/stop_machine.h |    1 +
 kernel/stop_machine.c        |   84 +++++++++++++++++++++++++++---------------
 kernel/sysctl.c              |    8 ++++
 3 files changed, 63 insertions(+), 30 deletions(-)

diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 4c934f7..2bb339b 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -15,6 +15,7 @@
 
 /* for sysctl entry */
 extern unsigned long stopmachine_timeout;
+extern unsigned int stopmachine_force;
 
 /**
  * stop_machine_run: freeze the machine on all CPUs and run this function
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 9059b9e..824aafe 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -35,18 +35,20 @@ struct stop_machine_data {
 };
 
 /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
-static atomic_t num_threads;
+static unsigned int num_threads;
+static atomic_t num_tardy;
 static atomic_t thread_ack;
 static cpumask_t prepared_cpus;
 static struct completion finished;
 static DEFINE_MUTEX(lock);
 
 unsigned long stopmachine_timeout; /* msecs, default is 0 = "never timeout" */
+unsigned int stopmachine_force = 1; /* force progress on timeout, 0:turn off */
 
 static void set_state(enum stopmachine_state newstate)
 {
 	/* Reset ack counter. */
-	atomic_set(&thread_ack, atomic_read(&num_threads));
+	atomic_set(&thread_ack, num_threads);
 	smp_wmb();
 	state = newstate;
 }
@@ -70,7 +72,11 @@ static int stop_cpu(struct stop_machine_data *smdata)
 	enum stopmachine_state curstate = STOPMACHINE_NONE;
 	int uninitialized_var(ret);
 
-	cpu_set(smp_processor_id(), prepared_cpus);
+	/* If we've been shoved off the normal CPU, abort. */
+	if (cpu_test_and_set(smp_processor_id(), prepared_cpus)) {
+		atomic_dec(&num_tardy);
+		do_exit(0);
+	}
 
 	/* Simple state machine */
 	do {
@@ -95,7 +101,6 @@ static int stop_cpu(struct stop_machine_data *smdata)
 		}
 	} while (curstate != STOPMACHINE_EXIT);
 
-	atomic_dec(&num_threads);
 	local_irq_enable();
 	do_exit(0);
 }
@@ -106,6 +111,41 @@ static int chill(void *unused)
 	return 0;
 }
 
+static bool fixup_timeout(struct task_struct **threads, const cpumask_t *cpus)
+{
+	int i;
+	bool ret = !!stopmachine_force ? true : false;
+
+	printk(KERN_CRIT "stopmachine: Failed to stop machine in "
+			"time(%ldms).\n", stopmachine_timeout);
+
+	for_each_online_cpu(i) {
+		if (i != smp_processor_id()
+		    && !cpu_test_and_set(i, prepared_cpus)) {
+
+			printk(KERN_CRIT "stopmachine: cpu#%d seems to be "
+					"stuck.\n", i);
+
+			/* Unbind thread */
+			set_cpus_allowed(threads[i], cpu_online_map);
+			atomic_inc(&num_tardy);
+			num_threads--;
+
+			/*
+			 * If we wanted to run on a particular CPU, and that's
+			 * the one which is stuck, we cannot continue.
+			 */
+			if (stopmachine_force)
+				ret &= (!cpus || !cpu_isset(i, *cpus));
+		}
+	}
+
+	if (ret)
+		printk(KERN_CRIT "stopmachine: Force progress ignoring stuck "
+			"CPUs.\n");
+	return ret;
+}
+
 int __stop_machine_run(int (*fn)(void *), void *data, const cpumask_t *cpus)
 {
 	int i, err;
@@ -113,7 +153,7 @@ int __stop_machine_run(int (*fn)(void *), void *data, const cpumask_t *cpus)
 	struct task_struct **threads;
 	unsigned long limit;
 
-	if (atomic_read(&num_threads)) {
+	if (atomic_read(&num_tardy)) {
 		/*
 		 * previous stop_machine was timeout, and still there are some
 		 * unfinished thread (dangling stucked CPU?).
@@ -135,7 +175,7 @@ int __stop_machine_run(int (*fn)(void *), void *data, const cpumask_t *cpus)
 	/* Set up initial state. */
 	mutex_lock(&lock);
 	init_completion(&finished);
-	atomic_set(&num_threads, num_online_cpus());
+	num_threads = num_online_cpus();
 	set_state(STOPMACHINE_PREPARE);
 
 	for_each_online_cpu(i) {
@@ -176,8 +216,14 @@ int __stop_machine_run(int (*fn)(void *), void *data, const cpumask_t *cpus)
 	if (stopmachine_timeout) {
 		limit = jiffies + msecs_to_jiffies(stopmachine_timeout);
 		while (cpus_weight(prepared_cpus) != num_online_cpus() - 1) {
-			if (time_is_before_jiffies(limit))
-				goto timeout;
+			if (time_is_before_jiffies(limit)) {
+				if (!fixup_timeout(threads, cpus)) {
+					/* Let threads go exit */
+					set_state(STOPMACHINE_EXIT);
+					active.fnret = -EBUSY;
+				}
+				break;
+			}
 			cpu_relax();
 		}
 	}
@@ -195,32 +241,10 @@ kill_threads:
 	for_each_online_cpu(i)
 		if (threads[i])
 			kthread_stop(threads[i]);
-	atomic_set(&num_threads, 0);
 	mutex_unlock(&lock);
 
 	kfree(threads);
 	return err;
-
-timeout:
-	printk(KERN_CRIT "stopmachine: Failed to stop machine in time(%lds).\n",
-			stopmachine_timeout);
-	for_each_online_cpu(i) {
-		if (!cpu_isset(i, prepared_cpus) && i != smp_processor_id())
-			printk(KERN_CRIT "stopmachine: cpu#%d seems to be "
-					"stuck.\n", i);
-		/* Unbind threads */
-		set_cpus_allowed(threads[i], cpu_online_map);
-	}
-
-	/* Let threads go exit */
-	set_state(STOPMACHINE_EXIT);
-
-	put_cpu();
-	/* no wait for completion */
-	mutex_unlock(&lock);
-	kfree(threads);
-
-	return -EBUSY;	/* canceled */
 }
 
 int stop_machine_run(int (*fn)(void *), void *data, const cpumask_t *cpus)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d9e9900..5cd72b8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -824,6 +824,14 @@ static struct ctl_table kern_table[] = {
 		.proc_handler   = &proc_doulongvec_minmax,
 		.strategy       = &sysctl_intvec,
 	},
+	{
+		.ctl_name       = CTL_UNNUMBERED,
+		.procname       = "stopmachine_force",
+		.data           = &stopmachine_force,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = &proc_dointvec,
+	},
 #endif
 /*
  * NOTE: do not add new entries to this table unless you have read
-- 

--
Previous message: [thread] [date] [author]
Next message: [thread] [date] [author]

Messages in current thread:
[PATCH] stopmachine: add stopmachine_timeout, Hidetoshi Seto, (Mon Jul 14, 3:52 am)
[PATCH] stopmachine: add stopmachine_timeout v4, Hidetoshi Seto, (Thu Jul 17, 2:12 am)
Re: [PATCH] stopmachine: add stopmachine_timeout v4, Max Krasnyansky, (Thu Jul 17, 3:09 am)
[PATCH] stopmachine: add stopmachine_timeout v2, Hidetoshi Seto, (Wed Jul 16, 12:27 am)
Re: [PATCH] stopmachine: add stopmachine_timeout v2, Jeremy Fitzhardinge, (Wed Jul 16, 6:11 am)
Re: [PATCH] stopmachine: add stopmachine_timeout v2, Hidetoshi Seto, (Wed Jul 16, 11:40 pm)
Re: [PATCH] stopmachine: add stopmachine_timeout v2, Rusty Russell, (Fri Jul 18, 12:18 am)
Re: [PATCH] stopmachine: add stopmachine_timeout v2, Jeremy Fitzhardinge, (Thu Jul 17, 1:37 am)
Re: [PATCH] stopmachine: add stopmachine_timeout v2, Max Krasnyansky, (Wed Jul 16, 2:23 am)
Re: [PATCH] stopmachine: add stopmachine_timeout v2, Hidetoshi Seto, (Wed Jul 16, 2:35 am)
[PATCH] stopmachine: add stopmachine_timeout v3, Hidetoshi Seto, (Wed Jul 16, 2:51 am)
Re: [PATCH] stopmachine: add stopmachine_timeout v3, Peter Zijlstra, (Wed Jul 16, 3:33 am)
Re: [PATCH] stopmachine: add stopmachine_timeout v3, Hidetoshi Seto, (Wed Jul 16, 4:12 am)
Re: [PATCH] stopmachine: add stopmachine_timeout, Christian Borntraeger, (Mon Jul 14, 7:51 am)
Re: [PATCH] stopmachine: add stopmachine_timeout, Rusty Russell, (Mon Jul 14, 8:34 am)
Re: [PATCH] stopmachine: add stopmachine_timeout, Jeremy Fitzhardinge, (Mon Jul 14, 2:56 pm)
Re: [PATCH] stopmachine: add stopmachine_timeout, Heiko Carstens, (Mon Jul 14, 5:20 pm)
Re: [PATCH] stopmachine: add stopmachine_timeout, Hidetoshi Seto, (Mon Jul 14, 10:24 pm)
Re: [PATCH] stopmachine: add stopmachine_timeout, Max Krasnyansky, (Mon Jul 14, 10:37 pm)
Re: [PATCH] stopmachine: add stopmachine_timeout, Max Krasnyansky, (Mon Jul 14, 10:24 pm)
Re: [PATCH] stopmachine: add stopmachine_timeout, Rusty Russell, (Tue Jul 15, 4:09 am)
Re: [PATCH] stopmachine: add stopmachine_timeout, Christian Borntraeger, (Wed Jul 16, 5:15 am)
Re: [PATCH] stopmachine: add stopmachine_timeout, Max Krasnyansky, (Tue Jul 15, 4:51 am)
Re: [PATCH] stopmachine: add stopmachine_timeout, Heiko Carstens, (Tue Jul 15, 4:39 am)
Re: [PATCH] stopmachine: add stopmachine_timeout, Heiko Carstens, (Tue Jul 15, 2:09 am)
Re: [PATCH] stopmachine: add stopmachine_timeout, Rusty Russell, (Mon Jul 14, 9:14 pm)
Re: [PATCH] stopmachine: add stopmachine_timeout, Rusty Russell, (Mon Jul 14, 6:43 am)
Re: [PATCH] stopmachine: add stopmachine_timeout, Hidetoshi Seto, (Mon Jul 14, 9:11 pm)
Re: [PATCH] stopmachine: add stopmachine_timeout, Rusty Russell, (Tue Jul 15, 3:50 am)
Re: [PATCH] stopmachine: add stopmachine_timeout, Hidetoshi Seto, (Wed Jul 16, 12:05 am)
Re: [PATCH] stopmachine: add stopmachine_timeout, Rusty Russell, (Sun Jul 20, 5:45 am)
[PATCH] stopmachine: allow force progress on timeout, Hidetoshi Seto, (Mon Jul 21, 11:28 pm)
Re: [PATCH] stopmachine: add stopmachine_timeout, Hidetoshi Seto, (Mon Jul 14, 4:19 am)