Re: RCU hang on cpu re-hotplug with 2.6.27rc8

Previous message: [thread] [date] [author]
Next message: [thread] [date] [author]
From: Paul E. McKenney
Date: Wednesday, October 8, 2008 - 6:33 pm

On Tue, Oct 07, 2008 at 02:22:15PM -0700, Paul E. McKenney wrote:

The attached patch (similar to one in -tip, but set up for mainline and
tweaked to make stall-checking on by default) should get you a stack
trace of any CPUs holding up RCU grace periods for more than about
three seconds.

On the off-chance that this helps.

							Thanx, Paul

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---

diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
index 4ab8436..cab055b 100644
--- a/include/linux/rcuclassic.h
+++ b/include/linux/rcuclassic.h
@@ -40,6 +40,10 @@
 #include <linux/cpumask.h>
 #include <linux/seqlock.h>
 
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+#define RCU_SECONDS_TILL_STALL_CHECK	3 * HZ	/* for rcp->jiffies_stall */
+#define RCU_SECONDS_TILL_STALL_RECHECK	30 * HZ	/* for rcp->jiffies_stall */
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 
 /* Global control variables for rcupdate callback mechanism. */
 struct rcu_ctrlblk {
@@ -52,6 +56,11 @@ struct rcu_ctrlblk {
 	spinlock_t	lock	____cacheline_internodealigned_in_smp;
 	cpumask_t	cpumask; /* CPUs that need to switch in order    */
 				 /* for current batch to proceed.        */
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+	unsigned long gp_start;	 /* Time at which GP started in jiffies. */
+	unsigned long jiffies_stall;
+				 /* Time at which to check for CPU stalls. */
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 } ____cacheline_internodealigned_in_smp;
 
 /* Is batch a before batch b ? */
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index aad93cd..a299876 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -118,6 +118,87 @@ static inline void force_quiescent_state(struct rcu_data *rdp,
 }
 #endif
 
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+
+static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
+{
+	rcp->gp_start = jiffies;
+	rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
+}
+
+static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+	int cpu;
+	long delta;
+	unsigned long flags;
+
+	/* Only let one CPU complain about others per time interval. */
+
+	spin_lock_irqsave(&rcp->lock, flags);
+	delta = jiffies - rcp->jiffies_stall;
+	if (delta < 2 || rcp->cur != rcp->completed) {
+		spin_unlock_irqrestore(&rcp->lock, flags);
+		return;
+	}
+	rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
+	spin_unlock_irqrestore(&rcp->lock, flags);
+
+	/* OK, time to rat on our buddy... */
+
+	printk(KERN_ERR "RCU detected CPU stalls:");
+	for_each_possible_cpu(cpu) {
+		if (cpu_isset(cpu, rcp->cpumask))
+			printk(" %d", cpu);
+	}
+	printk(" (detected by %d, t=%ld jiffies)\n",
+	       smp_processor_id(), (long)(jiffies - rcp->gp_start));
+}
+
+static void print_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+	unsigned long flags;
+
+	printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
+			smp_processor_id(), jiffies,
+			jiffies - rcp->gp_start);
+	dump_stack();
+	spin_lock_irqsave(&rcp->lock, flags);
+	if ((long)(jiffies - rcp->jiffies_stall) >= 0)
+		rcp->jiffies_stall =
+			jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
+	spin_unlock_irqrestore(&rcp->lock, flags);
+	set_need_resched();  /* kick ourselves to get things going. */
+}
+
+static void check_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+	long delta;
+
+	delta = jiffies - rcp->jiffies_stall;
+	if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) {
+		
+		/* We haven't checked in, so go dump stack. */
+		print_cpu_stall(rcp);
+
+	} else if (rcp->cur != rcp->completed && delta >= 2) {
+
+		/* They had two seconds to dump stack, so complain. */
+		print_other_cpu_stall(rcp);
+	}
+}
+
+#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
+static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
+{
+}
+
+static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
+{
+}
+
+#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
 /**
  * call_rcu - Queue an RCU callback for invocation after a grace period.
  * @head: structure to be used for queueing the RCU updates.
@@ -285,6 +366,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
 		 */
 		smp_wmb();
 		rcp->cur++;
+		record_gp_stall_check_time(rcp);
 
 		/*
 		 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
@@ -468,6 +550,9 @@ static void rcu_process_callbacks(struct softirq_action *unused)
 
 static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
 {
+	/* Check for CPU stalls, if enabled. */
+	check_cpu_stall(rcp);
+
 	/* This cpu has pending rcu entries and the grace period
 	 * for them has completed.
 	 */
@@ -558,6 +643,9 @@ void rcu_check_callbacks(int cpu, int user)
 static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
 						struct rcu_data *rdp)
 {
+#ifdef CONFIG_DEBUG_RCU_STALL
+	printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
+#endif /* #ifdef CONFIG_DEBUG_RCU_STALL */
 	memset(rdp, 0, sizeof(*rdp));
 	rdp->curtail = &rdp->curlist;
 	rdp->nxttail = &rdp->nxtlist;
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 0b50481..9fee969 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -597,6 +597,19 @@ config RCU_TORTURE_TEST_RUNNABLE
 	  Say N here if you want the RCU torture tests to start only
 	  after being manually enabled via /proc.
 
+config RCU_CPU_STALL_DETECTOR
+	bool "Check for stalled CPUs delaying RCU grace periods"
+	depends on CLASSIC_RCU
+	default y
+	help
+	  This option causes RCU to printk information on which
+	  CPUs are delaying the current grace period, but only when
+	  the grace period extends for excessive time periods.
+
+	  Say Y if you want RCU to perform such checks.
+
+	  Say N if you are unsure.
+
 config KPROBES_SANITY_TEST
 	bool "Kprobes sanity tests"
 	depends on DEBUG_KERNEL
--
Previous message: [thread] [date] [author]
Next message: [thread] [date] [author]

Messages in current thread:
scheduler hang on cpu re-hotplug with 2.6.27rc8, Andi Kleen, (Mon Oct 6, 7:12 am)
Re: RCU hang on cpu re-hotplug with 2.6.27rc8, Andi Kleen, (Mon Oct 6, 4:28 pm)
Re: RCU hang on cpu re-hotplug with 2.6.27rc8, Paul E. McKenney, (Mon Oct 6, 8:08 pm)
Re: RCU hang on cpu re-hotplug with 2.6.27rc8, Andi Kleen, (Tue Oct 7, 12:15 am)
Re: RCU hang on cpu re-hotplug with 2.6.27rc8, Paul E. McKenney, (Tue Oct 7, 8:26 am)
Re: RCU hang on cpu re-hotplug with 2.6.27rc8, Andi Kleen, (Tue Oct 7, 8:49 am)
Re: RCU hang on cpu re-hotplug with 2.6.27rc8, Paul E. McKenney, (Tue Oct 7, 9:34 am)
Re: RCU hang on cpu re-hotplug with 2.6.27rc8, Andi Kleen, (Tue Oct 7, 2:09 pm)
Re: RCU hang on cpu re-hotplug with 2.6.27rc8, Paul E. McKenney, (Tue Oct 7, 2:22 pm)
[PATCH] rudimentary tracing for Classic RCU, Paul E. McKenney, (Wed Oct 8, 6:08 pm)
Re: RCU hang on cpu re-hotplug with 2.6.27rc8, Paul E. McKenney, (Wed Oct 8, 6:33 pm)
Re: RCU hang on cpu re-hotplug with 2.6.27rc8, Andi Kleen, (Wed Oct 8, 9:56 pm)
Re: [PATCH] rudimentary tracing for Classic RCU, Lai Jiangshan, (Wed Oct 8, 11:20 pm)
Re: [PATCH] rudimentary tracing for Classic RCU, Andi Kleen, (Wed Oct 8, 11:55 pm)
Re: [PATCH] rudimentary tracing for Classic RCU, Lai Jiangshan, (Thu Oct 9, 12:05 am)
Re: [PATCH] rudimentary tracing for Classic RCU, KOSAKI Motohiro, (Thu Oct 9, 12:14 am)
Re: RCU hang on cpu re-hotplug with 2.6.27rc8, Thomas Gleixner, (Thu Oct 9, 12:24 am)
Re: [PATCH] rudimentary tracing for Classic RCU, Lai Jiangshan, (Thu Oct 9, 12:26 am)
Re: [PATCH] rudimentary tracing for Classic RCU, Andi Kleen, (Thu Oct 9, 1:06 am)
Re: RCU hang on cpu re-hotplug with 2.6.27rc8, Andi Kleen, (Thu Oct 9, 1:22 am)
Re: [PATCH] rudimentary tracing for Classic RCU, Frédéric Weisbecker, (Thu Oct 9, 3:23 am)
Re: [PATCH] rudimentary tracing for Classic RCU, Andi Kleen, (Thu Oct 9, 3:53 am)
Re: [PATCH] rudimentary tracing for Classic RCU, Frédéric Weisbecker, (Thu Oct 9, 4:44 am)
Re: RCU hang on cpu re-hotplug with 2.6.27rc8, Paul E. McKenney, (Thu Oct 9, 4:44 am)
Re: [PATCH] rudimentary tracing for Classic RCU, Paul E. McKenney, (Thu Oct 9, 4:50 am)
Re: [PATCH] rudimentary tracing for Classic RCU, Paul E. McKenney, (Thu Oct 9, 4:50 am)
Re: [PATCH] rudimentary tracing for Classic RCU, Paul E. McKenney, (Thu Oct 9, 4:54 am)
Re: [PATCH] rudimentary tracing for Classic RCU, Frédéric Weisbecker, (Thu Oct 9, 6:01 am)
[PATCH] v2 rudimentary tracing for Classic RCU, Paul E. McKenney, (Thu Oct 9, 8:44 pm)
Re: [PATCH] rudimentary tracing for Classic RCU, Paul E. McKenney, (Fri Oct 10, 4:48 am)
[PATCH] v3 rudimentary tracing for Classic RCU, Paul E. McKenney, (Mon Oct 13, 4:09 pm)
Re: [PATCH] v3 rudimentary tracing for Classic RCU, Lai Jiangshan, (Mon Oct 13, 8:53 pm)
Re: [PATCH] v3 rudimentary tracing for Classic RCU, Paul E. McKenney, (Tue Oct 14, 7:35 am)
Re: [PATCH] v3 rudimentary tracing for Classic RCU, Lai Jiangshan, (Thu Oct 23, 4:12 am)
Re: [PATCH] v3 rudimentary tracing for Classic RCU, Paul E. McKenney, (Sun Oct 26, 2:59 pm)
Re: [PATCH] v3 rudimentary tracing for Classic RCU, Paul E. McKenney, (Mon Oct 27, 2:50 pm)
Re: [PATCH] v3 rudimentary tracing for Classic RCU, Paul E. McKenney, (Mon Oct 27, 4:57 pm)
Re: [PATCH] v3 rudimentary tracing for Classic RCU, Paul E. McKenney, (Tue Oct 28, 6:16 pm)
Re: [PATCH] v3 rudimentary tracing for Classic RCU, Lai Jiangshan, (Tue Oct 28, 6:31 pm)
Re: [PATCH] v3 rudimentary tracing for Classic RCU, Paul E. McKenney, (Thu Oct 30, 8:52 am)