Re: [rfc] direct IO submission and completion scalability issues

!MAILaRCHIVE_VOTE_RePLACE
Previous message: [thread] [date] [author]
Next message: [thread] [date] [author]
To: Siddha, Suresh B <suresh.b.siddha@...>
Cc: <linux-kernel@...>, <arjan@...>, <mingo@...>, <ak@...>, <jens.axboe@...>, <James.Bottomley@...>, <andrea@...>, <clameter@...>, <akpm@...>, <andrew.vasquez@...>, <willy@...>, Zach Brown <zach.brown@...>
Date: Sunday, February 3, 2008 - 5:52 am

On Fri, Jul 27, 2007 at 06:21:28PM -0700, Suresh B wrote:

Hi guys,

Just had another way we might do this. Migrate the completions out to
the submitting CPUs rather than migrate submission into the completing
CPU.

I've got a basic patch that passes some stress testing. It seems fairly
simple to do at the block layer, and the bulk of the patch involves
introducing a scalable smp_call_function for it.

Now it could be optimised more by looking at batching up IPIs or
optimising the call function path or even mirating the completion event
at a different level...

However, this is a first cut. It actually seems like it might be taking
slightly more CPU to process block IO (~0.2%)... however, this is on my
dual core system that shares an llc, which means that there are very few
cache benefits to the migration, but non-zero overhead. So on multisocket
systems hopefully it might get to positive territory.

---

Index: linux-2.6/arch/x86/kernel/smp_64.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/smp_64.c
+++ linux-2.6/arch/x86/kernel/smp_64.c
@@ -321,6 +321,99 @@ void unlock_ipi_call_lock(void)
 	spin_unlock_irq(&call_lock);
 }
 
+struct call_single_data {
+	struct list_head list;
+	void (*func) (void *info);
+	void *info;
+	int wait;
+};
+
+struct call_single_queue {
+	spinlock_t lock;
+	struct list_head list;
+};
+static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
+
+int __cpuinit init_smp_call(void)
+{
+	int i;
+
+	for_each_cpu_mask(i, cpu_possible_map) {
+		spin_lock_init(&per_cpu(call_single_queue, i).lock);
+		INIT_LIST_HEAD(&per_cpu(call_single_queue, i).list);
+	}
+	return 0;
+}
+core_initcall(init_smp_call);
+
+/*
+ * this function sends a 'generic call function' IPI to all other CPU
+ * of the system defined in the mask.
+ */
+int smp_call_function_fast(int cpu, void (*func)(void *), void *info,
+				    int wait)
+{
+	struct call_single_data *data;
+	struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);
+	cpumask_t mask = cpumask_of_cpu(cpu);
+	int ipi;
+
+	data = kmalloc(sizeof(struct call_single_data), GFP_ATOMIC);
+	data->func = func;
+	data->info = info;
+	data->wait = wait;
+
+	spin_lock_irq(&dst->lock);
+	ipi = list_empty(&dst->list);
+	list_add_tail(&data->list, &dst->list);
+	spin_unlock_irq(&dst->lock);
+
+	if (ipi)
+		send_IPI_mask(mask, CALL_FUNCTION_SINGLE_VECTOR);
+
+	if (wait) {
+		/* Wait for response */
+		while (data->wait)
+			cpu_relax();
+		kfree(data);
+	}
+
+	return 0;
+}
+
+asmlinkage void smp_call_function_fast_interrupt(void)
+{
+	struct call_single_queue *q;
+	unsigned long flags;
+	LIST_HEAD(list);
+
+	ack_APIC_irq();
+
+	q = &__get_cpu_var(call_single_queue);
+	spin_lock_irqsave(&q->lock, flags);
+	list_replace_init(&q->list, &list);
+	spin_unlock_irqrestore(&q->lock, flags);
+
+	exit_idle();
+	irq_enter();
+	while (!list_empty(&list)) {
+		struct call_single_data *data;
+
+		data = list_entry(list.next, struct call_single_data, list);
+		list_del(&data->list);
+
+		data->func(data->info);
+		if (data->wait) {
+			smp_mb();
+			data->wait = 0;
+		} else {
+			kfree(data);
+		}
+	}
+	add_pda(irq_call_count, 1);
+	irq_exit();
+}
+
 /*
  * this function sends a 'generic call function' IPI to all other CPU
  * of the system defined in the mask.
Index: linux-2.6/block/blk-core.c
===================================================================
--- linux-2.6.orig/block/blk-core.c
+++ linux-2.6/block/blk-core.c
@@ -1604,6 +1604,13 @@ static int __end_that_request_first(stru
 	return 1;
 }
 
+static void blk_done_softirq_other(void *data)
+{
+	struct request *rq = data;
+
+	blk_complete_request(rq);
+}
+
 /*
  * splice the completion data to a local structure and hand off to
  * process_completion_queue() to complete the requests
@@ -1622,7 +1629,15 @@ static void blk_done_softirq(struct soft
 
 		rq = list_entry(local_list.next, struct request, donelist);
 		list_del_init(&rq->donelist);
-		rq->q->softirq_done_fn(rq);
+		if (rq->submission_cpu != smp_processor_id()) {
+			/*
+			 * Could batch up IPIs here, but we should measure how
+			 * often blk_done_softirq gets a large batch...
+			 */
+			smp_call_function_fast(rq->submission_cpu,
+						blk_done_softirq_other, rq, 0);
+		} else
+			rq->q->softirq_done_fn(rq);
 	}
 }
 
Index: linux-2.6/include/asm-x86/hw_irq_64.h
===================================================================
--- linux-2.6.orig/include/asm-x86/hw_irq_64.h
+++ linux-2.6/include/asm-x86/hw_irq_64.h
@@ -68,8 +68,7 @@
 #define ERROR_APIC_VECTOR	0xfe
 #define RESCHEDULE_VECTOR	0xfd
 #define CALL_FUNCTION_VECTOR	0xfc
-/* fb free - please don't readd KDB here because it's useless
-   (hint - think what a NMI bit does to a vector) */
+#define CALL_FUNCTION_SINGLE_VECTOR	0xfb
 #define THERMAL_APIC_VECTOR	0xfa
 #define THRESHOLD_APIC_VECTOR   0xf9
 /* f8 free */
@@ -102,6 +101,7 @@ void spurious_interrupt(void);
 void error_interrupt(void);
 void reschedule_interrupt(void);
 void call_function_interrupt(void);
+void call_function_fast_interrupt(void);
 void irq_move_cleanup_interrupt(void);
 void invalidate_interrupt0(void);
 void invalidate_interrupt1(void);
Index: linux-2.6/include/linux/smp.h
===================================================================
--- linux-2.6.orig/include/linux/smp.h
+++ linux-2.6/include/linux/smp.h
@@ -53,6 +53,7 @@ extern void smp_cpus_done(unsigned int m
  * Call a function on all other processors
  */
 int smp_call_function(void(*func)(void *info), void *info, int retry, int wait);
+int smp_call_function_fast(int cpuid, void(*func)(void *info), void *info, int wait);
 
 int smp_call_function_single(int cpuid, void (*func) (void *info), void *info,
 				int retry, int wait);
@@ -92,6 +93,11 @@ static inline int up_smp_call_function(v
 }
 #define smp_call_function(func, info, retry, wait) \
 			(up_smp_call_function(func, info))
+static inline int smp_call_function_fast(int cpuid, void(*func)(void *info), void *info, int wait)
+{
+	return 0;
+}
+
 #define on_each_cpu(func,info,retry,wait)	\
 	({					\
 		local_irq_disable();		\
Index: linux-2.6/block/elevator.c
===================================================================
--- linux-2.6.orig/block/elevator.c
+++ linux-2.6/block/elevator.c
@@ -648,6 +648,8 @@ void elv_insert(struct request_queue *q,
 void __elv_add_request(struct request_queue *q, struct request *rq, int where,
 		       int plug)
 {
+	rq->submission_cpu = smp_processor_id();
+
 	if (q->ordcolor)
 		rq->cmd_flags |= REQ_ORDERED_COLOR;
 
Index: linux-2.6/include/linux/blkdev.h
===================================================================
--- linux-2.6.orig/include/linux/blkdev.h
+++ linux-2.6/include/linux/blkdev.h
@@ -208,6 +208,8 @@ struct request {
 
 	int ref_count;
 
+	int submission_cpu;
+
 	/*
 	 * when request is used as a packet command carrier
 	 */
Index: linux-2.6/arch/x86/kernel/entry_64.S
===================================================================
--- linux-2.6.orig/arch/x86/kernel/entry_64.S
+++ linux-2.6/arch/x86/kernel/entry_64.S
@@ -696,6 +696,9 @@ END(invalidate_interrupt\num)
 ENTRY(call_function_interrupt)
 	apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
 END(call_function_interrupt)
+ENTRY(call_function_fast_interrupt)
+	apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_fast_interrupt
+END(call_function_fast_interrupt)
 ENTRY(irq_move_cleanup_interrupt)
 	apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
 END(irq_move_cleanup_interrupt)
Index: linux-2.6/arch/x86/kernel/i8259_64.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/i8259_64.c
+++ linux-2.6/arch/x86/kernel/i8259_64.c
@@ -493,6 +493,7 @@ void __init native_init_IRQ(void)
 
 	/* IPI for generic function call */
 	set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+	set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_fast_interrupt);
 
 	/* Low priority IPI to cleanup after moving an irq */
 	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
Index: linux-2.6/include/asm-x86/mach-default/entry_arch.h
===================================================================
--- linux-2.6.orig/include/asm-x86/mach-default/entry_arch.h
+++ linux-2.6/include/asm-x86/mach-default/entry_arch.h
@@ -13,6 +13,7 @@
 BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
 BUILD_INTERRUPT(invalidate_interrupt,INVALIDATE_TLB_VECTOR)
 BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
+BUILD_INTERRUPT(call_function_fast_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
 #endif
 
 /*
--
Previous message: [thread] [date] [author]
Next message: [thread] [date] [author]

Messages in current thread:
[rfc] direct IO submission and completion scalability issues, Siddha, Suresh B, (Fri Jul 27, 9:21 pm)
Re: [rfc] direct IO submission and completion scalability is..., Nick Piggin, (Sun Feb 3, 5:52 am)
Re: [rfc] direct IO submission and completion scalability is..., Christoph Lameter, (Mon Jul 30, 2:20 pm)