[PATCH] Implement slub fastpath in terms of freebase and freeoffset

!MAILaRCHIVE_VOTE_RePLACE
Previous message: [thread] [date] [author]
Next message: [thread] [date] [author]
To: Christoph Lameter <clameter@...>
Cc: Eric Dumazet <dada1@...>, Pekka Enberg <penberg@...>, Torsten Kaiser <just.for.lkml@...>, Ingo Molnar <mingo@...>, Linus Torvalds <torvalds@...>, Linux Kernel Mailing List <linux-kernel@...>
Date: Thursday, February 28, 2008 - 1:55 am

* Christoph Lameter (clameter@sgi.com) wrote:

Yep, I have an experimental patch ready. It would need some thorough
testing though. I seems to run fine on an x86_32 with and without
preemption (therefore with and without the slub cmpxchg_local fastpath).

You should first revert commit 00e962c5408b9f2d0bebd2308673fe982cb9a5fe
(this is the slub fastpath revert itself, currently in 2.6.25-rc3)

And then apply :


Implement slub fastpath in terms of freebase and freeoffset

It allows the cmpxchg_local to detect object re-use by keeping a counter in the
freeoffset MSBs.

freeoffset & PAGE_MASK is the offset of the freelist element within the page.
freeoffset & ~PAGE_MASK is the counter to detect object re-use.
freebase is the address of the page in this the object is located. It is a
multiple of PAGE_SIZE.

Whenever an object is freed in the cpu slab cache, the counter is incremented.
Whenever the alloc/free slow paths are modifying the freeoffset or freebase, the
freeoffset counter is also incremented. It is used to make sure we know if
freebase has been modified in an interrupt nested over the fast path.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
CC: Christoph Lameter <clameter@sgi.com>
---
 include/linux/slub_def.h |   35 +++++++++++++++++++
 mm/slub.c                |   84 +++++++++++++++++++++++++++++++++--------------
 2 files changed, 93 insertions(+), 26 deletions(-)

Index: linux-2.6-lttng/include/linux/slub_def.h
===================================================================
--- linux-2.6-lttng.orig/include/linux/slub_def.h	2008-02-27 20:40:48.000000000 -0500
+++ linux-2.6-lttng/include/linux/slub_def.h	2008-02-28 00:17:28.000000000 -0500
@@ -32,7 +32,18 @@ enum stat_item {
 	NR_SLUB_STAT_ITEMS };
 
 struct kmem_cache_cpu {
-	void **freelist;	/* Pointer to first free per cpu object */
+	unsigned long freeoffset;	/*
+					 * Offset of the first free per cpu
+				 	 * object. The MSBs are used as a
+					 * counter which must be incremented
+					 * each time an object is freed and each
+					 * time freebase is changed.
+					 */
+	unsigned long freebase;	/*
+				 * Pointer to the page address
+				 * containing the first free per cpu
+				 * object.
+				 */
 	struct page *page;	/* The slab from which we are allocating */
 	int node;		/* The node of the page (or -1 for debug) */
 	unsigned int offset;	/* Freepointer offset (in word units) */
@@ -42,6 +53,28 @@ struct kmem_cache_cpu {
 #endif
 };
 
+/*
+ * Used when interrupts are disabled, so no memory barriers are needed.
+ */
+static inline void **get_freelist_ptr(struct kmem_cache_cpu *c)
+{
+	return (void **)(c->freebase | (c->freeoffset & PAGE_MASK));
+}
+
+/*
+ * Updates the freebase and freeoffset concurrently with slub fastpath.
+ * We can nest over the fastpath as an interrupt. Therefore, all the freebase
+ * and freeoffset modifications will appear to have happened atomically from the
+ * fastpath point of view. (those are per-cpu variables)
+ */
+static inline void set_freelist_ptr(struct kmem_cache_cpu *c, void **freelist)
+{
+	c->freebase = (unsigned long)freelist & ~PAGE_MASK;
+	c->freeoffset += PAGE_SIZE;
+	c->freeoffset &= ~PAGE_MASK;
+	c->freeoffset |= (unsigned long)freelist & PAGE_MASK;
+}
+
 struct kmem_cache_node {
 	spinlock_t list_lock;	/* Protect partial list and nr_partial */
 	unsigned long nr_partial;
Index: linux-2.6-lttng/mm/slub.c
===================================================================
--- linux-2.6-lttng.orig/mm/slub.c	2008-02-27 20:41:14.000000000 -0500
+++ linux-2.6-lttng/mm/slub.c	2008-02-28 00:20:06.000000000 -0500
@@ -305,7 +305,7 @@ static inline struct kmem_cache_cpu *get
  * Note that SLUB relies on page_mapping returning NULL for pages with bit 0
  * in the mapping set.
  */
-static inline int is_end(void *addr)
+static inline int is_end(long addr)
 {
 	return (unsigned long)addr & PAGE_MAPPING_ANON;
 }
@@ -1411,7 +1411,7 @@ static void deactivate_slab(struct kmem_
 	struct page *page = c->page;
 	int tail = 1;
 
-	if (c->freelist)
+	if (get_freelist_ptr(c))
 		stat(c, DEACTIVATE_REMOTE_FREES);
 	/*
 	 * Merge cpu freelist into freelist. Typically we get here
@@ -1422,14 +1422,14 @@ static void deactivate_slab(struct kmem_
 	 * be called for a debug slab. Then c->freelist may contain
 	 * a dummy pointer.
 	 */
-	while (unlikely(!is_end(c->freelist))) {
+	while (unlikely(!is_end(c->freeoffset))) {
 		void **object;
 
 		tail = 0;	/* Hot objects. Put the slab first */
 
 		/* Retrieve object from cpu_freelist */
-		object = c->freelist;
-		c->freelist = c->freelist[c->offset];
+		object = get_freelist_ptr(c);
+		set_freelist_ptr(c, object[c->offset]);
 
 		/* And put onto the regular freelist */
 		object[c->offset] = page->freelist;
@@ -1534,7 +1534,7 @@ load_freelist:
 		goto debug;
 
 	object = c->page->freelist;
-	c->freelist = object[c->offset];
+	set_freelist_ptr(c, object[c->offset]);
 	c->page->inuse = s->objects;
 	c->page->freelist = c->page->end;
 	c->node = page_to_nid(c->page);
@@ -1636,28 +1636,45 @@ static __always_inline void *slab_alloc(
  */
 
 #ifdef SLUB_FASTPATH
+	unsigned long freeoffset, freebase;
+
 	c = get_cpu_slab(s, raw_smp_processor_id());
 	do {
-		object = c->freelist;
-		if (unlikely(is_end(object) || !node_match(c, node))) {
+		freeoffset = c->freeoffset;
+		/*
+		 * If freebase is changed, freeoffset _must_ have its
+		 * counter incremented.
+		 * read freeoffset before freebase (wrt interrupts).
+		 */
+		barrier();
+		freebase = c->freebase;
+		if (unlikely(is_end(freeoffset) || !node_match(c, node))) {
 			object = __slab_alloc(s, gfpflags, node, addr, c);
 			break;
 		}
+		object = (void **)(freebase | (freeoffset & PAGE_MASK));
 		stat(c, ALLOC_FASTPATH);
-	} while (cmpxchg_local(&c->freelist, object, object[c->offset])
-								!= object);
+		/* No need to increment the MSB counter here, because only
+		 * object free will lead to counter re-use. */
+	} while (cmpxchg_local(&c->freeoffset, freeoffset,
+		(freeoffset + (c->offset * sizeof(void *)))) != freeoffset);
 #else
 	unsigned long flags;
 
 	local_irq_save(flags);
 	c = get_cpu_slab(s, smp_processor_id());
-	if (unlikely(is_end(c->freelist) || !node_match(c, node)))
+	if (unlikely(is_end(c->freeoffset) || !node_match(c, node)))
 
 		object = __slab_alloc(s, gfpflags, node, addr, c);
 
 	else {
-		object = c->freelist;
-		c->freelist = object[c->offset];
+		object = get_freelist_ptr(c);
+		/*
+		 * No need to change freebase or increment freeoffset of
+		 * PAGE_SIZE, so we simply update the freeoffset LSB here.
+		 */
+		c->freeoffset &= ~PAGE_MASK;
+		c->freeoffset |= (unsigned long)object[c->offset] & PAGE_MASK;
 		stat(c, ALLOC_FASTPATH);
 	}
 	local_irq_restore(flags);
@@ -1779,17 +1796,24 @@ static __always_inline void slab_free(st
 	struct kmem_cache_cpu *c;
 
 #ifdef SLUB_FASTPATH
-	void **freelist;
+	unsigned long freeoffset, freebase, newoffset;
 
 	c = get_cpu_slab(s, raw_smp_processor_id());
 	debug_check_no_locks_freed(object, s->objsize);
 	do {
-		freelist = c->freelist;
+		freeoffset = c->freeoffset;
+		/*
+		 * If freebase is changed, freeoffset _must_ have its
+		 * counter incremented.
+		 * read freeoffset before freebase (wrt interrupts).
+		 */
+		barrier();
+		freebase = c->freebase;
 		barrier();
 		/*
 		 * If the compiler would reorder the retrieval of c->page to
-		 * come before c->freelist then an interrupt could
-		 * change the cpu slab before we retrieve c->freelist. We
+		 * come before c->freeoffset and freebase then an interrupt
+		 * could change the cpu slab before we retrieve c->freelist. We
 		 * could be matching on a page no longer active and put the
 		 * object onto the freelist of the wrong slab.
 		 *
@@ -1801,9 +1825,14 @@ static __always_inline void slab_free(st
 			__slab_free(s, page, x, addr, c->offset);
 			break;
 		}
-		object[c->offset] = freelist;
+		object[c->offset] = (void **)(freebase |
+					(freeoffset & PAGE_MASK));
 		stat(c, FREE_FASTPATH);
-	} while (cmpxchg_local(&c->freelist, freelist, object) != freelist);
+		newoffset = freeoffset + PAGE_SIZE;
+		newoffset &= ~PAGE_MASK;
+		newoffset |= (unsigned long)object & PAGE_MASK;
+	} while (cmpxchg_local(&c->freeoffset, freeoffset, newoffset)
+			!= freeoffset);
 #else
 	unsigned long flags;
 
@@ -1811,8 +1840,13 @@ static __always_inline void slab_free(st
 	debug_check_no_locks_freed(object, s->objsize);
 	c = get_cpu_slab(s, smp_processor_id());
 	if (likely(page == c->page && c->node >= 0)) {
-		object[c->offset] = c->freelist;
-		c->freelist = object;
+		object[c->offset] = get_freelist_ptr(c);
+		/*
+		 * No need to update freebase here, so simply update freeoffset.
+		 */
+		c->freeoffset += PAGE_SIZE;
+		c->freeoffset &= ~PAGE_MASK;
+		c->freeoffset |= (unsigned long)object & PAGE_MASK;
 		stat(c, FREE_FASTPATH);
 	} else
 		__slab_free(s, page, x, addr, c->offset);
@@ -1995,7 +2029,8 @@ static void init_kmem_cache_cpu(struct k
 			struct kmem_cache_cpu *c)
 {
 	c->page = NULL;
-	c->freelist = (void *)PAGE_MAPPING_ANON;
+	c->freeoffset = PAGE_MAPPING_ANON;
+	c->freebase = 0;
 	c->node = 0;
 	c->offset = s->offset / sizeof(void *);
 	c->objsize = s->objsize;
@@ -2042,8 +2077,7 @@ static struct kmem_cache_cpu *alloc_kmem
 	struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu);
 
 	if (c)
-		per_cpu(kmem_cache_cpu_free, cpu) =
-				(void *)c->freelist;
+		per_cpu(kmem_cache_cpu_free, cpu) = (void *)get_freelist_ptr(c);
 	else {
 		/* Table overflow: So allocate ourselves */
 		c = kmalloc_node(
@@ -2064,7 +2098,7 @@ static void free_kmem_cache_cpu(struct k
 		kfree(c);
 		return;
 	}
-	c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu);
+	set_freelist_ptr(c, (void *)per_cpu(kmem_cache_cpu_free, cpu));
 	per_cpu(kmem_cache_cpu_free, cpu) = c;
 }
 

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68
--
Previous message: [thread] [date] [author]
Next message: [thread] [date] [author]

Messages in current thread:
Linux 2.6.25-rc2, Linus Torvalds, (Fri Feb 15, 5:23 pm)
Re: Linux 2.6.25-rc2, Torsten Kaiser, (Sat Feb 16, 5:38 pm)
Re: Linux 2.6.25-rc2, Ingo Molnar, (Tue Feb 19, 2:11 am)
Re: Linux 2.6.25-rc2, Torsten Kaiser, (Tue Feb 19, 2:54 am)
Re: Linux 2.6.25-rc2, Pekka Enberg, (Tue Feb 19, 3:21 am)
Re: Linux 2.6.25-rc2, Mathieu Desnoyers, (Tue Feb 19, 10:02 am)
Re: Linux 2.6.25-rc2, Torsten Kaiser, (Tue Feb 19, 2:39 pm)
Re: Linux 2.6.25-rc2, Eric Dumazet, (Tue Feb 19, 12:27 pm)
Re: Linux 2.6.25-rc2, Christoph Lameter, (Wed Feb 27, 7:32 pm)
Re: Linux 2.6.25-rc2, Mathieu Desnoyers, (Tue Feb 19, 4:03 pm)
Re: Linux 2.6.25-rc2, Christoph Lameter, (Wed Feb 27, 7:34 pm)
[PATCH] Implement slub fastpath in terms of freebase and fre..., Mathieu Desnoyers, (Thu Feb 28, 1:55 am)
Re: [PATCH] Implement slub fastpath in terms of freebase and..., Christoph Lameter, (Thu Feb 28, 3:08 pm)
Re: [PATCH] Implement slub fastpath in terms of freebase and..., Mathieu Desnoyers, (Thu Feb 28, 7:25 pm)
Re: [PATCH] Implement slub fastpath in terms of freebase and..., Christoph Lameter, (Thu Feb 28, 8:57 pm)
Re: [PATCH] Implement slub fastpath in terms of freebase and..., Mathieu Desnoyers, (Thu Feb 28, 9:56 pm)
Re: [PATCH] Implement slub fastpath in terms of freebase and..., Christoph Lameter, (Thu Feb 28, 10:12 pm)
Re: [PATCH] Implement slub fastpath in terms of freebase and..., Mathieu Desnoyers, (Thu Feb 28, 11:32 pm)
Re: [PATCH] Implement slub fastpath in terms of freebase and..., Christoph Lameter, (Fri Feb 29, 1:11 am)
[PATCH] Slub Freeoffset check overflow, Mathieu Desnoyers, (Fri Feb 29, 9:28 am)
[PATCH] Slub Freeoffset check overflow (updated), Mathieu Desnoyers, (Tue Mar 4, 2:17 am)
Re: [PATCH] Slub Freeoffset check overflow (updated), Christoph Lameter, (Tue Mar 4, 3:15 am)
Re: [PATCH] Implement slub fastpath in terms of freebase and..., Mathieu Desnoyers, (Fri Feb 29, 9:03 am)
Re: [PATCH] Implement slub fastpath in terms of freebase and..., Christoph Lameter, (Fri Feb 29, 3:57 pm)
Re: Linux 2.6.25-rc2, Linus Torvalds, (Tue Feb 19, 12:38 pm)
Re: Linux 2.6.25-rc2, Pekka Enberg, (Tue Feb 19, 10:21 am)
Re: Linux 2.6.25-rc2, Mathieu Desnoyers, (Tue Feb 19, 4:08 pm)
Re: Linux 2.6.25-rc2, Christoph Lameter, (Wed Feb 27, 7:32 pm)
Re: Linux 2.6.25-rc2, Andrew Morton, (Wed Feb 27, 9:57 pm)
Re: Linux 2.6.25-rc2, Jiri Kosina, (Thu Feb 28, 7:13 am)
Re: Linux 2.6.25-rc2, Ingo Molnar, (Thu Feb 28, 4:14 am)
Re: Linux 2.6.25-rc2, Alan Cox, (Thu Feb 28, 7:15 am)
Re: Linux 2.6.25-rc2, Christoph Lameter, (Wed Feb 27, 10:43 pm)
Re: Linux 2.6.25-rc2, Pekka Enberg, (Tue Feb 19, 10:38 am)
Re: Linux 2.6.25-rc2, Linus Torvalds, (Tue Feb 19, 12:20 pm)
Re: Linux 2.6.25-rc2, Torsten Kaiser, (Tue Feb 19, 3:27 pm)
Re: Linux 2.6.25-rc2, Ingo Molnar, (Tue Feb 19, 12:45 pm)
Re: Linux 2.6.25-rc2, Ingo Molnar, (Tue Feb 19, 12:48 pm)
Re: Linux 2.6.25-rc2, Ingo Molnar, (Tue Feb 19, 10:55 am)
Re: Linux 2.6.25-rc2, Pekka Enberg, (Tue Feb 19, 11:52 am)
Re: Linux 2.6.25-rc2, Zhang, Yanmin, (Tue Feb 19, 8:36 pm)
Re: Linux 2.6.25-rc2, Zhang, Yanmin, (Tue Feb 19, 10:08 pm)
Re: Linux 2.6.25-rc2, Zhang, Yanmin, (Wed Feb 20, 2:53 am)
Re: Linux 2.6.25-rc2, Pekka Enberg, (Wed Feb 20, 3:10 am)
Re: Linux 2.6.25-rc2, Ingo Molnar, (Tue Feb 19, 10:57 am)
Re: Linux 2.6.25-rc2, Pekka Enberg, (Tue Feb 19, 11:54 am)
Re: Linux 2.6.25-rc2, Ingo Molnar, (Tue Feb 19, 6:27 am)
Re: Linux 2.6.25-rc2, Mathieu Desnoyers, (Tue Feb 19, 9:02 am)
Re: Linux 2.6.25-rc2, Ingo Molnar, (Tue Feb 19, 10:00 am)
Re: Linux 2.6.25-rc2, Pekka Enberg, (Tue Feb 19, 6:45 am)
Re: Linux 2.6.25-rc2, Linus Torvalds, (Mon Feb 18, 7:54 pm)
Re: Linux 2.6.25-rc2, Torsten Kaiser, (Tue Feb 19, 2:44 am)
Re: Linux 2.6.25-rc2, Rafael J. Wysocki, (Sun Feb 17, 4:25 pm)
Re: Linux 2.6.25-rc2, Torsten Kaiser, (Sun Feb 17, 5:32 pm)
Linux 2.6.25-rc2 regression: LVM cannot find volume group, Tilman Schmidt, (Sat Feb 16, 3:14 pm)
Re: Linux 2.6.25-rc2 regression: LVM cannot find volume group, Alasdair G Kergon, (Mon Feb 18, 9:53 pm)
Re: Linux 2.6.25-rc2, Jan Engelhardt, (Sat Feb 16, 12:52 pm)
Re: [BUG] Linux 2.6.25-rc2 - Regression from 2.6.24-rc1-git1..., Rafael J. Wysocki, (Sun Feb 17, 4:08 pm)
Re: [BUG] Linux 2.6.25-rc2 - Regression from 2.6.24-rc1-git1..., KAMEZAWA Hiroyuki, (Tue Feb 19, 4:04 am)
Re: [BUG] Linux 2.6.25-rc2 - Regression from 2.6.24-rc1-git1..., KAMEZAWA Hiroyuki, (Tue Feb 19, 5:02 am)
Re: [BUG] Linux 2.6.25-rc2 - Regression from 2.6.24-rc1-git1..., KAMEZAWA Hiroyuki, (Tue Feb 19, 4:47 am)
Re: [BUG] Linux 2.6.25-rc2 - Regression from 2.6.24-rc1-git1..., KAMEZAWA Hiroyuki, (Tue Feb 19, 5:07 am)
[BUG] Linux 2.6.25-rc2 - Kernel Ooops while running dbench, Kamalesh Babulal, (Sat Feb 16, 1:44 am)
Re: Linux 2.6.25-rc2, Rafael J. Wysocki, (Fri Feb 15, 10:08 pm)