This patch-set makes part of the mm a lot more preemptible. It converts i_mmap_lock and anon_vma->lock to mutexes and makes mmu_gather fully preemptible. The main motivation was making mm_take_all_locks() preemptible, since it appears people are nesting hundreds of spinlocks there. The side-effects are that can finally make mmu_gather preemptible, something which lots of people have wanted to do for a long time. It also gets us anon_vma refcounting, which seems to result in a nice cleanup of the anon_vma lifetime rules wrt KSM and compaction. This patch-set it build and boot-tested on x86_64 (a previous version was also tested on Dave's Niagra2 machines, and I suppose s390 did too when Martin provided the conversion patch for his arch). There are no known architectures left unconverted, although some arch code never did see a compiler (superh and ia64 come to mind, I'll try and update my toolchains next week). Yanmin ran the last posting through the comprehensive Intel test farm and didn't find any regressions. ( Not included in this posting are the 4 Sparc64 patches that implement gup_fast, those can be applied separately after this series gets anywhere. ) Full series (including the Sparc64 gup_fast bits) also available in -git form from (against Linus' tree as of about an hour ago): git://git.kernel.org/pub/scm/linux/kernel/git/peterz/linux-2.6-mmu_preempt.git mmu_preempt Do people feel its ready to get added to -next? --
This patch changes the anon_vma refcount to be 0 when the object is
free. It does this by adding 1 ref to being in use in the anon_vma
structure (iow. the anon_vma->head list is not empty).
This allows a simpler release scheme without having to check both the
refcount and the list as well as avoids taking a ref for each entry
on the list.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
include/linux/rmap.h | 11 +++++--
mm/ksm.c | 4 --
mm/rmap.c | 79 ++++++++++++++++++---------------------------------
3 files changed, 38 insertions(+), 56 deletions(-)
Index: linux-2.6/include/linux/rmap.h
===================================================================
--- linux-2.6.orig/include/linux/rmap.h
+++ linux-2.6/include/linux/rmap.h
@@ -73,7 +73,13 @@ static inline void get_anon_vma(struct a
atomic_inc(&anon_vma->refcount);
}
-void put_anon_vma(struct anon_vma *);
+void __put_anon_vma(struct anon_vma *anon_vma);
+
+static inline void put_anon_vma(struct anon_vma *anon_vma)
+{
+ if (atomic_dec_and_test(&anon_vma->refcount))
+ __put_anon_vma(anon_vma);
+}
static inline struct anon_vma *page_anon_vma(struct page *page)
{
@@ -116,7 +122,6 @@ void unlink_anon_vmas(struct vm_area_str
int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);
void __anon_vma_link(struct vm_area_struct *);
-void anon_vma_free(struct anon_vma *);
static inline void anon_vma_merge(struct vm_area_struct *vma,
struct vm_area_struct *next)
@@ -125,6 +130,8 @@ static inline void anon_vma_merge(struct
unlink_anon_vmas(next);
}
+struct anon_vma *page_get_anon_vma(struct page *page);
+
/*
* rmap interfaces called when adding or removing pte of page
*/
Index: linux-2.6/mm/ksm.c
===================================================================
--- linux-2.6.orig/mm/ksm.c
+++ linux-2.6/mm/ksm.c
@@ -309,9 +309,7 @@ static void ...FWIW, Acked-by: Pekka Enberg <penberg@kernel.org> --
Convert page_lock_anon_vma() over to use refcounts. This is
done for each of convertion of anon_vma from spinlock to mutex.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
mm/rmap.c | 23 +++++++++++++++++------
1 file changed, 17 insertions(+), 6 deletions(-)
Index: linux-2.6/mm/rmap.c
===================================================================
--- linux-2.6.orig/mm/rmap.c
+++ linux-2.6/mm/rmap.c
@@ -332,9 +332,9 @@ void __init anon_vma_init(void)
* that the anon_vma pointer from page->mapping is valid if there is a
* mapcount, we can dereference the anon_vma after observing those.
*/
-struct anon_vma *page_lock_anon_vma(struct page *page)
+struct anon_vma *page_get_anon_vma(struct page *page)
{
- struct anon_vma *anon_vma;
+ struct anon_vma *anon_vma = NULL;
unsigned long anon_mapping;
rcu_read_lock();
@@ -345,17 +345,28 @@ struct anon_vma *page_lock_anon_vma(stru
goto out;
anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
- anon_vma_lock(anon_vma);
- return anon_vma;
+ if (!atomic_inc_not_zero(&anon_vma->refcount))
+ anon_vma = NULL;
out:
rcu_read_unlock();
- return NULL;
+
+ return anon_vma;
+}
+
+struct anon_vma *page_lock_anon_vma(struct page *page)
+{
+ struct anon_vma *anon_vma = page_get_anon_vma(page);
+
+ if (anon_vma)
+ anon_vma_lock(anon_vma);
+
+ return anon_vma;
}
void page_unlock_anon_vma(struct anon_vma *anon_vma)
{
anon_vma_unlock(anon_vma);
- rcu_read_unlock();
+ put_anon_vma(anon_vma);
}
/*
--
Fix up the sh mmu_gahter code to conform to the new API.
Cc: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
arch/sh/include/asm/tlb.h | 12 ++----------
1 file changed, 2 insertions(+), 10 deletions(-)
Index: linux-2.6/arch/sh/include/asm/tlb.h
===================================================================
--- linux-2.6.orig/arch/sh/include/asm/tlb.h
+++ linux-2.6/arch/sh/include/asm/tlb.h
@@ -23,8 +23,6 @@ struct mmu_gather {
unsigned long start, end;
};
-DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
-
static inline void init_tlb_gather(struct mmu_gather *tlb)
{
tlb->start = TASK_SIZE;
@@ -36,17 +34,13 @@ static inline void init_tlb_gather(struc
}
}
-static inline struct mmu_gather *
-tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
+static inline void
+tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned int full_mm_flush)
{
- struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
-
tlb->mm = mm;
tlb->fullmm = full_mm_flush;
init_tlb_gather(tlb);
-
- return tlb;
}
static inline void
@@ -57,8 +51,6 @@ tlb_finish_mmu(struct mmu_gather *tlb, u
/* keep the page table cache within bounds */
check_pgt_cache();
-
- put_cpu_var(mmu_gathers);
}
static inline void
--
Provide the mutex_lock_nest_lock() annotation.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
include/linux/lockdep.h | 3 +++
include/linux/mutex.h | 9 +++++++++
kernel/mutex.c | 25 +++++++++++++++++--------
3 files changed, 29 insertions(+), 8 deletions(-)
Index: linux-2.6/include/linux/lockdep.h
===================================================================
--- linux-2.6.orig/include/linux/lockdep.h
+++ linux-2.6/include/linux/lockdep.h
@@ -492,12 +492,15 @@ static inline void print_irqtrace_events
#ifdef CONFIG_DEBUG_LOCK_ALLOC
# ifdef CONFIG_PROVE_LOCKING
# define mutex_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 2, NULL, i)
+# define mutex_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 2, n, i)
# else
# define mutex_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 1, NULL, i)
+# define mutex_acquire_nest(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, n, i)
# endif
# define mutex_release(l, n, i) lock_release(l, n, i)
#else
# define mutex_acquire(l, s, t, i) do { } while (0)
+# define mutex_acquire_nest(l, s, t, n, i) do { } while (0)
# define mutex_release(l, n, i) do { } while (0)
#endif
Index: linux-2.6/include/linux/mutex.h
===================================================================
--- linux-2.6.orig/include/linux/mutex.h
+++ linux-2.6/include/linux/mutex.h
@@ -124,6 +124,7 @@ static inline int mutex_is_locked(struct
*/
#ifdef CONFIG_DEBUG_LOCK_ALLOC
extern void mutex_lock_nested(struct mutex *lock, unsigned int subclass);
+extern void _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock);
extern int __must_check mutex_lock_interruptible_nested(struct mutex *lock,
unsigned int subclass);
extern int __must_check mutex_lock_killable_nested(struct mutex *lock,
@@ -132,6 +133,13 @@ extern int __must_check mutex_lock_killa
#define mutex_lock(lock) mutex_lock_nested(lock, 0)
#define mutex_lock_interruptible(lock) mutex_lock_interruptible_nested(lock, 0)
...Rework the sparc mmu_gather usage to conform to the new world order :-)
Sparc mmu_gather does two things:
- tracks vaddrs to unhash
- tracks pages to free
Split these two things like powerpc has done and keep the vaddrs
in per-cpu data structures and flush them on context switch.
The remaining bits can then use the generic mmu_gather.
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
arch/sparc/include/asm/pgalloc_64.h | 3 +
arch/sparc/include/asm/tlb_64.h | 91 ++---------------------------------
arch/sparc/include/asm/tlbflush_64.h | 12 +++-
arch/sparc/mm/tlb.c | 42 +++++++++-------
arch/sparc/mm/tsb.c | 15 +++--
5 files changed, 51 insertions(+), 112 deletions(-)
Index: linux-2.6/arch/sparc/include/asm/pgalloc_64.h
===================================================================
--- linux-2.6.orig/arch/sparc/include/asm/pgalloc_64.h
+++ linux-2.6/arch/sparc/include/asm/pgalloc_64.h
@@ -78,4 +78,7 @@ static inline void check_pgt_cache(void)
quicklist_trim(0, NULL, 25, 16);
}
+#define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, pte)
+#define __pmd_free_tlb(tlb, pmd, addr) pmd_free((tlb)->mm, pmd)
+
#endif /* _SPARC64_PGALLOC_H */
Index: linux-2.6/arch/sparc/include/asm/tlb_64.h
===================================================================
--- linux-2.6.orig/arch/sparc/include/asm/tlb_64.h
+++ linux-2.6/arch/sparc/include/asm/tlb_64.h
@@ -7,66 +7,11 @@
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
-#define TLB_BATCH_NR 192
-
-/*
- * For UP we don't need to worry about TLB flush
- * and page free order so much..
- */
-#ifdef CONFIG_SMP
- #define FREE_PTE_NR 506
- #define tlb_fast_mode(bp) ((bp)->pages_nr == ~0U)
-#else
- #define FREE_PTE_NR 1
- #define tlb_fast_mode(bp) 1
-#endif
-
-struct mmu_gather {
- struct mm_struct *mm;
- unsigned int pages_nr;
- unsigned int need_flush;
- unsigned int ...Usable for lock-breaks and such.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
include/linux/mutex.h | 5 +++++
1 file changed, 5 insertions(+)
Index: linux-2.6/include/linux/mutex.h
===================================================================
--- linux-2.6.orig/include/linux/mutex.h
+++ linux-2.6/include/linux/mutex.h
@@ -118,6 +118,11 @@ static inline int mutex_is_locked(struct
return atomic_read(&lock->count) != 1;
}
+static inline int mutex_is_contended(struct mutex *lock)
+{
+ return atomic_read(&lock->count) < 0;
+}
+
/*
* See kernel/mutex.c for detailed documentation of these APIs.
* Also see Documentation/mutex-design.txt.
--
A slightly more verbose comment to go along with the trickery in
page_lock_anon_vma().
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Mel Gorman <mel@csn.ul.ie>
LKML-Reference: <1271158226.4807.1107.camel@twins>
---
mm/rmap.c | 18 ++++++++++++++++--
1 file changed, 16 insertions(+), 2 deletions(-)
Index: linux-2.6/mm/rmap.c
===================================================================
--- linux-2.6.orig/mm/rmap.c
+++ linux-2.6/mm/rmap.c
@@ -311,8 +311,22 @@ void __init anon_vma_init(void)
}
/*
- * Getting a lock on a stable anon_vma from a page off the LRU is
- * tricky: page_lock_anon_vma rely on RCU to guard against the races.
+ * Getting a lock on a stable anon_vma from a page off the LRU is tricky!
+ *
+ * Since there is no serialization what so ever against page_remove_rmap()
+ * the best this function can do is return a locked anon_vma that might
+ * have been relevant to this page.
+ *
+ * The page might have been remapped to a different anon_vma or the anon_vma
+ * returned may already be freed (and even reused).
+ *
+ * All users of this function must be very careful when walking the anon_vma
+ * chain and verify that the page in question is indeed mapped in it
+ * [ something equivalent to page_mapped_in_vma() ].
+ *
+ * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap()
+ * that the anon_vma pointer from page->mapping is valid if there is a
+ * mapcount, we can dereference the anon_vma after observing those.
*/
struct anon_vma *page_lock_anon_vma(struct page *page)
{
--
Make mmu_gather preemptible by using a small on stack list and use
an option allocation to speed things up.
Preemptible mmu_gather is desired in general and usable once
i_mmap_lock becomes a mutex. Doing it before the mutex conversion
saves us from having to rework the code by moving the mmu_gather
bits inside the i_mmap_lock.
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Miller <davem@davemloft.net>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
fs/exec.c | 10 ++++-----
include/asm-generic/tlb.h | 51 +++++++++++++++++++++++++++++-----------------
include/linux/mm.h | 2 -
mm/memory.c | 27 +++++-------------------
mm/mmap.c | 16 +++++++-------
5 files changed, 53 insertions(+), 53 deletions(-)
Index: linux-2.6/fs/exec.c
===================================================================
--- linux-2.6.orig/fs/exec.c
+++ linux-2.6/fs/exec.c
@@ -504,7 +504,7 @@ static int shift_arg_pages(struct vm_are
unsigned long length = old_end - old_start;
unsigned long new_start = old_start - shift;
unsigned long new_end = old_end - shift;
- struct mmu_gather *tlb;
+ struct mmu_gather tlb;
BUG_ON(new_start > new_end);
@@ -530,12 +530,12 @@ static int shift_arg_pages(struct vm_are
return -ENOMEM;
lru_add_drain();
- tlb = tlb_gather_mmu(mm, 0);
+ tlb_gather_mmu(&tlb, mm, 0);
if (new_end > old_start) {
/*
* when the old and new regions overlap clear from new_end.
*/
- free_pgd_range(tlb, new_end, old_end, new_end,
+ free_pgd_range(&tlb, new_end, old_end, new_end,
vma->vm_next ? vma->vm_next->vm_start : 0);
} else {
/*
@@ -544,10 +544,10 @@ static int shift_arg_pages(struct vm_are
* have constraints on va-space that make this illegal ...Instead of using a single batch (the small on-stack, or an allocated
page), try and extend the batch every time it runs out and only flush
once either the extend fails or we're done.
Requested-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
include/asm-generic/tlb.h | 122 ++++++++++++++++++++++++++++++----------------
1 file changed, 82 insertions(+), 40 deletions(-)
Index: linux-2.6/include/asm-generic/tlb.h
===================================================================
--- linux-2.6.orig/include/asm-generic/tlb.h
+++ linux-2.6/include/asm-generic/tlb.h
@@ -17,16 +17,6 @@
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
-/*
- * For UP we don't need to worry about TLB flush
- * and page free order so much..
- */
-#ifdef CONFIG_SMP
- #define tlb_fast_mode(tlb) ((tlb)->nr == ~0U)
-#else
- #define tlb_fast_mode(tlb) 1
-#endif
-
#ifdef HAVE_ARCH_RCU_TABLE_FREE
/*
* Semi RCU freeing of the page directories.
@@ -70,31 +60,66 @@ extern void tlb_remove_table(struct mmu_
#endif
+struct mmu_gather_batch {
+ struct mmu_gather_batch *next;
+ unsigned int nr;
+ unsigned int max;
+ struct page *pages[0];
+};
+
+#define MAX_GATHER_BATCH \
+ ((PAGE_SIZE - sizeof(struct mmu_gather_batch)) / sizeof(void *))
+
/* struct mmu_gather is an opaque type used by the mm code for passing around
* any data needed by arch specific code for tlb_remove_page.
*/
struct mmu_gather {
struct mm_struct *mm;
- unsigned int nr; /* set to ~0U means fast mode */
- unsigned int max; /* nr < max */
- unsigned int need_flush;/* Really unmapped some ptes? */
- unsigned int fullmm; /* non-zero means full mm flush */
- struct page **pages;
- struct page *local[8];
+ unsigned int need_flush : 1, /* Did free PTEs */
+ fast_mode : 1; /* No batching */
+ unsigned int fullmm; /* Flush full mm */
+
+ struct mmu_gather_batch *active;
+ struct mmu_gather_batch local;
+ struct page *__pages[8];
...Fix up powerpc to the new mmu_gather stuffs.
PPC has an extra batching queue to RCU free the actual pagetable
allocations, use the ARCH extentions for that for now.
For the ppc64_tlb_batch, which tracks the vaddrs to unhash from the
hardware hash-table, keep using per-cpu arrays but flush on context
switch and use a TLF bit to track the laxy_mmu state.
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
arch/powerpc/include/asm/pgalloc.h | 4 ++--
arch/powerpc/include/asm/thread_info.h | 2 ++
arch/powerpc/include/asm/tlb.h | 10 ++++++++++
arch/powerpc/kernel/process.c | 19 +++++++++++++++++++
arch/powerpc/mm/pgtable.c | 14 ++++----------
arch/powerpc/mm/tlb_hash32.c | 2 +-
arch/powerpc/mm/tlb_hash64.c | 12 +++++++-----
arch/powerpc/mm/tlb_nohash.c | 2 +-
8 files changed, 46 insertions(+), 19 deletions(-)
Index: linux-2.6/arch/powerpc/include/asm/tlb.h
===================================================================
--- linux-2.6.orig/arch/powerpc/include/asm/tlb.h
+++ linux-2.6/arch/powerpc/include/asm/tlb.h
@@ -28,6 +28,16 @@
#define tlb_start_vma(tlb, vma) do { } while (0)
#define tlb_end_vma(tlb, vma) do { } while (0)
+#define HAVE_ARCH_MMU_GATHER 1
+
+struct pte_freelist_batch;
+
+struct arch_mmu_gather {
+ struct pte_freelist_batch *batch;
+};
+
+#define ARCH_MMU_GATHER_INIT (struct arch_mmu_gather){ .batch = NULL, }
+
extern void tlb_flush(struct mmu_gather *tlb);
/* Get the generic bits... */
Index: linux-2.6/arch/powerpc/kernel/process.c
===================================================================
--- linux-2.6.orig/arch/powerpc/kernel/process.c
+++ linux-2.6/arch/powerpc/kernel/process.c
@@ -393,6 +393,9 @@ struct task_struct *__switch_to(struct t
struct thread_struct *new_thread, *old_thread;
unsigned long flags;
struct task_struct *last;
+#ifdef CONFIG_PPC64
+ struct ...Unfortunately, I think this is broken... Here, you are coming out of _switch() which will have swapped the stack and non-volatile registers to the state they were in when the new task was originally switched-out. Thus "new" which is a local variable (either on stack or in a non-volatile register) will now refer to whatever was the next task back then. I suppose that's what's causing the similar patch you have in -rt to fail btw. This could be fixed easily by using "current" instead. However, that doesn't seem necessary at all, at least not for !-rt, or unless you broke something that I would need to look at very closely then :-) IE. Enable/disable the batch only within "lazy_mmu_mode" sections. We do that in large part because we do not want non-flushed pages to exist outside of the pte spinlock. The reason is that if we let that happen, a small possibility exist for our MMU hash page handling to try to insert a duplicate entry for a given PTE into the hash table, which is basically fatal. Thus, we only exist during that lazy period, which means with a lock held. Hence we can't schedule and the changes you do regarding get/put_cpu_var are unnecessary. Another "trick" here btw is that fork() is currently not using a batch, but with our technique, we do get batching there too. So unless something else is broken that makes the above not true anymore, which would be a concern, most of the changes you did to the flush batch are unnecessary for your preemptible mmu_gather on non-rt kernels. Of course, with -rt and the pte lock becoming a mutex, all of your changes do become necessary (and I suppose that's where they come from). Now, those changes won't technically hurt on a non-rt kernel, tho they will add a tiny bit of overhead. I'll see if I can measure it. Cheers, Ben. --
And here's a fix. Using current_thread_info() everywhere is better
anyways, you are not supposed to touch the local_flags within anything
but "current" so we may as well make it explicit.
Feel free to fold that into your -rt patch and this one if you decide
not to drop that whole part.
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 6ee4b09..6e8b6de 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -517,12 +517,12 @@ struct task_struct *__switch_to(struct task_struct *prev,
batch = &__get_cpu_var(ppc64_tlb_batch);
if (batch->active) {
- task_thread_info(prev)->local_flags |= _TLF_LAZY_MMU;
+ current_thread_info()->local_flags |= _TLF_LAZY_MMU;
if (batch->index)
__flush_tlb_pending(batch);
batch->active = 0;
}
-#endif
+#endif /* CONFIG_PPC64 */
local_irq_save(flags);
@@ -539,12 +539,12 @@ struct task_struct *__switch_to(struct task_struct *prev,
last = _switch(old_thread, new_thread);
#ifdef CONFIG_PPC64
- if (task_thread_info(new)->local_flags & _TLF_LAZY_MMU) {
- task_thread_info(new)->local_flags &= ~_TLF_LAZY_MMU;
+ if (current_thread_info()->local_flags & _TLF_LAZY_MMU) {
+ current_thread_info()->local_flags &= ~_TLF_LAZY_MMU;
batch = &__get_cpu_var(ppc64_tlb_batch);
batch->active = 1;
}
-#endif
+#endif /* CONFIG_PPC64 */
local_irq_restore(flags);
--
Indeed, those parts are only strictly needed for -rt, I was hoping the tiny bit of overhead wouldn't be too bad and we could keep it. If not I guess we should put those two bits in __switch_to() under Thanks! And yeah I'd prefer to keep it to reduce Thomas' burden if possible. --
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Adapt the stand-alone s390 mmu_gather implementation to the new
preemptible mmu_gather interface.
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100409105800.0d0f93ac@mschwide.boeblingen.de.ibm.com>
---
arch/s390/include/asm/tlb.h | 43 +++++++++++++++++++++++++------------------
1 file changed, 25 insertions(+), 18 deletions(-)
Index: linux-2.6/arch/s390/include/asm/tlb.h
===================================================================
--- linux-2.6.orig/arch/s390/include/asm/tlb.h
+++ linux-2.6/arch/s390/include/asm/tlb.h
@@ -28,44 +28,50 @@
#include <asm/smp.h>
#include <asm/tlbflush.h>
-#ifndef CONFIG_SMP
-#define TLB_NR_PTRS 1
-#else
-#define TLB_NR_PTRS 508
-#endif
-
struct mmu_gather {
struct mm_struct *mm;
unsigned int fullmm;
unsigned int nr_ptes;
unsigned int nr_pxds;
- void *array[TLB_NR_PTRS];
+ unsigned int max;
+ void **array;
+ void *local[8];
};
-DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
-
-static inline struct mmu_gather *tlb_gather_mmu(struct mm_struct *mm,
- unsigned int full_mm_flush)
+static inline void __tlb_alloc_pages(struct mmu_gather *tlb)
{
- struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
+ unsigned long addr = __get_free_pages(GFP_ATOMIC, 0);
+
+ if (addr) {
+ tlb->array = (void *) addr;
+ tlb->max = PAGE_SIZE / sizeof(void *);
+ }
+}
+static inline void tlb_gather_mmu(struct mmu_gather *tlb,
+ struct mm_struct *mm,
+ unsigned int full_mm_flush)
+{
tlb->mm = mm;
+ tlb->max = ARRAY_SIZE(tlb->local);
+ tlb->array = tlb->local;
tlb->fullmm = full_mm_flush;
- tlb->nr_ptes = 0;
- tlb->nr_pxds = TLB_NR_PTRS;
if (tlb->fullmm)
__tlb_flush_mm(mm);
- return tlb;
+ else
+ __tlb_alloc_pages(tlb);
+ tlb->nr_ptes = 0;
+ tlb->nr_pxds = tlb->max;
}
static inline void tlb_flush_mmu(struct mmu_gather *tlb,
unsigned long ...Fix up the ia64 mmu_gather code to conform to the new API.
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
arch/ia64/include/asm/tlb.h | 38 +++++++++++++++++++++++++-------------
1 file changed, 25 insertions(+), 13 deletions(-)
Index: linux-2.6/arch/ia64/include/asm/tlb.h
===================================================================
--- linux-2.6.orig/arch/ia64/include/asm/tlb.h
+++ linux-2.6/arch/ia64/include/asm/tlb.h
@@ -47,21 +47,21 @@
#include <asm/machvec.h>
#ifdef CONFIG_SMP
-# define FREE_PTE_NR 2048
# define tlb_fast_mode(tlb) ((tlb)->nr == ~0U)
#else
-# define FREE_PTE_NR 0
# define tlb_fast_mode(tlb) (1)
#endif
struct mmu_gather {
struct mm_struct *mm;
unsigned int nr; /* == ~0U => fast mode */
+ unsigned int max;
unsigned char fullmm; /* non-zero means full mm flush */
unsigned char need_flush; /* really unmapped some PTEs? */
unsigned long start_addr;
unsigned long end_addr;
- struct page *pages[FREE_PTE_NR];
+ struct page **pages;
+ struct page *local[8];
};
struct ia64_tr_entry {
@@ -90,9 +90,6 @@ extern struct ia64_tr_entry *ia64_idtrs[
#define RR_RID_MASK 0x00000000ffffff00L
#define RR_TO_RID(val) ((val >> 8) & 0xffffff)
-/* Users of the generic TLB shootdown code must declare this storage space. */
-DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
-
/*
* Flush the TLB for address range START to END and, if not in fast mode, release the
* freed pages that where gathered up to this point.
@@ -147,15 +144,25 @@ ia64_tlb_flush_mmu (struct mmu_gather *t
}
}
-/*
- * Return a pointer to an initialized struct mmu_gather.
- */
-static inline struct mmu_gather *
-tlb_gather_mmu (struct mm_struct *mm, unsigned int full_mm_flush)
+static inline void __tlb_alloc_pages(struct mmu_gather *tlb)
+{
+ unsigned long addr = __get_free_pages(GFP_ATOMIC, 0);
+
+ if (addr) {
+ tlb->pages = (void *)addr;
+ tlb->max = PAGE_SIZE / ...Got a fancy ia64 compiler now, which informed me I fouled up, fixed up
patch below.
Will update the git tree shortly.
---
Subject: ia64: Preemptible mmu_gather
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed Jun 02 13:26:02 CEST 2010
Fix up the ia64 mmu_gather code to conform to the new API.
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
---
arch/ia64/include/asm/tlb.h | 39 ++++++++++++++++++++++++---------------
1 file changed, 24 insertions(+), 15 deletions(-)
Index: linux-2.6/arch/ia64/include/asm/tlb.h
===================================================================
--- linux-2.6.orig/arch/ia64/include/asm/tlb.h
+++ linux-2.6/arch/ia64/include/asm/tlb.h
@@ -47,21 +47,21 @@
#include <asm/machvec.h>
#ifdef CONFIG_SMP
-# define FREE_PTE_NR 2048
# define tlb_fast_mode(tlb) ((tlb)->nr == ~0U)
#else
-# define FREE_PTE_NR 0
# define tlb_fast_mode(tlb) (1)
#endif
struct mmu_gather {
struct mm_struct *mm;
unsigned int nr; /* == ~0U => fast mode */
+ unsigned int max;
unsigned char fullmm; /* non-zero means full mm flush */
unsigned char need_flush; /* really unmapped some PTEs? */
unsigned long start_addr;
unsigned long end_addr;
- struct page *pages[FREE_PTE_NR];
+ struct page **pages;
+ struct page *local[8];
};
struct ia64_tr_entry {
@@ -90,9 +90,6 @@ extern struct ia64_tr_entry *ia64_idtrs[
#define RR_RID_MASK 0x00000000ffffff00L
#define RR_TO_RID(val) ((val >> 8) & 0xffffff)
-/* Users of the generic TLB shootdown code must declare this storage space. */
-DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
-
/*
* Flush the TLB for address range START to END and, if not in fast mode, release the
* freed pages that where gathered up to this point.
@@ -147,15 +144,23 @@ ia64_tlb_flush_mmu (struct mmu_gather *t
}
}
-/*
- * Return a pointer to an initialized struct mmu_gather.
- */
-static inline ...Optimize the page_lock_anon_vma() fast path to be one LOCKed op,
instead of two.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
mm/rmap.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 61 insertions(+), 4 deletions(-)
Index: linux-2.6/mm/rmap.c
===================================================================
--- linux-2.6.orig/mm/rmap.c
+++ linux-2.6/mm/rmap.c
@@ -353,20 +353,69 @@ out:
return anon_vma;
}
+/*
+ * Similar to page_get_anon_vma() except it locks the anon_vma.
+ *
+ * Its a little more complex as it tries to keep the fast path to a single
+ * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
+ * reference like with page_get_anon_vma() and then block on the mutex.
+ */
struct anon_vma *page_lock_anon_vma(struct page *page)
{
- struct anon_vma *anon_vma = page_get_anon_vma(page);
+ struct anon_vma *anon_vma = NULL;
+ unsigned long anon_mapping;
+
+ rcu_read_lock();
+ anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);
+ if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
+ goto out;
+ if (!page_mapped(page))
+ goto out;
+
+ anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
+ if (mutex_trylock(&anon_vma->root->lock)) {
+ /*
+ * If we observe a !0 refcount, then holding the lock ensures
+ * the anon_vma will not go away, see __put_anon_vma().
+ */
+ if (!atomic_read(&anon_vma->refcount)) {
+ anon_vma_unlock(anon_vma);
+ anon_vma = NULL;
+ }
+ goto out;
+ }
+
+ /* trylock failed, we got to sleep */
+ if (!atomic_inc_not_zero(&anon_vma->refcount)) {
+ anon_vma = NULL;
+ goto out;
+ }
+
+ /* we pinned the anon_vma, its safe to sleep */
+ rcu_read_unlock();
+ anon_vma_lock(anon_vma);
+
+ if (atomic_dec_and_test(&anon_vma->refcount)) {
+ /*
+ * Oops, we held the last refcount, release the lock
+ * and bail -- can't simply use put_anon_vma() because
+ * we'll deadlock on the anon_vma_lock() ...PowerPC relies on IRQ-disable to guard against RCU quiecent states,
use the appropriate RCU call version.
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
arch/powerpc/mm/pgtable.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
Index: linux-2.6/arch/powerpc/mm/pgtable.c
===================================================================
--- linux-2.6.orig/arch/powerpc/mm/pgtable.c
+++ linux-2.6/arch/powerpc/mm/pgtable.c
@@ -92,7 +92,7 @@ static void pte_free_rcu_callback(struct
static void pte_free_submit(struct pte_freelist_batch *batch)
{
- call_rcu(&batch->rcu, pte_free_rcu_callback);
+ call_rcu_sched(&batch->rcu, pte_free_rcu_callback);
}
void pgtable_free_tlb(struct mmu_gather *tlb, void *table, unsigned shift)
--
So as per discussions with Paul Mck, that should be correct, thus --
Fix up the um mmu_gather code to conform to the new API.
Cc: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
arch/um/include/asm/tlb.h | 16 ++--------------
1 file changed, 2 insertions(+), 14 deletions(-)
Index: linux-2.6/arch/um/include/asm/tlb.h
===================================================================
--- linux-2.6.orig/arch/um/include/asm/tlb.h
+++ linux-2.6/arch/um/include/asm/tlb.h
@@ -22,9 +22,6 @@ struct mmu_gather {
unsigned int fullmm; /* non-zero means full mm flush */
};
-/* Users of the generic TLB shootdown code must declare this storage space. */
-DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
-
static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep,
unsigned long address)
{
@@ -47,20 +44,13 @@ static inline void init_tlb_gather(struc
}
}
-/* tlb_gather_mmu
- * Return a pointer to an initialized struct mmu_gather.
- */
-static inline struct mmu_gather *
-tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
+static inline void
+tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned int full_mm_flush)
{
- struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
-
tlb->mm = mm;
tlb->fullmm = full_mm_flush;
init_tlb_gather(tlb);
-
- return tlb;
}
extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
@@ -87,8 +77,6 @@ tlb_finish_mmu(struct mmu_gather *tlb, u
/* keep the page table cache within bounds */
check_pgt_cache();
-
- put_cpu_var(mmu_gathers);
}
/* tlb_remove_page
--
Sorry folks, got Nick's old email in there (changed for this reply). --
I get 'as' segfaults during a "make -j128" of the kernel on sparc64 with these changes. I'll try to track it down. --
Hrm, I let it run: while :; do make O=foo/ clean; make O=foo/ randconfig; make O=foo/ -j48; done on my 24 cpu machine with this for like an hour, I'll let it run more tomorrow to see if I can reproduce this. Thanks for looking at this. --
How come? I tried this: aapi205:/usr/src/linus/tmp# git pull Already up-to-date. ... and don't see an update. Maybe I don't do something right. If there is newer code I should receive it, am I correct? I use this link: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git -- Janusz Korwin-Mikke: "Generalnie rzecz biorąc, ja jestem w jednej dziedzinie za równouprawnieniem, a mianowicie: żeby w małżeństwie było 50% mężczyzn i 50% kobiet." NP: Peter Green Splinter Group - Turn Your Love Away NB: 2.6.36-rc2-next-20100827 --
That's Linus' tree, the patches aren't in there quite yet, try: git remote add mmu_preempt git://git.kernel.org/pub/scm/linux/kernel/git/peterz/linux-2.6-mmu_preempt.git git remote update git checkout mmu_preempt/mmu_preempt or git checkout -b mmu_preempt origin/master git pull git://git.kernel.org/pub/scm/linux/kernel/git/peterz/linux-2.6-mmu_preempt.git mmu_preempt --
Which is better? Thanks, I'll try. Regards, Piotr Hosowicz -- Wzór Einsteina-Pitagorasa: E = mc2 = m(a2 + b2) NP: Peter Green Splinter Group - Big Change Is Gonna Come NB: 2.6.36-rc2-next-20100827 --
Fully depends on how you manage your git tree, the first will keep giving you the same tree while the second will try and merge my changes into whatever you got from Linus' tree, which keeps moving fwd and might thus start failing (at which point I'll have to either rebase or backmerge Linus' changes). --
I'll risk the second way. I almost do not manage the tree, I just copy my local copy to tmp dir, then git pull from Linus, if there are changes I build the kernel. Regards, Piotr Hosowicz -- Z cyklu "Uroki demokracji", czyli pytania i odpowiedzi w teledurniejach: - W którym kraju znajduje się Mount Everest? - Hm, to nie Szkocja, prawda? NP: Peter Green Splinter Group - I Can't Help Myself NB: 2.6.36-rc2-next-20100827 --
I chose the second path, kernel built OK, but then I had to build NVidia module and I started Xorg. I logged in and then the screen froze for a while, then Kde background went black and I had to hard reset the PC, bezcause couldn't switch to VT. I attach my Xorg log. -- - E.T. phone home. - powiedział E.T. - Dzwoni do centrali po wskazówki - domyślił się Stirlitz. NP: Patrick O'Hearn - Chance NB: 2.6.36-rc2-git5
OK. Regards, Piotr Hosowicz -- "Pirotechnicy zdetonowali na warszawskim Okęciu podejrzane ładunki, znalezione przy porzuconym motocyklu. Wszystko wskazuje na to, że był to fałszywy alarm". Z polskiego na nasze: "rozp* komuś motor". NP: Patrick O'Hearn - Night Becomes Her NB: 2.6.36-rc2-git5 --
Now I had done it the second way you described and everything seems to be working ok. aapi205:~# uname -r 2.6.36-rc3-20100831-1507-pz+ If there will be any problems I'll write here. Regards, Piotr Hosowicz -- Z cyklu "Uroki demokracji", czyli pytania i odpowiedzi w teledurniejach: - Który aktor amerykański jest mężem Nicole Kidman? - Forrest Gump. NP: Peter Green Splinter Group - You'll Be Sorry Someday NB: 2.6.36-rc3-20100831-1507-pz+ --
Sorry, I made a mistake - I had built it the first way. Regards and sorry for the noise, Piotr Hosowicz -- Janusz Korwin-Mikke: "Idiota z dyplomem to taki sam idiota, jak przedtem - tylko z pretensjami." NP: Peter Green Splinter Group - Tribal Dance NB: 2.6.36-rc3-20100831-1507-pz+ --
Still works OK and I perceive that it is better, Peter. Do you know when Linus will be back from Brazil? Regards, Piotr Hosowicz -- Z cyklu "Uroki demokracji", czyli pytania i odpowiedzi w teledurniejach: - Ile kół ma trójkołowiec? - Dwa. NP: Peter Green Splinter Group - Indians NB: 2.6.36-rc3-20100831-1507-pz+ --
Hi Peter,
Is there data somewhere that shows where this helps and how much?
Pekka
--
Yanmin didn't publish any data, but the main point of the series is to not take hundreds of nested spinlocks. Not regressing is a fine state. In theory the preemptible mmu could end up doing less TLB invalidates for large unmaps and thus gain some performance there. --
Peter: We tested your tree base on 2.6.36-rc3 kernel. The testing covered on Core2 2P, NHM-EP/WSM-EP machines, no clear performance regression found on your patch compare to 36-rc3, and also no clear improvements on our benchmarks. The benchmarks are listed in the following website, and plus some of FFSB/FIO scenarios. http://kernel-perf.sourceforge.net/ Regards! --
Thanks, much appreciated!! Hopefully we'll get sparc64 fixed up soon and can talk about merging this. --
