Hi, This (still incomplete) patch-set makes part of the mm a lot more preemptible. It converts i_mmap_lock and anon_vma->lock to mutexes. On the way there it also makes mmu_gather preemptible. The main motivation was making mm_take_all_locks() preemptible, since it appears people are nesting hundreds of spinlocks there. The side-effects are that we can finally make mmu_gather preemptible, something which lots of people have wanted to do for a long time. It also gets us anon_vma refcounting which seems to be wanted by KSM as well as Mel's compaction work. This patch-set seems to build and boot on my x86_64 machines and even builds a kernel. I've also attempted powerpc and sparc, which I've compile tested with their respective defconfigs, remaining are (afaikt the rest uses the generic tlb bits): - s390 - ia64 - arm - superh - um From those, s390 and ia64 look 'interesting', arm and superh seem very similar and should be relatively easy (-rt has a patchlet for arm iirc). What kind of performance tests would people have me run on this to satisfy their need for numbers? I've done a kernel build on x86_64 and if anything that was slightly faster with these patches, but it was well within the noise levels so it might be heat noise I'm looking at ;-) --
Straight fwd conversion of i_mmap_lock and anon_vma->lock to mutexes.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
arch/x86/mm/hugetlbpage.c | 4 ++--
fs/hugetlbfs/inode.c | 4 ++--
fs/inode.c | 2 +-
include/linux/fs.h | 2 +-
include/linux/mm.h | 2 +-
include/linux/rmap.h | 10 +++++-----
kernel/fork.c | 4 ++--
mm/filemap_xip.c | 4 ++--
mm/fremap.c | 4 ++--
mm/hugetlb.c | 12 ++++++------
mm/ksm.c | 16 ++++++++--------
mm/memory-failure.c | 4 ++--
mm/memory.c | 14 +++++++-------
mm/mmap.c | 20 ++++++++++----------
mm/mremap.c | 4 ++--
mm/rmap.c | 42 +++++++++++++++++++++---------------------
16 files changed, 74 insertions(+), 74 deletions(-)
Index: linux-2.6/arch/x86/mm/hugetlbpage.c
===================================================================
--- linux-2.6.orig/arch/x86/mm/hugetlbpage.c
+++ linux-2.6/arch/x86/mm/hugetlbpage.c
@@ -73,7 +73,7 @@ static void huge_pmd_share(struct mm_str
if (!vma_shareable(vma, addr))
return;
- spin_lock(&mapping->i_mmap_lock);
+ mutex_lock(&mapping->i_mmap_lock);
vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
if (svma == vma)
continue;
@@ -98,7 +98,7 @@ static void huge_pmd_share(struct mm_str
put_page(virt_to_page(spte));
spin_unlock(&mm->page_table_lock);
out:
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_lock);
}
/*
Index: linux-2.6/fs/hugetlbfs/inode.c
===================================================================
--- linux-2.6.orig/fs/hugetlbfs/inode.c
+++ linux-2.6/fs/hugetlbfs/inode.c
@@ -429,10 +429,10 @@ static int hugetlb_vmtruncate(struct ino
pgoff = offset >> PAGE_SHIFT;
i_size_write(inode, offset);
- spin_lock(&mapping->i_mmap_lock);
+ mutex_lock(&mapping->i_mmap_lock);
if ...Fix up powerpc to the new mmu_gather stuffs.
PPC has an extra batching queue to RCU free the actual pagetable
allocations, use the ARCH extentions for that for now.
For the ppc64_tlb_batch, which tracks the vaddrs to unhash from the
hardware hash-table, keep using per-cpu arrays but flush on context
switch and use a TIF bit to track the laxy_mmu state.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
arch/powerpc/include/asm/pgalloc.h | 4 +--
arch/powerpc/include/asm/thread_info.h | 2 +
arch/powerpc/include/asm/tlb.h | 10 +++++++++
arch/powerpc/include/asm/tlbflush.h | 16 ++++++++++-----
arch/powerpc/kernel/process.c | 18 +++++++++++++++++
arch/powerpc/mm/pgtable.c | 34 +++++++++++++++++++++++----------
arch/powerpc/mm/tlb_hash32.c | 2 -
arch/powerpc/mm/tlb_hash64.c | 12 ++++++-----
arch/powerpc/mm/tlb_nohash.c | 2 -
9 files changed, 76 insertions(+), 24 deletions(-)
Index: linux-2.6/arch/powerpc/include/asm/tlb.h
===================================================================
--- linux-2.6.orig/arch/powerpc/include/asm/tlb.h
+++ linux-2.6/arch/powerpc/include/asm/tlb.h
@@ -28,6 +28,16 @@
#define tlb_start_vma(tlb, vma) do { } while (0)
#define tlb_end_vma(tlb, vma) do { } while (0)
+#define HAVE_ARCH_MMU_GATHER 1
+
+struct pte_freelist_batch;
+
+struct arch_mmu_gather {
+ struct pte_freelist_batch *batch;
+};
+
+#define ARCH_MMU_GATHER_INIT (struct arch_mmu_gather){ .batch = NULL, }
+
extern void tlb_flush(struct mmu_gather *tlb);
/* Get the generic bits... */
Index: linux-2.6/arch/powerpc/include/asm/tlbflush.h
===================================================================
--- linux-2.6.orig/arch/powerpc/include/asm/tlbflush.h
+++ linux-2.6/arch/powerpc/include/asm/tlbflush.h
@@ -108,18 +108,24 @@ extern void hpte_need_flush(struct mm_st
static inline void ...Hm. Pity powerpc can't just use tlb flush gathering for this batching, (which is what it was designed for). Then it could avoid these tricks. What's preventing this? Adding a tlb gather for COW case in copy_page_range? --
I'm not quite sure what about that, didn't fully investigate it, just
wanted to get something working for now.
Of of the things is that both power and sparc need more than the struct
page we normally gather.
I did think of making the mmu_gather have something like
struct mmu_page {
struct page *page;
#ifdef HAVE_ARCH_TLB_VADDR
unsigned long vaddr;
#endif
};
struct mmu_gather {
...
unsigned int nr;
struct mmu_page *pages;
};
and doing that vaddr collection right along with it in the same batch.
I think that that would work, Ben, Dave?
--
No it's not your problem, but just perhaps a good add-on to your Well you could also have a per-arch struct for this, which they can --
Ah, right you are.
Maybe something like:
#ifndef tlb_fill_page
struct mmu_page {
struct page *page;
};
#define tlb_fill_page(tlb, pte, addr) do { } while (0)
#endif
Which can be used from the tlb_remove_tlb_entry() implementation to
gather both the pte and addr. The only complication seems to be that
tlb_remove_tlb_entry() and tlb_remove_page() aren't always matched.
Also, it looks like both power and sparc drive the pte/vaddr flush from
the pte_update/set_pte_at like functions. Which means its not synced to
the mmu_gather at all.
So I'm not at all sure its feasible to link these, but I'll keep it in
mind.
--
Well, ours aren't struct pages. IE. There's fundamentally 3 things that we are trying to batch here :-) 1- The original mmu_gather: batching the freeing of the actual user pages, so that the TLB flush can be delayed/gathered, plus there might be some micro-improvement in passing the page list to the allocator for freeing all at omce. This is thus purely a batch of struct pages. 2- The batching of the TLB flushes (or hash invalidates in the ppc case) proper, which needs the addition of the vaddr for things like sparc and powerpc since we don't just invalidate the whole bloody thing unlike x86 :-) On powerpc, we actually need more, we need the actual PTE content since it also contains tracking information relative to where things have been put in the hash table. 3- The batching of the freeing of the page table structure, which we want to delay more than batch, ie, the goal here is to delay that freeing using RCU until everybody has stopped walking them. This does rely on RCU grace period being "interrupt safe", ie, there's no rcu_read_lock() in the low level TLB or hash miss code, but that code runs with interrupts off. Now, 2. has a problem I described earlier, which is that we must not have the possibility of introducing a duplicate in the hash table, thus it must not be possible to put a new PTE in until the previous one has been flushed or bad things would happen. This is why powerpc doesn't use the mmu_gather the way it was originally intended to do both 1. and 2. but really only for 1., while for 2. we use a small batch that only exist between lazy_mmu_enter/exit, since those are always fully enclosed by a pte lock section. 3. As you have noticed, relies on the irq stuff. Plus there seem to be a dubious optimization here with mm_users. Might be worth sorting that out. However, it's a very different goal than 1. and 2. in the sense that batching proper is a minor issue, what we want is synchronization with walkers, and that batching is a way to lower the cost ...
We must flush before the pte_lock is released. If not, we end up with this funny situation: - PTE is read-only, hash contains a translation for it - PTE gets cleared & added to the batch, hash not flushed yet - PTE lock released, maybe even VMA fully removed - Other CPU takes a write fault, puts in a new PTE - Hash ends up with duplicates of the vaddr -> arch violation Now we could get out of that one, I suppose, if we had some kind of way to force flush any batch pertaining to a given mm before a new valid PTE can be written, but that doesn't sound such a trivial thing to do. Any better idea ? Cheers, Ben. --
On Thu, 2010-04-08 at 21:17 +0200, Peter Zijlstra wrote: Can index be > 0 if active == 0 ? I though not, which means you don't need to add a test on active, do you ? I'm also pondering whether we should just stick something in the task/thread struct and avoid that whole per-cpu manipulation including the stuff in process.c in fact. Use ti->local_flags so you can do it non-atomically, which is a lot iirc, we are synchronizing with CPUs walking page tables in their hash or TLB miss code, which is lockless. The mm_users test is a -little- bit dubious indeed. It may have to be mm_users < 2 && mm == current->active_mm, ie, we know for sure nobody else is currently walking those page tables ... Tho even than is fishy nowadays. We -can- walk page tables on behave of another process. In fact, we do it in the Cell SPU code for faulting page table entries as a result of SPEs taking faults for example. So I'm starting to suspect that this mm_users optimisation is bogus. We -do- want to optimize out the case where there is no user left Yeah well ... I'm not that familiar with RCU anymore. Back when I wrote that, iirc, I would more or less be safe against a CPU that doesn't schedule, but things may have well changed. We are trying to be safe against another CPU walking page tables in the asm lockless hash miss or TLB miss code. Note that sparc64 has a similar issue. This is highly optimized asm code that -cannot- call into things When this was written, generic batch only dealt with the actual pages, not the page tables, and as you may have noticed, our page tables on powerpc aren't struct page*, they come from dedicated slab caches and are of variable sizes. Cheers, --
True I guess, but like this we avoid a write, doesn't really matter I Can do, I can add batching similar to the generic code to the Heh, that seems like a real good way to waste a lot of memory fast ;-) --
Right, so Paul has been working hard to remove certain implementation artifact from RCU, such as the preempt-disable == rcu_read_lock thing. Now, even Preemptible RCU has IRQ-disabled == rcu_read_lock, simply because the RCU grace period state machine is driven from an interrupt. But there is no such requirement on RCU at all, so in the interest of removing assumptions and code validating we're trying to remove such things. --
Can't you fix that by having the SPE code take a reference on these mm_structs they're playing with? Poking at one without a ref seems fishy anyway. --
From: Peter Zijlstra <a.p.zijlstra@chello.nl> Did a build and test boot of this on my 128-cpu Niagara2 box, seems to work basically fine. --
From: David Miller <davem@davemloft.net> Here comes a set of 4 patches which build on top of your work by: 1) Killing quicklists on sparc64 2) Using the generic RCU page table liberation code on sparc64 3) Implement pte_special() et al. on sparc64 4) Implement get_user_pages_fast() on sparc64 Please add them to your patch set. If you change the RCU generic code enabler CPP define to be controlled via Kconfig (as we discussed on IRC) it should be easy to propagate that change into patch #2 here. Thanks! --
What's the straight-line performance impact of all this? And how about concurrency, I wonder. mutexes of course are double the atomics, and you've added a refcount which is two more again for those paths using it. Page faults are very important. We unfortunately have some databases doing a significant amount of mmap/munmap activity too. I'd like to see microbenchmark numbers for each of those (both anon and file backed for page faults). kbuild does quite a few pages faults, that would be an easy thing to Is it because you're reducing the number of TLB flushes, or what (kbuild isn't multi threaded so on x86 TLB flushes should be really fast anyway). --
You think this would affect the mmap/munmap times in any significant OK, I'll dig out that fault test used in the whole mmap_sem/rwsem thread I'll try and get some perf stat runs to get some insight into this. But the numbers were: time make O=defconfig -j48 bzImage (5x, cache hot) without: avg: 39.2018s +- 0.3407 with: avg: 38.9886s +- 0.1814 --
They're actually not _too_ heavy because they just setup and tear down vmas. No flushing or faulting required (well, in order to make any _use_ of them you need flushing and faulting of course). I have some microbenchmarks like this and the page fault test I Well that's interesting. Nice if it is an improvement not just some anomoly. I'd start by looking at TLB flushes maybe? For testing, it would be nice to make the flush sizes equal so you get more of a comparison of the straight line code. Other than this, I don't have a good suggestion of what to test. I mean, how far can you go? :) Some threaded workloads would probably be a good idea, though. Java? --
If you don't mind, please. If you send them my way I'll run them on my machines too. --
Hi Peter,
On Thu, 08 Apr 2010 21:17:37 +0200
Yes, that is a good thing. With the preemptible mmu_gather s390 will
use less IDTEs (that is the instruction that flushes all TLBs for a
To get the 'interesting' TLB flushing on s390 working again you need
this patch:
--
[PATCH] s390: preemptible mmu_gather
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Adapt the stand-alone s390 mmu_gather implementation to the new
preemptible mmu_gather interface.
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
arch/s390/include/asm/tlb.h | 43 +++++++++++++++++++++++++------------------
1 file changed, 25 insertions(+), 18 deletions(-)
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -28,45 +28,51 @@
#include <asm/smp.h>
#include <asm/tlbflush.h>
-#ifndef CONFIG_SMP
-#define TLB_NR_PTRS 1
-#else
-#define TLB_NR_PTRS 508
-#endif
-
struct mmu_gather {
struct mm_struct *mm;
unsigned int fullmm;
unsigned int nr_ptes;
unsigned int nr_pxds;
- void *array[TLB_NR_PTRS];
+ unsigned int max;
+ void **array;
+ void *local[8];
};
-DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
-
-static inline struct mmu_gather *tlb_gather_mmu(struct mm_struct *mm,
- unsigned int full_mm_flush)
+static inline void __tlb_alloc_pages(struct mmu_gather *tlb)
{
- struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
+ unsigned long addr = __get_free_pages(GFP_ATOMIC, 0);
+
+ if (addr) {
+ tlb->array = (void *) addr;
+ tlb->max = PAGE_SIZE / sizeof(void *);
+ }
+}
+static inline void tlb_gather_mmu(struct mmu_gather *tlb,
+ struct mm_struct *mm,
+ unsigned int full_mm_flush)
+{
tlb->mm = mm;
+ tlb->max = ARRAY_SIZE(tlb->local);
+ tlb->array = tlb->local;
tlb->fullmm = full_mm_flush || (num_online_cpus() == 1) ||
(atomic_read(&mm->mm_users) <= 1 && mm == current->active_mm);
- tlb->nr_ptes = 0;
- tlb->nr_pxds = TLB_NR_PTRS;
if (tlb->fullmm)
__tlb_flush_mm(mm);
- return ...Have you tried compiling this for NOMMU? David --
No, I will eventually, I'm sure there's something to fix. --
