Hi Ingo, This is a great big pile of x86 unification and Xen bugfix patches. They build and boot for me on 64-bit and 32-bit (PAE and non-PAE). Patches are based on x86.git#testing as of this morning. The overview: - a couple of Xen bugfixes, which are 2.6.24 and 2.6.25 material - a bunch of x86 cleanups and unifications, mostly around pgalloc - some Xen fixes and improvements: - unify PAE/non-PAE pagetable handling - implement sysenter where applicable Thanks, J --
Common definitions for 3-level pagetable functions.
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
arch/x86/mm/init_64.c | 5 -----
arch/x86/mm/pgtable.c | 8 ++++++++
arch/x86/mm/pgtable_32.c | 10 ----------
include/asm-x86/pgalloc.h | 33 +++++++++++++++++++++++++++++++++
include/asm-x86/pgalloc_32.h | 32 --------------------------------
include/asm-x86/pgalloc_64.h | 24 ------------------------
6 files changed, 41 insertions(+), 71 deletions(-)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -848,11 +848,6 @@
}
#endif
-void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
-{
- tlb_remove_page(tlb, virt_to_page(pmd));
-}
-
void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
{
tlb_remove_page(tlb, virt_to_page(pud));
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -27,6 +27,14 @@
paravirt_release_pt(page_to_pfn(pte));
tlb_remove_page(tlb, pte);
}
+
+#if PAGETABLE_LEVELS > 2
+void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
+{
+ paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
+ tlb_remove_page(tlb, virt_to_page(pmd));
+}
+#endif /* PAGETABLE_LEVELS > 2 */
#ifdef CONFIG_X86_64
static inline void pgd_list_add(pgd_t *pgd)
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -173,16 +173,6 @@
__VMALLOC_RESERVE += reserve;
}
-#ifdef CONFIG_X86_PAE
-
-void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
-{
- paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
- tlb_remove_page(tlb, virt_to_page(pmd));
-}
-
-#endif
-
int pmd_bad(pmd_t pmd)
{
WARN_ON_ONCE(pmd_bad_v1(pmd) != pmd_bad_v2(pmd));
diff --git a/include/asm-x86/pgalloc.h b/include/asm-x86/pgalloc.h
--- a/include/asm-x86/pgalloc.h
+++ b/include/asm-x86/pgalloc.h
@@ -40,6 +40,39 ...Convert Xen pagetable handling to use appropriate *val_t types.
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
arch/x86/xen/mmu.c | 32 ++++++++++++++++----------------
1 file changed, 16 insertions(+), 16 deletions(-)
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -214,35 +214,35 @@
xen_set_pmd(pmdp, __pmd(0));
}
-unsigned long long xen_pte_val(pte_t pte)
+pteval_t xen_pte_val(pte_t pte)
{
- unsigned long long ret = 0;
+ pteval_t ret = 0;
if (pte.pte_low) {
- ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low;
+ ret = ((pteval_t)pte.pte_high << 32) | pte.pte_low;
ret = machine_to_phys(XMADDR(ret)).paddr | 1;
}
return ret;
}
-unsigned long long xen_pmd_val(pmd_t pmd)
+pmdval_t xen_pmd_val(pmd_t pmd)
{
- unsigned long long ret = pmd.pmd;
+ pmdval_t ret = pmd.pmd;
if (ret)
ret = machine_to_phys(XMADDR(ret)).paddr | 1;
return ret;
}
-unsigned long long xen_pgd_val(pgd_t pgd)
+pgdval_t xen_pgd_val(pgd_t pgd)
{
- unsigned long long ret = pgd.pgd;
+ pgdval_t ret = pgd.pgd;
if (ret)
ret = machine_to_phys(XMADDR(ret)).paddr | 1;
return ret;
}
-pte_t xen_make_pte(unsigned long long pte)
+pte_t xen_make_pte(pteval_t pte)
{
if (pte & _PAGE_PRESENT) {
pte = phys_to_machine(XPADDR(pte)).maddr;
@@ -252,7 +252,7 @@
return (pte_t){ .pte = pte };
}
-pmd_t xen_make_pmd(unsigned long long pmd)
+pmd_t xen_make_pmd(pmdval_t pmd)
{
if (pmd & 1)
pmd = phys_to_machine(XPADDR(pmd)).maddr;
@@ -260,7 +260,7 @@
return (pmd_t){ pmd };
}
-pgd_t xen_make_pgd(unsigned long long pgd)
+pgd_t xen_make_pgd(pgdval_t pgd)
{
if (pgd & _PAGE_PRESENT)
pgd = phys_to_machine(XPADDR(pgd)).maddr;
@@ -273,9 +273,9 @@
*ptep = pte;
}
-unsigned long xen_pte_val(pte_t pte)
+pteval_t xen_pte_val(pte_t pte)
{
- unsigned long ret = pte.pte_low;
+ pteval_t ret = pte.pte_low;
if (ret & _PAGE_PRESENT)
...Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
arch/x86/kernel/paravirt.c | 2 ++
arch/x86/mm/pgtable.c | 1 +
include/asm-x86/paravirt.h | 11 +++++++++++
include/asm-x86/pgalloc.h | 3 +++
4 files changed, 17 insertions(+)
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -385,8 +385,10 @@
.alloc_pte = paravirt_nop,
.alloc_pmd = paravirt_nop,
.alloc_pmd_clone = paravirt_nop,
+ .alloc_pud = paravirt_nop,
.release_pte = paravirt_nop,
.release_pmd = paravirt_nop,
+ .release_pud = paravirt_nop,
.set_pte = native_set_pte,
.set_pte_at = native_set_pte_at,
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -38,6 +38,7 @@
#if PAGETABLE_LEVELS > 3
void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
{
+ paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
tlb_remove_page(tlb, virt_to_page(pud));
}
#endif /* PAGETABLE_LEVELS > 3 */
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -223,8 +223,10 @@
void (*alloc_pte)(struct mm_struct *mm, u32 pfn);
void (*alloc_pmd)(struct mm_struct *mm, u32 pfn);
void (*alloc_pmd_clone)(u32 pfn, u32 clonepfn, u32 start, u32 count);
+ void (*alloc_pud)(struct mm_struct *mm, u32 pfn);
void (*release_pte)(u32 pfn);
void (*release_pmd)(u32 pfn);
+ void (*release_pud)(u32 pfn);
/* Pagetable manipulation functions */
void (*set_pte)(pte_t *ptep, pte_t pteval);
@@ -918,6 +920,15 @@
PVOP_VCALL1(pv_mmu_ops.release_pmd, pfn);
}
+static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned pfn)
+{
+ PVOP_VCALL2(pv_mmu_ops.alloc_pud, mm, pfn);
+}
+static inline void paravirt_release_pud(unsigned pfn)
+{
+ PVOP_VCALL1(pv_mmu_ops.release_pud, pfn);
+}
+
#ifdef CONFIG_HIGHPTE
static inline void ...64-bit Xen supports sysenter for 32-bit guests, so support its use. (sysenter is faster than int $0x80 in 32-on-64.) sysexit is still not supported, so we fake it up using iret. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> --- arch/x86/kernel/entry_32.S | 18 +++++++++++++- arch/x86/xen/enlighten.c | 3 -- arch/x86/xen/setup.c | 21 ++++++++++++++++ arch/x86/xen/smp.c | 1 arch/x86/xen/xen-asm.S | 56 ++++++++++++++++++++++++++++++++++++++++++++ arch/x86/xen/xen-ops.h | 3 ++ 6 files changed, 99 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1024,6 +1024,13 @@ ENDPROC(kernel_thread_helper) #ifdef CONFIG_XEN +/* Xen doesn't set %esp to be precisely what the normal sysenter + entrypoint expects, so fix it up before using the normal path. */ +ENTRY(xen_sysenter_target) + RING0_INT_FRAME + addl $5*4, %esp /* remove xen-provided frame */ + jmp sysenter_past_esp + ENTRY(xen_hypervisor_callback) CFI_STARTPROC pushl $0 @@ -1043,8 +1050,17 @@ jae 1f call xen_iret_crit_fixup + jmp 2f -1: mov %esp, %eax +1: cmpl $xen_sysexit_start_crit,%eax + jb 2f + cmpl $xen_sysexit_end_crit,%eax + jae 2f + + jmp xen_sysexit_crit_fixup + +ENTRY(xen_do_upcall) +2: mov %esp, %eax call xen_evtchn_do_upcall jmp ret_from_intr CFI_ENDPROC diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -155,7 +155,6 @@ if (*ax == 1) maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */ (1 << X86_FEATURE_ACPI) | /* disable ACPI */ - (1 << X86_FEATURE_SEP) | /* disable SEP */ (1 << X86_FEATURE_ACC)); /* thermal monitoring */ asm(XEN_EMULATE_PREFIX "cpuid" @@ -981,7 +980,7 @@ .read_pmc = native_read_pmc, .iret = xen_iret, - .irq_enable_syscall_ret = ...
Mask MCE/MCA out of cpu caps. Its harmless to leave them there, but it does prevent the kernel from starting an unnecessary thread. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> --- arch/x86/xen/enlighten.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -155,6 +155,8 @@ if (*ax == 1) maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */ (1 << X86_FEATURE_ACPI) | /* disable ACPI */ + (1 << X86_FEATURE_MCE) | /* disable MCE */ + (1 << X86_FEATURE_MCA) | /* disable MCA */ (1 << X86_FEATURE_ACC)); /* thermal monitoring */ asm(XEN_EMULATE_PREFIX "cpuid" --
I looked over these and everything looks fine to me. I didn't test them though, since I'm not working on a git tree right now, but be sure to do a whole slew of compile testing - those demacroizations can trip up in random places hidden under config options like HIGHPTE. Zach Acked-by: Zachary Amsden <zach@vmware.com> --
I've been doing regular randconfig builds to shake out those kinds of
things, though I haven't put Ingo's infrastructure in place to actually
try to generate bootable kernels from randconfigs.
J
--
FWIW I tested in a PAE Xen guest on 32 and 64 bit h/v and it was fine including running ddcprobe which was my most recent issue. I can't see why this would fix it though. I thought for a second when you said it booted on 64 bit you meant 64 bit Xen guest, do you know how that stuff is going? -- Ian Campbell Evil isn't all bad. --
I'm actually having problem booting 32-on-64 hvm guests. Specifically
the Fedora installer crashes fairly early on:
Freeing unused kernel memory: 280k freed
Write protecting the kernel read-only data: 844k
input: ImExPS/2 Generic Explorer Mouse as /class/input/input2
[9;0][8]
Greetings.
anaconda installer init version 11.3.0.50 starting
mounting /proc filesystem... done
creating /dev filesystem... done
mounting /dev/pts (unix98 pty) filesystem... done
mounting /sys filesystem... done
anaconda installer init version 11.3.0.50 using a serial console
trying to remount root filesystem read write... done
mounting /tmp as ramfs... done
running install...
running /sbin/loader
loader received SIGSEGV! Backtrace:
[0x804a030]
[0x110420]
[0x816a7c8]
[0x81697a5]
[0x805b626]
[0x805b72b]
[0x805c153]
[0x804af81]
[0x8175004]
[0x8048151]
install exited abnormally [1/1]
sending termination signals...done
sending kill signals...done
disabling swap...
unmounting filesystems...
/proc done
/dev/pts done
/sys done
/tmp/ramfs done
you may safely reboot your system
Last I heard, they got something booting, but it wasn't very clean. I
need to resync with Eduardo. A lot of these changes were intended to
make that effort easier, but it would be nice to confirm ;) Doesn't
look like http://git.et.redhat.com/?p=xen-pvops-64.git;a=summary has
changed much lately.
J
--
-- Ian Campbell Intel CPUs are not defective, they just act that way. -- Henry Spencer --
It's a first for me. I just wanted a test-env for these x86 trees.
J
--
is this a bug that has leaked into current upstream -git as well, or is it an x86.git/testing issue? Ingo --
I'm assuming its a Xen bug actually. It happens with the distro kernel
on the installers as well as x86.git. Googling around shows similar
symptoms on laptops with problematic BIOSes, so that points at a
possible problem.
J
--
thanks Jeremy, i've queued them all up and the current x86/latest branch should have them. They looked robust so far in my testing. Ingo --
found a build bug on 32-bit & PAE - pud_populate() assumed too much about types: In file included from arch/x86/mm/pgtable.c:3: include/asm/pgalloc.h: In function 'pud_populate': include/asm/pgalloc.h:93: error: dereferencing pointer to incomplete type i've uninlined it and has trickled that through your series. Ingo --
I build 32-bit PAE all the time. I guess the difference is your build is !PARAVIRT? --
yes, randconfig triggered it rather quickly. Bad config attached. (it now builds fine) Ingo
kernel crash for sale ;-)
bisected it down to:
| commit aaa7d17e66bc0990d7bee8db5d37e12ae087e206
| Author: Jeremy Fitzhardinge <jeremy@goop.org>
| Date: Mon Mar 17 16:37:16 2008 -0700
|
| x86: only enable interrupts when kernel state has been set up
the bug is that you leak irqs off state into C code... which crashes in
the block layer.
Crash log below. For now i've disabled this patch. I suspect you can
reproduce this by running a DEBUG_PAGEALLOC+PROVE_LOCKING kernel.
Ingo
------------>
[ 66.336389] PM: Adding info for No Bus:vcsa1
[ 66.344927] device: 'vcs1': device_unregister
[ 66.348200] PM: Removing info for No Bus:vcs1
[ 66.353463] device: 'vcs1': device_create_release
[ 66.356305] device: 'vcsa1': device_unregister
[ 66.360244] PM: Removing info for No Bus:vcsa1
[ 66.365345] device: 'vcsa1': device_create_release
[ 66.392837] BUG: sleeping function called from invalid context at include/linux/pagemap.h:169
[ 66.396173] in_atomic():0, irqs_disabled():1
[ 66.396173] 1 lock held by init/1772:
[ 66.396173] #0: (&mm->mmap_sem){....}, at: [<c01166f3>] do_page_fault+0x93/0x7c0
[ 66.396173] Pid: 1772, comm: init Not tainted 2.6.25-rc6-x86-latest.git #11
[ 66.396173] [<c011f692>] __might_sleep+0xc2/0xe0
[ 66.396173] [<c0162c26>] __do_fault+0x3f6/0x490
[ 66.396173] [<c01641d1>] handle_mm_fault+0x151/0x570
[ 66.396173] [<c013b046>] ? down_read_trylock+0x56/0x60
[ 66.396173] [<c0116751>] do_page_fault+0xf1/0x7c0
[ 66.396173] [<c0116660>] ? do_page_fault+0x0/0x7c0
[ 66.396173] [<c082fd42>] ? error_code+0x6a/0x70
[ 66.396173] [<c030f772>] ? __put_user_4+0x12/0x18
[ 66.396173] [<c0116660>] ? do_page_fault+0x0/0x7c0
[ 66.396173] [<c082fd42>] error_code+0x6a/0x70
[ 66.396173] =======================
[ 66.397045] device: 'vcs1': device_add
[ 66.400497] PM: Adding info for No Bus:vcs1
[ 66.411267] device: 'vcsa1': device_add
[ 66.412445] PM: Adding info for No Bus:vcsa1
[ 66.481137] ...Hm, OK. I can't see how it can avoid enabling interrupts on that path,
but I'll look more closely. What's the .config. 32-bit, obviously.
J
--
config attached. randconfig generated, so it can have random surprises enabled :) Ingo
Thanks. What was the workload? Did it just happen, or were you doing something specific? BTW, that other pud_populate config you sent fails to link for me: LD .tmp_vmlinux1 arch/x86/kernel/built-in.o: In function `MP_processor_info': mpparse_32.c:(.cpuinit.text+0x32b2): undefined reference to `x86_cpu_to_apicid_early_ptr' mpparse_32.c:(.cpuinit.text+0x32c4): undefined reference to `x86_bios_cpu_apicid_early_ptr' mpparse_32.c:(.cpuinit.text+0x3363): undefined reference to `per_cpu__x86_cpu_to_apicid' mpparse_32.c:(.cpuinit.text+0x336e): undefined reference to `per_cpu__x86_bios_cpu_apicid' arch/x86/mach-generic/built-in.o: In function `cpu_present_to_apicid': summit.c:(.text+0xaf): undefined reference to `per_cpu__x86_bios_cpu_apicid' arch/x86/mach-generic/built-in.o: In function `cpu_present_to_apicid': bigsmp.c:(.text+0x47f): undefined reference to `per_cpu__x86_bios_cpu_apicid' arch/x86/mach-generic/built-in.o: In function `init_apic_ldr': bigsmp.c:(.text+0x6cc): undefined reference to `per_cpu__x86_bios_cpu_apicid' arch/x86/mach-generic/built-in.o: In function `cpu_present_to_apicid': es7000.c:(.text+0x774): undefined reference to `per_cpu__x86_bios_cpu_apicid' arch/x86/mach-generic/built-in.o:es7000.c:(.text+0x906): more undefined references to `per_cpu__x86_bios_cpu_apicid' follow make[2]: *** [.tmp_vmlinux1] Error 1 J --
"git-remote update" should solve that for you. Ingo --
Haven't managed to repro it yet, but DEBUG_PAGE_ALLOC turned up another
case of doing tlb flushes with preempt on:
BUG: using smp_processor_id() in preemptible [00000000] code: init/1
caller is xen_flush_tlb+0xe/0xa2
Pid: 1, comm: init Not tainted 2.6.25-rc6-x86-latest.git #281
[<c0234eed>] debug_smp_processor_id+0x99/0xb0
[<c0103ca0>] xen_flush_tlb+0xe/0xa2
[<c011d46b>] kernel_map_pages+0x104/0x125
[<c0154f45>] get_page_from_freelist+0x32f/0x433
[<c01444c4>] ? lock_acquire+0x90/0x9d
[<c01550bd>] __alloc_pages+0x5e/0x2a5
[<c015c5b3>] do_wp_page+0x2c9/0x653
[<c017953f>] ? do_lookup+0x4f/0x146
[<c0103e11>] ? xen_restore_fl+0x2e/0x52
[<c01444c4>] ? lock_acquire+0x90/0x9d
[<c015f720>] ? handle_mm_fault+0xa37/0xbb8
[<c015f7a4>] handle_mm_fault+0xabb/0xbb8
[<c0179332>] ? path_put+0x20/0x23
[<c0103e11>] ? xen_restore_fl+0x2e/0x52
[<c0103e11>] ? xen_restore_fl+0x2e/0x52
[<c045699d>] ? do_page_fault+0x460/0x952
[<c013b80d>] ? down_read_trylock+0x37/0x41
[<c0456a51>] do_page_fault+0x514/0x952
[<c0232116>] ? copy_to_user+0x2a/0x34
[<c017e3ce>] ? sys_select+0x13e/0x164
[<c045653d>] ? do_page_fault+0x0/0x952
[<c0454eaa>] error_code+0x72/0x78
This code seems a bit fast and loose anyway, since it only does a local
TLB flush, which means that other CPUs can still accessed free memory if
they still have a stale TLB for it. Having the tlb flush happen on a
random CPU doesn't really affect things much.
J
--
turns out that i have another bug which causes a screaming IRQ#0 that floods the box at 70K irqs/sec. That is what triggered that crash in your patch most likely. so either figure out the bug by review or try turning your IRQ#0 into a screaming one for debug purposes. (hint: turning irq#0's trigger mode from 'edge' to 'level' in mpparse.c or ioapic.c does wonders to that end ;-) Ingo --
Hm. I think the problem was that the patch changed the ordering so that
the %ebp fault test was happening with interrupts disabled, meaning that
any fault-in would happen with interrupts disabled.
Could you tell me if this revised patch still provokes a problem?
Thanks,
J
Subject: x86: only enable interrupts when kernel state has been set up
The sysenter path tries to enable interrupts immediately. Unfortunately
this doesn't work in a paravirt environment, because not enough kernel
state has been set up at that point (namely, pointing %fs to the kernel
percpu data segment). To fix this, defer ENABLE_INTERRUPTS until after
the kernel state has been set up.
Unfortunately this means that we're running with interrupts disabled
for a while without calling the IRQ tracing code, but that can't be
called without setting up %fs either.
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
arch/x86/kernel/entry_32.S | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
===================================================================
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -290,10 +290,10 @@ ENTRY(ia32_sysenter_target)
movl TSS_sysenter_sp0(%esp),%esp
ENTRY(sysenter_past_esp)
/*
- * No need to follow this irqs on/off section: the syscall
- * disabled irqs and here we enable it straight after entry:
+ * Interrupts are disabled here, but we can't trace it until
+ * enough kernel state to call TRACE_IRQS_OFF can be called - but
+ * we immediately enable interrupts at that point anyway.
*/
- ENABLE_INTERRUPTS(CLBR_NONE)
pushl $(__USER_DS)
CFI_ADJUST_CFA_OFFSET 4
/*CFI_REL_OFFSET ss, 0*/
@@ -314,6 +314,11 @@ ENTRY(sysenter_past_esp)
CFI_ADJUST_CFA_OFFSET 4
CFI_REL_OFFSET eip, 0
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ SAVE_ALL
+ ENABLE_INTERRUPTS(CLBR_NONE)
+
/*
* Load the potential sixth argument from user stack.
* Careful about security.
@@ -321,14 ...it's looking good so far - queued it up. Ingo --
it just crashed again, crashlog and config attached. This time it didnt need an irq flood. Ingo
Hm. I presume it happens immediately/consistently. Is it something
particular about this .config? Do other configs work for you?
Thanks,
J
--
it's a generic distro config this time around. It crashed on the first attempt. Ingo --
I haven't had a chance to try this on real hardware yet, but could you
confirm that booting with "nosep" avoids the problem? If not, then some
deep voodoo is going on...
Thanks,
J
--
This patch applied on top of everything else is sufficient to fix the problem for me: Subject: x86: fix build problem in pud_populate without CONFIG_PARAVIRT asm/paravirt.h ends up including linux/sched.h, which pud_populate needs for its reference to current. Specifically include linux/sched.h for it. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> --- include/asm-x86/pgalloc.h | 1 + 1 file changed, 1 insertion(+) =================================================================== --- a/include/asm-x86/pgalloc.h +++ b/include/asm-x86/pgalloc.h @@ -4,6 +4,7 @@ #include <linux/threads.h> #include <linux/mm.h> /* for struct page */ #include <linux/pagemap.h> +#include <linux/sched.h> #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> --
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
arch/x86/mm/init_64.c | 5 ----
arch/x86/mm/pgtable.c | 7 +++++
include/asm-x86/pgalloc.h | 52 +++++++++++++++++++++++++++++++++++++-----
include/asm-x86/pgalloc_32.h | 24 -------------------
include/asm-x86/pgalloc_64.h | 29 -----------------------
5 files changed, 53 insertions(+), 64 deletions(-)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -847,8 +847,3 @@
return 0;
}
#endif
-
-void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
-{
- tlb_remove_page(tlb, virt_to_page(pud));
-}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -34,6 +34,13 @@
paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
tlb_remove_page(tlb, virt_to_page(pmd));
}
+
+#if PAGETABLE_LEVELS > 3
+void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
+{
+ tlb_remove_page(tlb, virt_to_page(pud));
+}
+#endif /* PAGETABLE_LEVELS > 3 */
#endif /* PAGETABLE_LEVELS > 2 */
#ifdef CONFIG_X86_64
diff --git a/include/asm-x86/pgalloc.h b/include/asm-x86/pgalloc.h
--- a/include/asm-x86/pgalloc.h
+++ b/include/asm-x86/pgalloc.h
@@ -71,12 +71,52 @@
}
extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
+
+#ifdef CONFIG_X86_PAE
+static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
+{
+ paravirt_alloc_pd(mm, __pa(pmd) >> PAGE_SHIFT);
+
+ /* Note: almost everything apart from _PAGE_PRESENT is
+ reserved at the pmd (PDPT) level. */
+ set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
+
+ /*
+ * According to Intel App note "TLBs, Paging-Structure Caches,
+ * and Their Invalidation", April 2007, document 317080-001,
+ * section 8.1: in PAE mode we explicitly have to flush the
+ * TLB via cr3 if the top-level pgd is changed...
+ */
+ if (mm == ...Add a common arch/x86/mm/pgtable.c file for common pagetable functions.
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
arch/x86/mm/Makefile | 2
arch/x86/mm/pgtable.c | 239 ++++++++++++++++++++++++++++++++++++++++++
arch/x86/mm/pgtable_32.c | 187 --------------------------------
include/asm-x86/pgalloc.h | 18 +++
include/asm-x86/pgalloc_32.h | 11 -
include/asm-x86/pgalloc_64.h | 67 -----------
6 files changed, 258 insertions(+), 266 deletions(-)
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,4 +1,4 @@
-obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o
+obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o pgtable.o mmap.o
obj-$(CONFIG_X86_32) += pgtable_32.o
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
new file mode 100644
--- /dev/null
+++ b/arch/x86/mm/pgtable.c
@@ -0,0 +1,239 @@
+#include <linux/mm.h>
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+
+pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
+{
+ return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
+}
+
+pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
+{
+ struct page *pte;
+
+#ifdef CONFIG_HIGHPTE
+ pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
+#else
+ pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
+#endif
+ if (pte)
+ pgtable_page_ctor(pte);
+ return pte;
+}
+
+#ifdef CONFIG_X86_64
+static inline void pgd_list_add(pgd_t *pgd)
+{
+ struct page *page = virt_to_page(pgd);
+ unsigned long flags;
+
+ spin_lock_irqsave(&pgd_lock, flags);
+ list_add(&page->lru, &pgd_list);
+ spin_unlock_irqrestore(&pgd_lock, flags);
+}
+
+static inline void pgd_list_del(pgd_t *pgd)
+{
+ struct page *page = virt_to_page(pgd);
+ unsigned long flags;
+
+ spin_lock_irqsave(&pgd_lock, ...Turn paravirt stubs into inline functions, so that the arguments are
still typechecked.
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
include/asm-x86/pgalloc.h | 15 ++++++++-------
1 file changed, 8 insertions(+), 7 deletions(-)
diff --git a/include/asm-x86/pgalloc.h b/include/asm-x86/pgalloc.h
--- a/include/asm-x86/pgalloc.h
+++ b/include/asm-x86/pgalloc.h
@@ -8,13 +8,14 @@
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else
-#define paravirt_alloc_pte(mm, pfn) do { } while (0)
-#define paravirt_alloc_pmd(mm, pfn) do { } while (0)
-#define paravirt_alloc_pmd_clone(pfn, clonepfn, start, count) do { } while (0)
-#define paravirt_alloc_pud(mm, pfn) do { } while (0)
-#define paravirt_release_pte(pfn) do { } while (0)
-#define paravirt_release_pmd(pfn) do { } while (0)
-#define paravirt_release_pud(pfn) do { } while (0)
+static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) {}
+static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {}
+static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
+ unsigned long start, unsigned long count) {}
+static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) {}
+static inline void paravirt_release_pte(unsigned long pfn) {}
+static inline void paravirt_release_pmd(unsigned long pfn) {}
+static inline void paravirt_release_pud(unsigned long pfn) {}
#endif
/*
--
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
arch/x86/mm/pgtable.c | 12 ++++++++++++
include/asm-x86/pgtable.h | 10 ++--------
2 files changed, 14 insertions(+), 8 deletions(-)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -277,3 +277,15 @@
return ret;
}
+
+int ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
+{
+ int young;
+
+ young = ptep_test_and_clear_young(vma, address, ptep);
+ if (young)
+ flush_tlb_page(vma, address);
+
+ return young;
+}
diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h
--- a/include/asm-x86/pgtable.h
+++ b/include/asm-x86/pgtable.h
@@ -308,14 +308,8 @@
unsigned long addr, pte_t *ptep);
#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
-#define ptep_clear_flush_young(vma, address, ptep) \
-({ \
- int __young; \
- __young = ptep_test_and_clear_young((vma), (address), (ptep)); \
- if (__young) \
- flush_tlb_page(vma, address); \
- __young; \
-})
+extern int ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep);
#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
--
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> --- arch/x86/kernel/entry_32.S | 2 +- arch/x86/xen/xen-asm.S | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -411,7 +411,7 @@ irq_return: INTERRUPT_RETURN .section .fixup,"ax" -iret_exc: +ENTRY(iret_exc) pushl $0 # no error code pushl $do_iret_error jmp error_code diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -184,8 +184,12 @@ region is OK. */ je xen_hypervisor_callback - iret +1: iret xen_iret_end_crit: +.section __ex_table,"a" + .align 4 + .long 1b,iret_exc +.previous hyper_iret: /* put this out of line since its very rarely used */ --
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
arch/x86/mm/pgtable.c | 16 ++++++++++++++++
include/asm-x86/pgtable.h | 13 +++----------
2 files changed, 19 insertions(+), 10 deletions(-)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -1,5 +1,6 @@
#include <linux/mm.h>
#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
#include <asm/tlb.h>
pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
@@ -246,3 +247,18 @@
free_page((unsigned long)pgd);
}
#endif
+
+int ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ pte_t entry, int dirty)
+{
+ int changed = !pte_same(*ptep, entry);
+
+ if (changed && dirty) {
+ *ptep = entry;
+ pte_update_defer(vma->vm_mm, address, ptep);
+ flush_tlb_page(vma, address);
+ }
+
+ return changed;
+}
diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h
--- a/include/asm-x86/pgtable.h
+++ b/include/asm-x86/pgtable.h
@@ -299,16 +299,9 @@
* bit at the same time.
*/
#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
-#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
-({ \
- int __changed = !pte_same(*(ptep), entry); \
- if (__changed && dirty) { \
- *ptep = entry; \
- pte_update_defer((vma)->vm_mm, (address), (ptep)); \
- flush_tlb_page(vma, address); \
- } \
- __changed; \
-})
+extern int ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ pte_t entry, int dirty);
#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
#define ptep_test_and_clear_young(vma, addr, ptep) ({ \
--
All pagetables need fundamentally the same setup and destruction, so
just use the same code for everything.
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Andi Kleen <ak@suse.de>
---
arch/x86/mm/pgtable.c | 59 +++++++++---------------------------------
include/asm-x86/pgtable.h | 16 +++++++++++
include/asm-x86/pgtable_32.h | 15 ----------
include/asm-x86/pgtable_64.h | 2 -
4 files changed, 30 insertions(+), 62 deletions(-)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -59,50 +59,6 @@
list_del(&page->lru);
}
-#ifdef CONFIG_X86_64
-pgd_t *pgd_alloc(struct mm_struct *mm)
-{
- unsigned boundary;
- pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
- unsigned long flags;
- if (!pgd)
- return NULL;
- spin_lock_irqsave(&pgd_lock, flags);
- pgd_list_add(pgd);
- spin_unlock_irqrestore(&pgd_lock, flags);
- /*
- * Copy kernel pointers in from init.
- * Could keep a freelist or slab cache of those because the kernel
- * part never changes.
- */
- boundary = pgd_index(__PAGE_OFFSET);
- memset(pgd, 0, boundary * sizeof(pgd_t));
- memcpy(pgd + boundary,
- init_level4_pgt + boundary,
- (PTRS_PER_PGD - boundary) * sizeof(pgd_t));
- return pgd;
-}
-
-void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
- unsigned long flags;
- BUG_ON((unsigned long)pgd & (PAGE_SIZE-1));
- spin_lock_irqsave(&pgd_lock, flags);
- pgd_list_del(pgd);
- spin_unlock_irqrestore(&pgd_lock, flags);
- free_page((unsigned long)pgd);
-}
-#else
-/*
- * List of all pgd's needed for non-PAE so it can invalidate entries
- * in both cached and uncached pgd's; not needed for PAE since the
- * kernel pmd is shared. If PAE were not to share the pmd a similar
- * tactic would be needed. This is essentially codepath-based locking
- * against pageattr.c; it is the unique case in which a valid change
- * of kernel pagetables can't be lazily synchronized ...Common definitions for 2-level pagetable functions.
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
arch/x86/mm/init_64.c | 6 ------
arch/x86/mm/pgtable.c | 7 +++++++
arch/x86/mm/pgtable_32.c | 7 -------
include/asm-x86/pgalloc.h | 16 ++++++++++++++++
include/asm-x86/pgalloc_32.h | 18 ------------------
include/asm-x86/pgalloc_64.h | 16 ----------------
6 files changed, 23 insertions(+), 47 deletions(-)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -848,12 +848,6 @@
}
#endif
-void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
-{
- pgtable_page_dtor(pte);
- tlb_remove_page(tlb, pte);
-}
-
void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
{
tlb_remove_page(tlb, virt_to_page(pmd));
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -19,6 +19,13 @@
if (pte)
pgtable_page_ctor(pte);
return pte;
+}
+
+void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
+{
+ pgtable_page_dtor(pte);
+ paravirt_release_pt(page_to_pfn(pte));
+ tlb_remove_page(tlb, pte);
}
#ifdef CONFIG_X86_64
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -173,13 +173,6 @@
__VMALLOC_RESERVE += reserve;
}
-void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
-{
- pgtable_page_dtor(pte);
- paravirt_release_pt(page_to_pfn(pte));
- tlb_remove_page(tlb, pte);
-}
-
#ifdef CONFIG_X86_PAE
void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
diff --git a/include/asm-x86/pgalloc.h b/include/asm-x86/pgalloc.h
--- a/include/asm-x86/pgalloc.h
+++ b/include/asm-x86/pgalloc.h
@@ -24,6 +24,22 @@
extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
+/* Should ...pte_t always contains a "pte" field for the whole pte value, so make
use of it.
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
arch/x86/xen/mmu.c | 26 ++++++++++++--------------
1 file changed, 12 insertions(+), 14 deletions(-)
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -216,12 +216,10 @@
pteval_t xen_pte_val(pte_t pte)
{
- pteval_t ret = 0;
+ pteval_t ret = pte.pte;
- if (pte.pte_low) {
- ret = ((pteval_t)pte.pte_high << 32) | pte.pte_low;
- ret = machine_to_phys(XMADDR(ret)).paddr | 1;
- }
+ if (ret & _PAGE_PRESENT)
+ ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
return ret;
}
@@ -229,16 +227,16 @@
pmdval_t xen_pmd_val(pmd_t pmd)
{
pmdval_t ret = pmd.pmd;
- if (ret)
- ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+ if (ret & _PAGE_PRESENT)
+ ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
return ret;
}
pgdval_t xen_pgd_val(pgd_t pgd)
{
pgdval_t ret = pgd.pgd;
- if (ret)
- ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+ if (ret & _PAGE_PRESENT)
+ ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
return ret;
}
@@ -254,7 +252,7 @@
pmd_t xen_make_pmd(pmdval_t pmd)
{
- if (pmd & 1)
+ if (pmd & _PAGE_PRESENT)
pmd = phys_to_machine(XPADDR(pmd)).maddr;
return (pmd_t){ pmd };
@@ -275,7 +273,7 @@
pteval_t xen_pte_val(pte_t pte)
{
- pteval_t ret = pte.pte_low;
+ pteval_t ret = pte.pte;
if (ret & _PAGE_PRESENT)
ret = machine_to_phys(XMADDR(ret)).paddr;
@@ -286,8 +284,8 @@
pgdval_t xen_pgd_val(pgd_t pgd)
{
pteval_t ret = pgd.pgd;
- if (ret)
- ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+ if (ret & _PAGE_PRESENT)
+ ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
return ret;
}
--
Xen supports the notion of a debug interrupt which can be triggered
from the console. For now this is implemented to show pending events,
masks and each CPU's pending event set.
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
arch/x86/xen/events.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
arch/x86/xen/smp.c | 19 ++++++++++++++-----
arch/x86/xen/xen-ops.h | 3 +++
3 files changed, 64 insertions(+), 5 deletions(-)
diff --git a/arch/x86/xen/events.c b/arch/x86/xen/events.c
--- a/arch/x86/xen/events.c
+++ b/arch/x86/xen/events.c
@@ -455,6 +455,53 @@
notify_remote_via_irq(irq);
}
+irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
+{
+ struct shared_info *sh = HYPERVISOR_shared_info;
+ int cpu = smp_processor_id();
+ int i;
+ unsigned long flags;
+ static DEFINE_SPINLOCK(debug_lock);
+
+ spin_lock_irqsave(&debug_lock, flags);
+
+ printk("vcpu %d\n ", cpu);
+
+ for_each_online_cpu(i) {
+ struct vcpu_info *v = per_cpu(xen_vcpu, i);
+ printk("%d: masked=%d pending=%d event_sel %08lx\n ", i,
+ (get_irq_regs() && i == cpu) ? !(get_irq_regs()->flags & X86_EFLAGS_IF) : v->evtchn_upcall_mask,
+ v->evtchn_upcall_pending,
+ v->evtchn_pending_sel);
+ }
+ printk("pending:\n ");
+ for(i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--)
+ printk("%08lx%s", sh->evtchn_pending[i],
+ i % 8 == 0 ? "\n " : " ");
+ printk("\nmasks:\n ");
+ for(i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
+ printk("%08lx%s", sh->evtchn_mask[i],
+ i % 8 == 0 ? "\n " : " ");
+
+ printk("\nunmasked:\n ");
+ for(i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
+ printk("%08lx%s", sh->evtchn_pending[i] & ~sh->evtchn_mask[i],
+ i % 8 == 0 ? "\n " : " ");
+
+ printk("\npending list:\n");
+ for(i = 0; i < NR_EVENT_CHANNELS; i++) {
+ if (sync_test_bit(i, sh->evtchn_pending)) {
+ printk(" %d: event %d -> irq %d\n",
+ cpu_evtchn[i], ...We need to set up the shared_info pointer once we've mapped the real
shared_info into its fixmap slot. That needs to happen once the general
pagetable setup has been done. Previously, the UP shared_info was set
up one in xen_start_kernel, but that was left pointing to the dummy
shared info. Unfortunately there's no really good place to do a later
setup of the shared_info in UP, so just do it once the pagetable setup
has been done.
[ Stable: needed in 2.6.24.x ]
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Stable Kernel <stable@kernel.org>
---
arch/x86/xen/enlighten.c | 51 +++++++++++++++++++++++++---------------------
1 file changed, 28 insertions(+), 23 deletions(-)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -104,6 +104,7 @@
int err;
struct vcpu_info *vcpup;
+ BUG_ON(HYPERVISOR_shared_info == &dummy_shared_info);
per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
if (!have_vcpu_info_placement)
@@ -806,6 +807,31 @@
PFN_DOWN(__pa(xen_start_info->pt_base)));
}
+static __init void setup_shared_info(void)
+{
+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP);
+
+ /*
+ * Create a mapping for the shared info page.
+ * Should be set_fixmap(), but shared_info is a machine
+ * address with no corresponding pseudo-phys address.
+ */
+ set_pte_mfn(addr,
+ PFN_DOWN(xen_start_info->shared_info),
+ PAGE_KERNEL);
+
+ HYPERVISOR_shared_info = (struct shared_info *)addr;
+ } else
+ HYPERVISOR_shared_info =
+ (struct shared_info *)__va(xen_start_info->shared_info);
+
+#ifndef CONFIG_SMP
+ /* In UP this is as good a place as any to set up shared info */
+ xen_setup_vcpu_info_placement();
+#endif
+}
+
static __init void xen_pagetable_setup_done(pgd_t *base)
{
/* This will work as long as patching hasn't happened yet
@@ ...Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
include/xen/page.h | 16 ++--------------
1 file changed, 2 insertions(+), 14 deletions(-)
diff --git a/include/xen/page.h b/include/xen/page.h
--- a/include/xen/page.h
+++ b/include/xen/page.h
@@ -8,27 +8,15 @@
#include <xen/features.h>
-#ifdef CONFIG_X86_PAE
/* Xen machine address */
typedef struct xmaddr {
- unsigned long long maddr;
+ phys_addr_t maddr;
} xmaddr_t;
/* Xen pseudo-physical address */
typedef struct xpaddr {
- unsigned long long paddr;
+ phys_addr_t paddr;
} xpaddr_t;
-#else
-/* Xen machine address */
-typedef struct xmaddr {
- unsigned long maddr;
-} xmaddr_t;
-
-/* Xen pseudo-physical address */
-typedef struct xpaddr {
- unsigned long paddr;
-} xpaddr_t;
-#endif
#define XMADDR(x) ((xmaddr_t) { .maddr = (x) })
#define XPADDR(x) ((xpaddr_t) { .paddr = (x) })
--
Rename (alloc|release)_(pt|pd) to pte/pmd to explicitly match the name
of the appropriate pagetable level structure.
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
arch/x86/kernel/paravirt.c | 10 +++++-----
arch/x86/kernel/vmi_32.c | 20 ++++++++++----------
arch/x86/mm/init_32.c | 6 +++---
arch/x86/mm/ioremap.c | 2 +-
arch/x86/mm/pageattr.c | 2 +-
arch/x86/mm/pgtable.c | 16 ++++++++--------
arch/x86/xen/enlighten.c | 30 +++++++++++++++---------------
include/asm-x86/paravirt.h | 32 ++++++++++++++++----------------
include/asm-x86/pgalloc.h | 18 +++++++++---------
9 files changed, 68 insertions(+), 68 deletions(-)
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -382,11 +382,11 @@
.flush_tlb_single = native_flush_tlb_single,
.flush_tlb_others = native_flush_tlb_others,
- .alloc_pt = paravirt_nop,
- .alloc_pd = paravirt_nop,
- .alloc_pd_clone = paravirt_nop,
- .release_pt = paravirt_nop,
- .release_pd = paravirt_nop,
+ .alloc_pte = paravirt_nop,
+ .alloc_pmd = paravirt_nop,
+ .alloc_pmd_clone = paravirt_nop,
+ .release_pte = paravirt_nop,
+ .release_pmd = paravirt_nop,
.set_pte = native_set_pte,
.set_pte_at = native_set_pte_at,
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -392,13 +392,13 @@
}
#endif
-static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn)
+static void vmi_allocate_pte(struct mm_struct *mm, u32 pfn)
{
vmi_set_page_type(pfn, VMI_PAGE_L1);
vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
}
-static void vmi_allocate_pd(struct mm_struct *mm, u32 pfn)
+static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn)
{
/*
* This call comes in very early, before mem_map is setup.
@@ -409,20 +409,20 @@
vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
}
...Make KERNEL_PGD_PTRS common, as previously it was only being defined for 32-bit. There are a couple of follow-on changes from this: - KERNEL_PGD_PTRS was being defined in terms of USER_PGD_PTRS. The definition of USER_PGD_PTRS doesn't really make much sense on x86-64, since it can have two different user address-space configurations. I renamed USER_PGD_PTRS to KERNEL_PGD_BOUNDARY, which is meaningful for all of 32/32, 32/64 and 64/64 process configurations. - USER_PTRS_PER_PGD was also defined and was being used for similar purposes. Converting its users to KERNEL_PGD_BOUNDARY left it completely unused, and so I removed it. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Cc: Andi Kleen <ak@suse.de> Cc: Zach Amsden <zach@vmware.com> Cc: Ingo Molnar <mingo@elte.hu> --- arch/x86/kernel/reboot.c | 4 ++-- arch/x86/kernel/smpboot_32.c | 4 ++-- arch/x86/kernel/vmi_32.c | 2 +- arch/x86/mach-voyager/voyager_smp.c | 4 ++-- arch/x86/mm/init_32.c | 2 +- arch/x86/mm/pgtable.c | 12 ++++++------ include/asm-x86/pgtable.h | 4 +++- include/asm-x86/pgtable_32.h | 3 --- 8 files changed, 17 insertions(+), 18 deletions(-) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -8,6 +8,7 @@ #include <asm/apic.h> #include <asm/desc.h> #include <asm/hpet.h> +#include <asm/pgtable.h> #include <asm/reboot_fixups.h> #include <asm/reboot.h> @@ -15,7 +16,6 @@ # include <linux/dmi.h> # include <linux/ctype.h> # include <linux/mc146818rtc.h> -# include <asm/pgtable.h> #else # include <asm/iommu.h> #endif @@ -266,7 +266,7 @@ /* Remap the kernel at virtual address zero, as well as offset zero from the kernel segment. This assumes the kernel segment starts at virtual address PAGE_OFFSET. */ - memcpy(swapper_pg_dir, swapper_pg_dir + ...
If an event comes in while events are currently being processed, then
just increment the counter and have the outer event loop reprocess the
pending events. This prevents unbounded recursion on heavy event
loads (of course massive event storms will cause infinite loops).
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
arch/x86/xen/events.c | 46 ++++++++++++++++++++++++++++++----------------
1 file changed, 30 insertions(+), 16 deletions(-)
diff --git a/arch/x86/xen/events.c b/arch/x86/xen/events.c
--- a/arch/x86/xen/events.c
+++ b/arch/x86/xen/events.c
@@ -517,29 +517,43 @@
int cpu = get_cpu();
struct shared_info *s = HYPERVISOR_shared_info;
struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
- unsigned long pending_words;
+ static DEFINE_PER_CPU(unsigned, nesting_count);
+ unsigned count;
- vcpu_info->evtchn_upcall_pending = 0;
+ do {
+ unsigned long pending_words;
- /* NB. No need for a barrier here -- XCHG is a barrier on x86. */
- pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
- while (pending_words != 0) {
- unsigned long pending_bits;
- int word_idx = __ffs(pending_words);
- pending_words &= ~(1UL << word_idx);
+ vcpu_info->evtchn_upcall_pending = 0;
- while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) {
- int bit_idx = __ffs(pending_bits);
- int port = (word_idx * BITS_PER_LONG) + bit_idx;
- int irq = evtchn_to_irq[port];
+ if (__get_cpu_var(nesting_count)++)
+ goto out;
- if (irq != -1) {
- regs->orig_ax = ~irq;
- do_IRQ(regs);
+ /* NB. No need for a barrier here -- XCHG is a barrier on x86. */
+ pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
+ while (pending_words != 0) {
+ unsigned long pending_bits;
+ int word_idx = __ffs(pending_words);
+ pending_words &= ~(1UL << word_idx);
+
+ while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) {
+ int bit_idx = __ffs(pending_bits);
+ int port = (word_idx * BITS_PER_LONG) + ...Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
arch/x86/mm/pgtable.c | 28 +++++++---------------------
1 file changed, 7 insertions(+), 21 deletions(-)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -43,34 +43,31 @@
#endif /* PAGETABLE_LEVELS > 3 */
#endif /* PAGETABLE_LEVELS > 2 */
-#ifdef CONFIG_X86_64
static inline void pgd_list_add(pgd_t *pgd)
{
struct page *page = virt_to_page(pgd);
- unsigned long flags;
- spin_lock_irqsave(&pgd_lock, flags);
list_add(&page->lru, &pgd_list);
- spin_unlock_irqrestore(&pgd_lock, flags);
}
static inline void pgd_list_del(pgd_t *pgd)
{
struct page *page = virt_to_page(pgd);
- unsigned long flags;
- spin_lock_irqsave(&pgd_lock, flags);
list_del(&page->lru);
- spin_unlock_irqrestore(&pgd_lock, flags);
}
+#ifdef CONFIG_X86_64
pgd_t *pgd_alloc(struct mm_struct *mm)
{
unsigned boundary;
pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
+ unsigned long flags;
if (!pgd)
return NULL;
+ spin_lock_irqsave(&pgd_lock, flags);
pgd_list_add(pgd);
+ spin_unlock_irqrestore(&pgd_lock, flags);
/*
* Copy kernel pointers in from init.
* Could keep a freelist or slab cache of those because the kernel
@@ -86,8 +83,11 @@
void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
+ unsigned long flags;
BUG_ON((unsigned long)pgd & (PAGE_SIZE-1));
+ spin_lock_irqsave(&pgd_lock, flags);
pgd_list_del(pgd);
+ spin_unlock_irqrestore(&pgd_lock, flags);
free_page((unsigned long)pgd);
}
#else
@@ -101,20 +101,6 @@
* vmalloc faults work because attached pagetables are never freed.
* -- wli
*/
-static inline void pgd_list_add(pgd_t *pgd)
-{
- struct page *page = virt_to_page(pgd);
-
- list_add(&page->lru, &pgd_list);
-}
-
-static inline void pgd_list_del(pgd_t *pgd)
-{
- struct page *page = virt_to_page(pgd);
-
- list_del(&page->lru);
-}
-
#define ...The sysenter path tries to enable interrupts immediately. Unfortunately this doesn't work in a paravirt environment, because not enough kernel state has been set up at that point (namely, pointing %fs to the kernel percpu data segment). To fix this, defer ENABLE_INTERRUPTS until after the kernel state has been set up. Unfortunately this means that we're running with interrupts disabled for a while without calling the IRQ tracing code, but that can't be called without setting up %fs either. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> --- arch/x86/kernel/entry_32.S | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -290,10 +290,10 @@ movl TSS_sysenter_sp0(%esp),%esp ENTRY(sysenter_past_esp) /* - * No need to follow this irqs on/off section: the syscall - * disabled irqs and here we enable it straight after entry: + * Interrupts are disabled here, but we can't trace it until + * enough kernel state to call TRACE_IRQS_OFF can be called - but + * we immediately enable interrupts at that point anyway. */ - ENABLE_INTERRUPTS(CLBR_NONE) pushl $(__USER_DS) CFI_ADJUST_CFA_OFFSET 4 /*CFI_REL_OFFSET ss, 0*/ @@ -329,6 +329,7 @@ pushl %eax CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL + ENABLE_INTERRUPTS(CLBR_NONE) GET_THREAD_INFO(%ebp) /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ @@ -545,6 +546,7 @@ pushl %eax # save orig_eax CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL + ENABLE_INTERRUPTS(CLBR_NONE) GET_THREAD_INFO(%ebp) movl $-EFAULT,PT_EAX(%esp) jmp resume_userspace --
We can fold the essentially common pte functions together now.
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
arch/x86/xen/mmu.c | 125 ++++++++++++++++++----------------------------------
1 file changed, 44 insertions(+), 81 deletions(-)
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -171,6 +171,49 @@
xen_set_pte(ptep, pteval);
}
+pteval_t xen_pte_val(pte_t pte)
+{
+ pteval_t ret = pte.pte;
+
+ if (ret & _PAGE_PRESENT)
+ ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
+
+ return ret;
+}
+
+pgdval_t xen_pgd_val(pgd_t pgd)
+{
+ pgdval_t ret = pgd.pgd;
+ if (ret & _PAGE_PRESENT)
+ ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
+ return ret;
+}
+
+pte_t xen_make_pte(pteval_t pte)
+{
+ if (pte & _PAGE_PRESENT) {
+ pte = phys_to_machine(XPADDR(pte)).maddr;
+ pte &= ~(_PAGE_PCD | _PAGE_PWT);
+ }
+
+ return (pte_t){ .pte = pte };
+}
+
+pgd_t xen_make_pgd(pgdval_t pgd)
+{
+ if (pgd & _PAGE_PRESENT)
+ pgd = phys_to_machine(XPADDR(pgd)).maddr;
+
+ return (pgd_t){ pgd };
+}
+
+pmdval_t xen_pmd_val(pmd_t pmd)
+{
+ pmdval_t ret = native_pmd_val(pmd);
+ if (ret & _PAGE_PRESENT)
+ ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
+ return ret;
+}
#ifdef CONFIG_X86_PAE
void xen_set_pud(pud_t *ptr, pud_t val)
{
@@ -214,97 +257,17 @@
xen_set_pmd(pmdp, __pmd(0));
}
-pteval_t xen_pte_val(pte_t pte)
-{
- pteval_t ret = pte.pte;
-
- if (ret & _PAGE_PRESENT)
- ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
-
- return ret;
-}
-
-pmdval_t xen_pmd_val(pmd_t pmd)
-{
- pmdval_t ret = pmd.pmd;
- if (ret & _PAGE_PRESENT)
- ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
- return ret;
-}
-
-pgdval_t xen_pgd_val(pgd_t pgd)
-{
- pgdval_t ret = pgd.pgd;
- if (ret & _PAGE_PRESENT)
- ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
- return ret;
-}
-
-pte_t xen_make_pte(pteval_t ...retrigger_dynirq() was incomplete, and didn't properly set the event
to be pending again. It doesn't seem to actually get used.
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
arch/x86/xen/events.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/arch/x86/xen/events.c b/arch/x86/xen/events.c
--- a/arch/x86/xen/events.c
+++ b/arch/x86/xen/events.c
@@ -601,10 +601,16 @@
static int retrigger_dynirq(unsigned int irq)
{
int evtchn = evtchn_from_irq(irq);
+ struct shared_info *sh = HYPERVISOR_shared_info;
int ret = 0;
if (VALID_EVTCHN(evtchn)) {
- set_evtchn(evtchn);
+ int masked;
+
+ masked = sync_test_and_set_bit(evtchn, sh->evtchn_mask);
+ sync_set_bit(evtchn, sh->evtchn_pending);
+ if (!masked)
+ unmask_evtchn(evtchn);
ret = 1;
}
--
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
include/asm-x86/xen/hypercall.h | 6 ++
include/asm-x86/xen/interface.h | 4 +
include/xen/interface/callback.h | 102 ++++++++++++++++++++++++++++++++++++++
3 files changed, 112 insertions(+)
diff --git a/include/asm-x86/xen/hypercall.h b/include/asm-x86/xen/hypercall.h
--- a/include/asm-x86/xen/hypercall.h
+++ b/include/asm-x86/xen/hypercall.h
@@ -161,6 +161,12 @@
return _hypercall4(int, set_callbacks,
event_selector, event_address,
failsafe_selector, failsafe_address);
+}
+
+static inline int
+HYPERVISOR_callback_op(int cmd, void *arg)
+{
+ return _hypercall2(int, callback_op, cmd, arg);
}
static inline int
diff --git a/include/asm-x86/xen/interface.h b/include/asm-x86/xen/interface.h
--- a/include/asm-x86/xen/interface.h
+++ b/include/asm-x86/xen/interface.h
@@ -171,6 +171,10 @@
unsigned long pad[5]; /* sizeof(struct vcpu_info) == 64 */
};
+struct xen_callback {
+ unsigned long cs;
+ unsigned long eip;
+};
#endif /* !__ASSEMBLY__ */
/*
diff --git a/include/xen/interface/callback.h b/include/xen/interface/callback.h
new file mode 100644
--- /dev/null
+++ b/include/xen/interface/callback.h
@@ -0,0 +1,102 @@
+/******************************************************************************
+ * callback.h
+ *
+ * Register guest OS callbacks with Xen.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ ...Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
arch/x86/mm/pageattr.c | 2 --
include/asm-x86/pgalloc.h | 10 ++++++++++
include/asm-x86/pgalloc_32.h | 10 ----------
3 files changed, 10 insertions(+), 12 deletions(-)
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -479,9 +479,7 @@
goto out_unlock;
pbase = (pte_t *)page_address(base);
-#ifdef CONFIG_X86_32
paravirt_alloc_pt(&init_mm, page_to_pfn(base));
-#endif
ref_prot = pte_pgprot(pte_clrhuge(*kpte));
#ifdef CONFIG_X86_64
diff --git a/include/asm-x86/pgalloc.h b/include/asm-x86/pgalloc.h
--- a/include/asm-x86/pgalloc.h
+++ b/include/asm-x86/pgalloc.h
@@ -4,6 +4,16 @@
#include <linux/threads.h>
#include <linux/mm.h> /* for struct page */
#include <linux/pagemap.h>
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define paravirt_alloc_pt(mm, pfn) do { } while (0)
+#define paravirt_alloc_pd(mm, pfn) do { } while (0)
+#define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0)
+#define paravirt_release_pt(pfn) do { } while (0)
+#define paravirt_release_pd(pfn) do { } while (0)
+#endif
/*
* Allocate and free page tables.
diff --git a/include/asm-x86/pgalloc_32.h b/include/asm-x86/pgalloc_32.h
--- a/include/asm-x86/pgalloc_32.h
+++ b/include/asm-x86/pgalloc_32.h
@@ -1,15 +1,5 @@
#ifndef _I386_PGALLOC_H
#define _I386_PGALLOC_H
-
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#else
-#define paravirt_alloc_pt(mm, pfn) do { } while (0)
-#define paravirt_alloc_pd(mm, pfn) do { } while (0)
-#define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0)
-#define paravirt_release_pt(pfn) do { } while (0)
-#define paravirt_release_pd(pfn) do { } while (0)
-#endif
static inline void pmd_populate_kernel(struct mm_struct *mm,
pmd_t *pmd, pte_t *pte)
--
Use jmp rather than call for the iret fixup, so its consistent with
the sysexit fixup, and it simplifies the stack (which is already
complex).
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
arch/x86/kernel/entry_32.S | 3 +--
arch/x86/xen/xen-asm.S | 22 +++++++++-------------
2 files changed, 10 insertions(+), 15 deletions(-)
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1049,8 +1049,7 @@
cmpl $xen_iret_end_crit,%eax
jae 1f
- call xen_iret_crit_fixup
- jmp 2f
+ jmp xen_iret_crit_fixup
1: cmpl $xen_sysexit_start_crit,%eax
jb 2f
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -223,9 +223,7 @@
ds } SAVE_ALL state
eax }
: :
- ebx }
- ----------------
- return addr <- esp
+ ebx }<- esp
----------------
In order to deliver the nested exception properly, we need to shift
@@ -240,10 +238,8 @@
it's usermode state which we eventually need to restore.
*/
ENTRY(xen_iret_crit_fixup)
- /* offsets +4 for return address */
-
/*
- Paranoia: Make sure we're really coming from userspace.
+ Paranoia: Make sure we're really coming from kernel space.
One could imagine a case where userspace jumps into the
critical range address, but just before the CPU delivers a GP,
it decides to deliver an interrupt instead. Unlikely?
@@ -252,32 +248,32 @@
jump instruction itself, not the destination, but some virtual
environments get this wrong.
*/
- movl PT_CS+4(%esp), %ecx
+ movl PT_CS(%esp), %ecx
andl $SEGMENT_RPL_MASK, %ecx
cmpl $USER_RPL, %ecx
je 2f
- lea PT_ORIG_EAX+4(%esp), %esi
- lea PT_EFLAGS+4(%esp), %edi
+ lea PT_ORIG_EAX(%esp), %esi
+ lea PT_EFLAGS(%esp), %edi
/* If eip is before iret_restore_end then stack
hasn't been restored yet. */
cmp $iret_restore_end, ...xen_irq_enable_direct and xen_sysexit were using "andw $0x00ff,
XEN_vcpu_info_pending(vcpu)" to unmask events and test for pending ones
in one instuction.
Unfortunately, the pending flag must be modified with a locked operation
since it can be set by another CPU, and the unlocked form of this
operation was causing the pending flag to get lost, allowing the processor
to return to usermode with pending events and ultimately deadlock.
The simple fix would be to make it a locked operation, but that's rather
costly and unnecessary. The fix here is to split the mask-clearing and
pending-testing into two instructions; the interrupt window between
them is of no concern because either way pending or new events will
be processed.
This should fix lingering bugs in using direct vcpu structure access too.
[ Stable: needed in 2.6.24.x ]
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Stable <stable@kernel.org>
---
arch/x86/xen/enlighten.c | 2 +-
arch/x86/xen/xen-asm.S | 9 +++++++--
2 files changed, 8 insertions(+), 3 deletions(-)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -96,7 +96,7 @@
*
* 0: not available, 1: available
*/
-static int have_vcpu_info_placement = 0;
+static int have_vcpu_info_placement = 1;
static void __init xen_vcpu_setup(int cpu)
{
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -33,12 +33,17 @@
events, then enter the hypervisor to get them handled.
*/
ENTRY(xen_irq_enable_direct)
- /* Clear mask and test pending */
- andw $0x00ff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
+ /* Unmask events */
+ movb $0, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+
/* Preempt here doesn't matter because that will deal with
any pending interrupts. The pending check may end up being
run on the wrong CPU, but that doesn't hurt. ...Xen's pte operations on mfns can be unified like the kernel's pfn operations.
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
include/xen/page.h | 30 +++++++++++++++---------------
1 file changed, 15 insertions(+), 15 deletions(-)
diff --git a/include/xen/page.h b/include/xen/page.h
--- a/include/xen/page.h
+++ b/include/xen/page.h
@@ -125,37 +125,37 @@
#define virt_to_mfn(v) (pfn_to_mfn(PFN_DOWN(__pa(v))))
#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT))
-#ifdef CONFIG_X86_PAE
-#define pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) | \
- (((_pte).pte_high & 0xfff) << (32-PAGE_SHIFT)))
+static inline unsigned long pte_mfn(pte_t pte)
+{
+ return (pte.pte & ~_PAGE_NX) >> PAGE_SHIFT;
+}
static inline pte_t mfn_pte(unsigned long page_nr, pgprot_t pgprot)
{
pte_t pte;
- pte.pte_high = (page_nr >> (32 - PAGE_SHIFT)) |
- (pgprot_val(pgprot) >> 32);
- pte.pte_high &= (__supported_pte_mask >> 32);
- pte.pte_low = ((page_nr << PAGE_SHIFT) | pgprot_val(pgprot));
- pte.pte_low &= __supported_pte_mask;
+ pte.pte = ((phys_addr_t)page_nr << PAGE_SHIFT) |
+ (pgprot_val(pgprot) & __supported_pte_mask);
return pte;
}
-static inline unsigned long long pte_val_ma(pte_t x)
+static inline pteval_t pte_val_ma(pte_t pte)
{
- return x.pte;
+ return pte.pte;
}
+
+static inline pte_t __pte_ma(pteval_t x)
+{
+ return (pte_t) { .pte = x };
+}
+
+#ifdef CONFIG_X86_PAE
#define pmd_val_ma(v) ((v).pmd)
#define pud_val_ma(v) ((v).pgd.pgd)
-#define __pte_ma(x) ((pte_t) { .pte = (x) })
#define __pmd_ma(x) ((pmd_t) { (x) } )
#else /* !X86_PAE */
-#define pte_mfn(_pte) ((_pte).pte_low >> PAGE_SHIFT)
-#define mfn_pte(pfn, prot) __pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
-#define pte_val_ma(x) ((x).pte)
#define pmd_val_ma(v) ((v).pud.pgd.pgd)
-#define __pte_ma(x) ((pte_t) { (x) } )
#endif /* CONFIG_X86_PAE */
#define pgd_val_ma(x) ((x).pgd)
--
Convert asm-x86/pgalloc_64.h from macros into functions (#include hell
prevents __*_free_tlb from being inline, but they're probably a bit
big to inline anyway).
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
arch/x86/mm/init_64.c | 16 ++++++++++++++++
include/asm-x86/pgalloc_64.h | 33 ++++++++++++++++++---------------
2 files changed, 34 insertions(+), 15 deletions(-)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -847,3 +847,19 @@
return 0;
}
#endif
+
+void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
+{
+ pgtable_page_dtor(pte);
+ tlb_remove_page(tlb, pte);
+}
+
+void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
+{
+ tlb_remove_page(tlb, virt_to_page(pmd));
+}
+
+void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
+{
+ tlb_remove_page(tlb, virt_to_page(pud));
+}
diff --git a/include/asm-x86/pgalloc_64.h b/include/asm-x86/pgalloc_64.h
--- a/include/asm-x86/pgalloc_64.h
+++ b/include/asm-x86/pgalloc_64.h
@@ -1,16 +1,24 @@
#ifndef _X86_64_PGALLOC_H
#define _X86_64_PGALLOC_H
-#include <asm/pda.h>
#include <linux/threads.h>
#include <linux/mm.h>
+#include <asm/pda.h>
-#define pmd_populate_kernel(mm, pmd, pte) \
- set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)))
-#define pud_populate(mm, pud, pmd) \
- set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)))
-#define pgd_populate(mm, pgd, pud) \
- set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)))
+static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
+{
+ set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)));
+}
+
+static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
+{
+ set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
+}
+
+static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
+{
+ set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
+}
#define pmd_pgtable(pmd) pmd_page(pmd)
@@ -121,13 +129,8 @@
...Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
arch/x86/mm/pgtable.c | 15 +++++++++++++++
include/asm-x86/pgtable.h | 11 ++---------
2 files changed, 17 insertions(+), 9 deletions(-)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -262,3 +262,18 @@
return changed;
}
+
+int ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ int ret = 0;
+
+ if (pte_young(*ptep))
+ ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
+ &ptep->pte);
+
+ if (ret)
+ pte_update(vma->vm_mm, addr, ptep);
+
+ return ret;
+}
diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h
--- a/include/asm-x86/pgtable.h
+++ b/include/asm-x86/pgtable.h
@@ -304,15 +304,8 @@
pte_t entry, int dirty);
#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-#define ptep_test_and_clear_young(vma, addr, ptep) ({ \
- int __ret = 0; \
- if (pte_young(*(ptep))) \
- __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, \
- &(ptep)->pte); \
- if (__ret) \
- pte_update((vma)->vm_mm, addr, ptep); \
- __ret; \
-})
+extern int ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep);
#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
#define ptep_clear_flush_young(vma, address, ptep) \
--
