Tracing 'async' and *pfn is useless, since 'async' is always true,
and '*pfn' is always "fault_pfn'
We can trace 'gva' and 'gfn' instead, it can help us to see the
life-cycle of an async_pf
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
arch/x86/kvm/mmu.c | 2 +-
include/trace/events/kvm.h | 12 +++++++-----
2 files changed, 8 insertions(+), 6 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 39cc0c6..5275c50 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2627,7 +2627,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool no_apf, gfn_t gfn,
put_page(pfn_to_page(*pfn));
if (!no_apf && can_do_async_pf(vcpu)) {
- trace_kvm_try_async_get_page(async, *pfn);
+ trace_kvm_try_async_get_page(gva, gfn);
if (kvm_find_async_pf_gfn(vcpu, gfn)) {
trace_kvm_async_pf_doublefault(gva, gfn);
kvm_make_request(KVM_REQ_APF_HALT, vcpu);
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 9c2cc6a..30063c6 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -188,18 +188,20 @@ TRACE_EVENT(kvm_age_page,
#ifdef CONFIG_KVM_ASYNC_PF
TRACE_EVENT(
kvm_try_async_get_page,
- TP_PROTO(bool async, u64 pfn),
- TP_ARGS(async, pfn),
+ TP_PROTO(u64 gva, u64 gfn),
+ TP_ARGS(gva, gfn),
TP_STRUCT__entry(
- __field(__u64, pfn)
+ __field(u64, gva)
+ __field(u64, gfn)
),
TP_fast_assign(
- __entry->pfn = (!async) ? pfn : (u64)-1;
+ __entry->gva = gva;
+ __entry->gfn = gfn;
),
- TP_printk("pfn %#llx", __entry->pfn)
+ TP_printk("gva = %#llx, gfn = %#llx", __entry->gva, __entry->gfn)
);
TRACE_EVENT(
--
1.7.0.4
--
Use 'DECLARE_EVENT_CLASS' to cleanup async_pf tracepoints Acked-by: Gleb Natapov <gleb@redhat.com> Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com> --- include/trace/events/kvm.h | 76 ++++++++++++++++++++----------------------- 1 files changed, 35 insertions(+), 41 deletions(-) diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 30063c6..7bec396 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -186,59 +186,71 @@ TRACE_EVENT(kvm_age_page, ); #ifdef CONFIG_KVM_ASYNC_PF -TRACE_EVENT( - kvm_try_async_get_page, +DECLARE_EVENT_CLASS(kvm_async_get_page_class, + TP_PROTO(u64 gva, u64 gfn), + TP_ARGS(gva, gfn), TP_STRUCT__entry( - __field(u64, gva) + __field(__u64, gva) __field(u64, gfn) - ), + ), TP_fast_assign( __entry->gva = gva; __entry->gfn = gfn; - ), + ), TP_printk("gva = %#llx, gfn = %#llx", __entry->gva, __entry->gfn) ); -TRACE_EVENT( - kvm_async_pf_not_present, +DEFINE_EVENT(kvm_async_get_page_class, kvm_try_async_get_page, + + TP_PROTO(u64 gva, u64 gfn), + + TP_ARGS(gva, gfn) +); + +DEFINE_EVENT(kvm_async_get_page_class, kvm_async_pf_doublefault, + + TP_PROTO(u64 gva, u64 gfn), + + TP_ARGS(gva, gfn) +); + +DECLARE_EVENT_CLASS(kvm_async_pf_nopresent_ready, + TP_PROTO(u64 token, u64 gva), + TP_ARGS(token, gva), TP_STRUCT__entry( __field(__u64, token) __field(__u64, gva) - ), + ), TP_fast_assign( __entry->token = token; __entry->gva = gva; - ), + ), + + TP_printk("token %#llx gva %#llx", __entry->token, __entry->gva) - TP_printk("token %#llx gva %#llx not present", __entry->token, - __entry->gva) ); -TRACE_EVENT( - kvm_async_pf_ready, +DEFINE_EVENT(kvm_async_pf_nopresent_ready, kvm_async_pf_not_present, + TP_PROTO(u64 token, u64 gva), - TP_ARGS(token, gva), - TP_STRUCT__entry( - __field(__u64, token) - __field(__u64, gva) - ), + TP_ARGS(token, gva) +); ...
Don't search later slots if the slot is empty Acked-by: Gleb Natapov <gleb@redhat.com> Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com> --- arch/x86/kvm/x86.c | 4 ++-- 1 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2cfdf2d..9b543f4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6206,8 +6206,8 @@ static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn) u32 key = kvm_async_pf_hash_fn(gfn); for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) && - (vcpu->arch.apf.gfns[key] != gfn || - vcpu->arch.apf.gfns[key] == ~0); i++) + (vcpu->arch.apf.gfns[key] != gfn && + vcpu->arch.apf.gfns[key] != ~0); i++) key = kvm_async_pf_next_probe(key); return key; -- 1.7.0.4 --
In current code, it checks async pf completion out of the wait context,
like this:
if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
!vcpu->arch.apf.halted)
r = vcpu_enter_guest(vcpu);
else {
......
kvm_vcpu_block(vcpu)
^- waiting until 'async_pf.done' is not empty
}
kvm_check_async_pf_completion(vcpu)
^- delete list from async_pf.done
So, if we check aysnc pf completion first, it can be blocked at
kvm_vcpu_block
Fixed by mark the vcpu is unhalted in kvm_check_async_pf_completion()
path
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
arch/x86/kvm/x86.c | 1 +
1 files changed, 1 insertions(+), 0 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9b543f4..4da8485 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6280,6 +6280,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
vcpu->arch.fault.address = work->arch.token;
kvm_inject_page_fault(vcpu);
}
+ vcpu->arch.apf.halted = false;
}
bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
--
1.7.0.4
--
If it's no need to inject async #PF to PV guest we can handle
more completed apfs at one time, so we can retry guest #PF
as early as possible
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
arch/x86/include/asm/kvm_host.h | 3 ++-
arch/x86/kvm/x86.c | 8 ++++++--
virt/kvm/async_pf.c | 28 ++++++++++++++++------------
3 files changed, 24 insertions(+), 15 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1be0058..c95b3ff 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -818,7 +818,8 @@ bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work);
-void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
+/* return true if we can handle more completed apfs, false otherwise */
+bool kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work);
void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4da8485..189664a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6265,7 +6265,7 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
}
}
-void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
+bool kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work)
{
trace_kvm_async_pf_ready(work->arch.token, work->gva);
@@ -6274,13 +6274,17 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
else
kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
+ vcpu->arch.apf.halted = false;
+
if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&
!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
vcpu->arch.fault.error_code = 0;
vcpu->arch.fault.address = work->arch.token;
kvm_inject_page_fault(vcpu);
+ return false;
}
- vcpu->arch.apf.halted = ...No need to change kvm_arch_async_page_present() to return anything. You
can do while loop like this:
while (!list_empty_careful(&vcpu->async_pf.done) &&
kvm_arch_can_inject_async_page_present(vcpu)) {
}
If kvm_arch_async_page_present() call injects exception
kvm_arch_can_inject_async_page_present() will return false on next
iteration.
--
Gleb.
--
Yeah, it's a better way, i'll fix it, thanks Gleb! --
If it's no need to inject async #PF to PV guest we can handle
more completed apfs at one time, so we can retry guest #PF
as early as possible
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
virt/kvm/async_pf.c | 32 ++++++++++++++++----------------
1 files changed, 16 insertions(+), 16 deletions(-)
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 60df9e0..100c66e 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -124,24 +124,24 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
{
struct kvm_async_pf *work;
- if (list_empty_careful(&vcpu->async_pf.done) ||
- !kvm_arch_can_inject_async_page_present(vcpu))
- return;
-
- spin_lock(&vcpu->async_pf.lock);
- work = list_first_entry(&vcpu->async_pf.done, typeof(*work), link);
- list_del(&work->link);
- spin_unlock(&vcpu->async_pf.lock);
+ while (!list_empty_careful(&vcpu->async_pf.done) &&
+ kvm_arch_can_inject_async_page_present(vcpu)) {
+ spin_lock(&vcpu->async_pf.lock);
+ work = list_first_entry(&vcpu->async_pf.done, typeof(*work),
+ link);
+ list_del(&work->link);
+ spin_unlock(&vcpu->async_pf.lock);
- if (work->page)
- kvm_arch_async_page_ready(vcpu, work);
- kvm_arch_async_page_present(vcpu, work);
+ if (work->page)
+ kvm_arch_async_page_ready(vcpu, work);
+ kvm_arch_async_page_present(vcpu, work);
- list_del(&work->queue);
- vcpu->async_pf.queued--;
- if (work->page)
- put_page(work->page);
- kmem_cache_free(async_pf_cache, work);
+ list_del(&work->queue);
+ vcpu->async_pf.queued--;
+ if (work->page)
+ put_page(work->page);
+ kmem_cache_free(async_pf_cache, work);
+ }
}
int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
--
1.7.0.4
--
Acked-by: Gleb Natapov <gleb@redhat.com> -- Gleb. --
In kvm_async_pf_wakeup_all(), we add a dummy apf to vcpu->async_pf.done
without holding vcpu->async_pf.lock, it will break if we are handling apfs
at this time.
Also use 'list_empty_careful()' instead of 'list_empty()'
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
virt/kvm/async_pf.c | 5 ++++-
1 files changed, 4 insertions(+), 1 deletions(-)
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index d57ec92..6ef3373 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -200,7 +200,7 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu)
{
struct kvm_async_pf *work;
- if (!list_empty(&vcpu->async_pf.done))
+ if (!list_empty_careful(&vcpu->async_pf.done))
return 0;
work = kmem_cache_zalloc(async_pf_cache, GFP_ATOMIC);
@@ -211,7 +211,10 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu)
get_page(bad_page);
INIT_LIST_HEAD(&work->queue); /* for list_del to work */
+ spin_lock(&vcpu->async_pf.lock);
list_add_tail(&work->link, &vcpu->async_pf.done);
+ spin_unlock(&vcpu->async_pf.lock);
+
vcpu->async_pf.queued++;
return 0;
}
--
1.7.0.4
--
This should never happen to well behaved guest, but malicious guest can do it -- Gleb. --
Don't make a KVM_REQ_UNHALT request after async pf is completed since it
can break guest's 'HLT' instruction.
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
arch/x86/kvm/x86.c | 13 ++++++++++---
include/linux/kvm_host.h | 6 ++++++
virt/kvm/kvm_main.c | 9 ++++++++-
3 files changed, 24 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 189664a..c57fb38 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6105,13 +6105,20 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
{
- return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
+ if ((vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
!vcpu->arch.apf.halted)
- || !list_empty_careful(&vcpu->async_pf.done)
|| vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
|| vcpu->arch.nmi_pending ||
(kvm_arch_interrupt_allowed(vcpu) &&
- kvm_cpu_has_interrupt(vcpu));
+ kvm_cpu_has_interrupt(vcpu)))
+ return 1;
+
+ if (!list_empty_careful(&vcpu->async_pf.done)) {
+ vcpu->arch.apf.halted = false;
+ return 2;
+ }
+
+ return 0;
}
void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 462b982..8def043 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -436,6 +436,12 @@ void kvm_arch_hardware_disable(void *garbage);
int kvm_arch_hardware_setup(void);
void kvm_arch_hardware_unsetup(void);
void kvm_arch_check_processor_compat(void *rtn);
+
+/*
+ * return value: > 0 if the vcpu is runnable, 0 if not.
+ * Especially, if the return value == 1, we should make a
+ * 'KVM_REQ_UNHALT' request.
+ */
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
void kvm_free_physmem(struct kvm *kvm);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index dbe1d6a..9ccf105 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1375,14 +1375,21 @@ void mark_page_dirty(struct kvm *kvm, gfn_t ...kvm_arch_vcpu_runnable() shouldn't change vcpu state. I don't like the
way it propagates internal x86 state to kvm_vcpu_block() too.
May be what you are looking for is the patch below? (not tested).
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2cfdf2d..f7aed95 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5295,8 +5295,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
{
switch(vcpu->arch.mp_state) {
case KVM_MP_STATE_HALTED:
- vcpu->arch.mp_state =
- KVM_MP_STATE_RUNNABLE;
+ if (list_empty_careful(&vcpu->async_pf.done))
+ vcpu->arch.mp_state =
+ KVM_MP_STATE_RUNNABLE;
case KVM_MP_STATE_RUNNABLE:
vcpu->arch.apf.halted = false;
break;
@@ -6279,6 +6280,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
vcpu->arch.fault.error_code = 0;
vcpu->arch.fault.address = work->arch.token;
kvm_inject_page_fault(vcpu);
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
}
}
--
Gleb.
--
if nmi/interrupt and apfs completed event occur at the same time, we will miss to
Have a stupid question, why we make the vcpu runnable here? Sorry i don't know
kvm pv guest to much. :-(
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2044302..27a712b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5267,6 +5267,12 @@ out:
return r;
}
+static bool kvm_vcpu_leave_halt_state(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.nmi_pending ||
+ (kvm_arch_interrupt_allowed(vcpu) &&
+ kvm_cpu_has_interrupt(vcpu));
+}
static int __vcpu_run(struct kvm_vcpu *vcpu)
{
@@ -5299,8 +5305,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
{
switch(vcpu->arch.mp_state) {
case KVM_MP_STATE_HALTED:
- vcpu->arch.mp_state =
- KVM_MP_STATE_RUNNABLE;
+ if (kvm_vcpu_leave_halt_state(vcpu))
+ vcpu->arch.mp_state =
+ KVM_MP_STATE_RUNNABLE;
case KVM_MP_STATE_RUNNABLE:
vcpu->arch.apf.halted = false;
break;
@@ -6113,9 +6120,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
!vcpu->arch.apf.halted)
|| !list_empty_careful(&vcpu->async_pf.done)
|| vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
- || vcpu->arch.nmi_pending ||
- (kvm_arch_interrupt_allowed(vcpu) &&
- kvm_cpu_has_interrupt(vcpu));
+ || kvm_vcpu_leave_halt_state(vcpu);
}
void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
--
No, we will not. If nmi/interrupt and apfs completed event occur at the same time kvm_vcpu_block() will exit with KVM_REQ_UNHALT set, but cpu will not be unhalted because of list_empty_careful(&vcpu->async_pf.done) check. Vcpu then will process pending apf completion and enter kvm_vcpu_block() again which will immediately exit because kvm_arch_vcpu_runnable() will return true since there is pending Because kvm_arch_vcpu_runnable() does not check for pending exceptions. Since now we do not unhalt vcpu when apf completion happens we need to unhalt it here. But, as I said, the patch is untested. -- Gleb. --
Thanks for your explanation, but if it has nmi/interrupt pending, kvm_arch_can_inject_async_page_present() always return false in PV guest case, it can not process pending apf completion, so, the vcpu is remain halt state forever? Also, the pv guest can only handle an apf completion at one time, it can not ensure As i know, exception can not let guest exit halt state, only NMI/interruption can do it, yes? :-) --
kvm_event_needs_reinjection() checks for nmi/interrupts that need reinjection (not injection). Those are nmi/interrupts that was injected but injection failed for some reason. For nmi, for instance, kvm_arch_vcpu_runnable() checks vcpu->arch.nmi_pending, but kvm_event_needs_reinjection() checks for vcpu->arch.nmi_injected. NMI moves from nmi_pending to nmi_injected when it is injected into vcpu In case of PV guest it will be woken up by apf completion by On physical HW exception cannot happen while cpu is in halt state, but with PV we define what guest can and cannot expect, so when guest explicitly enables apf it should be able to handle it during halt. -- Gleb. --
Yeah, nmi is handled like this way, but for interrupt: If interruption controller is in userspace (!irqchip_in_kernel(v->kvm)), kvm_arch_vcpu_runnable() checks vcpu->arch.interrupt.pending and kvm_event_needs_reinjection() also checks vcpu->arch.interrupt.pending. Consider this case: - Guest vcpu executes 'HLT' - wakeup the vcpu and inject interrupt and apfs is completed at this time - then the vcpu can't handle apf completion and .done list keeps nonempty. Ah, i see, thanks your very much. --
If irqchip is in userspace apf is disabled (see mmu.c:can_do_async_pf()). The reason for this is that when irqchip_in_kernel(v->kvm) cpu sleeps in userspace during halt, so all event that can cause it to be unhalted should be generated in userspace too. This is also the reason you can't have pit in kernel and irqchip in userpsace. -- Gleb. --
Oh, thank you very much for answering so many questions, and your patch is looks good for me! ;-) --
It is still not tested though :) -- Gleb. --
Tested with full virtualization guests, it works well for me. Thanks! --
Thank you. Will repost properly. -- Gleb. --
There is no need to optimize this. Current code is simpler, less error-prone. Applied 1-6, thanks. --
| Greg KH | Og dreams of kernels |
| Jens Axboe | [PATCH 31/33] Fusion: sg chaining support |
| Arnd Bergmann | Re: |
