Remove 'struct kvm_unsync_walk' since it's not used now
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
arch/x86/kvm/mmu.c | 5 -----
1 files changed, 0 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b44380b..a23ca75 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -172,11 +172,6 @@ struct kvm_shadow_walk_iterator {
shadow_walk_okay(&(_walker)); \
shadow_walk_next(&(_walker)))
-
-struct kvm_unsync_walk {
- int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
-};
-
typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
static struct kmem_cache *pte_chain_cache;
--
1.6.1.2
--
- calculate zapped page number properly in mmu_zap_unsync_children()
- calculate freeed page number properly kvm_mmu_change_mmu_pages()
- restart list walking if have children page zapped
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
arch/x86/kvm/mmu.c | 7 ++++---
1 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a23ca75..8f4f781 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1483,8 +1483,8 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
for_each_sp(pages, sp, parents, i) {
kvm_mmu_zap_page(kvm, sp);
mmu_pages_clear_parents(&parents);
+ zapped++;
}
- zapped += pages.nr;
kvm_mmu_pages_init(parent, &parents, &pages);
}
@@ -1540,7 +1540,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
page = container_of(kvm->arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
- kvm_mmu_zap_page(kvm, page);
+ used_pages -= kvm_mmu_zap_page(kvm, page);
used_pages--;
}
kvm->arch.n_free_mmu_pages = 0;
@@ -1589,7 +1589,8 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
&& !sp->role.invalid) {
pgprintk("%s: zap %lx %x\n",
__func__, gfn, sp->role.word);
- kvm_mmu_zap_page(kvm, sp);
+ if (kvm_mmu_zap_page(kvm, sp))
+ nn = bucket->first;
}
}
}
--
1.6.1.2
--
This looks correct, I don't understand how we work in the first place. I don't understand why this is needed. -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. --
There is the code segment in mmu_unshadow():
|hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) {
| if (sp->gfn == gfn && !sp->role.direct
| && !sp->role.invalid) {
| pgprintk("%s: zap %lx %x\n",
| __func__, gfn, sp->role.word);
| kvm_mmu_zap_page(kvm, sp);
| }
| }
in the loop, if nn is zapped, hlist_for_each_entry_safe() access nn will
cause crash. and it's checked in other functions, such as kvm_mmu_zap_all(),
kvm_mmu_unprotect_page()...
Thanks,
Xiao
--
hlist_for_each_entry_safe() is supposed to be be safe against removal of the element that is pointed to by the iteration cursor. -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. --
Hi Avi,
If we destroyed the next point, hlist_for_each_entry_safe() is unsafe.
List hlist_for_each_entry_safe()'s code:
|#define hlist_for_each_entry_safe(tpos, pos, n, head, member) \
| for (pos = (head)->first; \
| pos && ({ n = pos->next; 1; }) && \
| ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
| pos = n)
if n is destroyed:
'pos = n, n = pos->next'
then it access n again, it's unsafe/illegal for us.
Xiao
--
But kvm_mmu_zap_page() will only destroy sp == tpos == pos; n points at pos->next already, so it's safe. -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. --
kvm_mmu_zap_page(sp) not only zaps sp but also zaps all sp's unsync children pages, if n is just sp's unsyc child, just at the same hlist and just behind sp, it will crash. :-) --
Ouch. I see now, thanks for explaining. One way to fix it is to make kvm_mmu_zap_page() only zap the page it is given, and use sp->role.invalid on its children. But it's better to fix it now quickly and do the more involved fixes later. Just change the assignment to a 'goto restart;' please, I don't like playing with list_for_each internals. -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. --
I think mmu_zap_unsync_children() should return the number of zapped pages then we can adjust the number of free pages in kvm_mmu_change_mmu_pages(), but pages.nr no only includes the unsync/zapped pages but also includes their parents. Thanks, Xiao --
Oh i see. I think its safer to check for list_empty then to rely on proper accounting there, like __kvm_mmu_free_some_pages does. --
Do you mean that we'd better add WARN_ON(list_empty()) code in kvm_mmu_zap_page()? Thanks, Xiao --
Just break out of the loop if list_empty(&vcpu->kvm->arch.active_mmu_pages). --
- 'vcpu' is not used while mark parent unsync, so remove it
- if it has alread marked unsync, no need to walk it's parent
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
arch/x86/kvm/mmu.c | 69 +++++++++++++++++----------------------------------
1 files changed, 23 insertions(+), 46 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8f4f781..5154d70 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -172,7 +172,7 @@ struct kvm_shadow_walk_iterator {
shadow_walk_okay(&(_walker)); \
shadow_walk_next(&(_walker)))
-typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
+typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
static struct kmem_cache *pte_chain_cache;
static struct kmem_cache *rmap_desc_cache;
@@ -1000,74 +1000,51 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
}
-static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- mmu_parent_walk_fn fn)
+static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
{
struct kvm_pte_chain *pte_chain;
struct hlist_node *node;
struct kvm_mmu_page *parent_sp;
int i;
- if (!sp->multimapped && sp->parent_pte) {
+ if (!sp->parent_pte)
+ return;
+
+ if (!sp->multimapped) {
parent_sp = page_header(__pa(sp->parent_pte));
- fn(vcpu, parent_sp);
- mmu_parent_walk(vcpu, parent_sp, fn);
+ if (fn(parent_sp, sp->parent_pte))
+ mmu_parent_walk(parent_sp, fn);
return;
}
+
hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
- if (!pte_chain->parent_ptes[i])
+ u64 *spte = pte_chain->parent_ptes[i];
+ if (!spte)
break;
- parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
- fn(vcpu, parent_sp);
- mmu_parent_walk(vcpu, parent_sp, fn);
+ parent_sp = page_header(__pa(spte));
+ if (fn(parent_sp, spte))
+ mmu_parent_walk(parent_sp, ...Please separate these two changes. The optimization looks good. Perhaps it can be done even nicer using mutually recursive functions (mmu_parent_walk calls mmu_parent_walk_fn which calls mmu_parent_walk on the parent), but let's not change too many things at a time. -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. --
OK, i'll separate it in the next version Thanks, Xiao --
Xiao, Did you actually see this codepath as being performance sensitive? I'd prefer to not touch it. --
Actually, i not run benchmarks to contrast the performance before this patch This patch avoids walk all parents and i think this overload is really unnecessary. It has other tricks in this codepath but i not noticed? :-) Thanks, Xiao --
There is also the advantage of code size reduction. -- error compiling committee.c: too many arguments to function --
My point is that there is no point in optimizing something unless its performance sensitive. And as i recall, mmu_unsync_walk was much more sensitive performance wise than parent walking. Actually, gfn_to_memslot seems more important since its also noticeable on EPT/NPT hosts. --
Hi Marcelo,
I think optimizing not only means 'performance' but also means 'smaller code'(maybe 'cleanup'
is more suitable) and 'logic optimize'(do little things), i'm not sure this patch whether can
improve system performance obviously but it optimize the code logic and reduce code size, and
it not harm other code and system performance, right? :-)
Actually, the origin code has a bug, the code segment in mmu_parent_walk():
| if (!sp->multimapped && sp->parent_pte) {
| ......
| return;
| }
| hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
| for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
| ......
| }
Yeah, i also noticed these and i'm looking into these code.
Thanks,
Xiao
--
Marcelo, please ignore this, it not a bug, just my mistake, sorry... --
Right, but this walking code already is compact and stable. Removing the unused code variables/definitions is fine, but i'd prefer to not change Great. --
Usually, OS changes CR4.PGE bit to flush all global page, under this
case, no need reset mmu and just flush tlb
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
arch/x86/kvm/x86.c | 9 +++++++++
1 files changed, 9 insertions(+), 0 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fd5c3d3..2aaa6fb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -463,6 +463,15 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
unsigned long old_cr4 = kvm_read_cr4(vcpu);
unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
+ if (cr4 == old_cr4)
+ return;
+
+ if ((cr4 ^ old_cr4) == X86_CR4_PGE) {
+ kvm_mmu_sync_roots(vcpu);
+ kvm_mmu_flush_tlb(vcpu);
+ return;
+ }
+
if (cr4 & CR4_RESERVED_BITS) {
kvm_inject_gp(vcpu, 0);
return;
--
1.6.1.2
--
All of which depend on cr4. -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. --
Hi Avi, Thanks for your comments. Oh, destroy_kvm_mmu() is not really destroyed cr3 and we can reload it later form shadow page cache, so, maybe this patch is unnecessary. But, i have a another question here, why we need encode 'cr4 & X86_CR4_PGE' into base_role.cr4_gpe? Why we need allocation different shadow page for global page and no-global page? As i know, global page is not static in TLB, and x86 cpu also may flush them form TLB, maybe we no need treat global page specially... Am i miss something? :-( Xiao --
See 6364a3918cb. It was reverted later due to a problem with the implementation. I'm not sure whether I want to fix the bug and restore that patch, or to drop it altogether and give the guest ownership of You can't read reverted patches? :) -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. --
I usually use 'get blame' to look into source, and not noticed reverted patches, i'll pay more attention on those. Below code still confused me: | vcpu->arch.mmu.base_role.cr4_pge = (cr4& X86_CR4_PGE)&&!tdp_enabled; And i found the commit 87778d60ee: | KVM: MMU: Segregate mmu pages created with different cr4.pge settings | | Don't allow a vcpu with cr4.pge cleared to use a shadow page created with | cr4.pge set; this might cause a cr3 switch not to sync ptes that have the | global bit set (the global bit has no effect if !cr4.pge). | | This can only occur on smp with different cr4.pge settings for different | vcpus (since a cr4 change will resync the shadow ptes), but there's no | cost to being correct here. In current code, cr3 switch will sync all unsync shadow pages(regardless it's global or not) and this issue not live now, so, do we need also revert this patch? Xiao --
One path is to revert this patch. The other is to restore the optimization that relies on it. I'm not sure which is best. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. --
'multimapped' and 'unsync' in 'struct kvm_mmu_page' are just indication
field, we can use flag bits instand of them
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
arch/x86/include/asm/kvm_host.h | 5 ++-
arch/x86/kvm/mmu.c | 65 ++++++++++++++++++++++++++++-----------
arch/x86/kvm/mmutrace.h | 7 ++--
arch/x86/kvm/paging_tmpl.h | 2 +-
4 files changed, 55 insertions(+), 24 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0c49c88..d463bc6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -202,9 +202,10 @@ struct kvm_mmu_page {
* in this shadow page.
*/
DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
- int multimapped; /* More than one parent_pte? */
int root_count; /* Currently serving as active root */
- bool unsync;
+ #define MMU_PAGE_MULTIMAPPED 0x1 /* More than one parent_pte? */
+ #define MMU_PAGE_UNSYNC 0x2
+ unsigned int flags;
unsigned int unsync_children;
union {
u64 *parent_pte; /* !multimapped */
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5154d70..18eceb2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -266,6 +266,36 @@ static int is_last_spte(u64 pte, int level)
return 0;
}
+static bool mmu_page_is_multimapped(struct kvm_mmu_page *sp)
+{
+ return !!(sp->flags & MMU_PAGE_MULTIMAPPED);
+}
+
+static void mmu_page_mark_multimapped(struct kvm_mmu_page *sp)
+{
+ sp->flags |= MMU_PAGE_MULTIMAPPED;
+}
+
+static void mmu_page_clear_multimapped(struct kvm_mmu_page *sp)
+{
+ sp->flags &= ~MMU_PAGE_MULTIMAPPED;
+}
+
+static bool mmu_page_is_unsync(struct kvm_mmu_page *sp)
+{
+ return !!(sp->flags & MMU_PAGE_UNSYNC);
+}
+
+static void mmu_page_mark_unsync(struct kvm_mmu_page *sp)
+{
+ sp->flags |= MMU_PAGE_UNSYNC;
+}
+
+static void mmu_page_clear_unsync(struct kvm_mmu_page *sp)
+{
+ sp->flags &= ...Please just use bool for multimapped. If we grow more than 4 flags, we can use 'bool flag : 1;' to reduce space while retaining readability. -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. --
Yeah, good idea, i'll fix it in the next version Thanks, Xiao --
- chain all unsync shadow pages then we can fetch them quickly
- flush local/remote tlb after all shadow page synced
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/mmu.c | 82 ++++++++++++++++++---------------------
2 files changed, 39 insertions(+), 44 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d463bc6..ae543fb 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -207,6 +207,7 @@ struct kvm_mmu_page {
#define MMU_PAGE_UNSYNC 0x2
unsigned int flags;
unsigned int unsync_children;
+ struct list_head unsync_link;
union {
u64 *parent_pte; /* !multimapped */
struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 18eceb2..fcb6299 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -177,6 +177,8 @@ typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
static struct kmem_cache *pte_chain_cache;
static struct kmem_cache *rmap_desc_cache;
static struct kmem_cache *mmu_page_header_cache;
+static struct list_head unsync_mmu_page_list =
+ LIST_HEAD_INIT(unsync_mmu_page_list);
static u64 __read_mostly shadow_trap_nonpresent_pte;
static u64 __read_mostly shadow_notrap_nonpresent_pte;
@@ -950,6 +952,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
sp->flags = 0;
sp->parent_pte = parent_pte;
+ INIT_LIST_HEAD(&sp->unsync_link);
--vcpu->kvm->arch.n_free_mmu_pages;
return sp;
}
@@ -1200,12 +1203,14 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
WARN_ON(!mmu_page_is_unsync(sp));
mmu_page_clear_unsync(sp);
+ list_del(&sp->unsync_link);
--kvm->stat.mmu_unsync;
}
static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page ...Will this ever happen? In my experience we usually have a ton of That's counter to the point of unsync. We only want to sync the minimum number of pages - all pages reachable by the new root. The others we want to keep writeable to reduce our fault count. -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. --
Ah... this patch need more cooking, i'll improve it... thanks for you point it out. Xiao --
