The audit can help us to detect the mmu bugs as early as possible, it's also can help us to fix them. It's so useful but the linux distribution is imposable to use it since: - it need enable it by define "AUDIT" macro and compile it - the audit code is very high overload, it lets the guest mostly hung So, this patchset supports to enable/disable it dynamically, it's very low overhead if disable it, and it lowers the audit frequency to assure the guest running. After this patchset, we can enable it by: mount -t debugfs none debugfs echo 1 > debugfs/kvm/mmu-debug disable it by: echo 0 > debugfs/kvm/mmu-debug default, the audit is disabled [PATCH 1/4] KVM: MMU: support disable/enable mmu audit dynamically [PATCH 2/4] KVM: MMU: improve active sp audit [PATCH 3/4] KVM: MMU: improve spte audit [PATCH 4/4] KVM: MMU: lower the aduit frequency --
Add the debugfs file named 'mmu-debug', we can disable/enable mmu audit by
this file:
enable:
echo 1 > debugfs/kvm/mmu-debug
disable:
echo 0 > debugfs/kvm/mmu-debug
This patch not change the logic
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
arch/x86/kvm/Kconfig | 6 +
arch/x86/kvm/mmu.c | 250 ++--------------------------------
arch/x86/kvm/mmu_debug.c | 329 ++++++++++++++++++++++++++++++++++++++++++++
arch/x86/kvm/mmu_debug.h | 12 ++
arch/x86/kvm/mmutrace.h | 19 +++
arch/x86/kvm/paging_tmpl.h | 4 +-
virt/kvm/kvm_main.c | 6 +-
7 files changed, 380 insertions(+), 246 deletions(-)
create mode 100644 arch/x86/kvm/mmu_debug.c
create mode 100644 arch/x86/kvm/mmu_debug.h
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 970bbd4..67a941d 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -64,6 +64,12 @@ config KVM_AMD
To compile this as a module, choose M here: the module
will be called kvm-amd.
+config KVM_MMU_DEBUG
+ bool "Debug KVM MMU"
+ depends on KVM && TRACEPOINTS
+ ---help---
+ This feature allows debug KVM MMU at runtime.
+
# OK, it's a little counter-intuitive to do this, but it puts it neatly under
# the virtualization menu.
source drivers/vhost/Kconfig
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0bff4d5..8609249 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -19,6 +19,7 @@
*/
#include "mmu.h"
+#include "mmu_debug.h"
#include "x86.h"
#include "kvm_cache_regs.h"
@@ -51,14 +52,6 @@ bool tdp_enabled = false;
#undef MMU_DEBUG
-#undef AUDIT
-
-#ifdef AUDIT
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
-#else
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
-#endif
-
#ifdef MMU_DEBUG
#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
@@ -71,7 +64,7 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
#endif
...Better as a runtime rw module parameter perhaps? At least it avoids the large debugfs callbacks. Here, of course, you can use print_symbolic() to preserve readability. -- error compiling committee.c: too many arguments to function --
OK Will fix them in the next version. --
Both audit_rmap() and audit_write_protection() need to walk all active sp, so we can do
these checking in a sp walking
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
arch/x86/kvm/mmu_debug.c | 80 +++++++++++++++++++++++----------------------
1 files changed, 41 insertions(+), 39 deletions(-)
diff --git a/arch/x86/kvm/mmu_debug.c b/arch/x86/kvm/mmu_debug.c
index d2c0048..812d6dc 100644
--- a/arch/x86/kvm/mmu_debug.c
+++ b/arch/x86/kvm/mmu_debug.c
@@ -70,6 +70,16 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
return;
}
+typedef void (*sp_handler) (struct kvm *kvm, struct kvm_mmu_page *sp);
+
+static void walk_all_active_sps(struct kvm *kvm, sp_handler fn)
+{
+ struct kvm_mmu_page *sp;
+
+ list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link)
+ fn(kvm, sp);
+}
+
static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
gva_t va, int level)
{
@@ -180,67 +190,59 @@ void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu)
mmu_spte_walk(vcpu, inspect_spte_has_rmap);
}
-static void check_mappings_rmap(struct kvm_vcpu *vcpu)
+static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
{
- struct kvm_mmu_page *sp;
int i;
- list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
- u64 *pt = sp->spt;
-
- if (sp->role.level != PT_PAGE_TABLE_LEVEL)
- continue;
+ if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+ return;
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- if (!is_rmap_spte(pt[i]))
- continue;
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ if (!is_rmap_spte(sp->spt[i]))
+ return;
- inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
- }
+ inspect_spte_has_rmap(kvm, sp->spt + i);
}
- return;
}
-static void audit_rmap(struct kvm_vcpu *vcpu)
+static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
{
- check_mappings_rmap(vcpu);
-}
-
-static void audit_write_protection(struct kvm_vcpu *vcpu)
-{
- struct ...Both audit_mappings() and audit_sptes_have_rmaps() need to walk vcpu's page table, so we can do
these checking in a spte walking
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
arch/x86/kvm/mmu_debug.c | 148 +++++++++++++++++++++------------------------
1 files changed, 69 insertions(+), 79 deletions(-)
diff --git a/arch/x86/kvm/mmu_debug.c b/arch/x86/kvm/mmu_debug.c
index 812d6dc..c4ebe6a 100644
--- a/arch/x86/kvm/mmu_debug.c
+++ b/arch/x86/kvm/mmu_debug.c
@@ -24,23 +24,24 @@ static bool mmu_debug;
static const char *audit_msg;
-typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
+typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
-static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
- inspect_spte_fn fn)
+static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ inspect_spte_fn fn, int level)
{
int i;
for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- u64 ent = sp->spt[i];
-
- if (is_shadow_present_pte(ent)) {
- if (!is_last_spte(ent, sp->role.level)) {
- struct kvm_mmu_page *child;
- child = page_header(ent & PT64_BASE_ADDR_MASK);
- __mmu_spte_walk(kvm, child, fn);
- } else
- fn(kvm, &sp->spt[i]);
+ u64 *ent = sp->spt;
+
+ fn(vcpu, ent + i, level);
+
+ if (is_shadow_present_pte(ent[i]) &&
+ !is_last_spte(ent[i], level)) {
+ struct kvm_mmu_page *child;
+
+ child = page_header(ent[i] & PT64_BASE_ADDR_MASK);
+ __mmu_spte_walk(vcpu, child, fn, level - 1);
}
}
}
@@ -52,19 +53,21 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
+
if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
sp = page_header(root);
- __mmu_spte_walk(vcpu->kvm, sp, fn);
+ __mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_LEVEL);
return;
}
+
for (i = 0; i < 4; ++i) {
hpa_t root = ...The audit is very high overhead, so we need lower the frequency to assure the guest running
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
arch/x86/kvm/mmu_debug.c | 6 ++++++
1 files changed, 6 insertions(+), 0 deletions(-)
diff --git a/arch/x86/kvm/mmu_debug.c b/arch/x86/kvm/mmu_debug.c
index c4ebe6a..bc61b3d 100644
--- a/arch/x86/kvm/mmu_debug.c
+++ b/arch/x86/kvm/mmu_debug.c
@@ -18,6 +18,7 @@
*/
#include <linux/debugfs.h>
+#include <linux/ratelimit.h>
static struct dentry *debugfs_file;
static bool mmu_debug;
@@ -233,6 +234,11 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, const char *msg)
{
+ static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
+
+ if (!__ratelimit(&ratelimit_state))
+ return;
+
audit_msg = msg;
audit_all_active_sps(vcpu->kvm);
audit_vcpu_spte(vcpu);
--
1.7.0.4
--
This means we see a bug long after it happened, so we can't correlate it to the cause. It's fine as an option (even the default) but I'd like to be able to audit after every operation. Perhaps a partial audit that only looks at the gfns and vaddrs that were affected in the last operation? I have to admit, it's been a very long time since I last used audit. -- error compiling committee.c: too many arguments to function --
Audit checks all the active shadow pages and all vcpu's page table, so the overload is very high :-) During my test, if enable the aduit, the guest mostly hung, it means the guest not do anything. (Host: Intel(R) Xeon(R) X3430 @ 2.40GHz * 4 + 4G memory GUest: x2VCPU + 1G memory ) I'll set the 'ratelimit' as a module parameter, then if the user's machine is fast enough, the ratelimit can be disabled. --
You're right, I remember that from the last time I used audit many years It's only useful in very special cases - low memory and a very fast reproducer. I think we can live without the parameter, if someone has this special case they can hack the code. -- error compiling committee.c: too many arguments to function --
Add a r/w module parameter named 'mmu_audit', it can control audit enable/disable:
enable:
echo 1 > /sys/module/kvm/parameters/mmu_audit
disable:
echo 0 > /sys/module/kvm/parameters/mmu_audit
This patch not change the logic
V2:
Using r/w module parameter instead of debugfs file
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
arch/x86/kvm/Kconfig | 7 +++
arch/x86/kvm/mmu.c | 91 +++++++++++++++++++++++++++++++++++---------
arch/x86/kvm/mmutrace.h | 19 +++++++++
arch/x86/kvm/paging_tmpl.h | 4 +-
4 files changed, 101 insertions(+), 20 deletions(-)
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 970bbd4..ddc131f 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -64,6 +64,13 @@ config KVM_AMD
To compile this as a module, choose M here: the module
will be called kvm-amd.
+config KVM_MMU_AUDIT
+ bool "Audit KVM MMU"
+ depends on KVM && TRACEPOINTS
+ ---help---
+ This option adds a R/W kVM module parameter 'mmu_audit', which allows
+ audit KVM MMU at runtime.
+
# OK, it's a little counter-intuitive to do this, but it puts it neatly under
# the virtualization menu.
source drivers/vhost/Kconfig
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0bff4d5..8b750ff 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -49,15 +49,21 @@
*/
bool tdp_enabled = false;
-#undef MMU_DEBUG
+enum {
+ AUDIT_PRE_PAGE_FAULT,
+ AUDIT_POST_PAGE_FAULT,
+ AUDIT_PRE_PTE_WRITE,
+ AUDIT_POST_PTE_WRITE
+};
-#undef AUDIT
+char *audit_point_name[] = {
+ "pre page fault",
+ "post page fault",
+ "pre pte write",
+ "post pte write"
+};
-#ifdef AUDIT
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
-#else
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
-#endif
+#undef MMU_DEBUG
#ifdef MMU_DEBUG
@@ -71,7 +77,7 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
#endif
-#if ...Move the audit code from arch/x86/kvm/mmu.c to arch/x86/kvm/mmu_audit.c
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
arch/x86/kvm/mmu.c | 279 +-------------------------------------------
arch/x86/kvm/mmu_audit.c | 297 ++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 298 insertions(+), 278 deletions(-)
create mode 100644 arch/x86/kvm/mmu_audit.c
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8b750ff..d2dad65 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3490,282 +3490,5 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
#ifdef CONFIG_KVM_MMU_AUDIT
-static const char *audit_msg;
-
-typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
-
-static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
- inspect_spte_fn fn)
-{
- int i;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- u64 ent = sp->spt[i];
-
- if (is_shadow_present_pte(ent)) {
- if (!is_last_spte(ent, sp->role.level)) {
- struct kvm_mmu_page *child;
- child = page_header(ent & PT64_BASE_ADDR_MASK);
- __mmu_spte_walk(kvm, child, fn);
- } else
- fn(kvm, &sp->spt[i]);
- }
- }
-}
-
-static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
-{
- int i;
- struct kvm_mmu_page *sp;
-
- if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
- return;
- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
- hpa_t root = vcpu->arch.mmu.root_hpa;
- sp = page_header(root);
- __mmu_spte_walk(vcpu->kvm, sp, fn);
- return;
- }
- for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->arch.mmu.pae_root[i];
-
- if (root && VALID_PAGE(root)) {
- root &= PT64_BASE_ADDR_MASK;
- sp = page_header(root);
- __mmu_spte_walk(vcpu->kvm, sp, fn);
- }
- }
- return;
-}
-
-static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
- gva_t va, int level)
-{
- u64 *pt = __va(page_pte & ...Both audit_rmap() and audit_write_protection() need to walk all active sp, so we can do
these checking in a sp walking
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
arch/x86/kvm/mmu_audit.c | 74 +++++++++++++++++++++++----------------------
1 files changed, 38 insertions(+), 36 deletions(-)
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index fb8a461..8becb86 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -65,6 +65,16 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
return;
}
+typedef void (*sp_handler) (struct kvm *kvm, struct kvm_mmu_page *sp);
+
+static void walk_all_active_sps(struct kvm *kvm, sp_handler fn)
+{
+ struct kvm_mmu_page *sp;
+
+ list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link)
+ fn(kvm, sp);
+}
+
static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
gva_t va, int level)
{
@@ -175,67 +185,59 @@ void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu)
mmu_spte_walk(vcpu, inspect_spte_has_rmap);
}
-static void check_mappings_rmap(struct kvm_vcpu *vcpu)
+static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
{
- struct kvm_mmu_page *sp;
int i;
- list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
- u64 *pt = sp->spt;
+ if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+ return;
- if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ if (!is_rmap_spte(sp->spt[i]))
continue;
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- if (!is_rmap_spte(pt[i]))
- continue;
-
- inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
- }
+ inspect_spte_has_rmap(kvm, sp->spt + i);
}
- return;
}
-static void audit_rmap(struct kvm_vcpu *vcpu)
+void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
{
- check_mappings_rmap(vcpu);
-}
-
-static void audit_write_protection(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu_page *sp;
struct ...Both audit_mappings() and audit_sptes_have_rmaps() need to walk vcpu's page table, so we can do
these checking in a spte walking
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
arch/x86/kvm/mmu_audit.c | 148 +++++++++++++++++++++------------------------
1 files changed, 69 insertions(+), 79 deletions(-)
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 8becb86..3bde186 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -19,23 +19,24 @@
static const char *audit_msg;
-typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
+typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
-static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
- inspect_spte_fn fn)
+static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ inspect_spte_fn fn, int level)
{
int i;
for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- u64 ent = sp->spt[i];
-
- if (is_shadow_present_pte(ent)) {
- if (!is_last_spte(ent, sp->role.level)) {
- struct kvm_mmu_page *child;
- child = page_header(ent & PT64_BASE_ADDR_MASK);
- __mmu_spte_walk(kvm, child, fn);
- } else
- fn(kvm, &sp->spt[i]);
+ u64 *ent = sp->spt;
+
+ fn(vcpu, ent + i, level);
+
+ if (is_shadow_present_pte(ent[i]) &&
+ !is_last_spte(ent[i], level)) {
+ struct kvm_mmu_page *child;
+
+ child = page_header(ent[i] & PT64_BASE_ADDR_MASK);
+ __mmu_spte_walk(vcpu, child, fn, level - 1);
}
}
}
@@ -47,21 +48,25 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
+
if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
+
sp = page_header(root);
- __mmu_spte_walk(vcpu->kvm, sp, fn);
+ __mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_LEVEL);
return;
}
+
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->arch.mmu.pae_root[i];
...The audit is very high overhead, so we need lower the frequency to assure the guest running
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
arch/x86/kvm/mmu_audit.c | 7 +++++++
1 files changed, 7 insertions(+), 0 deletions(-)
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 3bde186..bd2b1be 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -17,6 +17,8 @@
*
*/
+#include <linux/ratelimit.h>
+
static const char *audit_msg;
typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
@@ -228,6 +230,11 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
{
+ static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
+
+ if (!__ratelimit(&ratelimit_state))
+ return;
+
audit_msg = audit_point_name[audit_point];
audit_all_active_sps(vcpu->kvm);
audit_vcpu_spte(vcpu);
--
1.7.0.4
--
Well, as Avi said this makes it difficult to trace back to offender (the audit points are placed around modifications to the shadow page tree for that reason). I've always seen progress from the guest while running with audit enabled (its slow, but its not supposed to be fast anyway). Did you experience a freeze? --
There is a simply test in the guest if it's not rate limit: # time ls anaconda-ks.cfg Documents install.log Music Public Videos Desktop Downloads install.log.syslog Pictures Templates real 1m26.053s user 0m0.311s sys 0m1.813s 'ls' command cost about 1.5 minute, if we run the memory test program, i think the time/delay is unacceptable...... :-( --
Marcelo, would making the ratelimit optional help? personally I think without ratelimit audit is useless, and with ratelimit it is a lot less useful but can still point out problems. But I haven't used it in a while. -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. --
Was just wondering whether there was a freeze. Patchset looks good (ratelimit can be disabled manually). --
Ok, applied now. -- error compiling committee.c: too many arguments to function --
