The following patchset allows a host with running virtual machines to be suspended and, on at least a subset of the machines tested, resumed. Note that this is orthogonal to suspending and resuming an individual guest to a file. A side effect of implementing suspend/resume is that cpu hotplug is now supported. This should please the owners of big iron. Andrew, please queue for 2.6.21. -- error compiling committee.c: too many arguments to function -
KVM wants the cpu hotplug notifications, both for cpu hotplug itself, but more
commonly for host suspend/resume.
In order to avoid extensive #ifdefs, provide stubs when CONFIG_CPU_HOTPLUG is
not defined.
In all, we have four cases:
- UP: register and unregister stubbed out
- SMP+hotplug: full register and unregister
- SMP, no hotplug, core: register as __init, unregister stubbed
(cpus are brought up during core initialization)
- SMP, no hotplug, module: register and unregister stubbed out
(cpus cannot be brought up during module lifetime)
Signed-off-by: Avi Kivity <avi@qumranet.com>
Index: linux-2.6/include/linux/cpu.h
===================================================================
--- linux-2.6.orig/include/linux/cpu.h
+++ linux-2.6/include/linux/cpu.h
@@ -49,10 +49,20 @@ struct notifier_block;
#ifdef CONFIG_SMP
/* Need to know about CPUs going up/down? */
-extern int register_cpu_notifier(struct notifier_block *nb);
#ifdef CONFIG_HOTPLUG_CPU
+extern int register_cpu_notifier(struct notifier_block *nb);
extern void unregister_cpu_notifier(struct notifier_block *nb);
#else
+
+#ifndef MODULE
+extern int register_cpu_notifier(struct notifier_block *nb);
+#else
+static inline int register_cpu_notifier(struct notifier_block *nb)
+{
+ return 0;
+}
+#endif
+
static inline void unregister_cpu_notifier(struct notifier_block *nb)
{
}
-
This will allow us to iterate over all vcpus and see which cpus they are
running on.
Signed-off-by: Avi Kivity <avi@qumranet.com>
Index: linux-2.6/drivers/kvm/kvm_main.c
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm_main.c
+++ linux-2.6/drivers/kvm/kvm_main.c
@@ -41,6 +41,9 @@
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
+static spinlock_t kvm_lock = SPIN_LOCK_UNLOCKED;
+static struct list_head vm_list = LIST_HEAD_INIT(vm_list);
+
struct kvm_arch_ops *kvm_arch_ops;
struct kvm_stat kvm_stat;
EXPORT_SYMBOL_GPL(kvm_stat);
@@ -230,9 +233,13 @@ static int kvm_dev_open(struct inode *in
struct kvm_vcpu *vcpu = &kvm->vcpus[i];
mutex_init(&vcpu->mutex);
+ vcpu->cpu = -1;
vcpu->kvm = kvm;
vcpu->mmu.root_hpa = INVALID_PAGE;
INIT_LIST_HEAD(&vcpu->free_pages);
+ spin_lock(&kvm_lock);
+ list_add(&kvm->vm_list, &vm_list);
+ spin_unlock(&kvm_lock);
}
filp->private_data = kvm;
return 0;
@@ -292,6 +299,9 @@ static int kvm_dev_release(struct inode
{
struct kvm *kvm = filp->private_data;
+ spin_lock(&kvm_lock);
+ list_del(&kvm->vm_list);
+ spin_unlock(&kvm_lock);
kvm_free_vcpus(kvm);
kvm_free_physmem(kvm);
kfree(kvm);
@@ -546,7 +556,6 @@ static int kvm_dev_ioctl_create_vcpu(str
FX_IMAGE_ALIGN);
vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE;
- vcpu->cpu = -1; /* First load will set up TR */
r = kvm_arch_ops->vcpu_create(vcpu);
if (r < 0)
goto out_free_vcpus;
Index: linux-2.6/drivers/kvm/kvm.h
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm.h
+++ linux-2.6/drivers/kvm/kvm.h
@@ -304,6 +304,7 @@ struct kvm {
int memory_config_version;
int busy;
unsigned long rmap_overflow;
+ struct list_head vm_list;
};
struct kvm_stat {
-
Like the inline code it replaces, this function decaches the vmcs from the cpu
it last executed on. in addition:
- vcpu_clear() works if the last cpu is also the cpu we're running on
- it is faster on larger smps by virtue of using smp_call_function_single()
Includes fix from Ingo Molnar.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Avi Kivity <avi@qumranet.com>
Index: linux-2.6/drivers/kvm/vmx.c
===================================================================
--- linux-2.6.orig/drivers/kvm/vmx.c
+++ linux-2.6/drivers/kvm/vmx.c
@@ -125,6 +125,15 @@ static void __vcpu_clear(void *arg)
per_cpu(current_vmcs, cpu) = NULL;
}
+static void vcpu_clear(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->cpu != raw_smp_processor_id() && vcpu->cpu != -1)
+ smp_call_function_single(vcpu->cpu, __vcpu_clear, vcpu, 0, 1);
+ else
+ __vcpu_clear(vcpu);
+ vcpu->launched = 0;
+}
+
static unsigned long vmcs_readl(unsigned long field)
{
unsigned long value;
@@ -202,10 +211,8 @@ static struct kvm_vcpu *vmx_vcpu_load(st
cpu = get_cpu();
- if (vcpu->cpu != cpu) {
- smp_call_function(__vcpu_clear, vcpu, 0, 1);
- vcpu->launched = 0;
- }
+ if (vcpu->cpu != cpu)
+ vcpu_clear(vcpu);
if (per_cpu(current_vmcs, cpu) != vcpu->vmcs) {
u8 error;
-
On hotplug, we execute the hardware extension enable sequence.
On unplug, we decache any vcpus that last ran on the exiting cpu, and
execute the hardware extension disable sequence.
Signed-off-by: Avi Kivity <avi@qumranet.com>
Index: linux-2.6/drivers/kvm/kvm_main.c
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm_main.c
+++ linux-2.6/drivers/kvm/kvm_main.c
@@ -34,6 +34,7 @@
#include <linux/highmem.h>
#include <linux/file.h>
#include <asm/desc.h>
+#include <linux/cpu.h>
#include "x86_emulate.h"
#include "segment_descriptor.h"
@@ -2038,6 +2039,64 @@ static struct notifier_block kvm_reboot_
.priority = 0,
};
+/*
+ * Make sure that a cpu that is being hot-unplugged does not have any vcpus
+ * cached on it.
+ */
+static void decache_vcpus_on_cpu(int cpu)
+{
+ struct kvm *vm;
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ spin_lock(&kvm_lock);
+ list_for_each_entry(vm, &vm_list, vm_list)
+ for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+ vcpu = &vm->vcpus[i];
+ /*
+ * If the vcpu is locked, then it is running on some
+ * other cpu and therefore it is not cached on the
+ * cpu in question.
+ *
+ * If it's not locked, check the last cpu it executed
+ * on.
+ */
+ if (mutex_trylock(&vcpu->mutex)) {
+ if (vcpu->cpu == cpu) {
+ kvm_arch_ops->vcpu_decache(vcpu);
+ vcpu->cpu = -1;
+ }
+ mutex_unlock(&vcpu->mutex);
+ }
+ }
+ spin_unlock(&kvm_lock);
+}
+
+static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
+ void *v)
+{
+ int cpu = (long)v;
+
+ switch (val) {
+ case CPU_DEAD:
+ case CPU_UP_CANCELED:
+ decache_vcpus_on_cpu(cpu);
+ smp_call_function_single(cpu, kvm_arch_ops->hardware_disable,
+ NULL, 0, 1);
+ break;
+ case CPU_UP_PREPARE:
+ smp_call_function_single(cpu, kvm_arch_ops->hardware_enable,
+ NULL, 0, 1);
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block kvm_cpu_notifier = ...On Tue, 30 Jan 2007 14:56:16 -0000 The trylock is unpleasing. Perhaps kvm_lock should be a mutex or something? -
this is a special case. The vcpu->mutex acts as a 'this vcpu is running right now' flag as well - hence the trylock signals: is it running right now or not - if it's not running we do not have to 'decache' it. But i agree and i already suggested to Avi to change kvm_lock to be a mutex - but this wont change the trylock. Ingo -
To elaborate a little: replacing mutex_trylock() with mutex_lock() will cause unbounded latency as we wait for the vcpu to be descheduled. In this case, we're only interested in descheduled vcpus, so there's no need to wait. kvm is a bit funny in how it likes to pin cpus. -- error compiling committee.c: too many arguments to function -
Add the necessary callbacks to suspend and resume a host running kvm. This
is just a repeat of the cpu hotplug/unplug work.
Signed-off-by: Avi Kivity <avi@qumranet.com>
Index: linux-2.6/drivers/kvm/kvm_main.c
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm_main.c
+++ linux-2.6/drivers/kvm/kvm_main.c
@@ -34,6 +34,7 @@
#include <linux/highmem.h>
#include <linux/file.h>
#include <asm/desc.h>
+#include <linux/sysdev.h>
#include <linux/cpu.h>
#include "x86_emulate.h"
@@ -2116,6 +2117,30 @@ static void kvm_exit_debug(void)
debugfs_remove(debugfs_dir);
}
+static int kvm_suspend(struct sys_device *dev, pm_message_t state)
+{
+ decache_vcpus_on_cpu(raw_smp_processor_id());
+ on_each_cpu(kvm_arch_ops->hardware_disable, 0, 0, 1);
+ return 0;
+}
+
+static int kvm_resume(struct sys_device *dev)
+{
+ on_each_cpu(kvm_arch_ops->hardware_enable, 0, 0, 1);
+ return 0;
+}
+
+static struct sysdev_class kvm_sysdev_class = {
+ set_kset_name("kvm"),
+ .suspend = kvm_suspend,
+ .resume = kvm_resume,
+};
+
+static struct sys_device kvm_sysdev = {
+ .id = 0,
+ .cls = &kvm_sysdev_class,
+};
+
hpa_t bad_page_address;
int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module)
@@ -2148,6 +2173,14 @@ int kvm_init_arch(struct kvm_arch_ops *o
goto out_free_1;
register_reboot_notifier(&kvm_reboot_notifier);
+ r = sysdev_class_register(&kvm_sysdev_class);
+ if (r)
+ goto out_free_2;
+
+ r = sysdev_register(&kvm_sysdev);
+ if (r)
+ goto out_free_3;
+
kvm_chardev_ops.owner = module;
r = misc_register(&kvm_dev);
@@ -2159,6 +2192,10 @@ int kvm_init_arch(struct kvm_arch_ops *o
return r;
out_free:
+ sysdev_unregister(&kvm_sysdev);
+out_free_3:
+ sysdev_class_unregister(&kvm_sysdev_class);
+out_free_2:
unregister_reboot_notifier(&kvm_reboot_notifier);
unregister_cpu_notifier(&kvm_cpu_notifier);
out_free_1:
@@ -2170,8 +2207,10 @@ out_free_1:
void kvm_exit_arch(void)
{
...Hi. Maybe it's just a lack of understanding, but I'm wondering if this patch is necessary - we already hot-unplug secondary cpus prior to suspending devices. Regards, Nigel -
Not with the reoredering patches that are in -mm, though. With these patches, the nonboot CPUs are disabled after "regular" devices. Greetings, Rafael -- If you don't have the time to read, you don't have the time or the tools to write. - Stephen King -
Hi. Ah. Good point. Thanks! :) Nigel -
After that, it means on_each_cpu() again, and the hotplug patch becomes unnecessary (except to support hotplug, of course). Assuming I understand things correctly, which if not at all certain. -- error compiling committee.c: too many arguments to function -
| Greg KH | Og dreams of kernels |
| Jens Axboe | [PATCH 31/33] Fusion: sg chaining support |
| Arnd Bergmann | Re: finding your own dead "CONFIG_" variables |
