From 60e4c6317b8773d987729401aeca9d8c6b61b05f Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 24 Feb 2010 18:11:19 -0300
Subject: [PATCH 1/6] Allocate memory below 4GB as one chunk

Instead of allocating a separate chunk for the first 640KB and another
for 1MB+, allocate one large chunk.  This plays well in terms of alignment
and size with large pages.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 hw/pc.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/hw/pc.c b/hw/pc.c
index 4f6a5228fd..bdc297f717 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -833,18 +833,11 @@ static void pc_init1(ram_addr_t ram_size,
     vmport_init();
 
     /* allocate RAM */
-    ram_addr = qemu_ram_alloc(0xa0000);
+    ram_addr = qemu_ram_alloc(below_4g_mem_size);
     cpu_register_physical_memory(0, 0xa0000, ram_addr);
-
-    /* Allocate, even though we won't register, so we don't break the
-     * phys_ram_base + PA assumption. This range includes vga (0xa0000 - 0xc0000),
-     * and some bios areas, which will be registered later
-     */
-    ram_addr = qemu_ram_alloc(0x100000 - 0xa0000);
-    ram_addr = qemu_ram_alloc(below_4g_mem_size - 0x100000);
     cpu_register_physical_memory(0x100000,
                  below_4g_mem_size - 0x100000,
-                 ram_addr);
+                 ram_addr + 0x100000);
 
     /* above 4giga memory allocation */
     if (above_4g_mem_size > 0) {

From c902760fb25f9c490af01e8f6bccaa8dd71cc224 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Mon, 1 Mar 2010 20:25:08 -0300
Subject: [PATCH 2/6] Add option to use file backed guest memory

Port qemu-kvm's -mem-path and -mem-prealloc options. These are useful
for backing guest memory with huge pages via hugetlbfs.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
CC: john cooper <john.cooper@redhat.com>
---
 cpu-all.h       |   3 ++
 exec.c          | 117 +++++++++++++++++++++++++++++++++++++++++++++---
 qemu-options.hx |  16 +++++++
 vl.c            |  12 +++++
 4 files changed, 142 insertions(+), 6 deletions(-)

diff --git a/cpu-all.h b/cpu-all.h
index 8488bfea9f..9823c24bab 100644
--- a/cpu-all.h
+++ b/cpu-all.h
@@ -847,6 +847,9 @@ extern uint8_t *phys_ram_dirty;
 extern ram_addr_t ram_size;
 extern ram_addr_t last_ram_offset;
 
+extern const char *mem_path;
+extern int mem_prealloc;
+
 /* physical memory access */
 
 /* MMIO pages are identified by a combination of an IO device index and
diff --git a/exec.c b/exec.c
index 6a3c912b7f..f41518e8a3 100644
--- a/exec.c
+++ b/exec.c
@@ -2529,6 +2529,99 @@ void qemu_flush_coalesced_mmio_buffer(void)
         kvm_flush_coalesced_mmio_buffer();
 }
 
+#if defined(__linux__) && !defined(TARGET_S390X)
+
+#include <sys/vfs.h>
+
+#define HUGETLBFS_MAGIC       0x958458f6
+
+static long gethugepagesize(const char *path)
+{
+    struct statfs fs;
+    int ret;
+
+    do {
+	    ret = statfs(path, &fs);
+    } while (ret != 0 && errno == EINTR);
+
+    if (ret != 0) {
+	    perror("statfs");
+	    return 0;
+    }
+
+    if (fs.f_type != HUGETLBFS_MAGIC)
+	    fprintf(stderr, "Warning: path not on HugeTLBFS: %s\n", path);
+
+    return fs.f_bsize;
+}
+
+static void *file_ram_alloc(ram_addr_t memory, const char *path)
+{
+    char *filename;
+    void *area;
+    int fd;
+#ifdef MAP_POPULATE
+    int flags;
+#endif
+    unsigned long hpagesize;
+
+    hpagesize = gethugepagesize(path);
+    if (!hpagesize) {
+	return NULL;
+    }
+
+    if (memory < hpagesize) {
+        return NULL;
+    }
+
+    if (kvm_enabled() && !kvm_has_sync_mmu()) {
+        fprintf(stderr, "host lacks kvm mmu notifiers, -mem-path unsupported\n");
+        return NULL;
+    }
+
+    if (asprintf(&filename, "%s/qemu_back_mem.XXXXXX", path) == -1) {
+	return NULL;
+    }
+
+    fd = mkstemp(filename);
+    if (fd < 0) {
+	perror("mkstemp");
+	free(filename);
+	return NULL;
+    }
+    unlink(filename);
+    free(filename);
+
+    memory = (memory+hpagesize-1) & ~(hpagesize-1);
+
+    /*
+     * ftruncate is not supported by hugetlbfs in older
+     * hosts, so don't bother bailing out on errors.
+     * If anything goes wrong with it under other filesystems,
+     * mmap will fail.
+     */
+    if (ftruncate(fd, memory))
+	perror("ftruncate");
+
+#ifdef MAP_POPULATE
+    /* NB: MAP_POPULATE won't exhaustively alloc all phys pages in the case
+     * MAP_PRIVATE is requested.  For mem_prealloc we mmap as MAP_SHARED
+     * to sidestep this quirk.
+     */
+    flags = mem_prealloc ? MAP_POPULATE | MAP_SHARED : MAP_PRIVATE;
+    area = mmap(0, memory, PROT_READ | PROT_WRITE, flags, fd, 0);
+#else
+    area = mmap(0, memory, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+#endif
+    if (area == MAP_FAILED) {
+	perror("file_ram_alloc: can't mmap RAM pages");
+	close(fd);
+	return (NULL);
+    }
+    return area;
+}
+#endif
+
 ram_addr_t qemu_ram_alloc(ram_addr_t size)
 {
     RAMBlock *new_block;
@@ -2536,16 +2629,28 @@ ram_addr_t qemu_ram_alloc(ram_addr_t size)
     size = TARGET_PAGE_ALIGN(size);
     new_block = qemu_malloc(sizeof(*new_block));
 
-#if defined(TARGET_S390X) && defined(CONFIG_KVM)
-    /* XXX S390 KVM requires the topmost vma of the RAM to be < 256GB */
-    new_block->host = mmap((void*)0x1000000, size, PROT_EXEC|PROT_READ|PROT_WRITE,
-                           MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+    if (mem_path) {
+#if defined (__linux__) && !defined(TARGET_S390X)
+        new_block->host = file_ram_alloc(size, mem_path);
+        if (!new_block->host)
+            exit(1);
 #else
-    new_block->host = qemu_vmalloc(size);
+        fprintf(stderr, "-mem-path option unsupported\n");
+        exit(1);
+#endif
+    } else {
+#if defined(TARGET_S390X) && defined(CONFIG_KVM)
+        /* XXX S390 KVM requires the topmost vma of the RAM to be < 256GB */
+        new_block->host = mmap((void*)0x1000000, size,
+                                PROT_EXEC|PROT_READ|PROT_WRITE,
+                                MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+#else
+        new_block->host = qemu_vmalloc(size);
 #endif
 #ifdef MADV_MERGEABLE
-    madvise(new_block->host, size, MADV_MERGEABLE);
+        madvise(new_block->host, size, MADV_MERGEABLE);
 #endif
+    }
     new_block->offset = last_ram_offset;
     new_block->length = size;
 
diff --git a/qemu-options.hx b/qemu-options.hx
index 7daa246c1f..fd50addb65 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -314,6 +314,22 @@ a suffix of ``M'' or ``G'' can be used to signify a value in megabytes or
 gigabytes respectively.
 ETEXI
 
+DEF("mem-path", HAS_ARG, QEMU_OPTION_mempath,
+    "-mem-path FILE  provide backing storage for guest RAM\n")
+STEXI
+@item -mem-path @var{path}
+Allocate guest RAM from a temporarily created file in @var{path}.
+ETEXI
+
+#ifdef MAP_POPULATE
+DEF("mem-prealloc", 0, QEMU_OPTION_mem_prealloc,
+    "-mem-prealloc   preallocate guest memory (use with -mem-path)\n")
+STEXI
+@item -mem-prealloc
+Preallocate memory when using -mem-path.
+ETEXI
+#endif
+
 DEF("k", HAS_ARG, QEMU_OPTION_k,
     "-k language     use keyboard layout (for example 'fr' for French)\n")
 STEXI
diff --git a/vl.c b/vl.c
index db7a178f88..274d847294 100644
--- a/vl.c
+++ b/vl.c
@@ -185,6 +185,10 @@ enum vga_retrace_method vga_retrace_method = VGA_RETRACE_DUMB;
 DisplayType display_type = DT_DEFAULT;
 const char* keyboard_layout = NULL;
 ram_addr_t ram_size;
+const char *mem_path = NULL;
+#ifdef MAP_POPULATE
+int mem_prealloc = 0; /* force preallocation of physical target memory */
+#endif
 int nb_nics;
 NICInfo nd_table[MAX_NICS];
 int vm_running;
@@ -5216,6 +5220,14 @@ int main(int argc, char **argv, char **envp)
                 ram_size = value;
                 break;
             }
+            case QEMU_OPTION_mempath:
+                mem_path = optarg;
+                break;
+#ifdef MAP_POPULATE
+            case QEMU_OPTION_mem_prealloc:
+                mem_prealloc = 1;
+                break;
+#endif
             case QEMU_OPTION_d:
                 {
                     int mask;

From b0b1d69079fcb9453f45aade9e9f6b71422147b0 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Mon, 1 Mar 2010 19:10:29 +0100
Subject: [PATCH 3/6] KVM: Rework of guest debug state writing

So far we synchronized any dirty VCPU state back into the kernel before
updating the guest debug state. This was a tribute to a deficite in x86
kernels before 2.6.33. But as this is an arch-dependent issue, it is
better handle in the x86 part of KVM and remove the writeback point for
generic code. This also avoids overwriting the flushed state later on if
user space decides to change some more registers before resuming the
guest.

We furthermore need to reinject guest exceptions via the appropriate
mechanism. That is KVM_SET_GUEST_DEBUG for older kernels and
KVM_SET_VCPU_EVENTS for recent ones. Using both mechanisms at the same
time will cause state corruptions.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 kvm-all.c         | 24 ++++++++++++++++--------
 kvm.h             |  1 +
 target-i386/kvm.c | 47 +++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 60 insertions(+), 12 deletions(-)

diff --git a/kvm-all.c b/kvm-all.c
index 1a02076bfd..2f7e33a6cd 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -65,6 +65,7 @@ struct KVMState
     int broken_set_mem_region;
     int migration_log;
     int vcpu_events;
+    int robust_singlestep;
 #ifdef KVM_CAP_SET_GUEST_DEBUG
     struct kvm_sw_breakpoint_head kvm_sw_breakpoints;
 #endif
@@ -659,6 +660,12 @@ int kvm_init(int smp_cpus)
     s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
 #endif
 
+    s->robust_singlestep = 0;
+#ifdef KVM_CAP_X86_ROBUST_SINGLESTEP
+    s->robust_singlestep =
+        kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP);
+#endif
+
     ret = kvm_arch_init(s, smp_cpus);
     if (ret < 0)
         goto err;
@@ -917,6 +924,11 @@ int kvm_has_vcpu_events(void)
     return kvm_state->vcpu_events;
 }
 
+int kvm_has_robust_singlestep(void)
+{
+    return kvm_state->robust_singlestep;
+}
+
 void kvm_setup_guest_memory(void *start, size_t size)
 {
     if (!kvm_has_sync_mmu()) {
@@ -974,10 +986,6 @@ static void kvm_invoke_set_guest_debug(void *data)
     struct kvm_set_guest_debug_data *dbg_data = data;
     CPUState *env = dbg_data->env;
 
-    if (env->kvm_vcpu_dirty) {
-        kvm_arch_put_registers(env);
-        env->kvm_vcpu_dirty = 0;
-    }
     dbg_data->err = kvm_vcpu_ioctl(env, KVM_SET_GUEST_DEBUG, &dbg_data->dbg);
 }
 
@@ -985,12 +993,12 @@ int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap)
 {
     struct kvm_set_guest_debug_data data;
 
-    data.dbg.control = 0;
-    if (env->singlestep_enabled)
-        data.dbg.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
+    data.dbg.control = reinject_trap;
 
+    if (env->singlestep_enabled) {
+        data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
+    }
     kvm_arch_update_guest_debug(env, &data.dbg);
-    data.dbg.control |= reinject_trap;
     data.env = env;
 
     on_vcpu(env, kvm_invoke_set_guest_debug, &data);
diff --git a/kvm.h b/kvm.h
index a74dfcb083..a602e4552a 100644
--- a/kvm.h
+++ b/kvm.h
@@ -40,6 +40,7 @@ int kvm_log_stop(target_phys_addr_t phys_addr, ram_addr_t size);
 
 int kvm_has_sync_mmu(void);
 int kvm_has_vcpu_events(void);
+int kvm_has_robust_singlestep(void);
 
 void kvm_setup_guest_memory(void *start, size_t size);
 
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index d2116a7b3f..e0247ea631 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -852,6 +852,37 @@ static int kvm_get_vcpu_events(CPUState *env)
     return 0;
 }
 
+static int kvm_guest_debug_workarounds(CPUState *env)
+{
+    int ret = 0;
+#ifdef KVM_CAP_SET_GUEST_DEBUG
+    unsigned long reinject_trap = 0;
+
+    if (!kvm_has_vcpu_events()) {
+        if (env->exception_injected == 1) {
+            reinject_trap = KVM_GUESTDBG_INJECT_DB;
+        } else if (env->exception_injected == 3) {
+            reinject_trap = KVM_GUESTDBG_INJECT_BP;
+        }
+        env->exception_injected = -1;
+    }
+
+    /*
+     * Kernels before KVM_CAP_X86_ROBUST_SINGLESTEP overwrote flags.TF
+     * injected via SET_GUEST_DEBUG while updating GP regs. Work around this
+     * by updating the debug state once again if single-stepping is on.
+     * Another reason to call kvm_update_guest_debug here is a pending debug
+     * trap raise by the guest. On kernels without SET_VCPU_EVENTS we have to
+     * reinject them via SET_GUEST_DEBUG.
+     */
+    if (reinject_trap ||
+        (!kvm_has_robust_singlestep() && env->singlestep_enabled)) {
+        ret = kvm_update_guest_debug(env, reinject_trap);
+    }
+#endif /* KVM_CAP_SET_GUEST_DEBUG */
+    return ret;
+}
+
 int kvm_arch_put_registers(CPUState *env)
 {
     int ret;
@@ -880,6 +911,11 @@ int kvm_arch_put_registers(CPUState *env)
     if (ret < 0)
         return ret;
 
+    /* must be last */
+    ret = kvm_guest_debug_workarounds(env);
+    if (ret < 0)
+        return ret;
+
     return 0;
 }
 
@@ -1123,10 +1159,13 @@ int kvm_arch_debug(struct kvm_debug_exit_arch *arch_info)
     } else if (kvm_find_sw_breakpoint(cpu_single_env, arch_info->pc))
         handle = 1;
 
-    if (!handle)
-        kvm_update_guest_debug(cpu_single_env,
-                        (arch_info->exception == 1) ?
-                        KVM_GUESTDBG_INJECT_DB : KVM_GUESTDBG_INJECT_BP);
+    if (!handle) {
+        cpu_synchronize_state(cpu_single_env);
+        assert(cpu_single_env->exception_injected == -1);
+
+        cpu_single_env->exception_injected = arch_info->exception;
+        cpu_single_env->has_error_code = 0;
+    }
 
     return handle;
 }

From ea375f9ab8c76686dca0af8cb4f87a4eb569cad3 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Mon, 1 Mar 2010 19:10:30 +0100
Subject: [PATCH 4/6] KVM: Rework VCPU state writeback API

This grand cleanup drops all reset and vmsave/load related
synchronization points in favor of four(!) generic hooks:

- cpu_synchronize_all_states in qemu_savevm_state_complete
  (initial sync from kernel before vmsave)
- cpu_synchronize_all_post_init in qemu_loadvm_state
  (writeback after vmload)
- cpu_synchronize_all_post_init in main after machine init
- cpu_synchronize_all_post_reset in qemu_system_reset
  (writeback after system reset)

These writeback points + the existing one of VCPU exec after
cpu_synchronize_state map on three levels of writeback:

- KVM_PUT_RUNTIME_STATE (during runtime, other VCPUs continue to run)
- KVM_PUT_RESET_STATE   (on synchronous system reset, all VCPUs stopped)
- KVM_PUT_FULL_STATE    (on init or vmload, all VCPUs stopped as well)

This level is passed to the arch-specific VCPU state writing function
that will decide which concrete substates need to be written. That way,
no writer of load, save or reset functions that interact with in-kernel
KVM states will ever have to worry about synchronization again. That
also means that a lot of reasons for races, segfaults and deadlocks are
eliminated.

cpu_synchronize_state remains untouched, just as Anthony suggested. We
continue to need it before reading or writing of VCPU states that are
also tracked by in-kernel KVM subsystems.

Consequently, this patch removes many cpu_synchronize_state calls that
are now redundant, just like remaining explicit register syncs.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 exec.c                | 17 -----------------
 hw/apic.c             |  2 --
 hw/ppc_newworld.c     |  3 ---
 hw/ppc_oldworld.c     |  3 ---
 hw/s390-virtio.c      |  1 -
 kvm-all.c             | 19 +++++++++++++------
 kvm.h                 | 25 ++++++++++++++++++++++++-
 savevm.c              |  4 ++++
 sysemu.h              |  4 ++++
 target-i386/kvm.c     |  2 +-
 target-i386/machine.c | 11 -----------
 target-ppc/kvm.c      |  2 +-
 target-ppc/machine.c  |  4 ----
 target-s390x/kvm.c    |  3 +--
 vl.c                  | 29 +++++++++++++++++++++++++++++
 15 files changed, 77 insertions(+), 52 deletions(-)

diff --git a/exec.c b/exec.c
index f41518e8a3..891e0eee03 100644
--- a/exec.c
+++ b/exec.c
@@ -512,21 +512,6 @@ void cpu_exec_init_all(unsigned long tb_size)
 
 #if defined(CPU_SAVE_VERSION) && !defined(CONFIG_USER_ONLY)
 
-static void cpu_common_pre_save(void *opaque)
-{
-    CPUState *env = opaque;
-
-    cpu_synchronize_state(env);
-}
-
-static int cpu_common_pre_load(void *opaque)
-{
-    CPUState *env = opaque;
-
-    cpu_synchronize_state(env);
-    return 0;
-}
-
 static int cpu_common_post_load(void *opaque, int version_id)
 {
     CPUState *env = opaque;
@@ -544,8 +529,6 @@ static const VMStateDescription vmstate_cpu_common = {
     .version_id = 1,
     .minimum_version_id = 1,
     .minimum_version_id_old = 1,
-    .pre_save = cpu_common_pre_save,
-    .pre_load = cpu_common_pre_load,
     .post_load = cpu_common_post_load,
     .fields      = (VMStateField []) {
         VMSTATE_UINT32(halted, CPUState),
diff --git a/hw/apic.c b/hw/apic.c
index 87e7dc0ba9..3c90f4c845 100644
--- a/hw/apic.c
+++ b/hw/apic.c
@@ -938,8 +938,6 @@ static void apic_reset(void *opaque)
     APICState *s = opaque;
     int bsp;
 
-    cpu_synchronize_state(s->cpu_env);
-
     bsp = cpu_is_bsp(s->cpu_env);
     s->apicbase = 0xfee00000 |
         (bsp ? MSR_IA32_APICBASE_BSP : 0) | MSR_IA32_APICBASE_ENABLE;
diff --git a/hw/ppc_newworld.c b/hw/ppc_newworld.c
index bc86c851e7..d4f9013fe5 100644
--- a/hw/ppc_newworld.c
+++ b/hw/ppc_newworld.c
@@ -167,9 +167,6 @@ static void ppc_core99_init (ram_addr_t ram_size,
         envs[i] = env;
     }
 
-    /* Make sure all register sets take effect */
-    cpu_synchronize_state(env);
-
     /* allocate RAM */
     ram_offset = qemu_ram_alloc(ram_size);
     cpu_register_physical_memory(0, ram_size, ram_offset);
diff --git a/hw/ppc_oldworld.c b/hw/ppc_oldworld.c
index 04a78358dd..93c95ba659 100644
--- a/hw/ppc_oldworld.c
+++ b/hw/ppc_oldworld.c
@@ -165,9 +165,6 @@ static void ppc_heathrow_init (ram_addr_t ram_size,
         envs[i] = env;
     }
 
-    /* Make sure all register sets take effect */
-    cpu_synchronize_state(env);
-
     /* allocate RAM */
     if (ram_size > (2047 << 20)) {
         fprintf(stderr,
diff --git a/hw/s390-virtio.c b/hw/s390-virtio.c
index 3582728378..ad3386f607 100644
--- a/hw/s390-virtio.c
+++ b/hw/s390-virtio.c
@@ -185,7 +185,6 @@ static void s390_init(ram_addr_t ram_size,
             exit(1);
         }
 
-        cpu_synchronize_state(env);
         env->psw.addr = KERN_IMAGE_START;
         env->psw.mask = 0x0000000180000000ULL;
     }
diff --git a/kvm-all.c b/kvm-all.c
index 2f7e33a6cd..534ead060d 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -156,10 +156,6 @@ static void kvm_reset_vcpu(void *opaque)
     CPUState *env = opaque;
 
     kvm_arch_reset_vcpu(env);
-    if (kvm_arch_put_registers(env)) {
-        fprintf(stderr, "Fatal: kvm vcpu reset failed\n");
-        abort();
-    }
 }
 
 int kvm_irqchip_in_kernel(void)
@@ -214,7 +210,6 @@ int kvm_init_vcpu(CPUState *env)
     if (ret == 0) {
         qemu_register_reset(kvm_reset_vcpu, env);
         kvm_arch_reset_vcpu(env);
-        ret = kvm_arch_put_registers(env);
     }
 err:
     return ret;
@@ -753,6 +748,18 @@ void kvm_cpu_synchronize_state(CPUState *env)
     }
 }
 
+void kvm_cpu_synchronize_post_reset(CPUState *env)
+{
+    kvm_arch_put_registers(env, KVM_PUT_RESET_STATE);
+    env->kvm_vcpu_dirty = 0;
+}
+
+void kvm_cpu_synchronize_post_init(CPUState *env)
+{
+    kvm_arch_put_registers(env, KVM_PUT_FULL_STATE);
+    env->kvm_vcpu_dirty = 0;
+}
+
 int kvm_cpu_exec(CPUState *env)
 {
     struct kvm_run *run = env->kvm_run;
@@ -770,7 +777,7 @@ int kvm_cpu_exec(CPUState *env)
 #endif
 
         if (env->kvm_vcpu_dirty) {
-            kvm_arch_put_registers(env);
+            kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
             env->kvm_vcpu_dirty = 0;
         }
 
diff --git a/kvm.h b/kvm.h
index a602e4552a..b2937b9956 100644
--- a/kvm.h
+++ b/kvm.h
@@ -82,7 +82,14 @@ int kvm_arch_pre_run(CPUState *env, struct kvm_run *run);
 
 int kvm_arch_get_registers(CPUState *env);
 
-int kvm_arch_put_registers(CPUState *env);
+/* state subset only touched by the VCPU itself during runtime */
+#define KVM_PUT_RUNTIME_STATE   1
+/* state subset modified during VCPU reset */
+#define KVM_PUT_RESET_STATE     2
+/* full state set, modified during initialization or on vmload */
+#define KVM_PUT_FULL_STATE      3
+
+int kvm_arch_put_registers(CPUState *env, int level);
 
 int kvm_arch_init(KVMState *s, int smp_cpus);
 
@@ -126,6 +133,8 @@ int kvm_check_extension(KVMState *s, unsigned int extension);
 uint32_t kvm_arch_get_supported_cpuid(CPUState *env, uint32_t function,
                                       int reg);
 void kvm_cpu_synchronize_state(CPUState *env);
+void kvm_cpu_synchronize_post_reset(CPUState *env);
+void kvm_cpu_synchronize_post_init(CPUState *env);
 
 /* generic hooks - to be moved/refactored once there are more users */
 
@@ -136,4 +145,18 @@ static inline void cpu_synchronize_state(CPUState *env)
     }
 }
 
+static inline void cpu_synchronize_post_reset(CPUState *env)
+{
+    if (kvm_enabled()) {
+        kvm_cpu_synchronize_post_reset(env);
+    }
+}
+
+static inline void cpu_synchronize_post_init(CPUState *env)
+{
+    if (kvm_enabled()) {
+        kvm_cpu_synchronize_post_init(env);
+    }
+}
+
 #endif
diff --git a/savevm.c b/savevm.c
index 4b58663960..a6e774b4d0 100644
--- a/savevm.c
+++ b/savevm.c
@@ -1345,6 +1345,8 @@ int qemu_savevm_state_complete(Monitor *mon, QEMUFile *f)
 {
     SaveStateEntry *se;
 
+    cpu_synchronize_all_states();
+
     QTAILQ_FOREACH(se, &savevm_handlers, entry) {
         if (se->save_live_state == NULL)
             continue;
@@ -1545,6 +1547,8 @@ int qemu_loadvm_state(QEMUFile *f)
         }
     }
 
+    cpu_synchronize_all_post_init();
+
     ret = 0;
 
 out:
diff --git a/sysemu.h b/sysemu.h
index 8ba618e54f..d77344c1b9 100644
--- a/sysemu.h
+++ b/sysemu.h
@@ -58,6 +58,10 @@ int load_vmstate(Monitor *mon, const char *name);
 void do_delvm(Monitor *mon, const QDict *qdict);
 void do_info_snapshots(Monitor *mon);
 
+void cpu_synchronize_all_states(void);
+void cpu_synchronize_all_post_reset(void);
+void cpu_synchronize_all_post_init(void);
+
 void qemu_announce_self(void);
 
 void main_loop_wait(int timeout);
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index e0247ea631..2c834dfdcc 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -883,7 +883,7 @@ static int kvm_guest_debug_workarounds(CPUState *env)
     return ret;
 }
 
-int kvm_arch_put_registers(CPUState *env)
+int kvm_arch_put_registers(CPUState *env, int level)
 {
     int ret;
 
diff --git a/target-i386/machine.c b/target-i386/machine.c
index 87704918be..b547e2ac7a 100644
--- a/target-i386/machine.c
+++ b/target-i386/machine.c
@@ -321,8 +321,6 @@ static void cpu_pre_save(void *opaque)
     CPUState *env = opaque;
     int i;
 
-    cpu_synchronize_state(env);
-
     /* FPU */
     env->fpus_vmstate = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
     env->fptag_vmstate = 0;
@@ -337,14 +335,6 @@ static void cpu_pre_save(void *opaque)
 #endif
 }
 
-static int cpu_pre_load(void *opaque)
-{
-    CPUState *env = opaque;
-
-    cpu_synchronize_state(env);
-    return 0;
-}
-
 static int cpu_post_load(void *opaque, int version_id)
 {
     CPUState *env = opaque;
@@ -373,7 +363,6 @@ static const VMStateDescription vmstate_cpu = {
     .minimum_version_id = 3,
     .minimum_version_id_old = 3,
     .pre_save = cpu_pre_save,
-    .pre_load = cpu_pre_load,
     .post_load = cpu_post_load,
     .fields      = (VMStateField []) {
         VMSTATE_UINTTL_ARRAY(regs, CPUState, CPU_NB_REGS),
diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
index 8ad003799c..aa3d43247b 100644
--- a/target-ppc/kvm.c
+++ b/target-ppc/kvm.c
@@ -73,7 +73,7 @@ void kvm_arch_reset_vcpu(CPUState *env)
 {
 }
 
-int kvm_arch_put_registers(CPUState *env)
+int kvm_arch_put_registers(CPUState *env, int level)
 {
     struct kvm_regs regs;
     int ret;
diff --git a/target-ppc/machine.c b/target-ppc/machine.c
index 4897c8a4d3..67de951959 100644
--- a/target-ppc/machine.c
+++ b/target-ppc/machine.c
@@ -7,8 +7,6 @@ void cpu_save(QEMUFile *f, void *opaque)
     CPUState *env = (CPUState *)opaque;
     unsigned int i, j;
 
-    cpu_synchronize_state(env);
-
     for (i = 0; i < 32; i++)
         qemu_put_betls(f, &env->gpr[i]);
 #if !defined(TARGET_PPC64)
@@ -96,8 +94,6 @@ int cpu_load(QEMUFile *f, void *opaque, int version_id)
     CPUState *env = (CPUState *)opaque;
     unsigned int i, j;
 
-    cpu_synchronize_state(env);
-
     for (i = 0; i < 32; i++)
         qemu_get_betls(f, &env->gpr[i]);
 #if !defined(TARGET_PPC64)
diff --git a/target-s390x/kvm.c b/target-s390x/kvm.c
index 0199a650f6..72e77b0cda 100644
--- a/target-s390x/kvm.c
+++ b/target-s390x/kvm.c
@@ -91,7 +91,7 @@ void kvm_arch_reset_vcpu(CPUState *env)
     /* FIXME: add code to reset vcpu. */
 }
 
-int kvm_arch_put_registers(CPUState *env)
+int kvm_arch_put_registers(CPUState *env, int level)
 {
     struct kvm_regs regs;
     int ret;
@@ -296,7 +296,6 @@ static int handle_hypercall(CPUState *env, struct kvm_run *run)
 
     cpu_synchronize_state(env);
     r = s390_virtio_hypercall(env);
-    kvm_arch_put_registers(env);
 
     return r;
 }
diff --git a/vl.c b/vl.c
index 274d847294..10d8e34e50 100644
--- a/vl.c
+++ b/vl.c
@@ -3002,6 +3002,33 @@ static void nographic_update(void *opaque)
     qemu_mod_timer(nographic_timer, interval + qemu_get_clock(rt_clock));
 }
 
+void cpu_synchronize_all_states(void)
+{
+    CPUState *cpu;
+
+    for (cpu = first_cpu; cpu; cpu = cpu->next_cpu) {
+        cpu_synchronize_state(cpu);
+    }
+}
+
+void cpu_synchronize_all_post_reset(void)
+{
+    CPUState *cpu;
+
+    for (cpu = first_cpu; cpu; cpu = cpu->next_cpu) {
+        cpu_synchronize_post_reset(cpu);
+    }
+}
+
+void cpu_synchronize_all_post_init(void)
+{
+    CPUState *cpu;
+
+    for (cpu = first_cpu; cpu; cpu = cpu->next_cpu) {
+        cpu_synchronize_post_init(cpu);
+    }
+}
+
 struct vm_change_state_entry {
     VMChangeStateHandler *cb;
     void *opaque;
@@ -3143,6 +3170,7 @@ void qemu_system_reset(void)
     QTAILQ_FOREACH_SAFE(re, &reset_handlers, entry, nre) {
         re->func(re->opaque);
     }
+    cpu_synchronize_all_post_reset();
 }
 
 void qemu_system_reset_request(void)
@@ -5928,6 +5956,7 @@ int main(int argc, char **argv, char **envp)
     machine->init(ram_size, boot_devices,
                   kernel_filename, kernel_cmdline, initrd_filename, cpu_model);
 
+    cpu_synchronize_all_post_init();
 
 #ifndef _WIN32
     /* must be after terminal init, SDL library changes signal handlers */

From ea64305139357e89f58fc05ff5d48dc233d44d87 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Mon, 1 Mar 2010 19:10:31 +0100
Subject: [PATCH 5/6] KVM: x86: Restrict writeback of VCPU state

Do not write nmi_pending, sipi_vector, and mpstate unless we at least go
through a reset. And TSC as well as KVM wallclocks should only be
written on full sync, otherwise we risk to drop some time on state
read-modify-write.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 target-i386/kvm.c | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 2c834dfdcc..40f83037d6 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -546,7 +546,7 @@ static void kvm_msr_entry_set(struct kvm_msr_entry *entry,
     entry->data = value;
 }
 
-static int kvm_put_msrs(CPUState *env)
+static int kvm_put_msrs(CPUState *env, int level)
 {
     struct {
         struct kvm_msrs info;
@@ -560,7 +560,6 @@ static int kvm_put_msrs(CPUState *env)
     kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_EIP, env->sysenter_eip);
     if (kvm_has_msr_star(env))
 	kvm_msr_entry_set(&msrs[n++], MSR_STAR, env->star);
-    kvm_msr_entry_set(&msrs[n++], MSR_IA32_TSC, env->tsc);
 #ifdef TARGET_X86_64
     /* FIXME if lm capable */
     kvm_msr_entry_set(&msrs[n++], MSR_CSTAR, env->cstar);
@@ -568,8 +567,12 @@ static int kvm_put_msrs(CPUState *env)
     kvm_msr_entry_set(&msrs[n++], MSR_FMASK, env->fmask);
     kvm_msr_entry_set(&msrs[n++], MSR_LSTAR, env->lstar);
 #endif
-    kvm_msr_entry_set(&msrs[n++], MSR_KVM_SYSTEM_TIME,  env->system_time_msr);
-    kvm_msr_entry_set(&msrs[n++], MSR_KVM_WALL_CLOCK,  env->wall_clock_msr);
+    if (level == KVM_PUT_FULL_STATE) {
+        kvm_msr_entry_set(&msrs[n++], MSR_IA32_TSC, env->tsc);
+        kvm_msr_entry_set(&msrs[n++], MSR_KVM_SYSTEM_TIME,
+                          env->system_time_msr);
+        kvm_msr_entry_set(&msrs[n++], MSR_KVM_WALL_CLOCK, env->wall_clock_msr);
+    }
 
     msr_data.info.nmsrs = n;
 
@@ -782,7 +785,7 @@ static int kvm_get_mp_state(CPUState *env)
     return 0;
 }
 
-static int kvm_put_vcpu_events(CPUState *env)
+static int kvm_put_vcpu_events(CPUState *env, int level)
 {
 #ifdef KVM_CAP_VCPU_EVENTS
     struct kvm_vcpu_events events;
@@ -806,8 +809,11 @@ static int kvm_put_vcpu_events(CPUState *env)
 
     events.sipi_vector = env->sipi_vector;
 
-    events.flags =
-        KVM_VCPUEVENT_VALID_NMI_PENDING | KVM_VCPUEVENT_VALID_SIPI_VECTOR;
+    events.flags = 0;
+    if (level >= KVM_PUT_RESET_STATE) {
+        events.flags |=
+            KVM_VCPUEVENT_VALID_NMI_PENDING | KVM_VCPUEVENT_VALID_SIPI_VECTOR;
+    }
 
     return kvm_vcpu_ioctl(env, KVM_SET_VCPU_EVENTS, &events);
 #else
@@ -899,15 +905,17 @@ int kvm_arch_put_registers(CPUState *env, int level)
     if (ret < 0)
         return ret;
 
-    ret = kvm_put_msrs(env);
+    ret = kvm_put_msrs(env, level);
     if (ret < 0)
         return ret;
 
-    ret = kvm_put_mp_state(env);
-    if (ret < 0)
-        return ret;
+    if (level >= KVM_PUT_RESET_STATE) {
+        ret = kvm_put_mp_state(env);
+        if (ret < 0)
+            return ret;
+    }
 
-    ret = kvm_put_vcpu_events(env);
+    ret = kvm_put_vcpu_events(env, level);
     if (ret < 0)
         return ret;
 

From 6cb2996cef5e273ef370e690e84b5e1403f5c391 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Mon, 1 Mar 2010 19:10:32 +0100
Subject: [PATCH 6/6] x86: Extend validity of bsp_to_cpu

As we hard-wire the BSP to CPU 0 anyway and cpuid_apic_id equals
cpu_index, bsp_to_cpu can also be based on the latter directly. This
will help an early user of it: KVM while initializing mp_state.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 hw/pc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hw/pc.c b/hw/pc.c
index bdc297f717..e50a48848d 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -760,7 +760,8 @@ static void pc_init_ne2k_isa(NICInfo *nd)
 
 int cpu_is_bsp(CPUState *env)
 {
-    return env->cpuid_apic_id == 0;
+    /* We hard-wire the BSP to the first CPU. */
+    return env->cpu_index == 0;
 }
 
 static CPUState *pc_new_cpu(const char *cpu_model)