Merge remote-tracking branch 'bonzini/split-main-loop-for-anthony' into staging

2024-11-24 11:23:43 +08:00 · 2011-10-24 10:51:12 -05:00 · 2011-10-24 10:51:12 -05:00 · 952e849c15
commit 952e849c15
parent db418a0a7e 99435906cc
23 changed files with 1318 additions and 1069 deletions
--- a/Makefile.objs
+++ b/Makefile.objs
@ -81,7 +81,7 @@ common-obj-y += $(oslib-obj-y)
 common-obj-$(CONFIG_WIN32) += os-win32.o
 common-obj-$(CONFIG_POSIX) += os-posix.o

-common-obj-y += tcg-runtime.o host-utils.o
+common-obj-y += tcg-runtime.o host-utils.o main-loop.o
 common-obj-y += irq.o input.o
 common-obj-$(CONFIG_PTIMER) += ptimer.o
 common-obj-$(CONFIG_MAX7310) += max7310.o
--- a/async.c
+++ b/async.c
@ -24,6 +24,7 @@

 #include "qemu-common.h"
 #include "qemu-aio.h"
+#include "main-loop.h"

 /* Anchor of the list of Bottom Halves belonging to the context */
 static struct QEMUBH *first_bh;
--- a/cpus.c
+++ b/cpus.c
@ -33,17 +33,12 @@

 #include "qemu-thread.h"
 #include "cpus.h"
+#include "main-loop.h"

 #ifndef _WIN32
 #include "compatfd.h"
 #endif

-#ifdef SIGRTMIN
-#define SIG_IPI (SIGRTMIN+4)
-#else
-#define SIG_IPI SIGUSR1
-#endif
-
 #ifdef CONFIG_LINUX

 #include <sys/prctl.h>
@ -64,6 +59,281 @@

 static CPUState *next_cpu;

+/***********************************************************/
+/* guest cycle counter */
+
+/* Conversion factor from emulated instructions to virtual clock ticks.  */
+static int icount_time_shift;
+/* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
+#define MAX_ICOUNT_SHIFT 10
+/* Compensate for varying guest execution speed.  */
+static int64_t qemu_icount_bias;
+static QEMUTimer *icount_rt_timer;
+static QEMUTimer *icount_vm_timer;
+static QEMUTimer *icount_warp_timer;
+static int64_t vm_clock_warp_start;
+static int64_t qemu_icount;
+
+typedef struct TimersState {
+    int64_t cpu_ticks_prev;
+    int64_t cpu_ticks_offset;
+    int64_t cpu_clock_offset;
+    int32_t cpu_ticks_enabled;
+    int64_t dummy;
+} TimersState;
+
+TimersState timers_state;
+
+/* Return the virtual CPU time, based on the instruction counter.  */
+int64_t cpu_get_icount(void)
+{
+    int64_t icount;
+    CPUState *env = cpu_single_env;;
+
+    icount = qemu_icount;
+    if (env) {
+        if (!can_do_io(env)) {
+            fprintf(stderr, "Bad clock read\n");
+        }
+        icount -= (env->icount_decr.u16.low + env->icount_extra);
+    }
+    return qemu_icount_bias + (icount << icount_time_shift);
+}
+
+/* return the host CPU cycle counter and handle stop/restart */
+int64_t cpu_get_ticks(void)
+{
+    if (use_icount) {
+        return cpu_get_icount();
+    }
+    if (!timers_state.cpu_ticks_enabled) {
+        return timers_state.cpu_ticks_offset;
+    } else {
+        int64_t ticks;
+        ticks = cpu_get_real_ticks();
+        if (timers_state.cpu_ticks_prev > ticks) {
+            /* Note: non increasing ticks may happen if the host uses
+               software suspend */
+            timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
+        }
+        timers_state.cpu_ticks_prev = ticks;
+        return ticks + timers_state.cpu_ticks_offset;
+    }
+}
+
+/* return the host CPU monotonic timer and handle stop/restart */
+int64_t cpu_get_clock(void)
+{
+    int64_t ti;
+    if (!timers_state.cpu_ticks_enabled) {
+        return timers_state.cpu_clock_offset;
+    } else {
+        ti = get_clock();
+        return ti + timers_state.cpu_clock_offset;
+    }
+}
+
+/* enable cpu_get_ticks() */
+void cpu_enable_ticks(void)
+{
+    if (!timers_state.cpu_ticks_enabled) {
+        timers_state.cpu_ticks_offset -= cpu_get_real_ticks();
+        timers_state.cpu_clock_offset -= get_clock();
+        timers_state.cpu_ticks_enabled = 1;
+    }
+}
+
+/* disable cpu_get_ticks() : the clock is stopped. You must not call
+   cpu_get_ticks() after that.  */
+void cpu_disable_ticks(void)
+{
+    if (timers_state.cpu_ticks_enabled) {
+        timers_state.cpu_ticks_offset = cpu_get_ticks();
+        timers_state.cpu_clock_offset = cpu_get_clock();
+        timers_state.cpu_ticks_enabled = 0;
+    }
+}
+
+/* Correlation between real and virtual time is always going to be
+   fairly approximate, so ignore small variation.
+   When the guest is idle real and virtual time will be aligned in
+   the IO wait loop.  */
+#define ICOUNT_WOBBLE (get_ticks_per_sec() / 10)
+
+static void icount_adjust(void)
+{
+    int64_t cur_time;
+    int64_t cur_icount;
+    int64_t delta;
+    static int64_t last_delta;
+    /* If the VM is not running, then do nothing.  */
+    if (!runstate_is_running()) {
+        return;
+    }
+    cur_time = cpu_get_clock();
+    cur_icount = qemu_get_clock_ns(vm_clock);
+    delta = cur_icount - cur_time;
+    /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
+    if (delta > 0
+        && last_delta + ICOUNT_WOBBLE < delta * 2
+        && icount_time_shift > 0) {
+        /* The guest is getting too far ahead.  Slow time down.  */
+        icount_time_shift--;
+    }
+    if (delta < 0
+        && last_delta - ICOUNT_WOBBLE > delta * 2
+        && icount_time_shift < MAX_ICOUNT_SHIFT) {
+        /* The guest is getting too far behind.  Speed time up.  */
+        icount_time_shift++;
+    }
+    last_delta = delta;
+    qemu_icount_bias = cur_icount - (qemu_icount << icount_time_shift);
+}
+
+static void icount_adjust_rt(void *opaque)
+{
+    qemu_mod_timer(icount_rt_timer,
+                   qemu_get_clock_ms(rt_clock) + 1000);
+    icount_adjust();
+}
+
+static void icount_adjust_vm(void *opaque)
+{
+    qemu_mod_timer(icount_vm_timer,
+                   qemu_get_clock_ns(vm_clock) + get_ticks_per_sec() / 10);
+    icount_adjust();
+}
+
+static int64_t qemu_icount_round(int64_t count)
+{
+    return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
+}
+
+static void icount_warp_rt(void *opaque)
+{
+    if (vm_clock_warp_start == -1) {
+        return;
+    }
+
+    if (runstate_is_running()) {
+        int64_t clock = qemu_get_clock_ns(rt_clock);
+        int64_t warp_delta = clock - vm_clock_warp_start;
+        if (use_icount == 1) {
+            qemu_icount_bias += warp_delta;
+        } else {
+            /*
+             * In adaptive mode, do not let the vm_clock run too
+             * far ahead of real time.
+             */
+            int64_t cur_time = cpu_get_clock();
+            int64_t cur_icount = qemu_get_clock_ns(vm_clock);
+            int64_t delta = cur_time - cur_icount;
+            qemu_icount_bias += MIN(warp_delta, delta);
+        }
+        if (qemu_clock_expired(vm_clock)) {
+            qemu_notify_event();
+        }
+    }
+    vm_clock_warp_start = -1;
+}
+
+void qemu_clock_warp(QEMUClock *clock)
+{
+    int64_t deadline;
+
+    /*
+     * There are too many global variables to make the "warp" behavior
+     * applicable to other clocks.  But a clock argument removes the
+     * need for if statements all over the place.
+     */
+    if (clock != vm_clock || !use_icount) {
+        return;
+    }
+
+    /*
+     * If the CPUs have been sleeping, advance the vm_clock timer now.  This
+     * ensures that the deadline for the timer is computed correctly below.
+     * This also makes sure that the insn counter is synchronized before the
+     * CPU starts running, in case the CPU is woken by an event other than
+     * the earliest vm_clock timer.
+     */
+    icount_warp_rt(NULL);
+    if (!all_cpu_threads_idle() || !qemu_clock_has_timers(vm_clock)) {
+        qemu_del_timer(icount_warp_timer);
+        return;
+    }
+
+    vm_clock_warp_start = qemu_get_clock_ns(rt_clock);
+    deadline = qemu_clock_deadline(vm_clock);
+    if (deadline > 0) {
+        /*
+         * Ensure the vm_clock proceeds even when the virtual CPU goes to
+         * sleep.  Otherwise, the CPU might be waiting for a future timer
+         * interrupt to wake it up, but the interrupt never comes because
+         * the vCPU isn't running any insns and thus doesn't advance the
+         * vm_clock.
+         *
+         * An extreme solution for this problem would be to never let VCPUs
+         * sleep in icount mode if there is a pending vm_clock timer; rather
+         * time could just advance to the next vm_clock event.  Instead, we
+         * do stop VCPUs and only advance vm_clock after some "real" time,
+         * (related to the time left until the next event) has passed.  This
+         * rt_clock timer will do this.  This avoids that the warps are too
+         * visible externally---for example, you will not be sending network
+         * packets continously instead of every 100ms.
+         */
+        qemu_mod_timer(icount_warp_timer, vm_clock_warp_start + deadline);
+    } else {
+        qemu_notify_event();
+    }
+}
+
+static const VMStateDescription vmstate_timers = {
+    .name = "timer",
+    .version_id = 2,
+    .minimum_version_id = 1,
+    .minimum_version_id_old = 1,
+    .fields      = (VMStateField[]) {
+        VMSTATE_INT64(cpu_ticks_offset, TimersState),
+        VMSTATE_INT64(dummy, TimersState),
+        VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+void configure_icount(const char *option)
+{
+    vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
+    if (!option) {
+        return;
+    }
+
+    icount_warp_timer = qemu_new_timer_ns(rt_clock, icount_warp_rt, NULL);
+    if (strcmp(option, "auto") != 0) {
+        icount_time_shift = strtol(option, NULL, 0);
+        use_icount = 1;
+        return;
+    }
+
+    use_icount = 2;
+
+    /* 125MIPS seems a reasonable initial guess at the guest speed.
+       It will be corrected fairly quickly anyway.  */
+    icount_time_shift = 3;
+
+    /* Have both realtime and virtual time triggers for speed adjustment.
+       The realtime trigger catches emulated time passing too slowly,
+       the virtual time trigger catches emulated time passing too fast.
+       Realtime triggers occur even when idle, so use them less frequently
+       than VM triggers.  */
+    icount_rt_timer = qemu_new_timer_ms(rt_clock, icount_adjust_rt, NULL);
+    qemu_mod_timer(icount_rt_timer,
+                   qemu_get_clock_ms(rt_clock) + 1000);
+    icount_vm_timer = qemu_new_timer_ns(vm_clock, icount_adjust_vm, NULL);
+    qemu_mod_timer(icount_vm_timer,
+                   qemu_get_clock_ns(vm_clock) + get_ticks_per_sec() / 10);
+}
+
 /***********************************************************/
 void hw_error(const char *fmt, ...)
 {
@ -272,143 +542,10 @@ static void qemu_kvm_eat_signals(CPUState *env)
 #endif /* !CONFIG_LINUX */

 #ifndef _WIN32
-static int io_thread_fd = -1;
-
-static void qemu_event_increment(void)
-{
-    /* Write 8 bytes to be compatible with eventfd.  */
-    static const uint64_t val = 1;
-    ssize_t ret;
-
-    if (io_thread_fd == -1) {
-        return;
-    }
-    do {
-        ret = write(io_thread_fd, &val, sizeof(val));
-    } while (ret < 0 && errno == EINTR);
-
-    /* EAGAIN is fine, a read must be pending.  */
-    if (ret < 0 && errno != EAGAIN) {
-        fprintf(stderr, "qemu_event_increment: write() failed: %s\n",
-                strerror(errno));
-        exit (1);
-    }
-}
-
-static void qemu_event_read(void *opaque)
-{
-    int fd = (intptr_t)opaque;
-    ssize_t len;
-    char buffer[512];
-
-    /* Drain the notify pipe.  For eventfd, only 8 bytes will be read.  */
-    do {
-        len = read(fd, buffer, sizeof(buffer));
-    } while ((len == -1 && errno == EINTR) || len == sizeof(buffer));
-}
-
-static int qemu_event_init(void)
-{
-    int err;
-    int fds[2];
-
-    err = qemu_eventfd(fds);
-    if (err == -1) {
-        return -errno;
-    }
-    err = fcntl_setfl(fds[0], O_NONBLOCK);
-    if (err < 0) {
-        goto fail;
-    }
-    err = fcntl_setfl(fds[1], O_NONBLOCK);
-    if (err < 0) {
-        goto fail;
-    }
-    qemu_set_fd_handler2(fds[0], NULL, qemu_event_read, NULL,
-                         (void *)(intptr_t)fds[0]);
-
-    io_thread_fd = fds[1];
-    return 0;
-
-fail:
-    close(fds[0]);
-    close(fds[1]);
-    return err;
-}
-
 static void dummy_signal(int sig)
 {
 }

-/* If we have signalfd, we mask out the signals we want to handle and then
- * use signalfd to listen for them.  We rely on whatever the current signal
- * handler is to dispatch the signals when we receive them.
- */
-static void sigfd_handler(void *opaque)
-{
-    int fd = (intptr_t)opaque;
-    struct qemu_signalfd_siginfo info;
-    struct sigaction action;
-    ssize_t len;
-
-    while (1) {
-        do {
-            len = read(fd, &info, sizeof(info));
-        } while (len == -1 && errno == EINTR);
-
-        if (len == -1 && errno == EAGAIN) {
-            break;
-        }
-
-        if (len != sizeof(info)) {
-            printf("read from sigfd returned %zd: %m\n", len);
-            return;
-        }
-
-        sigaction(info.ssi_signo, NULL, &action);
-        if ((action.sa_flags & SA_SIGINFO) && action.sa_sigaction) {
-            action.sa_sigaction(info.ssi_signo,
-                                (siginfo_t *)&info, NULL);
-        } else if (action.sa_handler) {
-            action.sa_handler(info.ssi_signo);
-        }
-    }
-}
-
-static int qemu_signal_init(void)
-{
-    int sigfd;
-    sigset_t set;
-
-    /*
-     * SIG_IPI must be blocked in the main thread and must not be caught
-     * by sigwait() in the signal thread. Otherwise, the cpu thread will
-     * not catch it reliably.
-     */
-    sigemptyset(&set);
-    sigaddset(&set, SIG_IPI);
-    pthread_sigmask(SIG_BLOCK, &set, NULL);
-
-    sigemptyset(&set);
-    sigaddset(&set, SIGIO);
-    sigaddset(&set, SIGALRM);
-    sigaddset(&set, SIGBUS);
-    pthread_sigmask(SIG_BLOCK, &set, NULL);
-
-    sigfd = qemu_signalfd(&set);
-    if (sigfd == -1) {
-        fprintf(stderr, "failed to create signalfd\n");
-        return -errno;
-    }
-
-    fcntl_setfl(sigfd, O_NONBLOCK);
-
-    qemu_set_fd_handler2(sigfd, NULL, sigfd_handler, NULL,
-                         (void *)(intptr_t)sigfd);
-
-    return 0;
-}
-
 static void qemu_kvm_init_cpu_signals(CPUState *env)
 {
    int r;
@ -452,38 +589,6 @@ static void qemu_tcg_init_cpu_signals(void)
 }

 #else /* _WIN32 */
-
-HANDLE qemu_event_handle;
-
-static void dummy_event_handler(void *opaque)
-{
-}
-
-static int qemu_event_init(void)
-{
-    qemu_event_handle = CreateEvent(NULL, FALSE, FALSE, NULL);
-    if (!qemu_event_handle) {
-        fprintf(stderr, "Failed CreateEvent: %ld\n", GetLastError());
-        return -1;
-    }
-    qemu_add_wait_object(qemu_event_handle, dummy_event_handler, NULL);
-    return 0;
-}
-
-static void qemu_event_increment(void)
-{
-    if (!SetEvent(qemu_event_handle)) {
-        fprintf(stderr, "qemu_event_increment: SetEvent failed: %ld\n",
-                GetLastError());
-        exit (1);
-    }
-}
-
-static int qemu_signal_init(void)
-{
-    return 0;
-}
-
 static void qemu_kvm_init_cpu_signals(CPUState *env)
 {
    abort();
@ -509,38 +614,16 @@ static QemuCond qemu_cpu_cond;
 static QemuCond qemu_pause_cond;
 static QemuCond qemu_work_cond;

-int qemu_init_main_loop(void)
+void qemu_init_cpu_loop(void)
 {
-    int ret;
-
    qemu_init_sigbus();
-
-    ret = qemu_signal_init();
-    if (ret) {
-        return ret;
-    }
-
-    /* Note eventfd must be drained before signalfd handlers run */
-    ret = qemu_event_init();
-    if (ret) {
-        return ret;
-    }
-
    qemu_cond_init(&qemu_cpu_cond);
    qemu_cond_init(&qemu_pause_cond);
    qemu_cond_init(&qemu_work_cond);
    qemu_cond_init(&qemu_io_proceeded_cond);
    qemu_mutex_init(&qemu_global_mutex);
-    qemu_mutex_lock(&qemu_global_mutex);

    qemu_thread_get_self(&io_thread);
-
-    return 0;
-}
-
-void qemu_main_loop_start(void)
-{
-    resume_all_vcpus();
 }

 void run_on_cpu(CPUState *env, void (*func)(void *data), void *data)
@ -686,7 +769,7 @@ static void *qemu_tcg_cpu_thread_fn(void *arg)

    while (1) {
        cpu_exec_all();
-        if (use_icount && qemu_next_icount_deadline() <= 0) {
+        if (use_icount && qemu_clock_deadline(vm_clock) <= 0) {
            qemu_notify_event();
        }
        qemu_tcg_wait_io_event();
@ -784,6 +867,7 @@ void pause_all_vcpus(void)
 {
    CPUState *penv = first_cpu;

+    qemu_clock_enable(vm_clock, false);
    while (penv) {
        penv->stop = 1;
        qemu_cpu_kick(penv);
@ -858,11 +942,6 @@ void qemu_init_vcpu(void *_env)
    }
 }

-void qemu_notify_event(void)
-{
-    qemu_event_increment();
-}
-
 void cpu_stop_current(void)
 {
    if (cpu_single_env) {
@ -914,7 +993,7 @@ static int tcg_cpu_exec(CPUState *env)
        qemu_icount -= (env->icount_decr.u16.low + env->icount_extra);
        env->icount_decr.u16.low = 0;
        env->icount_extra = 0;
-        count = qemu_icount_round(qemu_next_icount_deadline());
+        count = qemu_icount_round(qemu_clock_deadline(vm_clock));
        qemu_icount += count;
        decr = (count > 0xffff) ? 0xffff : count;
        count -= decr;
@ -1006,22 +1085,6 @@ void set_cpu_log_filename(const char *optarg)
    cpu_set_log_filename(optarg);
 }

-/* Return the virtual CPU time, based on the instruction counter.  */
-int64_t cpu_get_icount(void)
-{
-    int64_t icount;
-    CPUState *env = cpu_single_env;;
-
-    icount = qemu_icount;
-    if (env) {
-        if (!can_do_io(env)) {
-            fprintf(stderr, "Bad clock read\n");
-        }
-        icount -= (env->icount_decr.u16.low + env->icount_extra);
-    }
-    return qemu_icount_bias + (icount << icount_time_shift);
-}
-
 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
 {
    /* XXX: implement xxx_cpu_list for targets that still miss it */
--- a/cpus.h
+++ b/cpus.h
@ -2,8 +2,7 @@
 #define QEMU_CPUS_H

 /* cpus.c */
-int qemu_init_main_loop(void);
-void qemu_main_loop_start(void);
+void qemu_init_cpu_loop(void);
 void resume_all_vcpus(void);
 void pause_all_vcpus(void);
 void cpu_stop_current(void);
--- a/exec-all.h
+++ b/exec-all.h
@ -356,4 +356,18 @@ extern int singlestep;
 /* cpu-exec.c */
 extern volatile sig_atomic_t exit_request;

+/* Deterministic execution requires that IO only be performed on the last
+   instruction of a TB so that interrupts take effect immediately.  */
+static inline int can_do_io(CPUState *env)
+{
+    if (!use_icount) {
+        return 1;
+    }
+    /* If not executing code then assume we are ok.  */
+    if (!env->current_tb) {
+        return 1;
+    }
+    return env->can_do_io != 0;
+}
+
 #endif
--- a/exec.c
+++ b/exec.c
@ -125,9 +125,6 @@ CPUState *cpu_single_env;
   1 = Precise instruction counting.
   2 = Adaptive rate instruction counting.  */
 int use_icount = 0;
-/* Current instruction counter.  While executing translated code this may
-   include some instructions that have not yet been executed.  */
-int64_t qemu_icount;

 typedef struct PageDesc {
    /* list of TBs intersecting this ram page */
--- a/hw/mac_dbdma.c
+++ b/hw/mac_dbdma.c
@ -661,11 +661,6 @@ void DBDMA_register_channel(void *dbdma, int nchan, qemu_irq irq,
    ch->io.channel = ch;
 }

-void DBDMA_schedule(void)
-{
-    qemu_notify_event();
-}
-
 static void
 dbdma_control_write(DBDMA_channel *ch)
 {
--- a/hw/mac_dbdma.h
+++ b/hw/mac_dbdma.h
@ -41,5 +41,4 @@ struct DBDMA_io {
 void DBDMA_register_channel(void *dbdma, int nchan, qemu_irq irq,
                            DBDMA_rw rw, DBDMA_flush flush,
                            void *opaque);
-void DBDMA_schedule(void);
 void* DBDMA_init (MemoryRegion **dbdma_mem);
--- a/iohandler.c
+++ b/iohandler.c
@ -26,6 +26,7 @@
 #include "qemu-common.h"
 #include "qemu-char.h"
 #include "qemu-queue.h"
+#include "main-loop.h"

 #ifndef _WIN32
 #include <sys/wait.h>
@ -80,64 +81,12 @@ int qemu_set_fd_handler2(int fd,
    return 0;
 }

-typedef struct IOTrampoline
-{
-    GIOChannel *chan;
-    IOHandler *fd_read;
-    IOHandler *fd_write;
-    void *opaque;
-    guint tag;
-} IOTrampoline;
-
-static gboolean fd_trampoline(GIOChannel *chan, GIOCondition cond, gpointer opaque)
-{
-    IOTrampoline *tramp = opaque;
-
-    if ((cond & G_IO_IN) && tramp->fd_read) {
-        tramp->fd_read(tramp->opaque);
-    }
-
-    if ((cond & G_IO_OUT) && tramp->fd_write) {
-        tramp->fd_write(tramp->opaque);
-    }
-
-    return TRUE;
-}
-
 int qemu_set_fd_handler(int fd,
                        IOHandler *fd_read,
                        IOHandler *fd_write,
                        void *opaque)
 {
-    static IOTrampoline fd_trampolines[FD_SETSIZE];
-    IOTrampoline *tramp = &fd_trampolines[fd];
-
-    if (tramp->tag != 0) {
-        g_io_channel_unref(tramp->chan);
-        g_source_remove(tramp->tag);
-        tramp->tag = 0;
-    }
-
-    if (fd_read || fd_write || opaque) {
-        GIOCondition cond = 0;
-
-        tramp->fd_read = fd_read;
-        tramp->fd_write = fd_write;
-        tramp->opaque = opaque;
-
-        if (fd_read) {
-            cond |= G_IO_IN | G_IO_ERR;
-        }
-
-        if (fd_write) {
-            cond |= G_IO_OUT | G_IO_ERR;
-        }
-
-        tramp->chan = g_io_channel_unix_new(fd);
-        tramp->tag = g_io_add_watch(tramp->chan, cond, fd_trampoline, tramp);
-    }
-
-    return 0;
+    return qemu_set_fd_handler2(fd, NULL, fd_read, fd_write, opaque);
 }

 void qemu_iohandler_fill(int *pnfds, fd_set *readfds, fd_set *writefds, fd_set *xfds)
--- a/main-loop.c
+++ b/main-loop.c
@ -0,0 +1,495 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "config-host.h"
+#include <unistd.h>
+#include <signal.h>
+#include <time.h>
+#include <errno.h>
+#include <sys/time.h>
+#include <stdbool.h>
+
+#ifdef _WIN32
+#include <windows.h>
+#include <winsock2.h>
+#include <ws2tcpip.h>
+#else
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <net/if.h>
+#include <arpa/inet.h>
+#include <sys/select.h>
+#include <sys/stat.h>
+#include "compatfd.h"
+#endif
+
+#include <glib.h>
+
+#include "main-loop.h"
+#include "qemu-timer.h"
+#include "slirp/libslirp.h"
+
+#ifndef _WIN32
+
+static int io_thread_fd = -1;
+
+void qemu_notify_event(void)
+{
+    /* Write 8 bytes to be compatible with eventfd.  */
+    static const uint64_t val = 1;
+    ssize_t ret;
+
+    if (io_thread_fd == -1) {
+        return;
+    }
+    do {
+        ret = write(io_thread_fd, &val, sizeof(val));
+    } while (ret < 0 && errno == EINTR);
+
+    /* EAGAIN is fine, a read must be pending.  */
+    if (ret < 0 && errno != EAGAIN) {
+        fprintf(stderr, "qemu_notify_event: write() failed: %s\n",
+                strerror(errno));
+        exit(1);
+    }
+}
+
+static void qemu_event_read(void *opaque)
+{
+    int fd = (intptr_t)opaque;
+    ssize_t len;
+    char buffer[512];
+
+    /* Drain the notify pipe.  For eventfd, only 8 bytes will be read.  */
+    do {
+        len = read(fd, buffer, sizeof(buffer));
+    } while ((len == -1 && errno == EINTR) || len == sizeof(buffer));
+}
+
+static int qemu_event_init(void)
+{
+    int err;
+    int fds[2];
+
+    err = qemu_eventfd(fds);
+    if (err == -1) {
+        return -errno;
+    }
+    err = fcntl_setfl(fds[0], O_NONBLOCK);
+    if (err < 0) {
+        goto fail;
+    }
+    err = fcntl_setfl(fds[1], O_NONBLOCK);
+    if (err < 0) {
+        goto fail;
+    }
+    qemu_set_fd_handler2(fds[0], NULL, qemu_event_read, NULL,
+                         (void *)(intptr_t)fds[0]);
+
+    io_thread_fd = fds[1];
+    return 0;
+
+fail:
+    close(fds[0]);
+    close(fds[1]);
+    return err;
+}
+
+/* If we have signalfd, we mask out the signals we want to handle and then
+ * use signalfd to listen for them.  We rely on whatever the current signal
+ * handler is to dispatch the signals when we receive them.
+ */
+static void sigfd_handler(void *opaque)
+{
+    int fd = (intptr_t)opaque;
+    struct qemu_signalfd_siginfo info;
+    struct sigaction action;
+    ssize_t len;
+
+    while (1) {
+        do {
+            len = read(fd, &info, sizeof(info));
+        } while (len == -1 && errno == EINTR);
+
+        if (len == -1 && errno == EAGAIN) {
+            break;
+        }
+
+        if (len != sizeof(info)) {
+            printf("read from sigfd returned %zd: %m\n", len);
+            return;
+        }
+
+        sigaction(info.ssi_signo, NULL, &action);
+        if ((action.sa_flags & SA_SIGINFO) && action.sa_sigaction) {
+            action.sa_sigaction(info.ssi_signo,
+                                (siginfo_t *)&info, NULL);
+        } else if (action.sa_handler) {
+            action.sa_handler(info.ssi_signo);
+        }
+    }
+}
+
+static int qemu_signal_init(void)
+{
+    int sigfd;
+    sigset_t set;
+
+    /*
+     * SIG_IPI must be blocked in the main thread and must not be caught
+     * by sigwait() in the signal thread. Otherwise, the cpu thread will
+     * not catch it reliably.
+     */
+    sigemptyset(&set);
+    sigaddset(&set, SIG_IPI);
+    pthread_sigmask(SIG_BLOCK, &set, NULL);
+
+    sigemptyset(&set);
+    sigaddset(&set, SIGIO);
+    sigaddset(&set, SIGALRM);
+    sigaddset(&set, SIGBUS);
+    pthread_sigmask(SIG_BLOCK, &set, NULL);
+
+    sigfd = qemu_signalfd(&set);
+    if (sigfd == -1) {
+        fprintf(stderr, "failed to create signalfd\n");
+        return -errno;
+    }
+
+    fcntl_setfl(sigfd, O_NONBLOCK);
+
+    qemu_set_fd_handler2(sigfd, NULL, sigfd_handler, NULL,
+                         (void *)(intptr_t)sigfd);
+
+    return 0;
+}
+
+#else /* _WIN32 */
+
+HANDLE qemu_event_handle;
+
+static void dummy_event_handler(void *opaque)
+{
+}
+
+static int qemu_event_init(void)
+{
+    qemu_event_handle = CreateEvent(NULL, FALSE, FALSE, NULL);
+    if (!qemu_event_handle) {
+        fprintf(stderr, "Failed CreateEvent: %ld\n", GetLastError());
+        return -1;
+    }
+    qemu_add_wait_object(qemu_event_handle, dummy_event_handler, NULL);
+    return 0;
+}
+
+void qemu_notify_event(void)
+{
+    if (!SetEvent(qemu_event_handle)) {
+        fprintf(stderr, "qemu_notify_event: SetEvent failed: %ld\n",
+                GetLastError());
+        exit(1);
+    }
+}
+
+static int qemu_signal_init(void)
+{
+    return 0;
+}
+#endif
+
+int qemu_init_main_loop(void)
+{
+    int ret;
+
+    qemu_mutex_lock_iothread();
+    ret = qemu_signal_init();
+    if (ret) {
+        return ret;
+    }
+
+    /* Note eventfd must be drained before signalfd handlers run */
+    ret = qemu_event_init();
+    if (ret) {
+        return ret;
+    }
+
+    return 0;
+}
+
+
+static GPollFD poll_fds[1024 * 2]; /* this is probably overkill */
+static int n_poll_fds;
+static int max_priority;
+
+static void glib_select_fill(int *max_fd, fd_set *rfds, fd_set *wfds,
+                             fd_set *xfds, struct timeval *tv)
+{
+    GMainContext *context = g_main_context_default();
+    int i;
+    int timeout = 0, cur_timeout;
+
+    g_main_context_prepare(context, &max_priority);
+
+    n_poll_fds = g_main_context_query(context, max_priority, &timeout,
+                                      poll_fds, ARRAY_SIZE(poll_fds));
+    g_assert(n_poll_fds <= ARRAY_SIZE(poll_fds));
+
+    for (i = 0; i < n_poll_fds; i++) {
+        GPollFD *p = &poll_fds[i];
+
+        if ((p->events & G_IO_IN)) {
+            FD_SET(p->fd, rfds);
+            *max_fd = MAX(*max_fd, p->fd);
+        }
+        if ((p->events & G_IO_OUT)) {
+            FD_SET(p->fd, wfds);
+            *max_fd = MAX(*max_fd, p->fd);
+        }
+        if ((p->events & G_IO_ERR)) {
+            FD_SET(p->fd, xfds);
+            *max_fd = MAX(*max_fd, p->fd);
+        }
+    }
+
+    cur_timeout = (tv->tv_sec * 1000) + ((tv->tv_usec + 500) / 1000);
+    if (timeout >= 0 && timeout < cur_timeout) {
+        tv->tv_sec = timeout / 1000;
+        tv->tv_usec = (timeout % 1000) * 1000;
+    }
+}
+
+static void glib_select_poll(fd_set *rfds, fd_set *wfds, fd_set *xfds,
+                             bool err)
+{
+    GMainContext *context = g_main_context_default();
+
+    if (!err) {
+        int i;
+
+        for (i = 0; i < n_poll_fds; i++) {
+            GPollFD *p = &poll_fds[i];
+
+            if ((p->events & G_IO_IN) && FD_ISSET(p->fd, rfds)) {
+                p->revents |= G_IO_IN;
+            }
+            if ((p->events & G_IO_OUT) && FD_ISSET(p->fd, wfds)) {
+                p->revents |= G_IO_OUT;
+            }
+            if ((p->events & G_IO_ERR) && FD_ISSET(p->fd, xfds)) {
+                p->revents |= G_IO_ERR;
+            }
+        }
+    }
+
+    if (g_main_context_check(context, max_priority, poll_fds, n_poll_fds)) {
+        g_main_context_dispatch(context);
+    }
+}
+
+#ifdef _WIN32
+/***********************************************************/
+/* Polling handling */
+
+typedef struct PollingEntry {
+    PollingFunc *func;
+    void *opaque;
+    struct PollingEntry *next;
+} PollingEntry;
+
+static PollingEntry *first_polling_entry;
+
+int qemu_add_polling_cb(PollingFunc *func, void *opaque)
+{
+    PollingEntry **ppe, *pe;
+    pe = g_malloc0(sizeof(PollingEntry));
+    pe->func = func;
+    pe->opaque = opaque;
+    for(ppe = &first_polling_entry; *ppe != NULL; ppe = &(*ppe)->next);
+    *ppe = pe;
+    return 0;
+}
+
+void qemu_del_polling_cb(PollingFunc *func, void *opaque)
+{
+    PollingEntry **ppe, *pe;
+    for(ppe = &first_polling_entry; *ppe != NULL; ppe = &(*ppe)->next) {
+        pe = *ppe;
+        if (pe->func == func && pe->opaque == opaque) {
+            *ppe = pe->next;
+            g_free(pe);
+            break;
+        }
+    }
+}
+
+/***********************************************************/
+/* Wait objects support */
+typedef struct WaitObjects {
+    int num;
+    HANDLE events[MAXIMUM_WAIT_OBJECTS + 1];
+    WaitObjectFunc *func[MAXIMUM_WAIT_OBJECTS + 1];
+    void *opaque[MAXIMUM_WAIT_OBJECTS + 1];
+} WaitObjects;
+
+static WaitObjects wait_objects = {0};
+
+int qemu_add_wait_object(HANDLE handle, WaitObjectFunc *func, void *opaque)
+{
+    WaitObjects *w = &wait_objects;
+    if (w->num >= MAXIMUM_WAIT_OBJECTS) {
+        return -1;
+    }
+    w->events[w->num] = handle;
+    w->func[w->num] = func;
+    w->opaque[w->num] = opaque;
+    w->num++;
+    return 0;
+}
+
+void qemu_del_wait_object(HANDLE handle, WaitObjectFunc *func, void *opaque)
+{
+    int i, found;
+    WaitObjects *w = &wait_objects;
+
+    found = 0;
+    for (i = 0; i < w->num; i++) {
+        if (w->events[i] == handle) {
+            found = 1;
+        }
+        if (found) {
+            w->events[i] = w->events[i + 1];
+            w->func[i] = w->func[i + 1];
+            w->opaque[i] = w->opaque[i + 1];
+        }
+    }
+    if (found) {
+        w->num--;
+    }
+}
+
+static void os_host_main_loop_wait(int *timeout)
+{
+    int ret, ret2, i;
+    PollingEntry *pe;
+
+    /* XXX: need to suppress polling by better using win32 events */
+    ret = 0;
+    for (pe = first_polling_entry; pe != NULL; pe = pe->next) {
+        ret |= pe->func(pe->opaque);
+    }
+    if (ret == 0) {
+        int err;
+        WaitObjects *w = &wait_objects;
+
+        qemu_mutex_unlock_iothread();
+        ret = WaitForMultipleObjects(w->num, w->events, FALSE, *timeout);
+        qemu_mutex_lock_iothread();
+        if (WAIT_OBJECT_0 + 0 <= ret && ret <= WAIT_OBJECT_0 + w->num - 1) {
+            if (w->func[ret - WAIT_OBJECT_0]) {
+                w->func[ret - WAIT_OBJECT_0](w->opaque[ret - WAIT_OBJECT_0]);
+            }
+
+            /* Check for additional signaled events */
+            for (i = (ret - WAIT_OBJECT_0 + 1); i < w->num; i++) {
+                /* Check if event is signaled */
+                ret2 = WaitForSingleObject(w->events[i], 0);
+                if (ret2 == WAIT_OBJECT_0) {
+                    if (w->func[i]) {
+                        w->func[i](w->opaque[i]);
+                    }
+                } else if (ret2 != WAIT_TIMEOUT) {
+                    err = GetLastError();
+                    fprintf(stderr, "WaitForSingleObject error %d %d\n", i, err);
+                }
+            }
+        } else if (ret != WAIT_TIMEOUT) {
+            err = GetLastError();
+            fprintf(stderr, "WaitForMultipleObjects error %d %d\n", ret, err);
+        }
+    }
+
+    *timeout = 0;
+}
+#else
+static inline void os_host_main_loop_wait(int *timeout)
+{
+}
+#endif
+
+int main_loop_wait(int nonblocking)
+{
+    fd_set rfds, wfds, xfds;
+    int ret, nfds;
+    struct timeval tv;
+    int timeout;
+
+    if (nonblocking) {
+        timeout = 0;
+    } else {
+        timeout = qemu_calculate_timeout();
+        qemu_bh_update_timeout(&timeout);
+    }
+
+    os_host_main_loop_wait(&timeout);
+
+    tv.tv_sec = timeout / 1000;
+    tv.tv_usec = (timeout % 1000) * 1000;
+
+    /* poll any events */
+    /* XXX: separate device handlers from system ones */
+    nfds = -1;
+    FD_ZERO(&rfds);
+    FD_ZERO(&wfds);
+    FD_ZERO(&xfds);
+
+#ifdef CONFIG_SLIRP
+    slirp_select_fill(&nfds, &rfds, &wfds, &xfds);
+#endif
+    qemu_iohandler_fill(&nfds, &rfds, &wfds, &xfds);
+    glib_select_fill(&nfds, &rfds, &wfds, &xfds, &tv);
+
+    if (timeout > 0) {
+        qemu_mutex_unlock_iothread();
+    }
+
+    ret = select(nfds + 1, &rfds, &wfds, &xfds, &tv);
+
+    if (timeout > 0) {
+        qemu_mutex_lock_iothread();
+    }
+
+    glib_select_poll(&rfds, &wfds, &xfds, (ret < 0));
+    qemu_iohandler_poll(&rfds, &wfds, &xfds, ret);
+#ifdef CONFIG_SLIRP
+    slirp_select_poll(&rfds, &wfds, &xfds, (ret < 0));
+#endif
+
+    qemu_run_all_timers();
+
+    /* Check bottom-halves last in case any of the earlier events triggered
+       them.  */
+    qemu_bh_poll();
+
+    return ret;
+}
--- a/main-loop.h
+++ b/main-loop.h
@ -0,0 +1,351 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef QEMU_MAIN_LOOP_H
+#define QEMU_MAIN_LOOP_H 1
+
+#ifdef SIGRTMIN
+#define SIG_IPI (SIGRTMIN+4)
+#else
+#define SIG_IPI SIGUSR1
+#endif
+
+/**
+ * qemu_init_main_loop: Set up the process so that it can run the main loop.
+ *
+ * This includes setting up signal handlers.  It should be called before
+ * any other threads are created.  In addition, threads other than the
+ * main one should block signals that are trapped by the main loop.
+ * For simplicity, you can consider these signals to be safe: SIGUSR1,
+ * SIGUSR2, thread signals (SIGFPE, SIGILL, SIGSEGV, SIGBUS) and real-time
+ * signals if available.  Remember that Windows in practice does not have
+ * signals, though.
+ */
+int qemu_init_main_loop(void);
+
+/**
+ * main_loop_wait: Run one iteration of the main loop.
+ *
+ * If @nonblocking is true, poll for events, otherwise suspend until
+ * one actually occurs.  The main loop usually consists of a loop that
+ * repeatedly calls main_loop_wait(false).
+ *
+ * Main loop services include file descriptor callbacks, bottom halves
+ * and timers (defined in qemu-timer.h).  Bottom halves are similar to timers
+ * that execute immediately, but have a lower overhead and scheduling them
+ * is wait-free, thread-safe and signal-safe.
+ *
+ * It is sometimes useful to put a whole program in a coroutine.  In this
+ * case, the coroutine actually should be started from within the main loop,
+ * so that the main loop can run whenever the coroutine yields.  To do this,
+ * you can use a bottom half to enter the coroutine as soon as the main loop
+ * starts:
+ *
+ *     void enter_co_bh(void *opaque) {
+ *         QEMUCoroutine *co = opaque;
+ *         qemu_coroutine_enter(co, NULL);
+ *     }
+ *
+ *     ...
+ *     QEMUCoroutine *co = qemu_coroutine_create(coroutine_entry);
+ *     QEMUBH *start_bh = qemu_bh_new(enter_co_bh, co);
+ *     qemu_bh_schedule(start_bh);
+ *     while (...) {
+ *         main_loop_wait(false);
+ *     }
+ *
+ * (In the future we may provide a wrapper for this).
+ *
+ * @nonblocking: Whether the caller should block until an event occurs.
+ */
+int main_loop_wait(int nonblocking);
+
+/**
+ * qemu_notify_event: Force processing of pending events.
+ *
+ * Similar to signaling a condition variable, qemu_notify_event forces
+ * main_loop_wait to look at pending events and exit.  The caller of
+ * main_loop_wait will usually call it again very soon, so qemu_notify_event
+ * also has the side effect of recalculating the sets of file descriptors
+ * that the main loop waits for.
+ *
+ * Calling qemu_notify_event is rarely necessary, because main loop
+ * services (bottom halves and timers) call it themselves.  One notable
+ * exception occurs when using qemu_set_fd_handler2 (see below).
+ */
+void qemu_notify_event(void);
+
+#ifdef _WIN32
+/* return TRUE if no sleep should be done afterwards */
+typedef int PollingFunc(void *opaque);
+
+/**
+ * qemu_add_polling_cb: Register a Windows-specific polling callback
+ *
+ * Currently, under Windows some events are polled rather than waited for.
+ * Polling callbacks do not ensure that @func is called timely, because
+ * the main loop might wait for an arbitrarily long time.  If possible,
+ * you should instead create a separate thread that does a blocking poll
+ * and set a Win32 event object.  The event can then be passed to
+ * qemu_add_wait_object.
+ *
+ * Polling callbacks really have nothing Windows specific in them, but
+ * as they are a hack and are currenly not necessary under POSIX systems,
+ * they are only available when QEMU is running under Windows.
+ *
+ * @func: The function that does the polling, and returns 1 to force
+ * immediate completion of main_loop_wait.
+ * @opaque: A pointer-size value that is passed to @func.
+ */
+int qemu_add_polling_cb(PollingFunc *func, void *opaque);
+
+/**
+ * qemu_del_polling_cb: Unregister a Windows-specific polling callback
+ *
+ * This function removes a callback that was registered with
+ * qemu_add_polling_cb.
+ *
+ * @func: The function that was passed to qemu_add_polling_cb.
+ * @opaque: A pointer-size value that was passed to qemu_add_polling_cb.
+ */
+void qemu_del_polling_cb(PollingFunc *func, void *opaque);
+
+/* Wait objects handling */
+typedef void WaitObjectFunc(void *opaque);
+
+/**
+ * qemu_add_wait_object: Register a callback for a Windows handle
+ *
+ * Under Windows, the iohandler mechanism can only be used with sockets.
+ * QEMU must use the WaitForMultipleObjects API to wait on other handles.
+ * This function registers a #HANDLE with QEMU, so that it will be included
+ * in the main loop's calls to WaitForMultipleObjects.  When the handle
+ * is in a signaled state, QEMU will call @func.
+ *
+ * @handle: The Windows handle to be observed.
+ * @func: A function to be called when @handle is in a signaled state.
+ * @opaque: A pointer-size value that is passed to @func.
+ */
+int qemu_add_wait_object(HANDLE handle, WaitObjectFunc *func, void *opaque);
+
+/**
+ * qemu_del_wait_object: Unregister a callback for a Windows handle
+ *
+ * This function removes a callback that was registered with
+ * qemu_add_wait_object.
+ *
+ * @func: The function that was passed to qemu_add_wait_object.
+ * @opaque: A pointer-size value that was passed to qemu_add_wait_object.
+ */
+void qemu_del_wait_object(HANDLE handle, WaitObjectFunc *func, void *opaque);
+#endif
+
+/* async I/O support */
+
+typedef void IOReadHandler(void *opaque, const uint8_t *buf, int size);
+typedef int IOCanReadHandler(void *opaque);
+typedef void IOHandler(void *opaque);
+
+/**
+ * qemu_set_fd_handler2: Register a file descriptor with the main loop
+ *
+ * This function tells the main loop to wake up whenever one of the
+ * following conditions is true:
+ *
+ * 1) if @fd_write is not %NULL, when the file descriptor is writable;
+ *
+ * 2) if @fd_read is not %NULL, when the file descriptor is readable.
+ *
+ * @fd_read_poll can be used to disable the @fd_read callback temporarily.
+ * This is useful to avoid calling qemu_set_fd_handler2 every time the
+ * client becomes interested in reading (or dually, stops being interested).
+ * A typical example is when @fd is a listening socket and you want to bound
+ * the number of active clients.  Remember to call qemu_notify_event whenever
+ * the condition may change from %false to %true.
+ *
+ * The callbacks that are set up by qemu_set_fd_handler2 are level-triggered.
+ * If @fd_read does not read from @fd, or @fd_write does not write to @fd
+ * until its buffers are full, they will be called again on the next
+ * iteration.
+ *
+ * @fd: The file descriptor to be observed.  Under Windows it must be
+ * a #SOCKET.
+ *
+ * @fd_read_poll: A function that returns 1 if the @fd_read callback
+ * should be fired.  If the function returns 0, the main loop will not
+ * end its iteration even if @fd becomes readable.
+ *
+ * @fd_read: A level-triggered callback that is fired if @fd is readable
+ * at the beginning of a main loop iteration, or if it becomes readable
+ * during one.
+ *
+ * @fd_write: A level-triggered callback that is fired when @fd is writable
+ * at the beginning of a main loop iteration, or if it becomes writable
+ * during one.
+ *
+ * @opaque: A pointer-sized value that is passed to @fd_read_poll,
+ * @fd_read and @fd_write.
+ */
+int qemu_set_fd_handler2(int fd,
+                         IOCanReadHandler *fd_read_poll,
+                         IOHandler *fd_read,
+                         IOHandler *fd_write,
+                         void *opaque);
+
+/**
+ * qemu_set_fd_handler: Register a file descriptor with the main loop
+ *
+ * This function tells the main loop to wake up whenever one of the
+ * following conditions is true:
+ *
+ * 1) if @fd_write is not %NULL, when the file descriptor is writable;
+ *
+ * 2) if @fd_read is not %NULL, when the file descriptor is readable.
+ *
+ * The callbacks that are set up by qemu_set_fd_handler are level-triggered.
+ * If @fd_read does not read from @fd, or @fd_write does not write to @fd
+ * until its buffers are full, they will be called again on the next
+ * iteration.
+ *
+ * @fd: The file descriptor to be observed.  Under Windows it must be
+ * a #SOCKET.
+ *
+ * @fd_read: A level-triggered callback that is fired if @fd is readable
+ * at the beginning of a main loop iteration, or if it becomes readable
+ * during one.
+ *
+ * @fd_write: A level-triggered callback that is fired when @fd is writable
+ * at the beginning of a main loop iteration, or if it becomes writable
+ * during one.
+ *
+ * @opaque: A pointer-sized value that is passed to @fd_read and @fd_write.
+ */
+int qemu_set_fd_handler(int fd,
+                        IOHandler *fd_read,
+                        IOHandler *fd_write,
+                        void *opaque);
+
+typedef struct QEMUBH QEMUBH;
+typedef void QEMUBHFunc(void *opaque);
+
+/**
+ * qemu_bh_new: Allocate a new bottom half structure.
+ *
+ * Bottom halves are lightweight callbacks whose invocation is guaranteed
+ * to be wait-free, thread-safe and signal-safe.  The #QEMUBH structure
+ * is opaque and must be allocated prior to its use.
+ */
+QEMUBH *qemu_bh_new(QEMUBHFunc *cb, void *opaque);
+
+/**
+ * qemu_bh_schedule: Schedule a bottom half.
+ *
+ * Scheduling a bottom half interrupts the main loop and causes the
+ * execution of the callback that was passed to qemu_bh_new.
+ *
+ * Bottom halves that are scheduled from a bottom half handler are instantly
+ * invoked.  This can create an infinite loop if a bottom half handler
+ * schedules itself.
+ *
+ * @bh: The bottom half to be scheduled.
+ */
+void qemu_bh_schedule(QEMUBH *bh);
+
+/**
+ * qemu_bh_cancel: Cancel execution of a bottom half.
+ *
+ * Canceling execution of a bottom half undoes the effect of calls to
+ * qemu_bh_schedule without freeing its resources yet.  While cancellation
+ * itself is also wait-free and thread-safe, it can of course race with the
+ * loop that executes bottom halves unless you are holding the iothread
+ * mutex.  This makes it mostly useless if you are not holding the mutex.
+ *
+ * @bh: The bottom half to be canceled.
+ */
+void qemu_bh_cancel(QEMUBH *bh);
+
+/**
+ *qemu_bh_delete: Cancel execution of a bottom half and free its resources.
+ *
+ * Deleting a bottom half frees the memory that was allocated for it by
+ * qemu_bh_new.  It also implies canceling the bottom half if it was
+ * scheduled.
+ *
+ * @bh: The bottom half to be deleted.
+ */
+void qemu_bh_delete(QEMUBH *bh);
+
+#ifdef CONFIG_POSIX
+/**
+ * qemu_add_child_watch: Register a child process for reaping.
+ *
+ * Under POSIX systems, a parent process must read the exit status of
+ * its child processes using waitpid, or the operating system will not
+ * free some of the resources attached to that process.
+ *
+ * This function directs the QEMU main loop to observe a child process
+ * and call waitpid as soon as it exits; the watch is then removed
+ * automatically.  It is useful whenever QEMU forks a child process
+ * but will find out about its termination by other means such as a
+ * "broken pipe".
+ *
+ * @pid: The pid that QEMU should observe.
+ */
+int qemu_add_child_watch(pid_t pid);
+#endif
+
+/**
+ * qemu_mutex_lock_iothread: Lock the main loop mutex.
+ *
+ * This function locks the main loop mutex.  The mutex is taken by
+ * qemu_init_main_loop and always taken except while waiting on
+ * external events (such as with select).  The mutex should be taken
+ * by threads other than the main loop thread when calling
+ * qemu_bh_new(), qemu_set_fd_handler() and basically all other
+ * functions documented in this file.
+ */
+void qemu_mutex_lock_iothread(void);
+
+/**
+ * qemu_mutex_unlock_iothread: Unlock the main loop mutex.
+ *
+ * This function unlocks the main loop mutex.  The mutex is taken by
+ * qemu_init_main_loop and always taken except while waiting on
+ * external events (such as with select).  The mutex should be unlocked
+ * as soon as possible by threads other than the main loop thread,
+ * because it prevents the main loop from processing callbacks,
+ * including timers and bottom halves.
+ */
+void qemu_mutex_unlock_iothread(void);
+
+/* internal interfaces */
+
+void qemu_iohandler_fill(int *pnfds, fd_set *readfds, fd_set *writefds, fd_set *xfds);
+void qemu_iohandler_poll(fd_set *readfds, fd_set *writefds, fd_set *xfds, int rc);
+
+void qemu_bh_schedule_idle(QEMUBH *bh);
+int qemu_bh_poll(void);
+void qemu_bh_update_timeout(int *timeout);
+
+#endif
--- a/os-win32.c
+++ b/os-win32.c
@ -48,129 +48,6 @@ int setenv(const char *name, const char *value, int overwrite)
    return result;
 }

-/***********************************************************/
-/* Polling handling */
-
-typedef struct PollingEntry {
-    PollingFunc *func;
-    void *opaque;
-    struct PollingEntry *next;
-} PollingEntry;
-
-static PollingEntry *first_polling_entry;
-
-int qemu_add_polling_cb(PollingFunc *func, void *opaque)
-{
-    PollingEntry **ppe, *pe;
-    pe = g_malloc0(sizeof(PollingEntry));
-    pe->func = func;
-    pe->opaque = opaque;
-    for(ppe = &first_polling_entry; *ppe != NULL; ppe = &(*ppe)->next);
-    *ppe = pe;
-    return 0;
-}
-
-void qemu_del_polling_cb(PollingFunc *func, void *opaque)
-{
-    PollingEntry **ppe, *pe;
-    for(ppe = &first_polling_entry; *ppe != NULL; ppe = &(*ppe)->next) {
-        pe = *ppe;
-        if (pe->func == func && pe->opaque == opaque) {
-            *ppe = pe->next;
-            g_free(pe);
-            break;
-        }
-    }
-}
-
-/***********************************************************/
-/* Wait objects support */
-typedef struct WaitObjects {
-    int num;
-    HANDLE events[MAXIMUM_WAIT_OBJECTS + 1];
-    WaitObjectFunc *func[MAXIMUM_WAIT_OBJECTS + 1];
-    void *opaque[MAXIMUM_WAIT_OBJECTS + 1];
-} WaitObjects;
-
-static WaitObjects wait_objects = {0};
-
-int qemu_add_wait_object(HANDLE handle, WaitObjectFunc *func, void *opaque)
-{
-    WaitObjects *w = &wait_objects;
-
-    if (w->num >= MAXIMUM_WAIT_OBJECTS)
-        return -1;
-    w->events[w->num] = handle;
-    w->func[w->num] = func;
-    w->opaque[w->num] = opaque;
-    w->num++;
-    return 0;
-}
-
-void qemu_del_wait_object(HANDLE handle, WaitObjectFunc *func, void *opaque)
-{
-    int i, found;
-    WaitObjects *w = &wait_objects;
-
-    found = 0;
-    for (i = 0; i < w->num; i++) {
-        if (w->events[i] == handle)
-            found = 1;
-        if (found) {
-            w->events[i] = w->events[i + 1];
-            w->func[i] = w->func[i + 1];
-            w->opaque[i] = w->opaque[i + 1];
-        }
-    }
-    if (found)
-        w->num--;
-}
-
-void os_host_main_loop_wait(int *timeout)
-{
-    int ret, ret2, i;
-    PollingEntry *pe;
-
-    /* XXX: need to suppress polling by better using win32 events */
-    ret = 0;
-    for(pe = first_polling_entry; pe != NULL; pe = pe->next) {
-        ret |= pe->func(pe->opaque);
-    }
-    if (ret == 0) {
-        int err;
-        WaitObjects *w = &wait_objects;
-
-        qemu_mutex_unlock_iothread();
-        ret = WaitForMultipleObjects(w->num, w->events, FALSE, *timeout);
-        qemu_mutex_lock_iothread();
-        if (WAIT_OBJECT_0 + 0 <= ret && ret <= WAIT_OBJECT_0 + w->num - 1) {
-            if (w->func[ret - WAIT_OBJECT_0])
-                w->func[ret - WAIT_OBJECT_0](w->opaque[ret - WAIT_OBJECT_0]);
-
-            /* Check for additional signaled events */
-            for(i = (ret - WAIT_OBJECT_0 + 1); i < w->num; i++) {
-
-                /* Check if event is signaled */
-                ret2 = WaitForSingleObject(w->events[i], 0);
-                if(ret2 == WAIT_OBJECT_0) {
-                    if (w->func[i])
-                        w->func[i](w->opaque[i]);
-                } else if (ret2 == WAIT_TIMEOUT) {
-                } else {
-                    err = GetLastError();
-                    fprintf(stderr, "WaitForSingleObject error %d %d\n", i, err);
-                }
-            }
-        } else if (ret == WAIT_TIMEOUT) {
-        } else {
-            err = GetLastError();
-            fprintf(stderr, "WaitForMultipleObjects error %d %d\n", ret, err);
-        }
-    }
-
-    *timeout = 0;
-}
-
 static BOOL WINAPI qemu_ctrl_handler(DWORD type)
 {
    exit(STATUS_CONTROL_C_EXIT);
--- a/qemu-char.h
+++ b/qemu-char.h
@ -7,6 +7,7 @@
 #include "qemu-config.h"
 #include "qobject.h"
 #include "qstring.h"
+#include "main-loop.h"

 /* character device */

@ -237,15 +238,4 @@ void qemu_chr_close_mem(CharDriverState *chr);
 QString *qemu_chr_mem_to_qs(CharDriverState *chr);
 size_t qemu_chr_mem_osize(const CharDriverState *chr);

-/* async I/O support */
-
-int qemu_set_fd_handler2(int fd,
-                         IOCanReadHandler *fd_read_poll,
-                         IOHandler *fd_read,
-                         IOHandler *fd_write,
-                         void *opaque);
-int qemu_set_fd_handler(int fd,
-                        IOHandler *fd_read,
-                        IOHandler *fd_write,
-                        void *opaque);
 #endif
--- a/qemu-common.h
+++ b/qemu-common.h
@ -13,7 +13,6 @@

 typedef struct QEMUTimer QEMUTimer;
 typedef struct QEMUFile QEMUFile;
-typedef struct QEMUBH QEMUBH;
 typedef struct DeviceState DeviceState;

 struct Monitor;
@ -96,6 +95,10 @@ static inline char *realpath(const char *path, char *resolved_path)
 }
 #endif

+/* icount */
+void configure_icount(const char *option);
+extern int use_icount;
+
 /* FIXME: Remove NEED_CPU_H.  */
 #ifndef NEED_CPU_H

@ -113,23 +116,6 @@ static inline char *realpath(const char *path, char *resolved_path)
 int qemu_main(int argc, char **argv, char **envp);
 #endif

-/* bottom halves */
-typedef void QEMUBHFunc(void *opaque);
-
-QEMUBH *qemu_bh_new(QEMUBHFunc *cb, void *opaque);
-void qemu_bh_schedule(QEMUBH *bh);
-/* Bottom halfs that are scheduled from a bottom half handler are instantly
- * invoked.  This can create an infinite loop if a bottom half handler
- * schedules itself.  qemu_bh_schedule_idle() avoids this infinite loop by
- * ensuring that the bottom half isn't executed until the next main loop
- * iteration.
- */
-void qemu_bh_schedule_idle(QEMUBH *bh);
-void qemu_bh_cancel(QEMUBH *bh);
-void qemu_bh_delete(QEMUBH *bh);
-int qemu_bh_poll(void);
-void qemu_bh_update_timeout(int *timeout);
-
 void qemu_get_timedate(struct tm *tm, int offset);
 int qemu_timedate_diff(struct tm *tm);

@ -183,16 +169,12 @@ const char *path(const char *pathname);

 void *qemu_oom_check(void *ptr);

-void qemu_mutex_lock_iothread(void);
-void qemu_mutex_unlock_iothread(void);
-
 int qemu_open(const char *name, int flags, ...);
 ssize_t qemu_write_full(int fd, const void *buf, size_t count)
    QEMU_WARN_UNUSED_RESULT;
 void qemu_set_cloexec(int fd);

 #ifndef _WIN32
-int qemu_add_child_watch(pid_t pid);
 int qemu_eventfd(int pipefd[2]);
 int qemu_pipe(int pipefd[2]);
 #endif
@ -207,14 +189,6 @@ int qemu_pipe(int pipefd[2]);

 void QEMU_NORETURN hw_error(const char *fmt, ...) GCC_FMT_ATTR(1, 2);

-/* IO callbacks.  */
-typedef void IOReadHandler(void *opaque, const uint8_t *buf, int size);
-typedef int IOCanReadHandler(void *opaque);
-typedef void IOHandler(void *opaque);
-
-void qemu_iohandler_fill(int *pnfds, fd_set *readfds, fd_set *writefds, fd_set *xfds);
-void qemu_iohandler_poll(fd_set *readfds, fd_set *writefds, fd_set *xfds, int rc);
-
 struct ParallelIOArg {
    void *buffer;
    int count;
@ -276,9 +250,6 @@ void cpu_exec_init_all(void);
 void cpu_save(QEMUFile *f, void *opaque);
 int cpu_load(QEMUFile *f, void *opaque, int version_id);

-/* Force QEMU to process pending events */
-void qemu_notify_event(void);
-
 /* Unblock cpu */
 void qemu_cpu_kick(void *env);
 void qemu_cpu_kick_self(void);
--- a/qemu-coroutine-lock.c
+++ b/qemu-coroutine-lock.c
@ -26,6 +26,7 @@
 #include "qemu-coroutine.h"
 #include "qemu-coroutine-int.h"
 #include "qemu-queue.h"
+#include "main-loop.h"
 #include "trace.h"

 static QTAILQ_HEAD(, Coroutine) unlock_bh_queue =
--- a/qemu-os-posix.h
+++ b/qemu-os-posix.h
@ -26,10 +26,6 @@
 #ifndef QEMU_OS_POSIX_H
 #define QEMU_OS_POSIX_H

-static inline void os_host_main_loop_wait(int *timeout)
-{
-}
-
 void os_set_line_buffering(void);
 void os_set_proc_name(const char *s);
 void os_setup_signal_handling(void);
--- a/qemu-os-win32.h
+++ b/qemu-os-win32.h
@ -28,26 +28,11 @@

 #include <windows.h>
 #include <winsock2.h>
+#include "main-loop.h"

 /* Declaration of ffs() is missing in MinGW's strings.h. */
 int ffs(int i);

-/* Polling handling */
-
-/* return TRUE if no sleep should be done afterwards */
-typedef int PollingFunc(void *opaque);
-
-int qemu_add_polling_cb(PollingFunc *func, void *opaque);
-void qemu_del_polling_cb(PollingFunc *func, void *opaque);
-
-/* Wait objects handling */
-typedef void WaitObjectFunc(void *opaque);
-
-int qemu_add_wait_object(HANDLE handle, WaitObjectFunc *func, void *opaque);
-void qemu_del_wait_object(HANDLE handle, WaitObjectFunc *func, void *opaque);
-
-void os_host_main_loop_wait(int *timeout);
-
 static inline void os_setup_signal_handling(void) {}
 static inline void os_daemonize(void) {}
 static inline void os_setup_post(void) {}
--- a/qemu-timer.c
+++ b/qemu-timer.c
@ -46,82 +46,6 @@

 #include "qemu-timer.h"

-/* Conversion factor from emulated instructions to virtual clock ticks.  */
-int icount_time_shift;
-/* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
-#define MAX_ICOUNT_SHIFT 10
-/* Compensate for varying guest execution speed.  */
-int64_t qemu_icount_bias;
-static QEMUTimer *icount_rt_timer;
-static QEMUTimer *icount_vm_timer;
-
-/***********************************************************/
-/* guest cycle counter */
-
-typedef struct TimersState {
-    int64_t cpu_ticks_prev;
-    int64_t cpu_ticks_offset;
-    int64_t cpu_clock_offset;
-    int32_t cpu_ticks_enabled;
-    int64_t dummy;
-} TimersState;
-
-TimersState timers_state;
-
-/* return the host CPU cycle counter and handle stop/restart */
-int64_t cpu_get_ticks(void)
-{
-    if (use_icount) {
-        return cpu_get_icount();
-    }
-    if (!timers_state.cpu_ticks_enabled) {
-        return timers_state.cpu_ticks_offset;
-    } else {
-        int64_t ticks;
-        ticks = cpu_get_real_ticks();
-        if (timers_state.cpu_ticks_prev > ticks) {
-            /* Note: non increasing ticks may happen if the host uses
-               software suspend */
-            timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
-        }
-        timers_state.cpu_ticks_prev = ticks;
-        return ticks + timers_state.cpu_ticks_offset;
-    }
-}
-
-/* return the host CPU monotonic timer and handle stop/restart */
-static int64_t cpu_get_clock(void)
-{
-    int64_t ti;
-    if (!timers_state.cpu_ticks_enabled) {
-        return timers_state.cpu_clock_offset;
-    } else {
-        ti = get_clock();
-        return ti + timers_state.cpu_clock_offset;
-    }
-}
-
-/* enable cpu_get_ticks() */
-void cpu_enable_ticks(void)
-{
-    if (!timers_state.cpu_ticks_enabled) {
-        timers_state.cpu_ticks_offset -= cpu_get_real_ticks();
-        timers_state.cpu_clock_offset -= get_clock();
-        timers_state.cpu_ticks_enabled = 1;
-    }
-}
-
-/* disable cpu_get_ticks() : the clock is stopped. You must not call
-   cpu_get_ticks() after that.  */
-void cpu_disable_ticks(void)
-{
-    if (timers_state.cpu_ticks_enabled) {
-        timers_state.cpu_ticks_offset = cpu_get_ticks();
-        timers_state.cpu_clock_offset = cpu_get_clock();
-        timers_state.cpu_ticks_enabled = 0;
-    }
-}
-
 /***********************************************************/
 /* timers */

@ -133,7 +57,7 @@ struct QEMUClock {
    int type;
    int enabled;

-    QEMUTimer *warp_timer;
+    QEMUTimer *active_timers;

    NotifierList reset_notifiers;
    int64_t last;
@ -152,7 +76,7 @@ struct qemu_alarm_timer {
    char const *name;
    int (*start)(struct qemu_alarm_timer *t);
    void (*stop)(struct qemu_alarm_timer *t);
-    void (*rearm)(struct qemu_alarm_timer *t);
+    void (*rearm)(struct qemu_alarm_timer *t, int64_t nearest_delta_ns);
 #if defined(__linux__)
    int fd;
    timer_t timer;
@ -180,12 +104,46 @@ static inline int alarm_has_dynticks(struct qemu_alarm_timer *t)
    return !!t->rearm;
 }

+static int64_t qemu_next_alarm_deadline(void)
+{
+    int64_t delta;
+    int64_t rtdelta;
+
+    if (!use_icount && vm_clock->active_timers) {
+        delta = vm_clock->active_timers->expire_time -
+                     qemu_get_clock_ns(vm_clock);
+    } else {
+        delta = INT32_MAX;
+    }
+    if (host_clock->active_timers) {
+        int64_t hdelta = host_clock->active_timers->expire_time -
+                 qemu_get_clock_ns(host_clock);
+        if (hdelta < delta) {
+            delta = hdelta;
+        }
+    }
+    if (rt_clock->active_timers) {
+        rtdelta = (rt_clock->active_timers->expire_time -
+                 qemu_get_clock_ns(rt_clock));
+        if (rtdelta < delta) {
+            delta = rtdelta;
+        }
+    }
+
+    return delta;
+}
+
 static void qemu_rearm_alarm_timer(struct qemu_alarm_timer *t)
 {
-    if (!alarm_has_dynticks(t))
+    int64_t nearest_delta_ns;
+    assert(alarm_has_dynticks(t));
+    if (!rt_clock->active_timers &&
+        !vm_clock->active_timers &&
+        !host_clock->active_timers) {
        return;
-
-    t->rearm(t);
+    }
+    nearest_delta_ns = qemu_next_alarm_deadline();
+    t->rearm(t, nearest_delta_ns);
 }

 /* TODO: MIN_TIMER_REARM_NS should be optimized */
@ -195,83 +153,28 @@ static void qemu_rearm_alarm_timer(struct qemu_alarm_timer *t)

 static int mm_start_timer(struct qemu_alarm_timer *t);
 static void mm_stop_timer(struct qemu_alarm_timer *t);
-static void mm_rearm_timer(struct qemu_alarm_timer *t);
+static void mm_rearm_timer(struct qemu_alarm_timer *t, int64_t delta);

 static int win32_start_timer(struct qemu_alarm_timer *t);
 static void win32_stop_timer(struct qemu_alarm_timer *t);
-static void win32_rearm_timer(struct qemu_alarm_timer *t);
+static void win32_rearm_timer(struct qemu_alarm_timer *t, int64_t delta);

 #else

 static int unix_start_timer(struct qemu_alarm_timer *t);
 static void unix_stop_timer(struct qemu_alarm_timer *t);
-static void unix_rearm_timer(struct qemu_alarm_timer *t);
+static void unix_rearm_timer(struct qemu_alarm_timer *t, int64_t delta);

 #ifdef __linux__

 static int dynticks_start_timer(struct qemu_alarm_timer *t);
 static void dynticks_stop_timer(struct qemu_alarm_timer *t);
-static void dynticks_rearm_timer(struct qemu_alarm_timer *t);
+static void dynticks_rearm_timer(struct qemu_alarm_timer *t, int64_t delta);

 #endif /* __linux__ */

 #endif /* _WIN32 */

-/* Correlation between real and virtual time is always going to be
-   fairly approximate, so ignore small variation.
-   When the guest is idle real and virtual time will be aligned in
-   the IO wait loop.  */
-#define ICOUNT_WOBBLE (get_ticks_per_sec() / 10)
-
-static void icount_adjust(void)
-{
-    int64_t cur_time;
-    int64_t cur_icount;
-    int64_t delta;
-    static int64_t last_delta;
-    /* If the VM is not running, then do nothing.  */
-    if (!runstate_is_running())
-        return;
-
-    cur_time = cpu_get_clock();
-    cur_icount = qemu_get_clock_ns(vm_clock);
-    delta = cur_icount - cur_time;
-    /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
-    if (delta > 0
-        && last_delta + ICOUNT_WOBBLE < delta * 2
-        && icount_time_shift > 0) {
-        /* The guest is getting too far ahead.  Slow time down.  */
-        icount_time_shift--;
-    }
-    if (delta < 0
-        && last_delta - ICOUNT_WOBBLE > delta * 2
-        && icount_time_shift < MAX_ICOUNT_SHIFT) {
-        /* The guest is getting too far behind.  Speed time up.  */
-        icount_time_shift++;
-    }
-    last_delta = delta;
-    qemu_icount_bias = cur_icount - (qemu_icount << icount_time_shift);
-}
-
-static void icount_adjust_rt(void * opaque)
-{
-    qemu_mod_timer(icount_rt_timer,
-                   qemu_get_clock_ms(rt_clock) + 1000);
-    icount_adjust();
-}
-
-static void icount_adjust_vm(void * opaque)
-{
-    qemu_mod_timer(icount_vm_timer,
-                   qemu_get_clock_ns(vm_clock) + get_ticks_per_sec() / 10);
-    icount_adjust();
-}
-
-int64_t qemu_icount_round(int64_t count)
-{
-    return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
-}
-
 static struct qemu_alarm_timer alarm_timers[] = {
 #ifndef _WIN32
 #ifdef __linux__
@ -352,14 +255,10 @@ next:
    }
 }

-#define QEMU_NUM_CLOCKS 3
-
 QEMUClock *rt_clock;
 QEMUClock *vm_clock;
 QEMUClock *host_clock;

-static QEMUTimer *active_timers[QEMU_NUM_CLOCKS];
-
 static QEMUClock *qemu_new_clock(int type)
 {
    QEMUClock *clock;
@ -367,101 +266,43 @@ static QEMUClock *qemu_new_clock(int type)
    clock = g_malloc0(sizeof(QEMUClock));
    clock->type = type;
    clock->enabled = 1;
+    clock->last = INT64_MIN;
    notifier_list_init(&clock->reset_notifiers);
-    /* required to detect & report backward jumps */
-    if (type == QEMU_CLOCK_HOST) {
-        clock->last = get_clock_realtime();
-    }
    return clock;
 }

 void qemu_clock_enable(QEMUClock *clock, int enabled)
 {
+    bool old = clock->enabled;
    clock->enabled = enabled;
+    if (enabled && !old) {
+        qemu_rearm_alarm_timer(alarm_timer);
+    }
 }

-static int64_t vm_clock_warp_start;
-
-static void icount_warp_rt(void *opaque)
+int64_t qemu_clock_has_timers(QEMUClock *clock)
 {
-    if (vm_clock_warp_start == -1) {
-        return;
-    }
-
-    if (runstate_is_running()) {
-        int64_t clock = qemu_get_clock_ns(rt_clock);
-        int64_t warp_delta = clock - vm_clock_warp_start;
-        if (use_icount == 1) {
-            qemu_icount_bias += warp_delta;
-        } else {
-            /*
-             * In adaptive mode, do not let the vm_clock run too
-             * far ahead of real time.
-             */
-            int64_t cur_time = cpu_get_clock();
-            int64_t cur_icount = qemu_get_clock_ns(vm_clock);
-            int64_t delta = cur_time - cur_icount;
-            qemu_icount_bias += MIN(warp_delta, delta);
-        }
-        if (qemu_timer_expired(active_timers[QEMU_CLOCK_VIRTUAL],
-                               qemu_get_clock_ns(vm_clock))) {
-            qemu_notify_event();
-        }
-    }
-    vm_clock_warp_start = -1;
+    return !!clock->active_timers;
 }

-void qemu_clock_warp(QEMUClock *clock)
+int64_t qemu_clock_expired(QEMUClock *clock)
 {
-    int64_t deadline;
+    return (clock->active_timers &&
+            clock->active_timers->expire_time < qemu_get_clock_ns(clock));
+}

-    if (!clock->warp_timer) {
-        return;
+int64_t qemu_clock_deadline(QEMUClock *clock)
+{
+    /* To avoid problems with overflow limit this to 2^32.  */
+    int64_t delta = INT32_MAX;
+
+    if (clock->active_timers) {
+        delta = clock->active_timers->expire_time - qemu_get_clock_ns(clock);
    }
-
-    /*
-     * There are too many global variables to make the "warp" behavior
-     * applicable to other clocks.  But a clock argument removes the
-     * need for if statements all over the place.
-     */
-    assert(clock == vm_clock);
-
-    /*
-     * If the CPUs have been sleeping, advance the vm_clock timer now.  This
-     * ensures that the deadline for the timer is computed correctly below.
-     * This also makes sure that the insn counter is synchronized before the
-     * CPU starts running, in case the CPU is woken by an event other than
-     * the earliest vm_clock timer.
-     */
-    icount_warp_rt(NULL);
-    if (!all_cpu_threads_idle() || !active_timers[clock->type]) {
-        qemu_del_timer(clock->warp_timer);
-        return;
-    }
-
-    vm_clock_warp_start = qemu_get_clock_ns(rt_clock);
-    deadline = qemu_next_icount_deadline();
-    if (deadline > 0) {
-        /*
-         * Ensure the vm_clock proceeds even when the virtual CPU goes to
-         * sleep.  Otherwise, the CPU might be waiting for a future timer
-         * interrupt to wake it up, but the interrupt never comes because
-         * the vCPU isn't running any insns and thus doesn't advance the
-         * vm_clock.
-         *
-         * An extreme solution for this problem would be to never let VCPUs
-         * sleep in icount mode if there is a pending vm_clock timer; rather
-         * time could just advance to the next vm_clock event.  Instead, we
-         * do stop VCPUs and only advance vm_clock after some "real" time,
-         * (related to the time left until the next event) has passed.  This
-         * rt_clock timer will do this.  This avoids that the warps are too
-         * visible externally---for example, you will not be sending network
-         * packets continously instead of every 100ms.
-         */
-        qemu_mod_timer(clock->warp_timer, vm_clock_warp_start + deadline);
-    } else {
-        qemu_notify_event();
+    if (delta < 0) {
+        delta = 0;
    }
+    return delta;
 }

 QEMUTimer *qemu_new_timer(QEMUClock *clock, int scale,
@ -489,7 +330,7 @@ void qemu_del_timer(QEMUTimer *ts)

    /* NOTE: this code must be signal safe because
       qemu_timer_expired() can be called from a signal. */
-    pt = &active_timers[ts->clock->type];
+    pt = &ts->clock->active_timers;
    for(;;) {
        t = *pt;
        if (!t)
@ -504,7 +345,7 @@ void qemu_del_timer(QEMUTimer *ts)

 /* modify the current timer so that it will be fired when current_time
   >= expire_time. The corresponding callback will be called. */
-static void qemu_mod_timer_ns(QEMUTimer *ts, int64_t expire_time)
+void qemu_mod_timer_ns(QEMUTimer *ts, int64_t expire_time)
 {
    QEMUTimer **pt, *t;

@ -513,7 +354,7 @@ static void qemu_mod_timer_ns(QEMUTimer *ts, int64_t expire_time)
    /* add the timer in the sorted list */
    /* NOTE: this code must be signal safe because
       qemu_timer_expired() can be called from a signal. */
-    pt = &active_timers[ts->clock->type];
+    pt = &ts->clock->active_timers;
    for(;;) {
        t = *pt;
        if (!qemu_timer_expired_ns(t, expire_time)) {
@ -526,7 +367,7 @@ static void qemu_mod_timer_ns(QEMUTimer *ts, int64_t expire_time)
    *pt = ts;

    /* Rearm if necessary  */
-    if (pt == &active_timers[ts->clock->type]) {
+    if (pt == &ts->clock->active_timers) {
        if (!alarm_timer->pending) {
            qemu_rearm_alarm_timer(alarm_timer);
        }
@ -538,8 +379,6 @@ static void qemu_mod_timer_ns(QEMUTimer *ts, int64_t expire_time)
    }
 }

-/* modify the current timer so that it will be fired when current_time
-   >= expire_time. The corresponding callback will be called. */
 void qemu_mod_timer(QEMUTimer *ts, int64_t expire_time)
 {
    qemu_mod_timer_ns(ts, expire_time * ts->scale);
@ -548,7 +387,7 @@ void qemu_mod_timer(QEMUTimer *ts, int64_t expire_time)
 int qemu_timer_pending(QEMUTimer *ts)
 {
    QEMUTimer *t;
-    for(t = active_timers[ts->clock->type]; t != NULL; t = t->next) {
+    for (t = ts->clock->active_timers; t != NULL; t = t->next) {
        if (t == ts)
            return 1;
    }
@ -569,7 +408,7 @@ static void qemu_run_timers(QEMUClock *clock)
        return;

    current_time = qemu_get_clock_ns(clock);
-    ptimer_head = &active_timers[clock->type];
+    ptimer_head = &clock->active_timers;
    for(;;) {
        ts = *ptimer_head;
        if (!qemu_timer_expired_ns(ts, current_time)) {
@ -624,79 +463,11 @@ void init_clocks(void)
    rt_clock = qemu_new_clock(QEMU_CLOCK_REALTIME);
    vm_clock = qemu_new_clock(QEMU_CLOCK_VIRTUAL);
    host_clock = qemu_new_clock(QEMU_CLOCK_HOST);
-
-    rtc_clock = host_clock;
 }

-/* save a timer */
-void qemu_put_timer(QEMUFile *f, QEMUTimer *ts)
+uint64_t qemu_timer_expire_time_ns(QEMUTimer *ts)
 {
-    uint64_t expire_time;
-
-    if (qemu_timer_pending(ts)) {
-        expire_time = ts->expire_time;
-    } else {
-        expire_time = -1;
-    }
-    qemu_put_be64(f, expire_time);
-}
-
-void qemu_get_timer(QEMUFile *f, QEMUTimer *ts)
-{
-    uint64_t expire_time;
-
-    expire_time = qemu_get_be64(f);
-    if (expire_time != -1) {
-        qemu_mod_timer_ns(ts, expire_time);
-    } else {
-        qemu_del_timer(ts);
-    }
-}
-
-static const VMStateDescription vmstate_timers = {
-    .name = "timer",
-    .version_id = 2,
-    .minimum_version_id = 1,
-    .minimum_version_id_old = 1,
-    .fields      = (VMStateField []) {
-        VMSTATE_INT64(cpu_ticks_offset, TimersState),
-        VMSTATE_INT64(dummy, TimersState),
-        VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
-        VMSTATE_END_OF_LIST()
-    }
-};
-
-void configure_icount(const char *option)
-{
-    vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
-    if (!option)
-        return;
-
-    vm_clock->warp_timer = qemu_new_timer_ns(rt_clock, icount_warp_rt, NULL);
-
-    if (strcmp(option, "auto") != 0) {
-        icount_time_shift = strtol(option, NULL, 0);
-        use_icount = 1;
-        return;
-    }
-
-    use_icount = 2;
-
-    /* 125MIPS seems a reasonable initial guess at the guest speed.
-       It will be corrected fairly quickly anyway.  */
-    icount_time_shift = 3;
-
-    /* Have both realtime and virtual time triggers for speed adjustment.
-       The realtime trigger catches emulated time passing too slowly,
-       the virtual time trigger catches emulated time passing too fast.
-       Realtime triggers occur even when idle, so use them less frequently
-       than VM triggers.  */
-    icount_rt_timer = qemu_new_timer_ms(rt_clock, icount_adjust_rt, NULL);
-    qemu_mod_timer(icount_rt_timer,
-                   qemu_get_clock_ms(rt_clock) + 1000);
-    icount_vm_timer = qemu_new_timer_ns(vm_clock, icount_adjust_vm, NULL);
-    qemu_mod_timer(icount_vm_timer,
-                   qemu_get_clock_ns(vm_clock) + get_ticks_per_sec() / 10);
+    return qemu_timer_pending(ts) ? ts->expire_time : -1;
 }

 void qemu_run_all_timers(void)
@ -710,16 +481,11 @@ void qemu_run_all_timers(void)
    }

    /* vm time timers */
-    if (runstate_is_running()) {
-        qemu_run_timers(vm_clock);
-    }
-
+    qemu_run_timers(vm_clock);
    qemu_run_timers(rt_clock);
    qemu_run_timers(host_clock);
 }

-static int64_t qemu_next_alarm_deadline(void);
-
 #ifdef _WIN32
 static void CALLBACK host_alarm_handler(PVOID lpParam, BOOLEAN unused)
 #else
@ -767,50 +533,6 @@ static void host_alarm_handler(int host_signum)
    }
 }

-int64_t qemu_next_icount_deadline(void)
-{
-    /* To avoid problems with overflow limit this to 2^32.  */
-    int64_t delta = INT32_MAX;
-
-    assert(use_icount);
-    if (active_timers[QEMU_CLOCK_VIRTUAL]) {
-        delta = active_timers[QEMU_CLOCK_VIRTUAL]->expire_time -
-                     qemu_get_clock_ns(vm_clock);
-    }
-
-    if (delta < 0)
-        delta = 0;
-
-    return delta;
-}
-
-static int64_t qemu_next_alarm_deadline(void)
-{
-    int64_t delta;
-    int64_t rtdelta;
-
-    if (!use_icount && active_timers[QEMU_CLOCK_VIRTUAL]) {
-        delta = active_timers[QEMU_CLOCK_VIRTUAL]->expire_time -
-                     qemu_get_clock_ns(vm_clock);
-    } else {
-        delta = INT32_MAX;
-    }
-    if (active_timers[QEMU_CLOCK_HOST]) {
-        int64_t hdelta = active_timers[QEMU_CLOCK_HOST]->expire_time -
-                 qemu_get_clock_ns(host_clock);
-        if (hdelta < delta)
-            delta = hdelta;
-    }
-    if (active_timers[QEMU_CLOCK_REALTIME]) {
-        rtdelta = (active_timers[QEMU_CLOCK_REALTIME]->expire_time -
-                 qemu_get_clock_ns(rt_clock));
-        if (rtdelta < delta)
-            delta = rtdelta;
-    }
-
-    return delta;
-}
-
 #if defined(__linux__)

 #include "compatfd.h"
@ -863,20 +585,13 @@ static void dynticks_stop_timer(struct qemu_alarm_timer *t)
    timer_delete(host_timer);
 }

-static void dynticks_rearm_timer(struct qemu_alarm_timer *t)
+static void dynticks_rearm_timer(struct qemu_alarm_timer *t,
+                                 int64_t nearest_delta_ns)
 {
    timer_t host_timer = t->timer;
    struct itimerspec timeout;
-    int64_t nearest_delta_ns = INT64_MAX;
    int64_t current_ns;

-    assert(alarm_has_dynticks(t));
-    if (!active_timers[QEMU_CLOCK_REALTIME] &&
-        !active_timers[QEMU_CLOCK_VIRTUAL] &&
-        !active_timers[QEMU_CLOCK_HOST])
-        return;
-
-    nearest_delta_ns = qemu_next_alarm_deadline();
    if (nearest_delta_ns < MIN_TIMER_REARM_NS)
        nearest_delta_ns = MIN_TIMER_REARM_NS;

@ -918,19 +633,12 @@ static int unix_start_timer(struct qemu_alarm_timer *t)
    return 0;
 }

-static void unix_rearm_timer(struct qemu_alarm_timer *t)
+static void unix_rearm_timer(struct qemu_alarm_timer *t,
+                             int64_t nearest_delta_ns)
 {
    struct itimerval itv;
-    int64_t nearest_delta_ns = INT64_MAX;
    int err;

-    assert(alarm_has_dynticks(t));
-    if (!active_timers[QEMU_CLOCK_REALTIME] &&
-        !active_timers[QEMU_CLOCK_VIRTUAL] &&
-        !active_timers[QEMU_CLOCK_HOST])
-        return;
-
-    nearest_delta_ns = qemu_next_alarm_deadline();
    if (nearest_delta_ns < MIN_TIMER_REARM_NS)
        nearest_delta_ns = MIN_TIMER_REARM_NS;

@ -1017,23 +725,14 @@ static void mm_stop_timer(struct qemu_alarm_timer *t)
    timeEndPeriod(mm_period);
 }

-static void mm_rearm_timer(struct qemu_alarm_timer *t)
+static void mm_rearm_timer(struct qemu_alarm_timer *t, int64_t delta)
 {
-    int nearest_delta_ms;
-
-    assert(alarm_has_dynticks(t));
-    if (!active_timers[QEMU_CLOCK_REALTIME] &&
-        !active_timers[QEMU_CLOCK_VIRTUAL] &&
-        !active_timers[QEMU_CLOCK_HOST]) {
-        return;
-    }
-
-    timeKillEvent(mm_timer);
-
-    nearest_delta_ms = (qemu_next_alarm_deadline() + 999999) / 1000000;
+    int nearest_delta_ms = (delta + 999999) / 1000000;
    if (nearest_delta_ms < 1) {
        nearest_delta_ms = 1;
    }
+
+    timeKillEvent(mm_timer);
    mm_timer = timeSetEvent(nearest_delta_ms,
                            mm_period,
                            mm_alarm_handler,
@ -1085,19 +784,14 @@ static void win32_stop_timer(struct qemu_alarm_timer *t)
    }
 }

-static void win32_rearm_timer(struct qemu_alarm_timer *t)
+static void win32_rearm_timer(struct qemu_alarm_timer *t,
+                              int64_t nearest_delta_ns)
 {
    HANDLE hTimer = t->timer;
    int nearest_delta_ms;
    BOOLEAN success;

-    assert(alarm_has_dynticks(t));
-    if (!active_timers[QEMU_CLOCK_REALTIME] &&
-        !active_timers[QEMU_CLOCK_VIRTUAL] &&
-        !active_timers[QEMU_CLOCK_HOST])
-        return;
-
-    nearest_delta_ms = (qemu_next_alarm_deadline() + 999999) / 1000000;
+    nearest_delta_ms = (nearest_delta_ns + 999999) / 1000000;
    if (nearest_delta_ms < 1) {
        nearest_delta_ms = 1;
    }
@ -1116,11 +810,11 @@ static void win32_rearm_timer(struct qemu_alarm_timer *t)

 #endif /* _WIN32 */

-static void alarm_timer_on_change_state_rearm(void *opaque, int running,
-                                              RunState state)
+static void quit_timers(void)
 {
-    if (running)
-        qemu_rearm_alarm_timer((struct qemu_alarm_timer *) opaque);
+    struct qemu_alarm_timer *t = alarm_timer;
+    alarm_timer = NULL;
+    t->stop(t);
 }

 int init_timer_alarm(void)
@ -1142,9 +836,9 @@ int init_timer_alarm(void)
    }

    /* first event is at time 0 */
+    atexit(quit_timers);
    t->pending = 1;
    alarm_timer = t;
-    qemu_add_vm_change_state_handler(alarm_timer_on_change_state_rearm, t);

    return 0;

@ -1152,13 +846,6 @@ fail:
    return err;
 }

-void quit_timers(void)
-{
-    struct qemu_alarm_timer *t = alarm_timer;
-    alarm_timer = NULL;
-    t->stop(t);
-}
-
 int qemu_calculate_timeout(void)
 {
    return 1000;
--- a/qemu-timer.h
+++ b/qemu-timer.h
@ -2,6 +2,7 @@
 #define QEMU_TIMER_H

 #include "qemu-common.h"
+#include "main-loop.h"
 #include "notify.h"
 #include <time.h>
 #include <sys/time.h>
@ -38,6 +39,9 @@ extern QEMUClock *vm_clock;
 extern QEMUClock *host_clock;

 int64_t qemu_get_clock_ns(QEMUClock *clock);
+int64_t qemu_clock_has_timers(QEMUClock *clock);
+int64_t qemu_clock_expired(QEMUClock *clock);
+int64_t qemu_clock_deadline(QEMUClock *clock);
 void qemu_clock_enable(QEMUClock *clock, int enabled);
 void qemu_clock_warp(QEMUClock *clock);

@ -49,19 +53,18 @@ QEMUTimer *qemu_new_timer(QEMUClock *clock, int scale,
                          QEMUTimerCB *cb, void *opaque);
 void qemu_free_timer(QEMUTimer *ts);
 void qemu_del_timer(QEMUTimer *ts);
+void qemu_mod_timer_ns(QEMUTimer *ts, int64_t expire_time);
 void qemu_mod_timer(QEMUTimer *ts, int64_t expire_time);
 int qemu_timer_pending(QEMUTimer *ts);
 int qemu_timer_expired(QEMUTimer *timer_head, int64_t current_time);
+uint64_t qemu_timer_expire_time_ns(QEMUTimer *ts);

 void qemu_run_all_timers(void);
 int qemu_alarm_pending(void);
-int64_t qemu_next_icount_deadline(void);
 void configure_alarms(char const *opt);
-void configure_icount(const char *option);
 int qemu_calculate_timeout(void);
 void init_clocks(void);
 int init_timer_alarm(void);
-void quit_timers(void);

 int64_t cpu_get_ticks(void);
 void cpu_enable_ticks(void);
@ -150,12 +153,8 @@ void ptimer_run(ptimer_state *s, int oneshot);
 void ptimer_stop(ptimer_state *s);

 /* icount */
-int64_t qemu_icount_round(int64_t count);
-extern int64_t qemu_icount;
-extern int use_icount;
-extern int icount_time_shift;
-extern int64_t qemu_icount_bias;
 int64_t cpu_get_icount(void);
+int64_t cpu_get_clock(void);

 /*******************************************/
 /* host CPU ticks (if available) */
@ -311,22 +310,6 @@ static inline int64_t cpu_get_real_ticks (void)
 }
 #endif

-#ifdef NEED_CPU_H
-/* Deterministic execution requires that IO only be performed on the last
-   instruction of a TB so that interrupts take effect immediately.  */
-static inline int can_do_io(CPUState *env)
-{
-    if (!use_icount)
-        return 1;
-
-    /* If not executing code then assume we are ok.  */
-    if (!env->current_tb)
-        return 1;
-
-    return env->can_do_io != 0;
-}
-#endif
-
 #ifdef CONFIG_PROFILER
 static inline int64_t profile_getclock(void)
 {
--- a/savevm.c
+++ b/savevm.c
@ -81,6 +81,7 @@
 #include "migration.h"
 #include "qemu_socket.h"
 #include "qemu-queue.h"
+#include "qemu-timer.h"
 #include "cpus.h"

 #define SELF_ANNOUNCE_ROUNDS 5
@ -712,6 +713,30 @@ uint64_t qemu_get_be64(QEMUFile *f)
    return v;
 }

+
+/* timer */
+
+void qemu_put_timer(QEMUFile *f, QEMUTimer *ts)
+{
+    uint64_t expire_time;
+
+    expire_time = qemu_timer_expire_time_ns(ts);
+    qemu_put_be64(f, expire_time);
+}
+
+void qemu_get_timer(QEMUFile *f, QEMUTimer *ts)
+{
+    uint64_t expire_time;
+
+    expire_time = qemu_get_be64(f);
+    if (expire_time != -1) {
+        qemu_mod_timer_ns(ts, expire_time);
+    } else {
+        qemu_del_timer(ts);
+    }
+}
+
+
 /* bool */

 static int get_bool(QEMUFile *f, void *pv, size_t size)
--- a/slirp/libslirp.h
+++ b/slirp/libslirp.h
@ -3,8 +3,6 @@

 #include "qemu-common.h"

-#ifdef CONFIG_SLIRP
-
 struct Slirp;
 typedef struct Slirp Slirp;

@ -44,13 +42,4 @@ void slirp_socket_recv(Slirp *slirp, struct in_addr guest_addr,
 size_t slirp_socket_can_recv(Slirp *slirp, struct in_addr guest_addr,
                             int guest_port);

-#else /* !CONFIG_SLIRP */
-
-static inline void slirp_select_fill(int *pnfds, fd_set *readfds,
-                                     fd_set *writefds, fd_set *xfds) { }
-
-static inline void slirp_select_poll(fd_set *readfds, fd_set *writefds,
-                                     fd_set *xfds, int select_error) { }
-#endif /* !CONFIG_SLIRP */
-
 #endif
--- a/sysemu.h
+++ b/sysemu.h
@ -8,6 +8,7 @@
 #include "qemu-timer.h"
 #include "qapi-types.h"
 #include "notify.h"
+#include "main-loop.h"

 /* vl.c */

@ -64,8 +65,6 @@ void do_info_snapshots(Monitor *mon);

 void qemu_announce_self(void);

-int main_loop_wait(int nonblocking);
-
 bool qemu_savevm_state_blocked(Monitor *mon);
 int qemu_savevm_state_begin(Monitor *mon, QEMUFile *f, int blk_enable,
                            int shared);
--- a/vl.c
+++ b/vl.c
@ -148,6 +148,7 @@ int main(int argc, char **argv)
 #include "qemu-objects.h"
 #include "qemu-options.h"
 #include "qmp-commands.h"
+#include "main-loop.h"
 #ifdef CONFIG_VIRTFS
 #include "fsdev/qemu-fsdev.h"
 #endif
@ -1425,142 +1426,51 @@ void qemu_system_vmstop_request(RunState state)
    qemu_notify_event();
 }

-static GPollFD poll_fds[1024 * 2]; /* this is probably overkill */
-static int n_poll_fds;
-static int max_priority;
-
-static void glib_select_fill(int *max_fd, fd_set *rfds, fd_set *wfds,
-                             fd_set *xfds, struct timeval *tv)
-{
-    GMainContext *context = g_main_context_default();
-    int i;
-    int timeout = 0, cur_timeout;
-
-    g_main_context_prepare(context, &max_priority);
-
-    n_poll_fds = g_main_context_query(context, max_priority, &timeout,
-                                      poll_fds, ARRAY_SIZE(poll_fds));
-    g_assert(n_poll_fds <= ARRAY_SIZE(poll_fds));
-
-    for (i = 0; i < n_poll_fds; i++) {
-        GPollFD *p = &poll_fds[i];
-
-        if ((p->events & G_IO_IN)) {
-            FD_SET(p->fd, rfds);
-            *max_fd = MAX(*max_fd, p->fd);
-        }
-        if ((p->events & G_IO_OUT)) {
-            FD_SET(p->fd, wfds);
-            *max_fd = MAX(*max_fd, p->fd);
-        }
-        if ((p->events & G_IO_ERR)) {
-            FD_SET(p->fd, xfds);
-            *max_fd = MAX(*max_fd, p->fd);
-        }
-    }
-
-    cur_timeout = (tv->tv_sec * 1000) + ((tv->tv_usec + 500) / 1000);
-    if (timeout >= 0 && timeout < cur_timeout) {
-        tv->tv_sec = timeout / 1000;
-        tv->tv_usec = (timeout % 1000) * 1000;
-    }
-}
-
-static void glib_select_poll(fd_set *rfds, fd_set *wfds, fd_set *xfds,
-                             bool err)
-{
-    GMainContext *context = g_main_context_default();
-
-    if (!err) {
-        int i;
-
-        for (i = 0; i < n_poll_fds; i++) {
-            GPollFD *p = &poll_fds[i];
-
-            if ((p->events & G_IO_IN) && FD_ISSET(p->fd, rfds)) {
-                p->revents |= G_IO_IN;
-            }
-            if ((p->events & G_IO_OUT) && FD_ISSET(p->fd, wfds)) {
-                p->revents |= G_IO_OUT;
-            }
-            if ((p->events & G_IO_ERR) && FD_ISSET(p->fd, xfds)) {
-                p->revents |= G_IO_ERR;
-            }
-        }
-    }
-
-    if (g_main_context_check(context, max_priority, poll_fds, n_poll_fds)) {
-        g_main_context_dispatch(context);
-    }
-}
-
-int main_loop_wait(int nonblocking)
-{
-    fd_set rfds, wfds, xfds;
-    int ret, nfds;
-    struct timeval tv;
-    int timeout;
-
-    if (nonblocking)
-        timeout = 0;
-    else {
-        timeout = qemu_calculate_timeout();
-        qemu_bh_update_timeout(&timeout);
-    }
-
-    os_host_main_loop_wait(&timeout);
-
-    tv.tv_sec = timeout / 1000;
-    tv.tv_usec = (timeout % 1000) * 1000;
-
-    /* poll any events */
-    /* XXX: separate device handlers from system ones */
-    nfds = -1;
-    FD_ZERO(&rfds);
-    FD_ZERO(&wfds);
-    FD_ZERO(&xfds);
-
-    qemu_iohandler_fill(&nfds, &rfds, &wfds, &xfds);
-    slirp_select_fill(&nfds, &rfds, &wfds, &xfds);
-    glib_select_fill(&nfds, &rfds, &wfds, &xfds, &tv);
-
-    if (timeout > 0) {
-        qemu_mutex_unlock_iothread();
-    }
-
-    ret = select(nfds + 1, &rfds, &wfds, &xfds, &tv);
-
-    if (timeout > 0) {
-        qemu_mutex_lock_iothread();
-    }
-
-    qemu_iohandler_poll(&rfds, &wfds, &xfds, ret);
-    slirp_select_poll(&rfds, &wfds, &xfds, (ret < 0));
-    glib_select_poll(&rfds, &wfds, &xfds, (ret < 0));
-
-    qemu_run_all_timers();
-
-    /* Check bottom-halves last in case any of the earlier events triggered
-       them.  */
-    qemu_bh_poll();
-
-    return ret;
-}
-
 qemu_irq qemu_system_powerdown;

+static bool main_loop_should_exit(void)
+{
+    RunState r;
+    if (qemu_debug_requested()) {
+        vm_stop(RUN_STATE_DEBUG);
+    }
+    if (qemu_shutdown_requested()) {
+        qemu_kill_report();
+        monitor_protocol_event(QEVENT_SHUTDOWN, NULL);
+        if (no_shutdown) {
+            vm_stop(RUN_STATE_SHUTDOWN);
+        } else {
+            return true;
+        }
+    }
+    if (qemu_reset_requested()) {
+        pause_all_vcpus();
+        cpu_synchronize_all_states();
+        qemu_system_reset(VMRESET_REPORT);
+        resume_all_vcpus();
+        if (runstate_check(RUN_STATE_INTERNAL_ERROR) ||
+            runstate_check(RUN_STATE_SHUTDOWN)) {
+            runstate_set(RUN_STATE_PAUSED);
+        }
+    }
+    if (qemu_powerdown_requested()) {
+        monitor_protocol_event(QEVENT_POWERDOWN, NULL);
+        qemu_irq_raise(qemu_system_powerdown);
+    }
+    if (qemu_vmstop_requested(&r)) {
+        vm_stop(r);
+    }
+    return false;
+}
+
 static void main_loop(void)
 {
    bool nonblocking;
-    int last_io __attribute__ ((unused)) = 0;
+    int last_io = 0;
 #ifdef CONFIG_PROFILER
    int64_t ti;
 #endif
-    RunState r;
-
-    qemu_main_loop_start();
-
-    for (;;) {
+    do {
        nonblocking = !kvm_enabled() && last_io > 0;
 #ifdef CONFIG_PROFILER
        ti = profile_getclock();
@ -1569,38 +1479,7 @@ static void main_loop(void)
 #ifdef CONFIG_PROFILER
        dev_time += profile_getclock() - ti;
 #endif
-
-        if (qemu_debug_requested()) {
-            vm_stop(RUN_STATE_DEBUG);
-        }
-        if (qemu_shutdown_requested()) {
-            qemu_kill_report();
-            monitor_protocol_event(QEVENT_SHUTDOWN, NULL);
-            if (no_shutdown) {
-                vm_stop(RUN_STATE_SHUTDOWN);
-            } else
-                break;
-        }
-        if (qemu_reset_requested()) {
-            pause_all_vcpus();
-            cpu_synchronize_all_states();
-            qemu_system_reset(VMRESET_REPORT);
-            resume_all_vcpus();
-            if (runstate_check(RUN_STATE_INTERNAL_ERROR) ||
-                runstate_check(RUN_STATE_SHUTDOWN)) {
-                runstate_set(RUN_STATE_PAUSED);
-            }
-        }
-        if (qemu_powerdown_requested()) {
-            monitor_protocol_event(QEVENT_POWERDOWN, NULL);
-            qemu_irq_raise(qemu_system_powerdown);
-        }
-        if (qemu_vmstop_requested(&r)) {
-            vm_stop(r);
-        }
-    }
-    bdrv_close_all();
-    pause_all_vcpus();
+    } while (!main_loop_should_exit());
 }

 static void version(void)
@ -2311,6 +2190,7 @@ int main(int argc, char **argv, char **envp)
    runstate_init();

    init_clocks();
+    rtc_clock = host_clock;

    qemu_cache_utils_init(envp);

@ -3298,6 +3178,7 @@ int main(int argc, char **argv, char **envp)

    configure_accelerator();

+    qemu_init_cpu_loop();
    if (qemu_init_main_loop()) {
        fprintf(stderr, "qemu_init_main_loop failed\n");
        exit(1);
@ -3564,8 +3445,10 @@ int main(int argc, char **argv, char **envp)

    os_setup_post();

+    resume_all_vcpus();
    main_loop();
-    quit_timers();
+    bdrv_close_all();
+    pause_all_vcpus();
    net_cleanup();
    res_free();