linux/drivers/lguest/core.c

374 lines
10 KiB
C
Raw Normal View History

/*P:400
* This contains run_guest() which actually calls into the Host<->Guest
* Switcher and analyzes the return, such as determining if the Guest wants the
* Host to do something. This file also contains useful helper routines.
:*/
#include <linux/module.h>
#include <linux/stringify.h>
#include <linux/stddef.h>
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <linux/cpu.h>
#include <linux/freezer.h>
#include <linux/highmem.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 16:04:11 +08:00
#include <linux/slab.h>
#include <asm/paravirt.h>
#include <asm/pgtable.h>
#include <asm/uaccess.h>
#include <asm/poll.h>
#include <asm/asm-offsets.h>
#include "lg.h"
unsigned long switcher_addr;
struct page **lg_switcher_pages;
static struct vm_struct *switcher_vma;
/* This One Big lock protects all inter-guest data structures. */
DEFINE_MUTEX(lguest_lock);
/*H:010
* We need to set up the Switcher at a high virtual address. Remember the
* Switcher is a few hundred bytes of assembler code which actually changes the
* CPU to run the Guest, and then changes back to the Host when a trap or
* interrupt happens.
*
* The Switcher code must be at the same virtual address in the Guest as the
* Host since it will be running as the switchover occurs.
*
* Trying to map memory at a particular address is an unusual thing to do, so
* it's not a simple one-liner.
*/
static __init int map_switcher(void)
{
int i, err;
struct page **pagep;
/*
* Map the Switcher in to high memory.
*
* It turns out that if we choose the address 0xFFC00000 (4MB under the
* top virtual address), it makes setting up the page tables really
* easy.
*/
/* We assume Switcher text fits into a single page. */
if (end_switcher_text - start_switcher_text > PAGE_SIZE) {
printk(KERN_ERR "lguest: switcher text too large (%zu)\n",
end_switcher_text - start_switcher_text);
return -EINVAL;
}
/*
* We allocate an array of struct page pointers. map_vm_area() wants
* this, rather than just an array of pages.
*/
lg_switcher_pages = kmalloc(sizeof(lg_switcher_pages[0])
* TOTAL_SWITCHER_PAGES,
GFP_KERNEL);
if (!lg_switcher_pages) {
err = -ENOMEM;
goto out;
}
/*
* Now we actually allocate the pages. The Guest will see these pages,
* so we make sure they're zeroed.
*/
for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
lg_switcher_pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
if (!lg_switcher_pages[i]) {
err = -ENOMEM;
goto free_some_pages;
}
}
/*
* We place the Switcher underneath the fixmap area, which is the
* highest virtual address we can get. This is important, since we
* tell the Guest it can't access this memory, so we want its ceiling
* as high as possible.
*/
switcher_addr = FIXADDR_START - (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE;
/*
* Now we reserve the "virtual memory area" we want. We might
* not get it in theory, but in practice it's worked so far.
* The end address needs +1 because __get_vm_area allocates an
* extra guard page, so we need space for that.
*/
switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE,
VM_ALLOC, switcher_addr, switcher_addr
+ (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE);
if (!switcher_vma) {
err = -ENOMEM;
printk("lguest: could not map switcher pages high\n");
goto free_pages;
}
/*
* This code actually sets up the pages we've allocated to appear at
* switcher_addr. map_vm_area() takes the vma we allocated above, the
* kind of pages we're mapping (kernel pages), and a pointer to our
* array of struct pages. It increments that pointer, but we don't
* care.
*/
pagep = lg_switcher_pages;
err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep);
if (err) {
printk("lguest: map_vm_area failed: %i\n", err);
goto free_vma;
}
/*
* Now the Switcher is mapped at the right address, we can't fail!
* Copy in the compiled-in Switcher code (from x86/switcher_32.S).
*/
memcpy(switcher_vma->addr, start_switcher_text,
end_switcher_text - start_switcher_text);
printk(KERN_INFO "lguest: mapped switcher at %p\n",
switcher_vma->addr);
/* And we succeeded... */
return 0;
free_vma:
vunmap(switcher_vma->addr);
free_pages:
i = TOTAL_SWITCHER_PAGES;
free_some_pages:
for (--i; i >= 0; i--)
__free_pages(lg_switcher_pages[i], 0);
kfree(lg_switcher_pages);
out:
return err;
}
/*:*/
/* Cleaning up the mapping when the module is unloaded is almost... too easy. */
static void unmap_switcher(void)
{
unsigned int i;
/* vunmap() undoes *both* map_vm_area() and __get_vm_area(). */
vunmap(switcher_vma->addr);
/* Now we just need to free the pages we copied the switcher into */
for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
__free_pages(lg_switcher_pages[i], 0);
kfree(lg_switcher_pages);
}
/*H:032
* Dealing With Guest Memory.
*
* Before we go too much further into the Host, we need to grok the routines
* we use to deal with Guest memory.
*
* When the Guest gives us (what it thinks is) a physical address, we can use
* the normal copy_from_user() & copy_to_user() on the corresponding place in
* the memory region allocated by the Launcher.
*
* But we can't trust the Guest: it might be trying to access the Launcher
* code. We have to check that the range is below the pfn_limit the Launcher
* gave us. We have to make sure that addr + len doesn't give us a false
* positive by overflowing, too.
*/
bool lguest_address_ok(const struct lguest *lg,
unsigned long addr, unsigned long len)
{
return (addr+len) / PAGE_SIZE < lg->pfn_limit && (addr+len >= addr);
}
/*
* This routine copies memory from the Guest. Here we can see how useful the
* kill_lguest() routine we met in the Launcher can be: we return a random
* value (all zeroes) instead of needing to return an error.
*/
void __lgread(struct lg_cpu *cpu, void *b, unsigned long addr, unsigned bytes)
{
if (!lguest_address_ok(cpu->lg, addr, bytes)
|| copy_from_user(b, cpu->lg->mem_base + addr, bytes) != 0) {
/* copy_from_user should do this, but as we rely on it... */
memset(b, 0, bytes);
kill_guest(cpu, "bad read address %#lx len %u", addr, bytes);
}
}
/* This is the write (copy into Guest) version. */
void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b,
unsigned bytes)
{
if (!lguest_address_ok(cpu->lg, addr, bytes)
|| copy_to_user(cpu->lg->mem_base + addr, b, bytes) != 0)
kill_guest(cpu, "bad write address %#lx len %u", addr, bytes);
}
/*:*/
/*H:030
* Let's jump straight to the the main loop which runs the Guest.
* Remember, this is called by the Launcher reading /dev/lguest, and we keep
* going around and around until something interesting happens.
*/
int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
{
/* We stop running once the Guest is dead. */
while (!cpu->lg->dead) {
unsigned int irq;
bool more;
/* First we run any hypercalls the Guest wants done. */
if (cpu->hcall)
do_hypercalls(cpu);
/*
* It's possible the Guest did a NOTIFY hypercall to the
* Launcher.
*/
if (cpu->pending_notify) {
/*
* Does it just needs to write to a registered
* eventfd (ie. the appropriate virtqueue thread)?
*/
if (!send_notify_to_eventfd(cpu)) {
/* OK, we tell the main Launcher. */
if (put_user(cpu->pending_notify, user))
return -EFAULT;
return sizeof(cpu->pending_notify);
}
}
/*
* All long-lived kernel loops need to check with this horrible
* thing called the freezer. If the Host is trying to suspend,
* it stops us.
*/
try_to_freeze();
/* Check for signals */
if (signal_pending(current))
return -ERESTARTSYS;
/*
* Check if there are any interrupts which can be delivered now:
* if so, this sets up the hander to be executed when we next
* run the Guest.
*/
irq = interrupt_pending(cpu, &more);
if (irq < LGUEST_IRQS)
try_deliver_interrupt(cpu, irq, more);
/*
* Just make absolutely sure the Guest is still alive. One of
* those hypercalls could have been fatal, for example.
*/
if (cpu->lg->dead)
break;
/*
* If the Guest asked to be stopped, we sleep. The Guest's
* clock timer will wake us.
*/
if (cpu->halted) {
set_current_state(TASK_INTERRUPTIBLE);
/*
* Just before we sleep, make sure no interrupt snuck in
* which we should be doing.
*/
if (interrupt_pending(cpu, &more) < LGUEST_IRQS)
set_current_state(TASK_RUNNING);
else
schedule();
continue;
}
/*
* OK, now we're ready to jump into the Guest. First we put up
* the "Do Not Disturb" sign:
*/
local_irq_disable();
/* Actually run the Guest until something happens. */
lguest_arch_run_guest(cpu);
/* Now we're ready to be interrupted or moved to other CPUs */
local_irq_enable();
/* Now we deal with whatever happened to the Guest. */
lguest_arch_handle_trap(cpu);
}
/* Special case: Guest is 'dead' but wants a reboot. */
if (cpu->lg->dead == ERR_PTR(-ERESTART))
return -ERESTART;
/* The Guest is dead => "No such file or directory" */
return -ENOENT;
}
/*H:000
* Welcome to the Host!
*
* By this point your brain has been tickled by the Guest code and numbed by
* the Launcher code; prepare for it to be stretched by the Host code. This is
* the heart. Let's begin at the initialization routine for the Host's lg
* module.
*/
static int __init init(void)
{
int err;
/* Lguest can't run under Xen, VMI or itself. It does Tricky Stuff. */
if (get_kernel_rpl() != 0) {
printk("lguest is afraid of being a guest\n");
return -EPERM;
}
/* First we put the Switcher up in very high virtual memory. */
err = map_switcher();
if (err)
goto out;
/* We might need to reserve an interrupt vector. */
err = init_interrupts();
if (err)
goto unmap;
/* /dev/lguest needs to be registered. */
err = lguest_device_init();
if (err)
goto free_interrupts;
/* Finally we do some architecture-specific setup. */
lguest_arch_host_init();
/* All good! */
return 0;
free_interrupts:
free_interrupts();
unmap:
unmap_switcher();
out:
return err;
}
/* Cleaning up is just the same code, backwards. With a little French. */
static void __exit fini(void)
{
lguest_device_remove();
free_interrupts();
unmap_switcher();
lguest_arch_host_fini();
}
/*:*/
/*
* The Host side of lguest can be a module. This is a nice way for people to
* play with it.
*/
module_init(init);
module_exit(fini);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");