mirror of
https://mirrors.bfsu.edu.cn/git/linux.git
synced 2024-11-18 17:54:13 +08:00
Merge git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-lguest
* git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-lguest: (45 commits) Use "struct boot_params" in example launcher Loading bzImage directly. Revert lguest magic and use hook in head.S Update lguest documentation to reflect the new virtual block device name. generalize lgread_u32/lgwrite_u32. Example launcher handle guests not being ready for input Update example launcher for virtio Lguest support for Virtio Remove old lguest I/O infrrasructure. Remove old lguest bus and drivers. Virtio helper routines for a descriptor ringbuffer implementation Module autoprobing support for virtio drivers. Virtio console driver Block driver using virtio. Net driver using virtio Virtio interface Boot with virtual == physical to get closer to native Linux. Allow guest to specify syscall vector to use. Rename "cr3" to "gpgdir" to avoid x86-specific naming. Pagetables to use normal kernel types ...
This commit is contained in:
commit
0d6810091c
@ -1,28 +1,8 @@
|
||||
# This creates the demonstration utility "lguest" which runs a Linux guest.
|
||||
|
||||
# For those people that have a separate object dir, look there for .config
|
||||
KBUILD_OUTPUT := ../..
|
||||
ifdef O
|
||||
ifeq ("$(origin O)", "command line")
|
||||
KBUILD_OUTPUT := $(O)
|
||||
endif
|
||||
endif
|
||||
# We rely on CONFIG_PAGE_OFFSET to know where to put lguest binary.
|
||||
include $(KBUILD_OUTPUT)/.config
|
||||
LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000)
|
||||
|
||||
CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -Wl,-T,lguest.lds
|
||||
CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include
|
||||
LDLIBS:=-lz
|
||||
# Removing this works for some versions of ld.so (eg. Ubuntu Feisty) and
|
||||
# not others (eg. FC7).
|
||||
LDFLAGS+=-static
|
||||
all: lguest.lds lguest
|
||||
|
||||
# The linker script on x86 is so complex the only way of creating one
|
||||
# which will link our binary in the right place is to mangle the
|
||||
# default one.
|
||||
lguest.lds:
|
||||
$(LD) --verbose | awk '/^==========/ { PRINT=1; next; } /SIZEOF_HEADERS/ { gsub(/0x[0-9A-F]*/, "$(LGUEST_GUEST_TOP)") } { if (PRINT) print $$0; }' > $@
|
||||
all: lguest
|
||||
|
||||
clean:
|
||||
rm -f lguest.lds lguest
|
||||
rm -f lguest
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -6,7 +6,7 @@ Lguest is designed to be a minimal hypervisor for the Linux kernel, for
|
||||
Linux developers and users to experiment with virtualization with the
|
||||
minimum of complexity. Nonetheless, it should have sufficient
|
||||
features to make it useful for specific tasks, and, of course, you are
|
||||
encouraged to fork and enhance it.
|
||||
encouraged to fork and enhance it (see drivers/lguest/README).
|
||||
|
||||
Features:
|
||||
|
||||
@ -23,19 +23,30 @@ Developer features:
|
||||
|
||||
Running Lguest:
|
||||
|
||||
- Lguest runs the same kernel as guest and host. You can configure
|
||||
them differently, but usually it's easiest not to.
|
||||
- The easiest way to run lguest is to use same kernel as guest and host.
|
||||
You can configure them differently, but usually it's easiest not to.
|
||||
|
||||
You will need to configure your kernel with the following options:
|
||||
|
||||
CONFIG_HIGHMEM64G=n ("High Memory Support" "64GB")[1]
|
||||
CONFIG_TUN=y/m ("Universal TUN/TAP device driver support")
|
||||
CONFIG_EXPERIMENTAL=y ("Prompt for development and/or incomplete code/drivers")
|
||||
CONFIG_PARAVIRT=y ("Paravirtualization support (EXPERIMENTAL)")
|
||||
CONFIG_LGUEST=y/m ("Linux hypervisor example code")
|
||||
"General setup":
|
||||
"Prompt for development and/or incomplete code/drivers" = Y
|
||||
(CONFIG_EXPERIMENTAL=y)
|
||||
|
||||
and I recommend:
|
||||
CONFIG_HZ=100 ("Timer frequency")[2]
|
||||
"Processor type and features":
|
||||
"Paravirtualized guest support" = Y
|
||||
"Lguest guest support" = Y
|
||||
"High Memory Support" = off/4GB
|
||||
"Alignment value to which kernel should be aligned" = 0x100000
|
||||
(CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and
|
||||
CONFIG_PHYSICAL_ALIGN=0x100000)
|
||||
|
||||
"Device Drivers":
|
||||
"Network device support"
|
||||
"Universal TUN/TAP device driver support" = M/Y
|
||||
(CONFIG_TUN=m)
|
||||
"Virtualization"
|
||||
"Linux hypervisor example code" = M/Y
|
||||
(CONFIG_LGUEST=m)
|
||||
|
||||
- A tool called "lguest" is available in this directory: type "make"
|
||||
to build it. If you didn't build your kernel in-tree, use "make
|
||||
@ -51,14 +62,17 @@ Running Lguest:
|
||||
dd if=/dev/zero of=rootfile bs=1M count=2048
|
||||
qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d
|
||||
|
||||
Make sure that you install a getty on /dev/hvc0 if you want to log in on the
|
||||
console!
|
||||
|
||||
- "modprobe lg" if you built it as a module.
|
||||
|
||||
- Run an lguest as root:
|
||||
|
||||
Documentation/lguest/lguest 64m vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/lgba
|
||||
Documentation/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/vda
|
||||
|
||||
Explanation:
|
||||
64m: the amount of memory to use.
|
||||
64: the amount of memory to use, in MB.
|
||||
|
||||
vmlinux: the kernel image found in the top of your build directory. You
|
||||
can also use a standard bzImage.
|
||||
@ -66,10 +80,10 @@ Running Lguest:
|
||||
--tunnet=192.168.19.1: configures a "tap" device for networking with this
|
||||
IP address.
|
||||
|
||||
--block=rootfile: a file or block device which becomes /dev/lgba
|
||||
--block=rootfile: a file or block device which becomes /dev/vda
|
||||
inside the guest.
|
||||
|
||||
root=/dev/lgba: this (and anything else on the command line) are
|
||||
root=/dev/vda: this (and anything else on the command line) are
|
||||
kernel boot parameters.
|
||||
|
||||
- Configuring networking. I usually have the host masquerade, using
|
||||
@ -99,31 +113,7 @@ Running Lguest:
|
||||
"--sharenet=<filename>": any two guests using the same file are on
|
||||
the same network. This file is created if it does not exist.
|
||||
|
||||
Lguest I/O model:
|
||||
There is a helpful mailing list at http://ozlabs.org/mailman/listinfo/lguest
|
||||
|
||||
Lguest uses a simplified DMA model plus shared memory for I/O. Guests
|
||||
can communicate with each other if they share underlying memory
|
||||
(usually by the lguest program mmaping the same file), but they can
|
||||
use any non-shared memory to communicate with the lguest process.
|
||||
|
||||
Guests can register DMA buffers at any key (must be a valid physical
|
||||
address) using the LHCALL_BIND_DMA(key, dmabufs, num<<8|irq)
|
||||
hypercall. "dmabufs" is the physical address of an array of "num"
|
||||
"struct lguest_dma": each contains a used_len, and an array of
|
||||
physical addresses and lengths. When a transfer occurs, the
|
||||
"used_len" field of one of the buffers which has used_len 0 will be
|
||||
set to the length transferred and the irq will fire.
|
||||
|
||||
Using an irq value of 0 unbinds the dma buffers.
|
||||
|
||||
To send DMA, the LHCALL_SEND_DMA(key, dma_physaddr) hypercall is used,
|
||||
and the bytes used is written to the used_len field. This can be 0 if
|
||||
noone else has bound a DMA buffer to that key or some other error.
|
||||
DMA buffers bound by the same guest are ignored.
|
||||
|
||||
Cheers!
|
||||
Good luck!
|
||||
Rusty Russell rusty@rustcorp.com.au.
|
||||
|
||||
[1] These are on various places on the TODO list, waiting for you to
|
||||
get annoyed enough at the limitation to fix it.
|
||||
[2] Lguest is not yet tickless when idle. See [1].
|
||||
|
@ -227,28 +227,40 @@ config SCHED_NO_NO_OMIT_FRAME_POINTER
|
||||
If in doubt, say "Y".
|
||||
|
||||
config PARAVIRT
|
||||
bool "Paravirtualization support (EXPERIMENTAL)"
|
||||
depends on EXPERIMENTAL
|
||||
bool
|
||||
depends on !(X86_VISWS || X86_VOYAGER)
|
||||
help
|
||||
Paravirtualization is a way of running multiple instances of
|
||||
Linux on the same machine, under a hypervisor. This option
|
||||
changes the kernel so it can modify itself when it is run
|
||||
under a hypervisor, improving performance significantly.
|
||||
However, when run without a hypervisor the kernel is
|
||||
theoretically slower. If in doubt, say N.
|
||||
This changes the kernel so it can modify itself when it is run
|
||||
under a hypervisor, potentially improving performance significantly
|
||||
over full virtualization. However, when run without a hypervisor
|
||||
the kernel is theoretically slower and slightly larger.
|
||||
|
||||
menuconfig PARAVIRT_GUEST
|
||||
bool "Paravirtualized guest support"
|
||||
help
|
||||
Say Y here to get to see options related to running Linux under
|
||||
various hypervisors. This option alone does not add any kernel code.
|
||||
|
||||
If you say N, all options in this submenu will be skipped and disabled.
|
||||
|
||||
if PARAVIRT_GUEST
|
||||
|
||||
source "arch/x86/xen/Kconfig"
|
||||
|
||||
config VMI
|
||||
bool "VMI Paravirt-ops support"
|
||||
depends on PARAVIRT
|
||||
bool "VMI Guest support"
|
||||
select PARAVIRT
|
||||
depends on !(X86_VISWS || X86_VOYAGER)
|
||||
help
|
||||
VMI provides a paravirtualized interface to the VMware ESX server
|
||||
(it could be used by other hypervisors in theory too, but is not
|
||||
at the moment), by linking the kernel to a GPL-ed ROM module
|
||||
provided by the hypervisor.
|
||||
|
||||
source "arch/x86/lguest/Kconfig"
|
||||
|
||||
endif
|
||||
|
||||
config ACPI_SRAT
|
||||
bool
|
||||
default y
|
||||
|
@ -99,6 +99,9 @@ core-$(CONFIG_X86_ES7000) := arch/x86/mach-es7000/
|
||||
# Xen paravirtualization support
|
||||
core-$(CONFIG_XEN) += arch/x86/xen/
|
||||
|
||||
# lguest paravirtualization support
|
||||
core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/
|
||||
|
||||
# default subarch .h files
|
||||
mflags-y += -Iinclude/asm-x86/mach-default
|
||||
|
||||
|
@ -136,6 +136,7 @@ void foo(void)
|
||||
#ifdef CONFIG_LGUEST_GUEST
|
||||
BLANK();
|
||||
OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
|
||||
OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
|
||||
OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
|
||||
OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
|
||||
OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
|
||||
|
14
arch/x86/lguest/Kconfig
Normal file
14
arch/x86/lguest/Kconfig
Normal file
@ -0,0 +1,14 @@
|
||||
config LGUEST_GUEST
|
||||
bool "Lguest guest support"
|
||||
select PARAVIRT
|
||||
depends on !X86_PAE
|
||||
select VIRTIO
|
||||
select VIRTIO_RING
|
||||
select VIRTIO_CONSOLE
|
||||
help
|
||||
Lguest is a tiny in-kernel hypervisor. Selecting this will
|
||||
allow your kernel to boot under lguest. This option will increase
|
||||
your kernel size by about 6k. If in doubt, say N.
|
||||
|
||||
If you say Y here, make sure you say Y (or M) to the virtio block
|
||||
and net drivers which lguest needs.
|
1
arch/x86/lguest/Makefile
Normal file
1
arch/x86/lguest/Makefile
Normal file
@ -0,0 +1 @@
|
||||
obj-y := i386_head.o boot.o
|
@ -55,7 +55,7 @@
|
||||
#include <linux/clockchips.h>
|
||||
#include <linux/lguest.h>
|
||||
#include <linux/lguest_launcher.h>
|
||||
#include <linux/lguest_bus.h>
|
||||
#include <linux/virtio_console.h>
|
||||
#include <asm/paravirt.h>
|
||||
#include <asm/param.h>
|
||||
#include <asm/page.h>
|
||||
@ -65,6 +65,7 @@
|
||||
#include <asm/e820.h>
|
||||
#include <asm/mce.h>
|
||||
#include <asm/io.h>
|
||||
#include <asm/i387.h>
|
||||
|
||||
/*G:010 Welcome to the Guest!
|
||||
*
|
||||
@ -85,9 +86,10 @@ struct lguest_data lguest_data = {
|
||||
.hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF },
|
||||
.noirq_start = (u32)lguest_noirq_start,
|
||||
.noirq_end = (u32)lguest_noirq_end,
|
||||
.kernel_address = PAGE_OFFSET,
|
||||
.blocked_interrupts = { 1 }, /* Block timer interrupts */
|
||||
.syscall_vec = SYSCALL_VECTOR,
|
||||
};
|
||||
struct lguest_device_desc *lguest_devices;
|
||||
static cycle_t clock_base;
|
||||
|
||||
/*G:035 Notice the lazy_hcall() above, rather than hcall(). This is our first
|
||||
@ -146,10 +148,10 @@ void async_hcall(unsigned long call,
|
||||
/* Table full, so do normal hcall which will flush table. */
|
||||
hcall(call, arg1, arg2, arg3);
|
||||
} else {
|
||||
lguest_data.hcalls[next_call].eax = call;
|
||||
lguest_data.hcalls[next_call].edx = arg1;
|
||||
lguest_data.hcalls[next_call].ebx = arg2;
|
||||
lguest_data.hcalls[next_call].ecx = arg3;
|
||||
lguest_data.hcalls[next_call].arg0 = call;
|
||||
lguest_data.hcalls[next_call].arg1 = arg1;
|
||||
lguest_data.hcalls[next_call].arg2 = arg2;
|
||||
lguest_data.hcalls[next_call].arg3 = arg3;
|
||||
/* Arguments must all be written before we mark it to go */
|
||||
wmb();
|
||||
lguest_data.hcall_status[next_call] = 0;
|
||||
@ -160,46 +162,6 @@ void async_hcall(unsigned long call,
|
||||
}
|
||||
/*:*/
|
||||
|
||||
/* Wrappers for the SEND_DMA and BIND_DMA hypercalls. This is mainly because
|
||||
* Jeff Garzik complained that __pa() should never appear in drivers, and this
|
||||
* helps remove most of them. But also, it wraps some ugliness. */
|
||||
void lguest_send_dma(unsigned long key, struct lguest_dma *dma)
|
||||
{
|
||||
/* The hcall might not write this if something goes wrong */
|
||||
dma->used_len = 0;
|
||||
hcall(LHCALL_SEND_DMA, key, __pa(dma), 0);
|
||||
}
|
||||
|
||||
int lguest_bind_dma(unsigned long key, struct lguest_dma *dmas,
|
||||
unsigned int num, u8 irq)
|
||||
{
|
||||
/* This is the only hypercall which actually wants 5 arguments, and we
|
||||
* only support 4. Fortunately the interrupt number is always less
|
||||
* than 256, so we can pack it with the number of dmas in the final
|
||||
* argument. */
|
||||
if (!hcall(LHCALL_BIND_DMA, key, __pa(dmas), (num << 8) | irq))
|
||||
return -ENOMEM;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Unbinding is the same hypercall as binding, but with 0 num & irq. */
|
||||
void lguest_unbind_dma(unsigned long key, struct lguest_dma *dmas)
|
||||
{
|
||||
hcall(LHCALL_BIND_DMA, key, __pa(dmas), 0);
|
||||
}
|
||||
|
||||
/* For guests, device memory can be used as normal memory, so we cast away the
|
||||
* __iomem to quieten sparse. */
|
||||
void *lguest_map(unsigned long phys_addr, unsigned long pages)
|
||||
{
|
||||
return (__force void *)ioremap(phys_addr, PAGE_SIZE*pages);
|
||||
}
|
||||
|
||||
void lguest_unmap(void *addr)
|
||||
{
|
||||
iounmap((__force void __iomem *)addr);
|
||||
}
|
||||
|
||||
/*G:033
|
||||
* Here are our first native-instruction replacements: four functions for
|
||||
* interrupt control.
|
||||
@ -680,6 +642,7 @@ static struct clocksource lguest_clock = {
|
||||
.mask = CLOCKSOURCE_MASK(64),
|
||||
.mult = 1 << 22,
|
||||
.shift = 22,
|
||||
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
|
||||
};
|
||||
|
||||
/* The "scheduler clock" is just our real clock, adjusted to start at zero */
|
||||
@ -761,11 +724,9 @@ static void lguest_time_init(void)
|
||||
* the TSC, otherwise it's a dumb nanosecond-resolution clock. Either
|
||||
* way, the "rating" is initialized so high that it's always chosen
|
||||
* over any other clocksource. */
|
||||
if (lguest_data.tsc_khz) {
|
||||
if (lguest_data.tsc_khz)
|
||||
lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz,
|
||||
lguest_clock.shift);
|
||||
lguest_clock.flags = CLOCK_SOURCE_IS_CONTINUOUS;
|
||||
}
|
||||
clock_base = lguest_clock_read();
|
||||
clocksource_register(&lguest_clock);
|
||||
|
||||
@ -889,6 +850,23 @@ static __init char *lguest_memory_setup(void)
|
||||
return "LGUEST";
|
||||
}
|
||||
|
||||
/* Before virtqueues are set up, we use LHCALL_NOTIFY on normal memory to
|
||||
* produce console output. */
|
||||
static __init int early_put_chars(u32 vtermno, const char *buf, int count)
|
||||
{
|
||||
char scratch[17];
|
||||
unsigned int len = count;
|
||||
|
||||
if (len > sizeof(scratch) - 1)
|
||||
len = sizeof(scratch) - 1;
|
||||
scratch[len] = '\0';
|
||||
memcpy(scratch, buf, len);
|
||||
hcall(LHCALL_NOTIFY, __pa(scratch), 0, 0);
|
||||
|
||||
/* This routine returns the number of bytes actually written. */
|
||||
return len;
|
||||
}
|
||||
|
||||
/*G:050
|
||||
* Patching (Powerfully Placating Performance Pedants)
|
||||
*
|
||||
@ -950,18 +928,8 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
|
||||
/*G:030 Once we get to lguest_init(), we know we're a Guest. The pv_ops
|
||||
* structures in the kernel provide points for (almost) every routine we have
|
||||
* to override to avoid privileged instructions. */
|
||||
__init void lguest_init(void *boot)
|
||||
__init void lguest_init(void)
|
||||
{
|
||||
/* Copy boot parameters first: the Launcher put the physical location
|
||||
* in %esi, and head.S converted that to a virtual address and handed
|
||||
* it to us. We use "__memcpy" because "memcpy" sometimes tries to do
|
||||
* tricky things to go faster, and we're not ready for that. */
|
||||
__memcpy(&boot_params, boot, PARAM_SIZE);
|
||||
/* The boot parameters also tell us where the command-line is: save
|
||||
* that, too. */
|
||||
__memcpy(boot_command_line, __va(boot_params.hdr.cmd_line_ptr),
|
||||
COMMAND_LINE_SIZE);
|
||||
|
||||
/* We're under lguest, paravirt is enabled, and we're running at
|
||||
* privilege level 1, not 0 as normal. */
|
||||
pv_info.name = "lguest";
|
||||
@ -1033,11 +1001,7 @@ __init void lguest_init(void *boot)
|
||||
|
||||
/*G:070 Now we've seen all the paravirt_ops, we return to
|
||||
* lguest_init() where the rest of the fairly chaotic boot setup
|
||||
* occurs.
|
||||
*
|
||||
* The Host expects our first hypercall to tell it where our "struct
|
||||
* lguest_data" is, so we do that first. */
|
||||
hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0);
|
||||
* occurs. */
|
||||
|
||||
/* The native boot code sets up initial page tables immediately after
|
||||
* the kernel itself, and sets init_pg_tables_end so they're not
|
||||
@ -1050,11 +1014,6 @@ __init void lguest_init(void *boot)
|
||||
* the normal data segment to get through booting. */
|
||||
asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory");
|
||||
|
||||
/* Clear the part of the kernel data which is expected to be zero.
|
||||
* Normally it will be anyway, but if we're loading from a bzImage with
|
||||
* CONFIG_RELOCATALE=y, the relocations will be sitting here. */
|
||||
memset(__bss_start, 0, __bss_stop - __bss_start);
|
||||
|
||||
/* The Host uses the top of the Guest's virtual address space for the
|
||||
* Host<->Guest Switcher, and it tells us how much it needs in
|
||||
* lguest_data.reserve_mem, set up on the LGUEST_INIT hypercall. */
|
||||
@ -1092,6 +1051,9 @@ __init void lguest_init(void *boot)
|
||||
* adapted for lguest's use. */
|
||||
add_preferred_console("hvc", 0, NULL);
|
||||
|
||||
/* Register our very early console. */
|
||||
virtio_cons_early_init(early_put_chars);
|
||||
|
||||
/* Last of all, we set the power management poweroff hook to point to
|
||||
* the Guest routine to power off. */
|
||||
pm_power_off = lguest_power_off;
|
@ -1,25 +1,47 @@
|
||||
#include <linux/linkage.h>
|
||||
#include <linux/lguest.h>
|
||||
#include <asm/lguest_hcall.h>
|
||||
#include <asm/asm-offsets.h>
|
||||
#include <asm/thread_info.h>
|
||||
#include <asm/processor-flags.h>
|
||||
|
||||
/*G:020 This is where we begin: we have a magic signature which the launcher
|
||||
* looks for. The plan is that the Linux boot protocol will be extended with a
|
||||
* "platform type" field which will guide us here from the normal entry point,
|
||||
* but for the moment this suffices. The normal boot code uses %esi for the
|
||||
* boot header, so we do too. We convert it to a virtual address by adding
|
||||
* PAGE_OFFSET, and hand it to lguest_init() as its argument (ie. %eax).
|
||||
/*G:020 This is where we begin: head.S notes that the boot header's platform
|
||||
* type field is "1" (lguest), so calls us here. The boot header is in %esi.
|
||||
*
|
||||
* WARNING: be very careful here! We're running at addresses equal to physical
|
||||
* addesses (around 0), not above PAGE_OFFSET as most code expectes
|
||||
* (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any
|
||||
* data.
|
||||
*
|
||||
* The .section line puts this code in .init.text so it will be discarded after
|
||||
* boot. */
|
||||
.section .init.text, "ax", @progbits
|
||||
.ascii "GenuineLguest"
|
||||
/* Set up initial stack. */
|
||||
movl $(init_thread_union+THREAD_SIZE),%esp
|
||||
movl %esi, %eax
|
||||
addl $__PAGE_OFFSET, %eax
|
||||
jmp lguest_init
|
||||
ENTRY(lguest_entry)
|
||||
/* Make initial hypercall now, so we can set up the pagetables. */
|
||||
movl $LHCALL_LGUEST_INIT, %eax
|
||||
movl $lguest_data - __PAGE_OFFSET, %edx
|
||||
int $LGUEST_TRAP_ENTRY
|
||||
|
||||
/* The Host put the toplevel pagetable in lguest_data.pgdir. The movsl
|
||||
* instruction uses %esi implicitly. */
|
||||
movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi
|
||||
|
||||
/* Copy first 32 entries of page directory to __PAGE_OFFSET entries.
|
||||
* This means the first 128M of kernel memory will be mapped at
|
||||
* PAGE_OFFSET where the kernel expects to run. This will get it far
|
||||
* enough through boot to switch to its own pagetables. */
|
||||
movl $32, %ecx
|
||||
movl %esi, %edi
|
||||
addl $((__PAGE_OFFSET >> 22) * 4), %edi
|
||||
rep
|
||||
movsl
|
||||
|
||||
/* Set up the initial stack so we can run C code. */
|
||||
movl $(init_thread_union+THREAD_SIZE),%esp
|
||||
|
||||
/* Jumps are relative, and we're running __PAGE_OFFSET too low at the
|
||||
* moment. */
|
||||
jmp lguest_init+__PAGE_OFFSET
|
||||
|
||||
/*G:055 We create a macro which puts the assembler code between lgstart_ and
|
||||
* lgend_ markers. These templates are put in the .text section: they can't be
|
@ -3,8 +3,9 @@
|
||||
#
|
||||
|
||||
config XEN
|
||||
bool "Enable support for Xen hypervisor"
|
||||
depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES
|
||||
bool "Xen guest support"
|
||||
select PARAVIRT
|
||||
depends on X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES && !(X86_VISWS || X86_VOYAGER)
|
||||
help
|
||||
This is the Linux Xen port. Enabling this will allow the
|
||||
kernel to boot in a paravirtualized environment under the
|
||||
|
@ -94,5 +94,5 @@ source "drivers/kvm/Kconfig"
|
||||
|
||||
source "drivers/uio/Kconfig"
|
||||
|
||||
source "drivers/lguest/Kconfig"
|
||||
source "drivers/virtio/Kconfig"
|
||||
endmenu
|
||||
|
@ -91,3 +91,4 @@ obj-$(CONFIG_HID) += hid/
|
||||
obj-$(CONFIG_PPC_PS3) += ps3/
|
||||
obj-$(CONFIG_OF) += of/
|
||||
obj-$(CONFIG_SSB) += ssb/
|
||||
obj-$(CONFIG_VIRTIO) += virtio/
|
||||
|
@ -425,4 +425,10 @@ config XEN_BLKDEV_FRONTEND
|
||||
block device driver. It communicates with a back-end driver
|
||||
in another domain which drives the actual block device.
|
||||
|
||||
config VIRTIO_BLK
|
||||
tristate "Virtio block driver (EXPERIMENTAL)"
|
||||
depends on EXPERIMENTAL && VIRTIO
|
||||
---help---
|
||||
This is the virtual block driver for lguest. Say Y or M.
|
||||
|
||||
endif # BLK_DEV
|
||||
|
@ -25,10 +25,10 @@ obj-$(CONFIG_SUNVDC) += sunvdc.o
|
||||
obj-$(CONFIG_BLK_DEV_UMEM) += umem.o
|
||||
obj-$(CONFIG_BLK_DEV_NBD) += nbd.o
|
||||
obj-$(CONFIG_BLK_DEV_CRYPTOLOOP) += cryptoloop.o
|
||||
obj-$(CONFIG_VIRTIO_BLK) += virtio_blk.o
|
||||
|
||||
obj-$(CONFIG_VIODASD) += viodasd.o
|
||||
obj-$(CONFIG_BLK_DEV_SX8) += sx8.o
|
||||
obj-$(CONFIG_BLK_DEV_UB) += ub.o
|
||||
|
||||
obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
|
||||
obj-$(CONFIG_LGUEST_BLOCK) += lguest_blk.o
|
||||
|
@ -1,421 +0,0 @@
|
||||
/*D:400
|
||||
* The Guest block driver
|
||||
*
|
||||
* This is a simple block driver, which appears as /dev/lgba, lgbb, lgbc etc.
|
||||
* The mechanism is simple: we place the information about the request in the
|
||||
* device page, then use SEND_DMA (containing the data for a write, or an empty
|
||||
* "ping" DMA for a read).
|
||||
:*/
|
||||
/* Copyright 2006 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
*/
|
||||
//#define DEBUG
|
||||
#include <linux/init.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/lguest_bus.h>
|
||||
|
||||
static char next_block_index = 'a';
|
||||
|
||||
/*D:420 Here is the structure which holds all the information we need about
|
||||
* each Guest block device.
|
||||
*
|
||||
* I'm sure at this stage, you're wondering "hey, where was the adventure I was
|
||||
* promised?" and thinking "Rusty sucks, I shall say nasty things about him on
|
||||
* my blog". I think Real adventures have boring bits, too, and you're in the
|
||||
* middle of one. But it gets better. Just not quite yet. */
|
||||
struct blockdev
|
||||
{
|
||||
/* The block queue infrastructure wants a spinlock: it is held while it
|
||||
* calls our block request function. We grab it in our interrupt
|
||||
* handler so the responses don't mess with new requests. */
|
||||
spinlock_t lock;
|
||||
|
||||
/* The disk structure registered with kernel. */
|
||||
struct gendisk *disk;
|
||||
|
||||
/* The major device number for this disk, and the interrupt. We only
|
||||
* really keep them here for completeness; we'd need them if we
|
||||
* supported device unplugging. */
|
||||
int major;
|
||||
int irq;
|
||||
|
||||
/* The physical address of this device's memory page */
|
||||
unsigned long phys_addr;
|
||||
/* The mapped memory page for convenient acces. */
|
||||
struct lguest_block_page *lb_page;
|
||||
|
||||
/* We only have a single request outstanding at a time: this is it. */
|
||||
struct lguest_dma dma;
|
||||
struct request *req;
|
||||
};
|
||||
|
||||
/*D:495 We originally used end_request() throughout the driver, but it turns
|
||||
* out that end_request() is deprecated, and doesn't actually end the request
|
||||
* (which seems like a good reason to deprecate it!). It simply ends the first
|
||||
* bio. So if we had 3 bios in a "struct request" we would do all 3,
|
||||
* end_request(), do 2, end_request(), do 1 and end_request(): twice as much
|
||||
* work as we needed to do.
|
||||
*
|
||||
* This reinforced to me that I do not understand the block layer.
|
||||
*
|
||||
* Nonetheless, Jens Axboe gave me this nice helper to end all chunks of a
|
||||
* request. This improved disk speed by 130%. */
|
||||
static void end_entire_request(struct request *req, int uptodate)
|
||||
{
|
||||
if (end_that_request_first(req, uptodate, req->hard_nr_sectors))
|
||||
BUG();
|
||||
add_disk_randomness(req->rq_disk);
|
||||
blkdev_dequeue_request(req);
|
||||
end_that_request_last(req, uptodate);
|
||||
}
|
||||
|
||||
/* I'm told there are only two stories in the world worth telling: love and
|
||||
* hate. So there used to be a love scene here like this:
|
||||
*
|
||||
* Launcher: We could make beautiful I/O together, you and I.
|
||||
* Guest: My, that's a big disk!
|
||||
*
|
||||
* Unfortunately, it was just too raunchy for our otherwise-gentle tale. */
|
||||
|
||||
/*D:490 This is the interrupt handler, called when a block read or write has
|
||||
* been completed for us. */
|
||||
static irqreturn_t lgb_irq(int irq, void *_bd)
|
||||
{
|
||||
/* We handed our "struct blockdev" as the argument to request_irq(), so
|
||||
* it is passed through to us here. This tells us which device we're
|
||||
* dealing with in case we have more than one. */
|
||||
struct blockdev *bd = _bd;
|
||||
unsigned long flags;
|
||||
|
||||
/* We weren't doing anything? Strange, but could happen if we shared
|
||||
* interrupts (we don't!). */
|
||||
if (!bd->req) {
|
||||
pr_debug("No work!\n");
|
||||
return IRQ_NONE;
|
||||
}
|
||||
|
||||
/* Not done yet? That's equally strange. */
|
||||
if (!bd->lb_page->result) {
|
||||
pr_debug("No result!\n");
|
||||
return IRQ_NONE;
|
||||
}
|
||||
|
||||
/* We have to grab the lock before ending the request. */
|
||||
spin_lock_irqsave(&bd->lock, flags);
|
||||
/* "result" is 1 for success, 2 for failure: end_entire_request() wants
|
||||
* to know whether this succeeded or not. */
|
||||
end_entire_request(bd->req, bd->lb_page->result == 1);
|
||||
/* Clear out request, it's done. */
|
||||
bd->req = NULL;
|
||||
/* Reset incoming DMA for next time. */
|
||||
bd->dma.used_len = 0;
|
||||
/* Ready for more reads or writes */
|
||||
blk_start_queue(bd->disk->queue);
|
||||
spin_unlock_irqrestore(&bd->lock, flags);
|
||||
|
||||
/* The interrupt was for us, we dealt with it. */
|
||||
return IRQ_HANDLED;
|
||||
}
|
||||
|
||||
/*D:480 The block layer's "struct request" contains a number of "struct bio"s,
|
||||
* each of which contains "struct bio_vec"s, each of which contains a page, an
|
||||
* offset and a length.
|
||||
*
|
||||
* Fortunately there are iterators to help us walk through the "struct
|
||||
* request". Even more fortunately, there were plenty of places to steal the
|
||||
* code from. We pack the "struct request" into our "struct lguest_dma" and
|
||||
* return the total length. */
|
||||
static unsigned int req_to_dma(struct request *req, struct lguest_dma *dma)
|
||||
{
|
||||
unsigned int i = 0, len = 0;
|
||||
struct req_iterator iter;
|
||||
struct bio_vec *bvec;
|
||||
|
||||
rq_for_each_segment(bvec, req, iter) {
|
||||
/* We told the block layer not to give us too many. */
|
||||
BUG_ON(i == LGUEST_MAX_DMA_SECTIONS);
|
||||
/* If we had a zero-length segment, it would look like
|
||||
* the end of the data referred to by the "struct
|
||||
* lguest_dma", so make sure that doesn't happen. */
|
||||
BUG_ON(!bvec->bv_len);
|
||||
/* Convert page & offset to a physical address */
|
||||
dma->addr[i] = page_to_phys(bvec->bv_page)
|
||||
+ bvec->bv_offset;
|
||||
dma->len[i] = bvec->bv_len;
|
||||
len += bvec->bv_len;
|
||||
i++;
|
||||
}
|
||||
/* If the array isn't full, we mark the end with a 0 length */
|
||||
if (i < LGUEST_MAX_DMA_SECTIONS)
|
||||
dma->len[i] = 0;
|
||||
return len;
|
||||
}
|
||||
|
||||
/* This creates an empty DMA, useful for prodding the Host without sending data
|
||||
* (ie. when we want to do a read) */
|
||||
static void empty_dma(struct lguest_dma *dma)
|
||||
{
|
||||
dma->len[0] = 0;
|
||||
}
|
||||
|
||||
/*D:470 Setting up a request is fairly easy: */
|
||||
static void setup_req(struct blockdev *bd,
|
||||
int type, struct request *req, struct lguest_dma *dma)
|
||||
{
|
||||
/* The type is 1 (write) or 0 (read). */
|
||||
bd->lb_page->type = type;
|
||||
/* The sector on disk where the read or write starts. */
|
||||
bd->lb_page->sector = req->sector;
|
||||
/* The result is initialized to 0 (unfinished). */
|
||||
bd->lb_page->result = 0;
|
||||
/* The current request (so we can end it in the interrupt handler). */
|
||||
bd->req = req;
|
||||
/* The number of bytes: returned as a side-effect of req_to_dma(),
|
||||
* which packs the block layer's "struct request" into our "struct
|
||||
* lguest_dma" */
|
||||
bd->lb_page->bytes = req_to_dma(req, dma);
|
||||
}
|
||||
|
||||
/*D:450 Write is pretty straightforward: we pack the request into a "struct
|
||||
* lguest_dma", then use SEND_DMA to send the request. */
|
||||
static void do_write(struct blockdev *bd, struct request *req)
|
||||
{
|
||||
struct lguest_dma send;
|
||||
|
||||
pr_debug("lgb: WRITE sector %li\n", (long)req->sector);
|
||||
setup_req(bd, 1, req, &send);
|
||||
|
||||
lguest_send_dma(bd->phys_addr, &send);
|
||||
}
|
||||
|
||||
/* Read is similar to write, except we pack the request into our receive
|
||||
* "struct lguest_dma" and send through an empty DMA just to tell the Host that
|
||||
* there's a request pending. */
|
||||
static void do_read(struct blockdev *bd, struct request *req)
|
||||
{
|
||||
struct lguest_dma ping;
|
||||
|
||||
pr_debug("lgb: READ sector %li\n", (long)req->sector);
|
||||
setup_req(bd, 0, req, &bd->dma);
|
||||
|
||||
empty_dma(&ping);
|
||||
lguest_send_dma(bd->phys_addr, &ping);
|
||||
}
|
||||
|
||||
/*D:440 This where requests come in: we get handed the request queue and are
|
||||
* expected to pull a "struct request" off it until we've finished them or
|
||||
* we're waiting for a reply: */
|
||||
static void do_lgb_request(struct request_queue *q)
|
||||
{
|
||||
struct blockdev *bd;
|
||||
struct request *req;
|
||||
|
||||
again:
|
||||
/* This sometimes returns NULL even on the very first time around. I
|
||||
* wonder if it's something to do with letting elves handle the request
|
||||
* queue... */
|
||||
req = elv_next_request(q);
|
||||
if (!req)
|
||||
return;
|
||||
|
||||
/* We attached the struct blockdev to the disk: get it back */
|
||||
bd = req->rq_disk->private_data;
|
||||
/* Sometimes we get repeated requests after blk_stop_queue(), but we
|
||||
* can only handle one at a time. */
|
||||
if (bd->req)
|
||||
return;
|
||||
|
||||
/* We only do reads and writes: no tricky business! */
|
||||
if (!blk_fs_request(req)) {
|
||||
pr_debug("Got non-command 0x%08x\n", req->cmd_type);
|
||||
req->errors++;
|
||||
end_entire_request(req, 0);
|
||||
goto again;
|
||||
}
|
||||
|
||||
if (rq_data_dir(req) == WRITE)
|
||||
do_write(bd, req);
|
||||
else
|
||||
do_read(bd, req);
|
||||
|
||||
/* We've put out the request, so stop any more coming in until we get
|
||||
* an interrupt, which takes us to lgb_irq() to re-enable the queue. */
|
||||
blk_stop_queue(q);
|
||||
}
|
||||
|
||||
/*D:430 This is the "struct block_device_operations" we attach to the disk at
|
||||
* the end of lguestblk_probe(). It doesn't seem to want much. */
|
||||
static struct block_device_operations lguestblk_fops = {
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
/*D:425 Setting up a disk device seems to involve a lot of code. I'm not sure
|
||||
* quite why. I do know that the IDE code sent two or three of the maintainers
|
||||
* insane, perhaps this is the fringe of the same disease?
|
||||
*
|
||||
* As in the console code, the probe function gets handed the generic
|
||||
* lguest_device from lguest_bus.c: */
|
||||
static int lguestblk_probe(struct lguest_device *lgdev)
|
||||
{
|
||||
struct blockdev *bd;
|
||||
int err;
|
||||
int irqflags = IRQF_SHARED;
|
||||
|
||||
/* First we allocate our own "struct blockdev" and initialize the easy
|
||||
* fields. */
|
||||
bd = kmalloc(sizeof(*bd), GFP_KERNEL);
|
||||
if (!bd)
|
||||
return -ENOMEM;
|
||||
|
||||
spin_lock_init(&bd->lock);
|
||||
bd->irq = lgdev_irq(lgdev);
|
||||
bd->req = NULL;
|
||||
bd->dma.used_len = 0;
|
||||
bd->dma.len[0] = 0;
|
||||
/* The descriptor in the lguest_devices array provided by the Host
|
||||
* gives the Guest the physical page number of the device's page. */
|
||||
bd->phys_addr = (lguest_devices[lgdev->index].pfn << PAGE_SHIFT);
|
||||
|
||||
/* We use lguest_map() to get a pointer to the device page */
|
||||
bd->lb_page = lguest_map(bd->phys_addr, 1);
|
||||
if (!bd->lb_page) {
|
||||
err = -ENOMEM;
|
||||
goto out_free_bd;
|
||||
}
|
||||
|
||||
/* We need a major device number: 0 means "assign one dynamically". */
|
||||
bd->major = register_blkdev(0, "lguestblk");
|
||||
if (bd->major < 0) {
|
||||
err = bd->major;
|
||||
goto out_unmap;
|
||||
}
|
||||
|
||||
/* This allocates a "struct gendisk" where we pack all the information
|
||||
* about the disk which the rest of Linux sees. The argument is the
|
||||
* number of minor devices desired: we need one minor for the main
|
||||
* disk, and one for each partition. Of course, we can't possibly know
|
||||
* how many partitions are on the disk (add_disk does that).
|
||||
*/
|
||||
bd->disk = alloc_disk(16);
|
||||
if (!bd->disk) {
|
||||
err = -ENOMEM;
|
||||
goto out_unregister_blkdev;
|
||||
}
|
||||
|
||||
/* Every disk needs a queue for requests to come in: we set up the
|
||||
* queue with a callback function (the core of our driver) and the lock
|
||||
* to use. */
|
||||
bd->disk->queue = blk_init_queue(do_lgb_request, &bd->lock);
|
||||
if (!bd->disk->queue) {
|
||||
err = -ENOMEM;
|
||||
goto out_put_disk;
|
||||
}
|
||||
|
||||
/* We can only handle a certain number of pointers in our SEND_DMA
|
||||
* call, so we set that with blk_queue_max_hw_segments(). This is not
|
||||
* to be confused with blk_queue_max_phys_segments() of course! I
|
||||
* know, who could possibly confuse the two?
|
||||
*
|
||||
* Well, it's simple to tell them apart: this one seems to work and the
|
||||
* other one didn't. */
|
||||
blk_queue_max_hw_segments(bd->disk->queue, LGUEST_MAX_DMA_SECTIONS);
|
||||
|
||||
/* Due to technical limitations of our Host (and simple coding) we
|
||||
* can't have a single buffer which crosses a page boundary. Tell it
|
||||
* here. This means that our maximum request size is 16
|
||||
* (LGUEST_MAX_DMA_SECTIONS) pages. */
|
||||
blk_queue_segment_boundary(bd->disk->queue, PAGE_SIZE-1);
|
||||
|
||||
/* We name our disk: this becomes the device name when udev does its
|
||||
* magic thing and creates the device node, such as /dev/lgba.
|
||||
* next_block_index is a global which starts at 'a'. Unfortunately
|
||||
* this simple increment logic means that the 27th disk will be called
|
||||
* "/dev/lgb{". In that case, I recommend having at least 29 disks, so
|
||||
* your /dev directory will be balanced. */
|
||||
sprintf(bd->disk->disk_name, "lgb%c", next_block_index++);
|
||||
|
||||
/* We look to the device descriptor again to see if this device's
|
||||
* interrupts are expected to be random. If they are, we tell the irq
|
||||
* subsystem. At the moment this bit is always set. */
|
||||
if (lguest_devices[lgdev->index].features & LGUEST_DEVICE_F_RANDOMNESS)
|
||||
irqflags |= IRQF_SAMPLE_RANDOM;
|
||||
|
||||
/* Now we have the name and irqflags, we can request the interrupt; we
|
||||
* give it the "struct blockdev" we have set up to pass to lgb_irq()
|
||||
* when there is an interrupt. */
|
||||
err = request_irq(bd->irq, lgb_irq, irqflags, bd->disk->disk_name, bd);
|
||||
if (err)
|
||||
goto out_cleanup_queue;
|
||||
|
||||
/* We bind our one-entry DMA pool to the key for this block device so
|
||||
* the Host can reply to our requests. The key is equal to the
|
||||
* physical address of the device's page, which is conveniently
|
||||
* unique. */
|
||||
err = lguest_bind_dma(bd->phys_addr, &bd->dma, 1, bd->irq);
|
||||
if (err)
|
||||
goto out_free_irq;
|
||||
|
||||
/* We finish our disk initialization and add the disk to the system. */
|
||||
bd->disk->major = bd->major;
|
||||
bd->disk->first_minor = 0;
|
||||
bd->disk->private_data = bd;
|
||||
bd->disk->fops = &lguestblk_fops;
|
||||
/* This is initialized to the disk size by the Launcher. */
|
||||
set_capacity(bd->disk, bd->lb_page->num_sectors);
|
||||
add_disk(bd->disk);
|
||||
|
||||
printk(KERN_INFO "%s: device %i at major %d\n",
|
||||
bd->disk->disk_name, lgdev->index, bd->major);
|
||||
|
||||
/* We don't need to keep the "struct blockdev" around, but if we ever
|
||||
* implemented device removal, we'd need this. */
|
||||
lgdev->private = bd;
|
||||
return 0;
|
||||
|
||||
out_free_irq:
|
||||
free_irq(bd->irq, bd);
|
||||
out_cleanup_queue:
|
||||
blk_cleanup_queue(bd->disk->queue);
|
||||
out_put_disk:
|
||||
put_disk(bd->disk);
|
||||
out_unregister_blkdev:
|
||||
unregister_blkdev(bd->major, "lguestblk");
|
||||
out_unmap:
|
||||
lguest_unmap(bd->lb_page);
|
||||
out_free_bd:
|
||||
kfree(bd);
|
||||
return err;
|
||||
}
|
||||
|
||||
/*D:410 The boilerplate code for registering the lguest block driver is just
|
||||
* like the console: */
|
||||
static struct lguest_driver lguestblk_drv = {
|
||||
.name = "lguestblk",
|
||||
.owner = THIS_MODULE,
|
||||
.device_type = LGUEST_DEVICE_T_BLOCK,
|
||||
.probe = lguestblk_probe,
|
||||
};
|
||||
|
||||
static __init int lguestblk_init(void)
|
||||
{
|
||||
return register_lguest_driver(&lguestblk_drv);
|
||||
}
|
||||
module_init(lguestblk_init);
|
||||
|
||||
MODULE_DESCRIPTION("Lguest block driver");
|
||||
MODULE_LICENSE("GPL");
|
308
drivers/block/virtio_blk.c
Normal file
308
drivers/block/virtio_blk.c
Normal file
@ -0,0 +1,308 @@
|
||||
//#define DEBUG
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/hdreg.h>
|
||||
#include <linux/virtio.h>
|
||||
#include <linux/virtio_blk.h>
|
||||
#include <linux/virtio_blk.h>
|
||||
|
||||
static unsigned char virtblk_index = 'a';
|
||||
struct virtio_blk
|
||||
{
|
||||
spinlock_t lock;
|
||||
|
||||
struct virtio_device *vdev;
|
||||
struct virtqueue *vq;
|
||||
|
||||
/* The disk structure for the kernel. */
|
||||
struct gendisk *disk;
|
||||
|
||||
/* Request tracking. */
|
||||
struct list_head reqs;
|
||||
|
||||
mempool_t *pool;
|
||||
|
||||
/* Scatterlist: can be too big for stack. */
|
||||
struct scatterlist sg[3+MAX_PHYS_SEGMENTS];
|
||||
};
|
||||
|
||||
struct virtblk_req
|
||||
{
|
||||
struct list_head list;
|
||||
struct request *req;
|
||||
struct virtio_blk_outhdr out_hdr;
|
||||
struct virtio_blk_inhdr in_hdr;
|
||||
};
|
||||
|
||||
static bool blk_done(struct virtqueue *vq)
|
||||
{
|
||||
struct virtio_blk *vblk = vq->vdev->priv;
|
||||
struct virtblk_req *vbr;
|
||||
unsigned int len;
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&vblk->lock, flags);
|
||||
while ((vbr = vblk->vq->vq_ops->get_buf(vblk->vq, &len)) != NULL) {
|
||||
int uptodate;
|
||||
switch (vbr->in_hdr.status) {
|
||||
case VIRTIO_BLK_S_OK:
|
||||
uptodate = 1;
|
||||
break;
|
||||
case VIRTIO_BLK_S_UNSUPP:
|
||||
uptodate = -ENOTTY;
|
||||
break;
|
||||
default:
|
||||
uptodate = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
end_dequeued_request(vbr->req, uptodate);
|
||||
list_del(&vbr->list);
|
||||
mempool_free(vbr, vblk->pool);
|
||||
}
|
||||
/* In case queue is stopped waiting for more buffers. */
|
||||
blk_start_queue(vblk->disk->queue);
|
||||
spin_unlock_irqrestore(&vblk->lock, flags);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
|
||||
struct request *req)
|
||||
{
|
||||
unsigned long num, out, in;
|
||||
struct virtblk_req *vbr;
|
||||
|
||||
vbr = mempool_alloc(vblk->pool, GFP_ATOMIC);
|
||||
if (!vbr)
|
||||
/* When another request finishes we'll try again. */
|
||||
return false;
|
||||
|
||||
vbr->req = req;
|
||||
if (blk_fs_request(vbr->req)) {
|
||||
vbr->out_hdr.type = 0;
|
||||
vbr->out_hdr.sector = vbr->req->sector;
|
||||
vbr->out_hdr.ioprio = vbr->req->ioprio;
|
||||
} else if (blk_pc_request(vbr->req)) {
|
||||
vbr->out_hdr.type = VIRTIO_BLK_T_SCSI_CMD;
|
||||
vbr->out_hdr.sector = 0;
|
||||
vbr->out_hdr.ioprio = vbr->req->ioprio;
|
||||
} else {
|
||||
/* We don't put anything else in the queue. */
|
||||
BUG();
|
||||
}
|
||||
|
||||
if (blk_barrier_rq(vbr->req))
|
||||
vbr->out_hdr.type |= VIRTIO_BLK_T_BARRIER;
|
||||
|
||||
/* We have to zero this, otherwise blk_rq_map_sg gets upset. */
|
||||
memset(vblk->sg, 0, sizeof(vblk->sg));
|
||||
sg_set_buf(&vblk->sg[0], &vbr->out_hdr, sizeof(vbr->out_hdr));
|
||||
num = blk_rq_map_sg(q, vbr->req, vblk->sg+1);
|
||||
sg_set_buf(&vblk->sg[num+1], &vbr->in_hdr, sizeof(vbr->in_hdr));
|
||||
|
||||
if (rq_data_dir(vbr->req) == WRITE) {
|
||||
vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
|
||||
out = 1 + num;
|
||||
in = 1;
|
||||
} else {
|
||||
vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
|
||||
out = 1;
|
||||
in = 1 + num;
|
||||
}
|
||||
|
||||
if (vblk->vq->vq_ops->add_buf(vblk->vq, vblk->sg, out, in, vbr)) {
|
||||
mempool_free(vbr, vblk->pool);
|
||||
return false;
|
||||
}
|
||||
|
||||
list_add_tail(&vbr->list, &vblk->reqs);
|
||||
return true;
|
||||
}
|
||||
|
||||
static void do_virtblk_request(struct request_queue *q)
|
||||
{
|
||||
struct virtio_blk *vblk = NULL;
|
||||
struct request *req;
|
||||
unsigned int issued = 0;
|
||||
|
||||
while ((req = elv_next_request(q)) != NULL) {
|
||||
vblk = req->rq_disk->private_data;
|
||||
BUG_ON(req->nr_phys_segments > ARRAY_SIZE(vblk->sg));
|
||||
|
||||
/* If this request fails, stop queue and wait for something to
|
||||
finish to restart it. */
|
||||
if (!do_req(q, vblk, req)) {
|
||||
blk_stop_queue(q);
|
||||
break;
|
||||
}
|
||||
blkdev_dequeue_request(req);
|
||||
issued++;
|
||||
}
|
||||
|
||||
if (issued)
|
||||
vblk->vq->vq_ops->kick(vblk->vq);
|
||||
}
|
||||
|
||||
static int virtblk_ioctl(struct inode *inode, struct file *filp,
|
||||
unsigned cmd, unsigned long data)
|
||||
{
|
||||
return scsi_cmd_ioctl(filp, inode->i_bdev->bd_disk->queue,
|
||||
inode->i_bdev->bd_disk, cmd,
|
||||
(void __user *)data);
|
||||
}
|
||||
|
||||
static struct block_device_operations virtblk_fops = {
|
||||
.ioctl = virtblk_ioctl,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int virtblk_probe(struct virtio_device *vdev)
|
||||
{
|
||||
struct virtio_blk *vblk;
|
||||
int err, major;
|
||||
void *token;
|
||||
unsigned int len;
|
||||
u64 cap;
|
||||
u32 v;
|
||||
|
||||
vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL);
|
||||
if (!vblk) {
|
||||
err = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
INIT_LIST_HEAD(&vblk->reqs);
|
||||
spin_lock_init(&vblk->lock);
|
||||
vblk->vdev = vdev;
|
||||
|
||||
/* We expect one virtqueue, for output. */
|
||||
vblk->vq = vdev->config->find_vq(vdev, blk_done);
|
||||
if (IS_ERR(vblk->vq)) {
|
||||
err = PTR_ERR(vblk->vq);
|
||||
goto out_free_vblk;
|
||||
}
|
||||
|
||||
vblk->pool = mempool_create_kmalloc_pool(1,sizeof(struct virtblk_req));
|
||||
if (!vblk->pool) {
|
||||
err = -ENOMEM;
|
||||
goto out_free_vq;
|
||||
}
|
||||
|
||||
major = register_blkdev(0, "virtblk");
|
||||
if (major < 0) {
|
||||
err = major;
|
||||
goto out_mempool;
|
||||
}
|
||||
|
||||
/* FIXME: How many partitions? How long is a piece of string? */
|
||||
vblk->disk = alloc_disk(1 << 4);
|
||||
if (!vblk->disk) {
|
||||
err = -ENOMEM;
|
||||
goto out_unregister_blkdev;
|
||||
}
|
||||
|
||||
vblk->disk->queue = blk_init_queue(do_virtblk_request, &vblk->lock);
|
||||
if (!vblk->disk->queue) {
|
||||
err = -ENOMEM;
|
||||
goto out_put_disk;
|
||||
}
|
||||
|
||||
sprintf(vblk->disk->disk_name, "vd%c", virtblk_index++);
|
||||
vblk->disk->major = major;
|
||||
vblk->disk->first_minor = 0;
|
||||
vblk->disk->private_data = vblk;
|
||||
vblk->disk->fops = &virtblk_fops;
|
||||
|
||||
/* If barriers are supported, tell block layer that queue is ordered */
|
||||
token = vdev->config->find(vdev, VIRTIO_CONFIG_BLK_F, &len);
|
||||
if (virtio_use_bit(vdev, token, len, VIRTIO_BLK_F_BARRIER))
|
||||
blk_queue_ordered(vblk->disk->queue, QUEUE_ORDERED_TAG, NULL);
|
||||
|
||||
err = virtio_config_val(vdev, VIRTIO_CONFIG_BLK_F_CAPACITY, &cap);
|
||||
if (err) {
|
||||
dev_err(&vdev->dev, "Bad/missing capacity in config\n");
|
||||
goto out_put_disk;
|
||||
}
|
||||
|
||||
/* If capacity is too big, truncate with warning. */
|
||||
if ((sector_t)cap != cap) {
|
||||
dev_warn(&vdev->dev, "Capacity %llu too large: truncating\n",
|
||||
(unsigned long long)cap);
|
||||
cap = (sector_t)-1;
|
||||
}
|
||||
set_capacity(vblk->disk, cap);
|
||||
|
||||
err = virtio_config_val(vdev, VIRTIO_CONFIG_BLK_F_SIZE_MAX, &v);
|
||||
if (!err)
|
||||
blk_queue_max_segment_size(vblk->disk->queue, v);
|
||||
else if (err != -ENOENT) {
|
||||
dev_err(&vdev->dev, "Bad SIZE_MAX in config\n");
|
||||
goto out_put_disk;
|
||||
}
|
||||
|
||||
err = virtio_config_val(vdev, VIRTIO_CONFIG_BLK_F_SEG_MAX, &v);
|
||||
if (!err)
|
||||
blk_queue_max_hw_segments(vblk->disk->queue, v);
|
||||
else if (err != -ENOENT) {
|
||||
dev_err(&vdev->dev, "Bad SEG_MAX in config\n");
|
||||
goto out_put_disk;
|
||||
}
|
||||
|
||||
add_disk(vblk->disk);
|
||||
return 0;
|
||||
|
||||
out_put_disk:
|
||||
put_disk(vblk->disk);
|
||||
out_unregister_blkdev:
|
||||
unregister_blkdev(major, "virtblk");
|
||||
out_mempool:
|
||||
mempool_destroy(vblk->pool);
|
||||
out_free_vq:
|
||||
vdev->config->del_vq(vblk->vq);
|
||||
out_free_vblk:
|
||||
kfree(vblk);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static void virtblk_remove(struct virtio_device *vdev)
|
||||
{
|
||||
struct virtio_blk *vblk = vdev->priv;
|
||||
int major = vblk->disk->major;
|
||||
|
||||
BUG_ON(!list_empty(&vblk->reqs));
|
||||
blk_cleanup_queue(vblk->disk->queue);
|
||||
put_disk(vblk->disk);
|
||||
unregister_blkdev(major, "virtblk");
|
||||
mempool_destroy(vblk->pool);
|
||||
kfree(vblk);
|
||||
}
|
||||
|
||||
static struct virtio_device_id id_table[] = {
|
||||
{ VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
|
||||
{ 0 },
|
||||
};
|
||||
|
||||
static struct virtio_driver virtio_blk = {
|
||||
.driver.name = KBUILD_MODNAME,
|
||||
.driver.owner = THIS_MODULE,
|
||||
.id_table = id_table,
|
||||
.probe = virtblk_probe,
|
||||
.remove = __devexit_p(virtblk_remove),
|
||||
};
|
||||
|
||||
static int __init init(void)
|
||||
{
|
||||
return register_virtio_driver(&virtio_blk);
|
||||
}
|
||||
|
||||
static void __exit fini(void)
|
||||
{
|
||||
unregister_virtio_driver(&virtio_blk);
|
||||
}
|
||||
module_init(init);
|
||||
module_exit(fini);
|
||||
|
||||
MODULE_DEVICE_TABLE(virtio, id_table);
|
||||
MODULE_DESCRIPTION("Virtio block driver");
|
||||
MODULE_LICENSE("GPL");
|
@ -613,6 +613,10 @@ config HVC_XEN
|
||||
help
|
||||
Xen virtual console device driver
|
||||
|
||||
config VIRTIO_CONSOLE
|
||||
bool
|
||||
select HVC_DRIVER
|
||||
|
||||
config HVCS
|
||||
tristate "IBM Hypervisor Virtual Console Server support"
|
||||
depends on PPC_PSERIES
|
||||
|
@ -42,7 +42,6 @@ obj-$(CONFIG_SYNCLINK_GT) += synclink_gt.o
|
||||
obj-$(CONFIG_N_HDLC) += n_hdlc.o
|
||||
obj-$(CONFIG_AMIGA_BUILTIN_SERIAL) += amiserial.o
|
||||
obj-$(CONFIG_SX) += sx.o generic_serial.o
|
||||
obj-$(CONFIG_LGUEST_GUEST) += hvc_lguest.o
|
||||
obj-$(CONFIG_RIO) += rio/ generic_serial.o
|
||||
obj-$(CONFIG_HVC_CONSOLE) += hvc_vio.o hvsi.o
|
||||
obj-$(CONFIG_HVC_ISERIES) += hvc_iseries.o
|
||||
@ -50,6 +49,7 @@ obj-$(CONFIG_HVC_RTAS) += hvc_rtas.o
|
||||
obj-$(CONFIG_HVC_BEAT) += hvc_beat.o
|
||||
obj-$(CONFIG_HVC_DRIVER) += hvc_console.o
|
||||
obj-$(CONFIG_HVC_XEN) += hvc_xen.o
|
||||
obj-$(CONFIG_VIRTIO_CONSOLE) += virtio_console.o
|
||||
obj-$(CONFIG_RAW_DRIVER) += raw.o
|
||||
obj-$(CONFIG_SGI_SNSC) += snsc.o snsc_event.o
|
||||
obj-$(CONFIG_MSPEC) += mspec.o
|
||||
|
@ -1,177 +0,0 @@
|
||||
/*D:300
|
||||
* The Guest console driver
|
||||
*
|
||||
* This is a trivial console driver: we use lguest's DMA mechanism to send
|
||||
* bytes out, and register a DMA buffer to receive bytes in. It is assumed to
|
||||
* be present and available from the very beginning of boot.
|
||||
*
|
||||
* Writing console drivers is one of the few remaining Dark Arts in Linux.
|
||||
* Fortunately for us, the path of virtual consoles has been well-trodden by
|
||||
* the PowerPC folks, who wrote "hvc_console.c" to generically support any
|
||||
* virtual console. We use that infrastructure which only requires us to write
|
||||
* the basic put_chars and get_chars functions and call the right register
|
||||
* functions.
|
||||
:*/
|
||||
|
||||
/*M:002 The console can be flooded: while the Guest is processing input the
|
||||
* Host can send more. Buffering in the Host could alleviate this, but it is a
|
||||
* difficult problem in general. :*/
|
||||
/* Copyright (C) 2006 Rusty Russell, IBM Corporation
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
*/
|
||||
#include <linux/err.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/lguest_bus.h>
|
||||
#include <asm/paravirt.h>
|
||||
#include "hvc_console.h"
|
||||
|
||||
/*D:340 This is our single console input buffer, with associated "struct
|
||||
* lguest_dma" referring to it. Note the 0-terminated length array, and the
|
||||
* use of physical address for the buffer itself. */
|
||||
static char inbuf[256];
|
||||
static struct lguest_dma cons_input = { .used_len = 0,
|
||||
.addr[0] = __pa(inbuf),
|
||||
.len[0] = sizeof(inbuf),
|
||||
.len[1] = 0 };
|
||||
|
||||
/*D:310 The put_chars() callback is pretty straightforward.
|
||||
*
|
||||
* First we put the pointer and length in a "struct lguest_dma": we only have
|
||||
* one pointer, so we set the second length to 0. Then we use SEND_DMA to send
|
||||
* the data to (Host) buffers attached to the console key. Usually a device's
|
||||
* key is a physical address within the device's memory, but because the
|
||||
* console device doesn't have any associated physical memory, we use the
|
||||
* LGUEST_CONSOLE_DMA_KEY constant (aka 0). */
|
||||
static int put_chars(u32 vtermno, const char *buf, int count)
|
||||
{
|
||||
struct lguest_dma dma;
|
||||
|
||||
/* FIXME: DMA buffers in a "struct lguest_dma" are not allowed
|
||||
* to go over page boundaries. This never seems to happen,
|
||||
* but if it did we'd need to fix this code. */
|
||||
dma.len[0] = count;
|
||||
dma.len[1] = 0;
|
||||
dma.addr[0] = __pa(buf);
|
||||
|
||||
lguest_send_dma(LGUEST_CONSOLE_DMA_KEY, &dma);
|
||||
/* We're expected to return the amount of data we wrote: all of it. */
|
||||
return count;
|
||||
}
|
||||
|
||||
/*D:350 get_chars() is the callback from the hvc_console infrastructure when
|
||||
* an interrupt is received.
|
||||
*
|
||||
* Firstly we see if our buffer has been filled: if not, we return. The rest
|
||||
* of the code deals with the fact that the hvc_console() infrastructure only
|
||||
* asks us for 16 bytes at a time. We keep a "cons_offset" variable for
|
||||
* partially-read buffers. */
|
||||
static int get_chars(u32 vtermno, char *buf, int count)
|
||||
{
|
||||
static int cons_offset;
|
||||
|
||||
/* Nothing left to see here... */
|
||||
if (!cons_input.used_len)
|
||||
return 0;
|
||||
|
||||
/* You want more than we have to give? Well, try wanting less! */
|
||||
if (cons_input.used_len - cons_offset < count)
|
||||
count = cons_input.used_len - cons_offset;
|
||||
|
||||
/* Copy across to their buffer and increment offset. */
|
||||
memcpy(buf, inbuf + cons_offset, count);
|
||||
cons_offset += count;
|
||||
|
||||
/* Finished? Zero offset, and reset cons_input so Host will use it
|
||||
* again. */
|
||||
if (cons_offset == cons_input.used_len) {
|
||||
cons_offset = 0;
|
||||
cons_input.used_len = 0;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
/*:*/
|
||||
|
||||
static struct hv_ops lguest_cons = {
|
||||
.get_chars = get_chars,
|
||||
.put_chars = put_chars,
|
||||
};
|
||||
|
||||
/*D:320 Console drivers are initialized very early so boot messages can go
|
||||
* out. At this stage, the console is output-only. Our driver checks we're a
|
||||
* Guest, and if so hands hvc_instantiate() the console number (0), priority
|
||||
* (0), and the struct hv_ops containing the put_chars() function. */
|
||||
static int __init cons_init(void)
|
||||
{
|
||||
if (strcmp(pv_info.name, "lguest") != 0)
|
||||
return 0;
|
||||
|
||||
return hvc_instantiate(0, 0, &lguest_cons);
|
||||
}
|
||||
console_initcall(cons_init);
|
||||
|
||||
/*D:370 To set up and manage our virtual console, we call hvc_alloc() and
|
||||
* stash the result in the private pointer of the "struct lguest_device".
|
||||
* Since we never remove the console device we never need this pointer again,
|
||||
* but using ->private is considered good form, and you never know who's going
|
||||
* to copy your driver.
|
||||
*
|
||||
* Once the console is set up, we bind our input buffer ready for input. */
|
||||
static int lguestcons_probe(struct lguest_device *lgdev)
|
||||
{
|
||||
int err;
|
||||
|
||||
/* The first argument of hvc_alloc() is the virtual console number, so
|
||||
* we use zero. The second argument is the interrupt number.
|
||||
*
|
||||
* The third argument is a "struct hv_ops" containing the put_chars()
|
||||
* and get_chars() pointers. The final argument is the output buffer
|
||||
* size: we use 256 and expect the Host to have room for us to send
|
||||
* that much. */
|
||||
lgdev->private = hvc_alloc(0, lgdev_irq(lgdev), &lguest_cons, 256);
|
||||
if (IS_ERR(lgdev->private))
|
||||
return PTR_ERR(lgdev->private);
|
||||
|
||||
/* We bind a single DMA buffer at key LGUEST_CONSOLE_DMA_KEY.
|
||||
* "cons_input" is that statically-initialized global DMA buffer we saw
|
||||
* above, and we also give the interrupt we want. */
|
||||
err = lguest_bind_dma(LGUEST_CONSOLE_DMA_KEY, &cons_input, 1,
|
||||
lgdev_irq(lgdev));
|
||||
if (err)
|
||||
printk("lguest console: failed to bind buffer.\n");
|
||||
return err;
|
||||
}
|
||||
/* Note the use of lgdev_irq() for the interrupt number. We tell hvc_alloc()
|
||||
* to expect input when this interrupt is triggered, and then tell
|
||||
* lguest_bind_dma() that is the interrupt to send us when input comes in. */
|
||||
|
||||
/*D:360 From now on the console driver follows standard Guest driver form:
|
||||
* register_lguest_driver() registers the device type and probe function, and
|
||||
* the probe function sets up the device.
|
||||
*
|
||||
* The standard "struct lguest_driver": */
|
||||
static struct lguest_driver lguestcons_drv = {
|
||||
.name = "lguestcons",
|
||||
.owner = THIS_MODULE,
|
||||
.device_type = LGUEST_DEVICE_T_CONSOLE,
|
||||
.probe = lguestcons_probe,
|
||||
};
|
||||
|
||||
/* The standard init function */
|
||||
static int __init hvc_lguest_init(void)
|
||||
{
|
||||
return register_lguest_driver(&lguestcons_drv);
|
||||
}
|
||||
module_init(hvc_lguest_init);
|
225
drivers/char/virtio_console.c
Normal file
225
drivers/char/virtio_console.c
Normal file
@ -0,0 +1,225 @@
|
||||
/*D:300
|
||||
* The Guest console driver
|
||||
*
|
||||
* Writing console drivers is one of the few remaining Dark Arts in Linux.
|
||||
* Fortunately for us, the path of virtual consoles has been well-trodden by
|
||||
* the PowerPC folks, who wrote "hvc_console.c" to generically support any
|
||||
* virtual console. We use that infrastructure which only requires us to write
|
||||
* the basic put_chars and get_chars functions and call the right register
|
||||
* functions.
|
||||
:*/
|
||||
|
||||
/*M:002 The console can be flooded: while the Guest is processing input the
|
||||
* Host can send more. Buffering in the Host could alleviate this, but it is a
|
||||
* difficult problem in general. :*/
|
||||
/* Copyright (C) 2006, 2007 Rusty Russell, IBM Corporation
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
*/
|
||||
#include <linux/err.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/virtio.h>
|
||||
#include <linux/virtio_console.h>
|
||||
#include "hvc_console.h"
|
||||
|
||||
/*D:340 These represent our input and output console queues, and the virtio
|
||||
* operations for them. */
|
||||
static struct virtqueue *in_vq, *out_vq;
|
||||
static struct virtio_device *vdev;
|
||||
|
||||
/* This is our input buffer, and how much data is left in it. */
|
||||
static unsigned int in_len;
|
||||
static char *in, *inbuf;
|
||||
|
||||
/* The operations for our console. */
|
||||
static struct hv_ops virtio_cons;
|
||||
|
||||
/*D:310 The put_chars() callback is pretty straightforward.
|
||||
*
|
||||
* We turn the characters into a scatter-gather list, add it to the output
|
||||
* queue and then kick the Host. Then we sit here waiting for it to finish:
|
||||
* inefficient in theory, but in practice implementations will do it
|
||||
* immediately (lguest's Launcher does). */
|
||||
static int put_chars(u32 vtermno, const char *buf, int count)
|
||||
{
|
||||
struct scatterlist sg[1];
|
||||
unsigned int len;
|
||||
|
||||
/* This is a convenient routine to initialize a single-elem sg list */
|
||||
sg_init_one(sg, buf, count);
|
||||
|
||||
/* add_buf wants a token to identify this buffer: we hand it any
|
||||
* non-NULL pointer, since there's only ever one buffer. */
|
||||
if (out_vq->vq_ops->add_buf(out_vq, sg, 1, 0, (void *)1) == 0) {
|
||||
/* Tell Host to go! */
|
||||
out_vq->vq_ops->kick(out_vq);
|
||||
/* Chill out until it's done with the buffer. */
|
||||
while (!out_vq->vq_ops->get_buf(out_vq, &len))
|
||||
cpu_relax();
|
||||
}
|
||||
|
||||
/* We're expected to return the amount of data we wrote: all of it. */
|
||||
return count;
|
||||
}
|
||||
|
||||
/* Create a scatter-gather list representing our input buffer and put it in the
|
||||
* queue. */
|
||||
static void add_inbuf(void)
|
||||
{
|
||||
struct scatterlist sg[1];
|
||||
sg_init_one(sg, inbuf, PAGE_SIZE);
|
||||
|
||||
/* We should always be able to add one buffer to an empty queue. */
|
||||
if (in_vq->vq_ops->add_buf(in_vq, sg, 0, 1, inbuf) != 0)
|
||||
BUG();
|
||||
in_vq->vq_ops->kick(in_vq);
|
||||
}
|
||||
|
||||
/*D:350 get_chars() is the callback from the hvc_console infrastructure when
|
||||
* an interrupt is received.
|
||||
*
|
||||
* Most of the code deals with the fact that the hvc_console() infrastructure
|
||||
* only asks us for 16 bytes at a time. We keep in_offset and in_used fields
|
||||
* for partially-filled buffers. */
|
||||
static int get_chars(u32 vtermno, char *buf, int count)
|
||||
{
|
||||
/* If we don't have an input queue yet, we can't get input. */
|
||||
BUG_ON(!in_vq);
|
||||
|
||||
/* No buffer? Try to get one. */
|
||||
if (!in_len) {
|
||||
in = in_vq->vq_ops->get_buf(in_vq, &in_len);
|
||||
if (!in)
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* You want more than we have to give? Well, try wanting less! */
|
||||
if (in_len < count)
|
||||
count = in_len;
|
||||
|
||||
/* Copy across to their buffer and increment offset. */
|
||||
memcpy(buf, in, count);
|
||||
in += count;
|
||||
in_len -= count;
|
||||
|
||||
/* Finished? Re-register buffer so Host will use it again. */
|
||||
if (in_len == 0)
|
||||
add_inbuf();
|
||||
|
||||
return count;
|
||||
}
|
||||
/*:*/
|
||||
|
||||
/*D:320 Console drivers are initialized very early so boot messages can go out,
|
||||
* so we do things slightly differently from the generic virtio initialization
|
||||
* of the net and block drivers.
|
||||
*
|
||||
* At this stage, the console is output-only. It's too early to set up a
|
||||
* virtqueue, so we let the drivers do some boutique early-output thing. */
|
||||
int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int))
|
||||
{
|
||||
virtio_cons.put_chars = put_chars;
|
||||
return hvc_instantiate(0, 0, &virtio_cons);
|
||||
}
|
||||
|
||||
/*D:370 Once we're further in boot, we get probed like any other virtio device.
|
||||
* At this stage we set up the output virtqueue.
|
||||
*
|
||||
* To set up and manage our virtual console, we call hvc_alloc(). Since we
|
||||
* never remove the console device we never need this pointer again.
|
||||
*
|
||||
* Finally we put our input buffer in the input queue, ready to receive. */
|
||||
static int virtcons_probe(struct virtio_device *dev)
|
||||
{
|
||||
int err;
|
||||
struct hvc_struct *hvc;
|
||||
|
||||
vdev = dev;
|
||||
|
||||
/* This is the scratch page we use to receive console input */
|
||||
inbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
|
||||
if (!inbuf) {
|
||||
err = -ENOMEM;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
/* Find the input queue. */
|
||||
/* FIXME: This is why we want to wean off hvc: we do nothing
|
||||
* when input comes in. */
|
||||
in_vq = vdev->config->find_vq(vdev, NULL);
|
||||
if (IS_ERR(in_vq)) {
|
||||
err = PTR_ERR(in_vq);
|
||||
goto free;
|
||||
}
|
||||
|
||||
out_vq = vdev->config->find_vq(vdev, NULL);
|
||||
if (IS_ERR(out_vq)) {
|
||||
err = PTR_ERR(out_vq);
|
||||
goto free_in_vq;
|
||||
}
|
||||
|
||||
/* Start using the new console output. */
|
||||
virtio_cons.get_chars = get_chars;
|
||||
virtio_cons.put_chars = put_chars;
|
||||
|
||||
/* The first argument of hvc_alloc() is the virtual console number, so
|
||||
* we use zero. The second argument is the interrupt number; we
|
||||
* currently leave this as zero: it would be better not to use the
|
||||
* hvc mechanism and fix this (FIXME!).
|
||||
*
|
||||
* The third argument is a "struct hv_ops" containing the put_chars()
|
||||
* and get_chars() pointers. The final argument is the output buffer
|
||||
* size: we can do any size, so we put PAGE_SIZE here. */
|
||||
hvc = hvc_alloc(0, 0, &virtio_cons, PAGE_SIZE);
|
||||
if (IS_ERR(hvc)) {
|
||||
err = PTR_ERR(hvc);
|
||||
goto free_out_vq;
|
||||
}
|
||||
|
||||
/* Register the input buffer the first time. */
|
||||
add_inbuf();
|
||||
return 0;
|
||||
|
||||
free_out_vq:
|
||||
vdev->config->del_vq(out_vq);
|
||||
free_in_vq:
|
||||
vdev->config->del_vq(in_vq);
|
||||
free:
|
||||
kfree(inbuf);
|
||||
fail:
|
||||
return err;
|
||||
}
|
||||
|
||||
static struct virtio_device_id id_table[] = {
|
||||
{ VIRTIO_ID_CONSOLE, VIRTIO_DEV_ANY_ID },
|
||||
{ 0 },
|
||||
};
|
||||
|
||||
static struct virtio_driver virtio_console = {
|
||||
.driver.name = KBUILD_MODNAME,
|
||||
.driver.owner = THIS_MODULE,
|
||||
.id_table = id_table,
|
||||
.probe = virtcons_probe,
|
||||
};
|
||||
|
||||
static int __init init(void)
|
||||
{
|
||||
return register_virtio_driver(&virtio_console);
|
||||
}
|
||||
module_init(init);
|
||||
|
||||
MODULE_DEVICE_TABLE(virtio, id_table);
|
||||
MODULE_DESCRIPTION("Virtio console driver");
|
||||
MODULE_LICENSE("GPL");
|
@ -47,4 +47,8 @@ config KVM_AMD
|
||||
Provides support for KVM on AMD processors equipped with the AMD-V
|
||||
(SVM) extensions.
|
||||
|
||||
# OK, it's a little counter-intuitive to do this, but it puts it neatly under
|
||||
# the virtualization menu.
|
||||
source drivers/lguest/Kconfig
|
||||
|
||||
endif # VIRTUALIZATION
|
||||
|
@ -1,7 +1,6 @@
|
||||
config LGUEST
|
||||
tristate "Linux hypervisor example code"
|
||||
depends on X86 && PARAVIRT && EXPERIMENTAL && !X86_PAE && FUTEX
|
||||
select LGUEST_GUEST
|
||||
depends on X86_32 && EXPERIMENTAL && !X86_PAE && FUTEX && !(X86_VISWS || X86_VOYAGER)
|
||||
select HVC_DRIVER
|
||||
---help---
|
||||
This is a very simple module which allows you to run
|
||||
@ -18,13 +17,3 @@ config LGUEST_GUEST
|
||||
The guest needs code built-in, even if the host has lguest
|
||||
support as a module. The drivers are tiny, so we build them
|
||||
in too.
|
||||
|
||||
config LGUEST_NET
|
||||
tristate
|
||||
default y
|
||||
depends on LGUEST_GUEST && NET
|
||||
|
||||
config LGUEST_BLOCK
|
||||
tristate
|
||||
default y
|
||||
depends on LGUEST_GUEST && BLOCK
|
||||
|
@ -1,10 +1,12 @@
|
||||
# Guest requires the paravirt_ops replacement and the bus driver.
|
||||
obj-$(CONFIG_LGUEST_GUEST) += lguest.o lguest_asm.o lguest_bus.o
|
||||
# Guest requires the device configuration and probing code.
|
||||
obj-$(CONFIG_LGUEST_GUEST) += lguest_device.o
|
||||
|
||||
# Host requires the other files, which can be a module.
|
||||
obj-$(CONFIG_LGUEST) += lg.o
|
||||
lg-y := core.o hypercalls.o page_tables.o interrupts_and_traps.o \
|
||||
segments.o io.o lguest_user.o switcher.o
|
||||
lg-y = core.o hypercalls.o page_tables.o interrupts_and_traps.o \
|
||||
segments.o lguest_user.o
|
||||
|
||||
lg-$(CONFIG_X86_32) += x86/switcher_32.o x86/core.o
|
||||
|
||||
Preparation Preparation!: PREFIX=P
|
||||
Guest: PREFIX=G
|
||||
|
@ -11,58 +11,20 @@
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/cpu.h>
|
||||
#include <linux/freezer.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <asm/paravirt.h>
|
||||
#include <asm/desc.h>
|
||||
#include <asm/pgtable.h>
|
||||
#include <asm/uaccess.h>
|
||||
#include <asm/poll.h>
|
||||
#include <asm/highmem.h>
|
||||
#include <asm/asm-offsets.h>
|
||||
#include <asm/i387.h>
|
||||
#include "lg.h"
|
||||
|
||||
/* Found in switcher.S */
|
||||
extern char start_switcher_text[], end_switcher_text[], switch_to_guest[];
|
||||
extern unsigned long default_idt_entries[];
|
||||
|
||||
/* Every guest maps the core switcher code. */
|
||||
#define SHARED_SWITCHER_PAGES \
|
||||
DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE)
|
||||
/* Pages for switcher itself, then two pages per cpu */
|
||||
#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * NR_CPUS)
|
||||
|
||||
/* We map at -4M for ease of mapping into the guest (one PTE page). */
|
||||
#define SWITCHER_ADDR 0xFFC00000
|
||||
|
||||
static struct vm_struct *switcher_vma;
|
||||
static struct page **switcher_page;
|
||||
|
||||
static int cpu_had_pge;
|
||||
static struct {
|
||||
unsigned long offset;
|
||||
unsigned short segment;
|
||||
} lguest_entry;
|
||||
|
||||
/* This One Big lock protects all inter-guest data structures. */
|
||||
DEFINE_MUTEX(lguest_lock);
|
||||
static DEFINE_PER_CPU(struct lguest *, last_guest);
|
||||
|
||||
/* FIXME: Make dynamic. */
|
||||
#define MAX_LGUEST_GUESTS 16
|
||||
struct lguest lguests[MAX_LGUEST_GUESTS];
|
||||
|
||||
/* Offset from where switcher.S was compiled to where we've copied it */
|
||||
static unsigned long switcher_offset(void)
|
||||
{
|
||||
return SWITCHER_ADDR - (unsigned long)start_switcher_text;
|
||||
}
|
||||
|
||||
/* This cpu's struct lguest_pages. */
|
||||
static struct lguest_pages *lguest_pages(unsigned int cpu)
|
||||
{
|
||||
return &(((struct lguest_pages *)
|
||||
(SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]);
|
||||
}
|
||||
|
||||
/*H:010 We need to set up the Switcher at a high virtual address. Remember the
|
||||
* Switcher is a few hundred bytes of assembler code which actually changes the
|
||||
@ -73,9 +35,7 @@ static struct lguest_pages *lguest_pages(unsigned int cpu)
|
||||
* Host since it will be running as the switchover occurs.
|
||||
*
|
||||
* Trying to map memory at a particular address is an unusual thing to do, so
|
||||
* it's not a simple one-liner. We also set up the per-cpu parts of the
|
||||
* Switcher here.
|
||||
*/
|
||||
* it's not a simple one-liner. */
|
||||
static __init int map_switcher(void)
|
||||
{
|
||||
int i, err;
|
||||
@ -132,90 +92,11 @@ static __init int map_switcher(void)
|
||||
goto free_vma;
|
||||
}
|
||||
|
||||
/* Now the switcher is mapped at the right address, we can't fail!
|
||||
* Copy in the compiled-in Switcher code (from switcher.S). */
|
||||
/* Now the Switcher is mapped at the right address, we can't fail!
|
||||
* Copy in the compiled-in Switcher code (from <arch>_switcher.S). */
|
||||
memcpy(switcher_vma->addr, start_switcher_text,
|
||||
end_switcher_text - start_switcher_text);
|
||||
|
||||
/* Most of the switcher.S doesn't care that it's been moved; on Intel,
|
||||
* jumps are relative, and it doesn't access any references to external
|
||||
* code or data.
|
||||
*
|
||||
* The only exception is the interrupt handlers in switcher.S: their
|
||||
* addresses are placed in a table (default_idt_entries), so we need to
|
||||
* update the table with the new addresses. switcher_offset() is a
|
||||
* convenience function which returns the distance between the builtin
|
||||
* switcher code and the high-mapped copy we just made. */
|
||||
for (i = 0; i < IDT_ENTRIES; i++)
|
||||
default_idt_entries[i] += switcher_offset();
|
||||
|
||||
/*
|
||||
* Set up the Switcher's per-cpu areas.
|
||||
*
|
||||
* Each CPU gets two pages of its own within the high-mapped region
|
||||
* (aka. "struct lguest_pages"). Much of this can be initialized now,
|
||||
* but some depends on what Guest we are running (which is set up in
|
||||
* copy_in_guest_info()).
|
||||
*/
|
||||
for_each_possible_cpu(i) {
|
||||
/* lguest_pages() returns this CPU's two pages. */
|
||||
struct lguest_pages *pages = lguest_pages(i);
|
||||
/* This is a convenience pointer to make the code fit one
|
||||
* statement to a line. */
|
||||
struct lguest_ro_state *state = &pages->state;
|
||||
|
||||
/* The Global Descriptor Table: the Host has a different one
|
||||
* for each CPU. We keep a descriptor for the GDT which says
|
||||
* where it is and how big it is (the size is actually the last
|
||||
* byte, not the size, hence the "-1"). */
|
||||
state->host_gdt_desc.size = GDT_SIZE-1;
|
||||
state->host_gdt_desc.address = (long)get_cpu_gdt_table(i);
|
||||
|
||||
/* All CPUs on the Host use the same Interrupt Descriptor
|
||||
* Table, so we just use store_idt(), which gets this CPU's IDT
|
||||
* descriptor. */
|
||||
store_idt(&state->host_idt_desc);
|
||||
|
||||
/* The descriptors for the Guest's GDT and IDT can be filled
|
||||
* out now, too. We copy the GDT & IDT into ->guest_gdt and
|
||||
* ->guest_idt before actually running the Guest. */
|
||||
state->guest_idt_desc.size = sizeof(state->guest_idt)-1;
|
||||
state->guest_idt_desc.address = (long)&state->guest_idt;
|
||||
state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1;
|
||||
state->guest_gdt_desc.address = (long)&state->guest_gdt;
|
||||
|
||||
/* We know where we want the stack to be when the Guest enters
|
||||
* the switcher: in pages->regs. The stack grows upwards, so
|
||||
* we start it at the end of that structure. */
|
||||
state->guest_tss.esp0 = (long)(&pages->regs + 1);
|
||||
/* And this is the GDT entry to use for the stack: we keep a
|
||||
* couple of special LGUEST entries. */
|
||||
state->guest_tss.ss0 = LGUEST_DS;
|
||||
|
||||
/* x86 can have a finegrained bitmap which indicates what I/O
|
||||
* ports the process can use. We set it to the end of our
|
||||
* structure, meaning "none". */
|
||||
state->guest_tss.io_bitmap_base = sizeof(state->guest_tss);
|
||||
|
||||
/* Some GDT entries are the same across all Guests, so we can
|
||||
* set them up now. */
|
||||
setup_default_gdt_entries(state);
|
||||
/* Most IDT entries are the same for all Guests, too.*/
|
||||
setup_default_idt_entries(state, default_idt_entries);
|
||||
|
||||
/* The Host needs to be able to use the LGUEST segments on this
|
||||
* CPU, too, so put them in the Host GDT. */
|
||||
get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
|
||||
get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
|
||||
}
|
||||
|
||||
/* In the Switcher, we want the %cs segment register to use the
|
||||
* LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so
|
||||
* it will be undisturbed when we switch. To change %cs and jump we
|
||||
* need this structure to feed to Intel's "lcall" instruction. */
|
||||
lguest_entry.offset = (long)switch_to_guest + switcher_offset();
|
||||
lguest_entry.segment = LGUEST_CS;
|
||||
|
||||
printk(KERN_INFO "lguest: mapped switcher at %p\n",
|
||||
switcher_vma->addr);
|
||||
/* And we succeeded... */
|
||||
@ -247,86 +128,12 @@ static void unmap_switcher(void)
|
||||
__free_pages(switcher_page[i], 0);
|
||||
}
|
||||
|
||||
/*H:130 Our Guest is usually so well behaved; it never tries to do things it
|
||||
* isn't allowed to. Unfortunately, Linux's paravirtual infrastructure isn't
|
||||
* quite complete, because it doesn't contain replacements for the Intel I/O
|
||||
* instructions. As a result, the Guest sometimes fumbles across one during
|
||||
* the boot process as it probes for various things which are usually attached
|
||||
* to a PC.
|
||||
*
|
||||
* When the Guest uses one of these instructions, we get trap #13 (General
|
||||
* Protection Fault) and come here. We see if it's one of those troublesome
|
||||
* instructions and skip over it. We return true if we did. */
|
||||
static int emulate_insn(struct lguest *lg)
|
||||
{
|
||||
u8 insn;
|
||||
unsigned int insnlen = 0, in = 0, shift = 0;
|
||||
/* The eip contains the *virtual* address of the Guest's instruction:
|
||||
* guest_pa just subtracts the Guest's page_offset. */
|
||||
unsigned long physaddr = guest_pa(lg, lg->regs->eip);
|
||||
|
||||
/* The guest_pa() function only works for Guest kernel addresses, but
|
||||
* that's all we're trying to do anyway. */
|
||||
if (lg->regs->eip < lg->page_offset)
|
||||
return 0;
|
||||
|
||||
/* Decoding x86 instructions is icky. */
|
||||
lgread(lg, &insn, physaddr, 1);
|
||||
|
||||
/* 0x66 is an "operand prefix". It means it's using the upper 16 bits
|
||||
of the eax register. */
|
||||
if (insn == 0x66) {
|
||||
shift = 16;
|
||||
/* The instruction is 1 byte so far, read the next byte. */
|
||||
insnlen = 1;
|
||||
lgread(lg, &insn, physaddr + insnlen, 1);
|
||||
}
|
||||
|
||||
/* We can ignore the lower bit for the moment and decode the 4 opcodes
|
||||
* we need to emulate. */
|
||||
switch (insn & 0xFE) {
|
||||
case 0xE4: /* in <next byte>,%al */
|
||||
insnlen += 2;
|
||||
in = 1;
|
||||
break;
|
||||
case 0xEC: /* in (%dx),%al */
|
||||
insnlen += 1;
|
||||
in = 1;
|
||||
break;
|
||||
case 0xE6: /* out %al,<next byte> */
|
||||
insnlen += 2;
|
||||
break;
|
||||
case 0xEE: /* out %al,(%dx) */
|
||||
insnlen += 1;
|
||||
break;
|
||||
default:
|
||||
/* OK, we don't know what this is, can't emulate. */
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* If it was an "IN" instruction, they expect the result to be read
|
||||
* into %eax, so we change %eax. We always return all-ones, which
|
||||
* traditionally means "there's nothing there". */
|
||||
if (in) {
|
||||
/* Lower bit tells is whether it's a 16 or 32 bit access */
|
||||
if (insn & 0x1)
|
||||
lg->regs->eax = 0xFFFFFFFF;
|
||||
else
|
||||
lg->regs->eax |= (0xFFFF << shift);
|
||||
}
|
||||
/* Finally, we've "done" the instruction, so move past it. */
|
||||
lg->regs->eip += insnlen;
|
||||
/* Success! */
|
||||
return 1;
|
||||
}
|
||||
/*:*/
|
||||
|
||||
/*L:305
|
||||
* Dealing With Guest Memory.
|
||||
*
|
||||
* When the Guest gives us (what it thinks is) a physical address, we can use
|
||||
* the normal copy_from_user() & copy_to_user() on that address: remember,
|
||||
* Guest physical == Launcher virtual.
|
||||
* the normal copy_from_user() & copy_to_user() on the corresponding place in
|
||||
* the memory region allocated by the Launcher.
|
||||
*
|
||||
* But we can't trust the Guest: it might be trying to access the Launcher
|
||||
* code. We have to check that the range is below the pfn_limit the Launcher
|
||||
@ -338,148 +145,27 @@ int lguest_address_ok(const struct lguest *lg,
|
||||
return (addr+len) / PAGE_SIZE < lg->pfn_limit && (addr+len >= addr);
|
||||
}
|
||||
|
||||
/* This is a convenient routine to get a 32-bit value from the Guest (a very
|
||||
* common operation). Here we can see how useful the kill_lguest() routine we
|
||||
* met in the Launcher can be: we return a random value (0) instead of needing
|
||||
* to return an error. */
|
||||
u32 lgread_u32(struct lguest *lg, unsigned long addr)
|
||||
{
|
||||
u32 val = 0;
|
||||
|
||||
/* Don't let them access lguest binary. */
|
||||
if (!lguest_address_ok(lg, addr, sizeof(val))
|
||||
|| get_user(val, (u32 __user *)addr) != 0)
|
||||
kill_guest(lg, "bad read address %#lx", addr);
|
||||
return val;
|
||||
}
|
||||
|
||||
/* Same thing for writing a value. */
|
||||
void lgwrite_u32(struct lguest *lg, unsigned long addr, u32 val)
|
||||
{
|
||||
if (!lguest_address_ok(lg, addr, sizeof(val))
|
||||
|| put_user(val, (u32 __user *)addr) != 0)
|
||||
kill_guest(lg, "bad write address %#lx", addr);
|
||||
}
|
||||
|
||||
/* This routine is more generic, and copies a range of Guest bytes into a
|
||||
* buffer. If the copy_from_user() fails, we fill the buffer with zeroes, so
|
||||
* the caller doesn't end up using uninitialized kernel memory. */
|
||||
void lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes)
|
||||
/* This routine copies memory from the Guest. Here we can see how useful the
|
||||
* kill_lguest() routine we met in the Launcher can be: we return a random
|
||||
* value (all zeroes) instead of needing to return an error. */
|
||||
void __lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes)
|
||||
{
|
||||
if (!lguest_address_ok(lg, addr, bytes)
|
||||
|| copy_from_user(b, (void __user *)addr, bytes) != 0) {
|
||||
|| copy_from_user(b, lg->mem_base + addr, bytes) != 0) {
|
||||
/* copy_from_user should do this, but as we rely on it... */
|
||||
memset(b, 0, bytes);
|
||||
kill_guest(lg, "bad read address %#lx len %u", addr, bytes);
|
||||
}
|
||||
}
|
||||
|
||||
/* Similarly, our generic routine to copy into a range of Guest bytes. */
|
||||
void lgwrite(struct lguest *lg, unsigned long addr, const void *b,
|
||||
unsigned bytes)
|
||||
/* This is the write (copy into guest) version. */
|
||||
void __lgwrite(struct lguest *lg, unsigned long addr, const void *b,
|
||||
unsigned bytes)
|
||||
{
|
||||
if (!lguest_address_ok(lg, addr, bytes)
|
||||
|| copy_to_user((void __user *)addr, b, bytes) != 0)
|
||||
|| copy_to_user(lg->mem_base + addr, b, bytes) != 0)
|
||||
kill_guest(lg, "bad write address %#lx len %u", addr, bytes);
|
||||
}
|
||||
/* (end of memory access helper routines) :*/
|
||||
|
||||
static void set_ts(void)
|
||||
{
|
||||
u32 cr0;
|
||||
|
||||
cr0 = read_cr0();
|
||||
if (!(cr0 & 8))
|
||||
write_cr0(cr0|8);
|
||||
}
|
||||
|
||||
/*S:010
|
||||
* We are getting close to the Switcher.
|
||||
*
|
||||
* Remember that each CPU has two pages which are visible to the Guest when it
|
||||
* runs on that CPU. This has to contain the state for that Guest: we copy the
|
||||
* state in just before we run the Guest.
|
||||
*
|
||||
* Each Guest has "changed" flags which indicate what has changed in the Guest
|
||||
* since it last ran. We saw this set in interrupts_and_traps.c and
|
||||
* segments.c.
|
||||
*/
|
||||
static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages)
|
||||
{
|
||||
/* Copying all this data can be quite expensive. We usually run the
|
||||
* same Guest we ran last time (and that Guest hasn't run anywhere else
|
||||
* meanwhile). If that's not the case, we pretend everything in the
|
||||
* Guest has changed. */
|
||||
if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) {
|
||||
__get_cpu_var(last_guest) = lg;
|
||||
lg->last_pages = pages;
|
||||
lg->changed = CHANGED_ALL;
|
||||
}
|
||||
|
||||
/* These copies are pretty cheap, so we do them unconditionally: */
|
||||
/* Save the current Host top-level page directory. */
|
||||
pages->state.host_cr3 = __pa(current->mm->pgd);
|
||||
/* Set up the Guest's page tables to see this CPU's pages (and no
|
||||
* other CPU's pages). */
|
||||
map_switcher_in_guest(lg, pages);
|
||||
/* Set up the two "TSS" members which tell the CPU what stack to use
|
||||
* for traps which do directly into the Guest (ie. traps at privilege
|
||||
* level 1). */
|
||||
pages->state.guest_tss.esp1 = lg->esp1;
|
||||
pages->state.guest_tss.ss1 = lg->ss1;
|
||||
|
||||
/* Copy direct-to-Guest trap entries. */
|
||||
if (lg->changed & CHANGED_IDT)
|
||||
copy_traps(lg, pages->state.guest_idt, default_idt_entries);
|
||||
|
||||
/* Copy all GDT entries which the Guest can change. */
|
||||
if (lg->changed & CHANGED_GDT)
|
||||
copy_gdt(lg, pages->state.guest_gdt);
|
||||
/* If only the TLS entries have changed, copy them. */
|
||||
else if (lg->changed & CHANGED_GDT_TLS)
|
||||
copy_gdt_tls(lg, pages->state.guest_gdt);
|
||||
|
||||
/* Mark the Guest as unchanged for next time. */
|
||||
lg->changed = 0;
|
||||
}
|
||||
|
||||
/* Finally: the code to actually call into the Switcher to run the Guest. */
|
||||
static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
|
||||
{
|
||||
/* This is a dummy value we need for GCC's sake. */
|
||||
unsigned int clobber;
|
||||
|
||||
/* Copy the guest-specific information into this CPU's "struct
|
||||
* lguest_pages". */
|
||||
copy_in_guest_info(lg, pages);
|
||||
|
||||
/* Set the trap number to 256 (impossible value). If we fault while
|
||||
* switching to the Guest (bad segment registers or bug), this will
|
||||
* cause us to abort the Guest. */
|
||||
lg->regs->trapnum = 256;
|
||||
|
||||
/* Now: we push the "eflags" register on the stack, then do an "lcall".
|
||||
* This is how we change from using the kernel code segment to using
|
||||
* the dedicated lguest code segment, as well as jumping into the
|
||||
* Switcher.
|
||||
*
|
||||
* The lcall also pushes the old code segment (KERNEL_CS) onto the
|
||||
* stack, then the address of this call. This stack layout happens to
|
||||
* exactly match the stack of an interrupt... */
|
||||
asm volatile("pushf; lcall *lguest_entry"
|
||||
/* This is how we tell GCC that %eax ("a") and %ebx ("b")
|
||||
* are changed by this routine. The "=" means output. */
|
||||
: "=a"(clobber), "=b"(clobber)
|
||||
/* %eax contains the pages pointer. ("0" refers to the
|
||||
* 0-th argument above, ie "a"). %ebx contains the
|
||||
* physical address of the Guest's top-level page
|
||||
* directory. */
|
||||
: "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir))
|
||||
/* We tell gcc that all these registers could change,
|
||||
* which means we don't have to save and restore them in
|
||||
* the Switcher. */
|
||||
: "memory", "%edx", "%ecx", "%edi", "%esi");
|
||||
}
|
||||
/*:*/
|
||||
|
||||
/*H:030 Let's jump straight to the the main loop which runs the Guest.
|
||||
@ -489,22 +175,16 @@ int run_guest(struct lguest *lg, unsigned long __user *user)
|
||||
{
|
||||
/* We stop running once the Guest is dead. */
|
||||
while (!lg->dead) {
|
||||
/* We need to initialize this, otherwise gcc complains. It's
|
||||
* not (yet) clever enough to see that it's initialized when we
|
||||
* need it. */
|
||||
unsigned int cr2 = 0; /* Damn gcc */
|
||||
/* First we run any hypercalls the Guest wants done. */
|
||||
if (lg->hcall)
|
||||
do_hypercalls(lg);
|
||||
|
||||
/* First we run any hypercalls the Guest wants done: either in
|
||||
* the hypercall ring in "struct lguest_data", or directly by
|
||||
* using int 31 (LGUEST_TRAP_ENTRY). */
|
||||
do_hypercalls(lg);
|
||||
/* It's possible the Guest did a SEND_DMA hypercall to the
|
||||
/* It's possible the Guest did a NOTIFY hypercall to the
|
||||
* Launcher, in which case we return from the read() now. */
|
||||
if (lg->dma_is_pending) {
|
||||
if (put_user(lg->pending_dma, user) ||
|
||||
put_user(lg->pending_key, user+1))
|
||||
if (lg->pending_notify) {
|
||||
if (put_user(lg->pending_notify, user))
|
||||
return -EFAULT;
|
||||
return sizeof(unsigned long)*2;
|
||||
return sizeof(lg->pending_notify);
|
||||
}
|
||||
|
||||
/* Check for signals */
|
||||
@ -542,144 +222,20 @@ int run_guest(struct lguest *lg, unsigned long __user *user)
|
||||
* the "Do Not Disturb" sign: */
|
||||
local_irq_disable();
|
||||
|
||||
/* Remember the awfully-named TS bit? If the Guest has asked
|
||||
* to set it we set it now, so we can trap and pass that trap
|
||||
* to the Guest if it uses the FPU. */
|
||||
if (lg->ts)
|
||||
set_ts();
|
||||
|
||||
/* SYSENTER is an optimized way of doing system calls. We
|
||||
* can't allow it because it always jumps to privilege level 0.
|
||||
* A normal Guest won't try it because we don't advertise it in
|
||||
* CPUID, but a malicious Guest (or malicious Guest userspace
|
||||
* program) could, so we tell the CPU to disable it before
|
||||
* running the Guest. */
|
||||
if (boot_cpu_has(X86_FEATURE_SEP))
|
||||
wrmsr(MSR_IA32_SYSENTER_CS, 0, 0);
|
||||
|
||||
/* Now we actually run the Guest. It will pop back out when
|
||||
* something interesting happens, and we can examine its
|
||||
* registers to see what it was doing. */
|
||||
run_guest_once(lg, lguest_pages(raw_smp_processor_id()));
|
||||
|
||||
/* The "regs" pointer contains two extra entries which are not
|
||||
* really registers: a trap number which says what interrupt or
|
||||
* trap made the switcher code come back, and an error code
|
||||
* which some traps set. */
|
||||
|
||||
/* If the Guest page faulted, then the cr2 register will tell
|
||||
* us the bad virtual address. We have to grab this now,
|
||||
* because once we re-enable interrupts an interrupt could
|
||||
* fault and thus overwrite cr2, or we could even move off to a
|
||||
* different CPU. */
|
||||
if (lg->regs->trapnum == 14)
|
||||
cr2 = read_cr2();
|
||||
/* Similarly, if we took a trap because the Guest used the FPU,
|
||||
* we have to restore the FPU it expects to see. */
|
||||
else if (lg->regs->trapnum == 7)
|
||||
math_state_restore();
|
||||
|
||||
/* Restore SYSENTER if it's supposed to be on. */
|
||||
if (boot_cpu_has(X86_FEATURE_SEP))
|
||||
wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
|
||||
/* Actually run the Guest until something happens. */
|
||||
lguest_arch_run_guest(lg);
|
||||
|
||||
/* Now we're ready to be interrupted or moved to other CPUs */
|
||||
local_irq_enable();
|
||||
|
||||
/* OK, so what happened? */
|
||||
switch (lg->regs->trapnum) {
|
||||
case 13: /* We've intercepted a GPF. */
|
||||
/* Check if this was one of those annoying IN or OUT
|
||||
* instructions which we need to emulate. If so, we
|
||||
* just go back into the Guest after we've done it. */
|
||||
if (lg->regs->errcode == 0) {
|
||||
if (emulate_insn(lg))
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
case 14: /* We've intercepted a page fault. */
|
||||
/* The Guest accessed a virtual address that wasn't
|
||||
* mapped. This happens a lot: we don't actually set
|
||||
* up most of the page tables for the Guest at all when
|
||||
* we start: as it runs it asks for more and more, and
|
||||
* we set them up as required. In this case, we don't
|
||||
* even tell the Guest that the fault happened.
|
||||
*
|
||||
* The errcode tells whether this was a read or a
|
||||
* write, and whether kernel or userspace code. */
|
||||
if (demand_page(lg, cr2, lg->regs->errcode))
|
||||
continue;
|
||||
|
||||
/* OK, it's really not there (or not OK): the Guest
|
||||
* needs to know. We write out the cr2 value so it
|
||||
* knows where the fault occurred.
|
||||
*
|
||||
* Note that if the Guest were really messed up, this
|
||||
* could happen before it's done the INITIALIZE
|
||||
* hypercall, so lg->lguest_data will be NULL, so
|
||||
* &lg->lguest_data->cr2 will be address 8. Writing
|
||||
* into that address won't hurt the Host at all,
|
||||
* though. */
|
||||
if (put_user(cr2, &lg->lguest_data->cr2))
|
||||
kill_guest(lg, "Writing cr2");
|
||||
break;
|
||||
case 7: /* We've intercepted a Device Not Available fault. */
|
||||
/* If the Guest doesn't want to know, we already
|
||||
* restored the Floating Point Unit, so we just
|
||||
* continue without telling it. */
|
||||
if (!lg->ts)
|
||||
continue;
|
||||
break;
|
||||
case 32 ... 255:
|
||||
/* These values mean a real interrupt occurred, in
|
||||
* which case the Host handler has already been run.
|
||||
* We just do a friendly check if another process
|
||||
* should now be run, then fall through to loop
|
||||
* around: */
|
||||
cond_resched();
|
||||
case LGUEST_TRAP_ENTRY: /* Handled at top of loop */
|
||||
continue;
|
||||
}
|
||||
|
||||
/* If we get here, it's a trap the Guest wants to know
|
||||
* about. */
|
||||
if (deliver_trap(lg, lg->regs->trapnum))
|
||||
continue;
|
||||
|
||||
/* If the Guest doesn't have a handler (either it hasn't
|
||||
* registered any yet, or it's one of the faults we don't let
|
||||
* it handle), it dies with a cryptic error message. */
|
||||
kill_guest(lg, "unhandled trap %li at %#lx (%#lx)",
|
||||
lg->regs->trapnum, lg->regs->eip,
|
||||
lg->regs->trapnum == 14 ? cr2 : lg->regs->errcode);
|
||||
/* Now we deal with whatever happened to the Guest. */
|
||||
lguest_arch_handle_trap(lg);
|
||||
}
|
||||
|
||||
/* The Guest is dead => "No such file or directory" */
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
/* Now we can look at each of the routines this calls, in increasing order of
|
||||
* complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(),
|
||||
* deliver_trap() and demand_page(). After all those, we'll be ready to
|
||||
* examine the Switcher, and our philosophical understanding of the Host/Guest
|
||||
* duality will be complete. :*/
|
||||
|
||||
int find_free_guest(void)
|
||||
{
|
||||
unsigned int i;
|
||||
for (i = 0; i < MAX_LGUEST_GUESTS; i++)
|
||||
if (!lguests[i].tsk)
|
||||
return i;
|
||||
return -1;
|
||||
}
|
||||
|
||||
static void adjust_pge(void *on)
|
||||
{
|
||||
if (on)
|
||||
write_cr4(read_cr4() | X86_CR4_PGE);
|
||||
else
|
||||
write_cr4(read_cr4() & ~X86_CR4_PGE);
|
||||
}
|
||||
|
||||
/*H:000
|
||||
* Welcome to the Host!
|
||||
*
|
||||
@ -701,72 +257,50 @@ static int __init init(void)
|
||||
/* First we put the Switcher up in very high virtual memory. */
|
||||
err = map_switcher();
|
||||
if (err)
|
||||
return err;
|
||||
goto out;
|
||||
|
||||
/* Now we set up the pagetable implementation for the Guests. */
|
||||
err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES);
|
||||
if (err) {
|
||||
unmap_switcher();
|
||||
return err;
|
||||
}
|
||||
if (err)
|
||||
goto unmap;
|
||||
|
||||
/* The I/O subsystem needs some things initialized. */
|
||||
lguest_io_init();
|
||||
/* We might need to reserve an interrupt vector. */
|
||||
err = init_interrupts();
|
||||
if (err)
|
||||
goto free_pgtables;
|
||||
|
||||
/* /dev/lguest needs to be registered. */
|
||||
err = lguest_device_init();
|
||||
if (err) {
|
||||
free_pagetables();
|
||||
unmap_switcher();
|
||||
return err;
|
||||
}
|
||||
if (err)
|
||||
goto free_interrupts;
|
||||
|
||||
/* Finally, we need to turn off "Page Global Enable". PGE is an
|
||||
* optimization where page table entries are specially marked to show
|
||||
* they never change. The Host kernel marks all the kernel pages this
|
||||
* way because it's always present, even when userspace is running.
|
||||
*
|
||||
* Lguest breaks this: unbeknownst to the rest of the Host kernel, we
|
||||
* switch to the Guest kernel. If you don't disable this on all CPUs,
|
||||
* you'll get really weird bugs that you'll chase for two days.
|
||||
*
|
||||
* I used to turn PGE off every time we switched to the Guest and back
|
||||
* on when we return, but that slowed the Switcher down noticibly. */
|
||||
|
||||
/* We don't need the complexity of CPUs coming and going while we're
|
||||
* doing this. */
|
||||
lock_cpu_hotplug();
|
||||
if (cpu_has_pge) { /* We have a broader idea of "global". */
|
||||
/* Remember that this was originally set (for cleanup). */
|
||||
cpu_had_pge = 1;
|
||||
/* adjust_pge is a helper function which sets or unsets the PGE
|
||||
* bit on its CPU, depending on the argument (0 == unset). */
|
||||
on_each_cpu(adjust_pge, (void *)0, 0, 1);
|
||||
/* Turn off the feature in the global feature set. */
|
||||
clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
|
||||
}
|
||||
unlock_cpu_hotplug();
|
||||
/* Finally we do some architecture-specific setup. */
|
||||
lguest_arch_host_init();
|
||||
|
||||
/* All good! */
|
||||
return 0;
|
||||
|
||||
free_interrupts:
|
||||
free_interrupts();
|
||||
free_pgtables:
|
||||
free_pagetables();
|
||||
unmap:
|
||||
unmap_switcher();
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
/* Cleaning up is just the same code, backwards. With a little French. */
|
||||
static void __exit fini(void)
|
||||
{
|
||||
lguest_device_remove();
|
||||
free_interrupts();
|
||||
free_pagetables();
|
||||
unmap_switcher();
|
||||
|
||||
/* If we had PGE before we started, turn it back on now. */
|
||||
lock_cpu_hotplug();
|
||||
if (cpu_had_pge) {
|
||||
set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
|
||||
/* adjust_pge's argument "1" means set PGE. */
|
||||
on_each_cpu(adjust_pge, (void *)1, 0, 1);
|
||||
}
|
||||
unlock_cpu_hotplug();
|
||||
lguest_arch_host_fini();
|
||||
}
|
||||
/*:*/
|
||||
|
||||
/* The Host side of lguest can be a module. This is a nice way for people to
|
||||
* play with it. */
|
||||
|
@ -25,17 +25,13 @@
|
||||
#include <linux/mm.h>
|
||||
#include <asm/page.h>
|
||||
#include <asm/pgtable.h>
|
||||
#include <irq_vectors.h>
|
||||
#include "lg.h"
|
||||
|
||||
/*H:120 This is the core hypercall routine: where the Guest gets what it
|
||||
* wants. Or gets killed. Or, in the case of LHCALL_CRASH, both.
|
||||
*
|
||||
* Remember from the Guest: %eax == which call to make, and the arguments are
|
||||
* packed into %edx, %ebx and %ecx if needed. */
|
||||
static void do_hcall(struct lguest *lg, struct lguest_regs *regs)
|
||||
/*H:120 This is the core hypercall routine: where the Guest gets what it wants.
|
||||
* Or gets killed. Or, in the case of LHCALL_CRASH, both. */
|
||||
static void do_hcall(struct lguest *lg, struct hcall_args *args)
|
||||
{
|
||||
switch (regs->eax) {
|
||||
switch (args->arg0) {
|
||||
case LHCALL_FLUSH_ASYNC:
|
||||
/* This call does nothing, except by breaking out of the Guest
|
||||
* it makes us process all the asynchronous hypercalls. */
|
||||
@ -51,7 +47,7 @@ static void do_hcall(struct lguest *lg, struct lguest_regs *regs)
|
||||
char msg[128];
|
||||
/* If the lgread fails, it will call kill_guest() itself; the
|
||||
* kill_guest() with the message will be ignored. */
|
||||
lgread(lg, msg, regs->edx, sizeof(msg));
|
||||
__lgread(lg, msg, args->arg1, sizeof(msg));
|
||||
msg[sizeof(msg)-1] = '\0';
|
||||
kill_guest(lg, "CRASH: %s", msg);
|
||||
break;
|
||||
@ -59,67 +55,49 @@ static void do_hcall(struct lguest *lg, struct lguest_regs *regs)
|
||||
case LHCALL_FLUSH_TLB:
|
||||
/* FLUSH_TLB comes in two flavors, depending on the
|
||||
* argument: */
|
||||
if (regs->edx)
|
||||
if (args->arg1)
|
||||
guest_pagetable_clear_all(lg);
|
||||
else
|
||||
guest_pagetable_flush_user(lg);
|
||||
break;
|
||||
case LHCALL_BIND_DMA:
|
||||
/* BIND_DMA really wants four arguments, but it's the only call
|
||||
* which does. So the Guest packs the number of buffers and
|
||||
* the interrupt number into the final argument, and we decode
|
||||
* it here. This can legitimately fail, since we currently
|
||||
* place a limit on the number of DMA pools a Guest can have.
|
||||
* So we return true or false from this call. */
|
||||
regs->eax = bind_dma(lg, regs->edx, regs->ebx,
|
||||
regs->ecx >> 8, regs->ecx & 0xFF);
|
||||
break;
|
||||
|
||||
/* All these calls simply pass the arguments through to the right
|
||||
* routines. */
|
||||
case LHCALL_SEND_DMA:
|
||||
send_dma(lg, regs->edx, regs->ebx);
|
||||
break;
|
||||
case LHCALL_LOAD_GDT:
|
||||
load_guest_gdt(lg, regs->edx, regs->ebx);
|
||||
break;
|
||||
case LHCALL_LOAD_IDT_ENTRY:
|
||||
load_guest_idt_entry(lg, regs->edx, regs->ebx, regs->ecx);
|
||||
break;
|
||||
case LHCALL_NEW_PGTABLE:
|
||||
guest_new_pagetable(lg, regs->edx);
|
||||
guest_new_pagetable(lg, args->arg1);
|
||||
break;
|
||||
case LHCALL_SET_STACK:
|
||||
guest_set_stack(lg, regs->edx, regs->ebx, regs->ecx);
|
||||
guest_set_stack(lg, args->arg1, args->arg2, args->arg3);
|
||||
break;
|
||||
case LHCALL_SET_PTE:
|
||||
guest_set_pte(lg, regs->edx, regs->ebx, mkgpte(regs->ecx));
|
||||
guest_set_pte(lg, args->arg1, args->arg2, __pte(args->arg3));
|
||||
break;
|
||||
case LHCALL_SET_PMD:
|
||||
guest_set_pmd(lg, regs->edx, regs->ebx);
|
||||
break;
|
||||
case LHCALL_LOAD_TLS:
|
||||
guest_load_tls(lg, regs->edx);
|
||||
guest_set_pmd(lg, args->arg1, args->arg2);
|
||||
break;
|
||||
case LHCALL_SET_CLOCKEVENT:
|
||||
guest_set_clockevent(lg, regs->edx);
|
||||
guest_set_clockevent(lg, args->arg1);
|
||||
break;
|
||||
|
||||
case LHCALL_TS:
|
||||
/* This sets the TS flag, as we saw used in run_guest(). */
|
||||
lg->ts = regs->edx;
|
||||
lg->ts = args->arg1;
|
||||
break;
|
||||
case LHCALL_HALT:
|
||||
/* Similarly, this sets the halted flag for run_guest(). */
|
||||
lg->halted = 1;
|
||||
break;
|
||||
case LHCALL_NOTIFY:
|
||||
lg->pending_notify = args->arg1;
|
||||
break;
|
||||
default:
|
||||
kill_guest(lg, "Bad hypercall %li\n", regs->eax);
|
||||
if (lguest_arch_do_hcall(lg, args))
|
||||
kill_guest(lg, "Bad hypercall %li\n", args->arg0);
|
||||
}
|
||||
}
|
||||
/*:*/
|
||||
|
||||
/* Asynchronous hypercalls are easy: we just look in the array in the Guest's
|
||||
* "struct lguest_data" and see if there are any new ones marked "ready".
|
||||
/*H:124 Asynchronous hypercalls are easy: we just look in the array in the
|
||||
* Guest's "struct lguest_data" to see if any new ones are marked "ready".
|
||||
*
|
||||
* We are careful to do these in order: obviously we respect the order the
|
||||
* Guest put them in the ring, but we also promise the Guest that they will
|
||||
@ -134,10 +112,9 @@ static void do_async_hcalls(struct lguest *lg)
|
||||
if (copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st)))
|
||||
return;
|
||||
|
||||
|
||||
/* We process "struct lguest_data"s hcalls[] ring once. */
|
||||
for (i = 0; i < ARRAY_SIZE(st); i++) {
|
||||
struct lguest_regs regs;
|
||||
struct hcall_args args;
|
||||
/* We remember where we were up to from last time. This makes
|
||||
* sure that the hypercalls are done in the order the Guest
|
||||
* places them in the ring. */
|
||||
@ -152,18 +129,16 @@ static void do_async_hcalls(struct lguest *lg)
|
||||
if (++lg->next_hcall == LHCALL_RING_SIZE)
|
||||
lg->next_hcall = 0;
|
||||
|
||||
/* We copy the hypercall arguments into a fake register
|
||||
* structure. This makes life simple for do_hcall(). */
|
||||
if (get_user(regs.eax, &lg->lguest_data->hcalls[n].eax)
|
||||
|| get_user(regs.edx, &lg->lguest_data->hcalls[n].edx)
|
||||
|| get_user(regs.ecx, &lg->lguest_data->hcalls[n].ecx)
|
||||
|| get_user(regs.ebx, &lg->lguest_data->hcalls[n].ebx)) {
|
||||
/* Copy the hypercall arguments into a local copy of
|
||||
* the hcall_args struct. */
|
||||
if (copy_from_user(&args, &lg->lguest_data->hcalls[n],
|
||||
sizeof(struct hcall_args))) {
|
||||
kill_guest(lg, "Fetching async hypercalls");
|
||||
break;
|
||||
}
|
||||
|
||||
/* Do the hypercall, same as a normal one. */
|
||||
do_hcall(lg, ®s);
|
||||
do_hcall(lg, &args);
|
||||
|
||||
/* Mark the hypercall done. */
|
||||
if (put_user(0xFF, &lg->lguest_data->hcall_status[n])) {
|
||||
@ -171,9 +146,9 @@ static void do_async_hcalls(struct lguest *lg)
|
||||
break;
|
||||
}
|
||||
|
||||
/* Stop doing hypercalls if we've just done a DMA to the
|
||||
* Launcher: it needs to service this first. */
|
||||
if (lg->dma_is_pending)
|
||||
/* Stop doing hypercalls if they want to notify the Launcher:
|
||||
* it needs to service this first. */
|
||||
if (lg->pending_notify)
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -182,76 +157,35 @@ static void do_async_hcalls(struct lguest *lg)
|
||||
* Guest makes a hypercall, we end up here to set things up: */
|
||||
static void initialize(struct lguest *lg)
|
||||
{
|
||||
u32 tsc_speed;
|
||||
|
||||
/* You can't do anything until you're initialized. The Guest knows the
|
||||
* rules, so we're unforgiving here. */
|
||||
if (lg->regs->eax != LHCALL_LGUEST_INIT) {
|
||||
kill_guest(lg, "hypercall %li before LGUEST_INIT",
|
||||
lg->regs->eax);
|
||||
if (lg->hcall->arg0 != LHCALL_LGUEST_INIT) {
|
||||
kill_guest(lg, "hypercall %li before INIT", lg->hcall->arg0);
|
||||
return;
|
||||
}
|
||||
|
||||
/* We insist that the Time Stamp Counter exist and doesn't change with
|
||||
* cpu frequency. Some devious chip manufacturers decided that TSC
|
||||
* changes could be handled in software. I decided that time going
|
||||
* backwards might be good for benchmarks, but it's bad for users.
|
||||
*
|
||||
* We also insist that the TSC be stable: the kernel detects unreliable
|
||||
* TSCs for its own purposes, and we use that here. */
|
||||
if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable())
|
||||
tsc_speed = tsc_khz;
|
||||
else
|
||||
tsc_speed = 0;
|
||||
|
||||
/* The pointer to the Guest's "struct lguest_data" is the only
|
||||
* argument. */
|
||||
lg->lguest_data = (struct lguest_data __user *)lg->regs->edx;
|
||||
/* If we check the address they gave is OK now, we can simply
|
||||
* copy_to_user/from_user from now on rather than using lgread/lgwrite.
|
||||
* I put this in to show that I'm not immune to writing stupid
|
||||
* optimizations. */
|
||||
if (!lguest_address_ok(lg, lg->regs->edx, sizeof(*lg->lguest_data))) {
|
||||
if (lguest_arch_init_hypercalls(lg))
|
||||
kill_guest(lg, "bad guest page %p", lg->lguest_data);
|
||||
return;
|
||||
}
|
||||
|
||||
/* The Guest tells us where we're not to deliver interrupts by putting
|
||||
* the range of addresses into "struct lguest_data". */
|
||||
if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start)
|
||||
|| get_user(lg->noirq_end, &lg->lguest_data->noirq_end)
|
||||
/* We tell the Guest that it can't use the top 4MB of virtual
|
||||
* addresses used by the Switcher. */
|
||||
|| put_user(4U*1024*1024, &lg->lguest_data->reserve_mem)
|
||||
|| put_user(tsc_speed, &lg->lguest_data->tsc_khz)
|
||||
/* We also give the Guest a unique id, as used in lguest_net.c. */
|
||||
|| put_user(lg->guestid, &lg->lguest_data->guestid))
|
||||
|| get_user(lg->noirq_end, &lg->lguest_data->noirq_end))
|
||||
kill_guest(lg, "bad guest page %p", lg->lguest_data);
|
||||
|
||||
/* We write the current time into the Guest's data page once now. */
|
||||
write_timestamp(lg);
|
||||
|
||||
/* page_tables.c will also do some setup. */
|
||||
page_table_guest_data_init(lg);
|
||||
|
||||
/* This is the one case where the above accesses might have been the
|
||||
* first write to a Guest page. This may have caused a copy-on-write
|
||||
* fault, but the Guest might be referring to the old (read-only)
|
||||
* page. */
|
||||
guest_pagetable_clear_all(lg);
|
||||
}
|
||||
/* Now we've examined the hypercall code; our Guest can make requests. There
|
||||
* is one other way we can do things for the Guest, as we see in
|
||||
* emulate_insn(). */
|
||||
|
||||
/*H:110 Tricky point: we mark the hypercall as "done" once we've done it.
|
||||
* Normally we don't need to do this: the Guest will run again and update the
|
||||
* trap number before we come back around the run_guest() loop to
|
||||
* do_hypercalls().
|
||||
*
|
||||
* However, if we are signalled or the Guest sends DMA to the Launcher, that
|
||||
* loop will exit without running the Guest. When it comes back it would try
|
||||
* to re-run the hypercall. */
|
||||
static void clear_hcall(struct lguest *lg)
|
||||
{
|
||||
lg->regs->trapnum = 255;
|
||||
}
|
||||
|
||||
/*H:100
|
||||
* Hypercalls
|
||||
@ -261,16 +195,12 @@ static void clear_hcall(struct lguest *lg)
|
||||
*/
|
||||
void do_hypercalls(struct lguest *lg)
|
||||
{
|
||||
/* Not initialized yet? */
|
||||
/* Not initialized yet? This hypercall must do it. */
|
||||
if (unlikely(!lg->lguest_data)) {
|
||||
/* Did the Guest make a hypercall? We might have come back for
|
||||
* some other reason (an interrupt, a different trap). */
|
||||
if (lg->regs->trapnum == LGUEST_TRAP_ENTRY) {
|
||||
/* Set up the "struct lguest_data" */
|
||||
initialize(lg);
|
||||
/* The hypercall is done. */
|
||||
clear_hcall(lg);
|
||||
}
|
||||
/* Set up the "struct lguest_data" */
|
||||
initialize(lg);
|
||||
/* Hcall is done. */
|
||||
lg->hcall = NULL;
|
||||
return;
|
||||
}
|
||||
|
||||
@ -280,12 +210,21 @@ void do_hypercalls(struct lguest *lg)
|
||||
do_async_hcalls(lg);
|
||||
|
||||
/* If we stopped reading the hypercall ring because the Guest did a
|
||||
* SEND_DMA to the Launcher, we want to return now. Otherwise if the
|
||||
* Guest asked us to do a hypercall, we do it. */
|
||||
if (!lg->dma_is_pending && lg->regs->trapnum == LGUEST_TRAP_ENTRY) {
|
||||
do_hcall(lg, lg->regs);
|
||||
/* The hypercall is done. */
|
||||
clear_hcall(lg);
|
||||
* NOTIFY to the Launcher, we want to return now. Otherwise we do
|
||||
* the hypercall. */
|
||||
if (!lg->pending_notify) {
|
||||
do_hcall(lg, lg->hcall);
|
||||
/* Tricky point: we reset the hcall pointer to mark the
|
||||
* hypercall as "done". We use the hcall pointer rather than
|
||||
* the trap number to indicate a hypercall is pending.
|
||||
* Normally it doesn't matter: the Guest will run again and
|
||||
* update the trap number before we come back here.
|
||||
*
|
||||
* However, if we are signalled or the Guest sends DMA to the
|
||||
* Launcher, the run_guest() loop will exit without running the
|
||||
* Guest. When it comes back it would try to re-run the
|
||||
* hypercall. */
|
||||
lg->hcall = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
@ -295,6 +234,6 @@ void write_timestamp(struct lguest *lg)
|
||||
{
|
||||
struct timespec now;
|
||||
ktime_get_real_ts(&now);
|
||||
if (put_user(now, &lg->lguest_data->time))
|
||||
if (copy_to_user(&lg->lguest_data->time, &now, sizeof(struct timespec)))
|
||||
kill_guest(lg, "Writing timestamp");
|
||||
}
|
||||
|
@ -12,8 +12,14 @@
|
||||
* them first, so we also have a way of "reflecting" them into the Guest as if
|
||||
* they had been delivered to it directly. :*/
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/module.h>
|
||||
#include "lg.h"
|
||||
|
||||
/* Allow Guests to use a non-128 (ie. non-Linux) syscall trap. */
|
||||
static unsigned int syscall_vector = SYSCALL_VECTOR;
|
||||
module_param(syscall_vector, uint, 0444);
|
||||
|
||||
/* The address of the interrupt handler is split into two bits: */
|
||||
static unsigned long idt_address(u32 lo, u32 hi)
|
||||
{
|
||||
@ -39,7 +45,7 @@ static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val)
|
||||
{
|
||||
/* Stack grows upwards: move stack then write value. */
|
||||
*gstack -= 4;
|
||||
lgwrite_u32(lg, *gstack, val);
|
||||
lgwrite(lg, *gstack, u32, val);
|
||||
}
|
||||
|
||||
/*H:210 The set_guest_interrupt() routine actually delivers the interrupt or
|
||||
@ -56,8 +62,9 @@ static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val)
|
||||
* it). */
|
||||
static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
|
||||
{
|
||||
unsigned long gstack;
|
||||
unsigned long gstack, origstack;
|
||||
u32 eflags, ss, irq_enable;
|
||||
unsigned long virtstack;
|
||||
|
||||
/* There are two cases for interrupts: one where the Guest is already
|
||||
* in the kernel, and a more complex one where the Guest is in
|
||||
@ -65,8 +72,10 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
|
||||
if ((lg->regs->ss&0x3) != GUEST_PL) {
|
||||
/* The Guest told us their kernel stack with the SET_STACK
|
||||
* hypercall: both the virtual address and the segment */
|
||||
gstack = guest_pa(lg, lg->esp1);
|
||||
virtstack = lg->esp1;
|
||||
ss = lg->ss1;
|
||||
|
||||
origstack = gstack = guest_pa(lg, virtstack);
|
||||
/* We push the old stack segment and pointer onto the new
|
||||
* stack: when the Guest does an "iret" back from the interrupt
|
||||
* handler the CPU will notice they're dropping privilege
|
||||
@ -75,8 +84,10 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
|
||||
push_guest_stack(lg, &gstack, lg->regs->esp);
|
||||
} else {
|
||||
/* We're staying on the same Guest (kernel) stack. */
|
||||
gstack = guest_pa(lg, lg->regs->esp);
|
||||
virtstack = lg->regs->esp;
|
||||
ss = lg->regs->ss;
|
||||
|
||||
origstack = gstack = guest_pa(lg, virtstack);
|
||||
}
|
||||
|
||||
/* Remember that we never let the Guest actually disable interrupts, so
|
||||
@ -102,7 +113,7 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
|
||||
/* Now we've pushed all the old state, we change the stack, the code
|
||||
* segment and the address to execute. */
|
||||
lg->regs->ss = ss;
|
||||
lg->regs->esp = gstack + lg->page_offset;
|
||||
lg->regs->esp = virtstack + (gstack - origstack);
|
||||
lg->regs->cs = (__KERNEL_CS|GUEST_PL);
|
||||
lg->regs->eip = idt_address(lo, hi);
|
||||
|
||||
@ -165,7 +176,7 @@ void maybe_do_interrupt(struct lguest *lg)
|
||||
/* Look at the IDT entry the Guest gave us for this interrupt. The
|
||||
* first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip
|
||||
* over them. */
|
||||
idt = &lg->idt[FIRST_EXTERNAL_VECTOR+irq];
|
||||
idt = &lg->arch.idt[FIRST_EXTERNAL_VECTOR+irq];
|
||||
/* If they don't have a handler (yet?), we just ignore it */
|
||||
if (idt_present(idt->a, idt->b)) {
|
||||
/* OK, mark it no longer pending and deliver it. */
|
||||
@ -183,6 +194,47 @@ void maybe_do_interrupt(struct lguest *lg)
|
||||
* timer interrupt. */
|
||||
write_timestamp(lg);
|
||||
}
|
||||
/*:*/
|
||||
|
||||
/* Linux uses trap 128 for system calls. Plan9 uses 64, and Ron Minnich sent
|
||||
* me a patch, so we support that too. It'd be a big step for lguest if half
|
||||
* the Plan 9 user base were to start using it.
|
||||
*
|
||||
* Actually now I think of it, it's possible that Ron *is* half the Plan 9
|
||||
* userbase. Oh well. */
|
||||
static bool could_be_syscall(unsigned int num)
|
||||
{
|
||||
/* Normal Linux SYSCALL_VECTOR or reserved vector? */
|
||||
return num == SYSCALL_VECTOR || num == syscall_vector;
|
||||
}
|
||||
|
||||
/* The syscall vector it wants must be unused by Host. */
|
||||
bool check_syscall_vector(struct lguest *lg)
|
||||
{
|
||||
u32 vector;
|
||||
|
||||
if (get_user(vector, &lg->lguest_data->syscall_vec))
|
||||
return false;
|
||||
|
||||
return could_be_syscall(vector);
|
||||
}
|
||||
|
||||
int init_interrupts(void)
|
||||
{
|
||||
/* If they want some strange system call vector, reserve it now */
|
||||
if (syscall_vector != SYSCALL_VECTOR
|
||||
&& test_and_set_bit(syscall_vector, used_vectors)) {
|
||||
printk("lg: couldn't reserve syscall %u\n", syscall_vector);
|
||||
return -EBUSY;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void free_interrupts(void)
|
||||
{
|
||||
if (syscall_vector != SYSCALL_VECTOR)
|
||||
clear_bit(syscall_vector, used_vectors);
|
||||
}
|
||||
|
||||
/*H:220 Now we've got the routines to deliver interrupts, delivering traps
|
||||
* like page fault is easy. The only trick is that Intel decided that some
|
||||
@ -197,14 +249,14 @@ int deliver_trap(struct lguest *lg, unsigned int num)
|
||||
{
|
||||
/* Trap numbers are always 8 bit, but we set an impossible trap number
|
||||
* for traps inside the Switcher, so check that here. */
|
||||
if (num >= ARRAY_SIZE(lg->idt))
|
||||
if (num >= ARRAY_SIZE(lg->arch.idt))
|
||||
return 0;
|
||||
|
||||
/* Early on the Guest hasn't set the IDT entries (or maybe it put a
|
||||
* bogus one in): if we fail here, the Guest will be killed. */
|
||||
if (!idt_present(lg->idt[num].a, lg->idt[num].b))
|
||||
if (!idt_present(lg->arch.idt[num].a, lg->arch.idt[num].b))
|
||||
return 0;
|
||||
set_guest_interrupt(lg, lg->idt[num].a, lg->idt[num].b, has_err(num));
|
||||
set_guest_interrupt(lg, lg->arch.idt[num].a, lg->arch.idt[num].b, has_err(num));
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -218,28 +270,20 @@ int deliver_trap(struct lguest *lg, unsigned int num)
|
||||
* system calls down from 1750ns to 270ns. Plus, if lguest didn't do it, all
|
||||
* the other hypervisors would tease it.
|
||||
*
|
||||
* This routine determines if a trap can be delivered directly. */
|
||||
static int direct_trap(const struct lguest *lg,
|
||||
const struct desc_struct *trap,
|
||||
unsigned int num)
|
||||
* This routine indicates if a particular trap number could be delivered
|
||||
* directly. */
|
||||
static int direct_trap(unsigned int num)
|
||||
{
|
||||
/* Hardware interrupts don't go to the Guest at all (except system
|
||||
* call). */
|
||||
if (num >= FIRST_EXTERNAL_VECTOR && num != SYSCALL_VECTOR)
|
||||
if (num >= FIRST_EXTERNAL_VECTOR && !could_be_syscall(num))
|
||||
return 0;
|
||||
|
||||
/* The Host needs to see page faults (for shadow paging and to save the
|
||||
* fault address), general protection faults (in/out emulation) and
|
||||
* device not available (TS handling), and of course, the hypercall
|
||||
* trap. */
|
||||
if (num == 14 || num == 13 || num == 7 || num == LGUEST_TRAP_ENTRY)
|
||||
return 0;
|
||||
|
||||
/* Only trap gates (type 15) can go direct to the Guest. Interrupt
|
||||
* gates (type 14) disable interrupts as they are entered, which we
|
||||
* never let the Guest do. Not present entries (type 0x0) also can't
|
||||
* go direct, of course 8) */
|
||||
return idt_type(trap->a, trap->b) == 0xF;
|
||||
return num != 14 && num != 13 && num != 7 && num != LGUEST_TRAP_ENTRY;
|
||||
}
|
||||
/*:*/
|
||||
|
||||
@ -348,15 +392,11 @@ void load_guest_idt_entry(struct lguest *lg, unsigned int num, u32 lo, u32 hi)
|
||||
* to copy this again. */
|
||||
lg->changed |= CHANGED_IDT;
|
||||
|
||||
/* The IDT which we keep in "struct lguest" only contains 32 entries
|
||||
* for the traps and LGUEST_IRQS (32) entries for interrupts. We
|
||||
* ignore attempts to set handlers for higher interrupt numbers, except
|
||||
* for the system call "interrupt" at 128: we have a special IDT entry
|
||||
* for that. */
|
||||
if (num < ARRAY_SIZE(lg->idt))
|
||||
set_trap(lg, &lg->idt[num], num, lo, hi);
|
||||
else if (num == SYSCALL_VECTOR)
|
||||
set_trap(lg, &lg->syscall_idt, num, lo, hi);
|
||||
/* Check that the Guest doesn't try to step outside the bounds. */
|
||||
if (num >= ARRAY_SIZE(lg->arch.idt))
|
||||
kill_guest(lg, "Setting idt entry %u", num);
|
||||
else
|
||||
set_trap(lg, &lg->arch.idt[num], num, lo, hi);
|
||||
}
|
||||
|
||||
/* The default entry for each interrupt points into the Switcher routines which
|
||||
@ -399,20 +439,21 @@ void copy_traps(const struct lguest *lg, struct desc_struct *idt,
|
||||
|
||||
/* We can simply copy the direct traps, otherwise we use the default
|
||||
* ones in the Switcher: they will return to the Host. */
|
||||
for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) {
|
||||
if (direct_trap(lg, &lg->idt[i], i))
|
||||
idt[i] = lg->idt[i];
|
||||
for (i = 0; i < ARRAY_SIZE(lg->arch.idt); i++) {
|
||||
/* If no Guest can ever override this trap, leave it alone. */
|
||||
if (!direct_trap(i))
|
||||
continue;
|
||||
|
||||
/* Only trap gates (type 15) can go direct to the Guest.
|
||||
* Interrupt gates (type 14) disable interrupts as they are
|
||||
* entered, which we never let the Guest do. Not present
|
||||
* entries (type 0x0) also can't go direct, of course. */
|
||||
if (idt_type(lg->arch.idt[i].a, lg->arch.idt[i].b) == 0xF)
|
||||
idt[i] = lg->arch.idt[i];
|
||||
else
|
||||
/* Reset it to the default. */
|
||||
default_idt_entry(&idt[i], i, def[i]);
|
||||
}
|
||||
|
||||
/* Don't forget the system call trap! The IDT entries for other
|
||||
* interupts never change, so no need to copy them. */
|
||||
i = SYSCALL_VECTOR;
|
||||
if (direct_trap(lg, &lg->syscall_idt, i))
|
||||
idt[i] = lg->syscall_idt;
|
||||
else
|
||||
default_idt_entry(&idt[i], i, def[i]);
|
||||
}
|
||||
|
||||
void guest_set_clockevent(struct lguest *lg, unsigned long delta)
|
||||
|
@ -1,626 +0,0 @@
|
||||
/*P:300 The I/O mechanism in lguest is simple yet flexible, allowing the Guest
|
||||
* to talk to the Launcher or directly to another Guest. It uses familiar
|
||||
* concepts of DMA and interrupts, plus some neat code stolen from
|
||||
* futexes... :*/
|
||||
|
||||
/* Copyright (C) 2006 Rusty Russell IBM Corporation
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
#include <linux/types.h>
|
||||
#include <linux/futex.h>
|
||||
#include <linux/jhash.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include "lg.h"
|
||||
|
||||
/*L:300
|
||||
* I/O
|
||||
*
|
||||
* Getting data in and out of the Guest is quite an art. There are numerous
|
||||
* ways to do it, and they all suck differently. We try to keep things fairly
|
||||
* close to "real" hardware so our Guest's drivers don't look like an alien
|
||||
* visitation in the middle of the Linux code, and yet make sure that Guests
|
||||
* can talk directly to other Guests, not just the Launcher.
|
||||
*
|
||||
* To do this, the Guest gives us a key when it binds or sends DMA buffers.
|
||||
* The key corresponds to a "physical" address inside the Guest (ie. a virtual
|
||||
* address inside the Launcher process). We don't, however, use this key
|
||||
* directly.
|
||||
*
|
||||
* We want Guests which share memory to be able to DMA to each other: two
|
||||
* Launchers can mmap memory the same file, then the Guests can communicate.
|
||||
* Fortunately, the futex code provides us with a way to get a "union
|
||||
* futex_key" corresponding to the memory lying at a virtual address: if the
|
||||
* two processes share memory, the "union futex_key" for that memory will match
|
||||
* even if the memory is mapped at different addresses in each. So we always
|
||||
* convert the keys to "union futex_key"s to compare them.
|
||||
*
|
||||
* Before we dive into this though, we need to look at another set of helper
|
||||
* routines used throughout the Host kernel code to access Guest memory.
|
||||
:*/
|
||||
static struct list_head dma_hash[61];
|
||||
|
||||
/* An unfortunate side effect of the Linux double-linked list implementation is
|
||||
* that there's no good way to statically initialize an array of linked
|
||||
* lists. */
|
||||
void lguest_io_init(void)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(dma_hash); i++)
|
||||
INIT_LIST_HEAD(&dma_hash[i]);
|
||||
}
|
||||
|
||||
/* FIXME: allow multi-page lengths. */
|
||||
static int check_dma_list(struct lguest *lg, const struct lguest_dma *dma)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
|
||||
if (!dma->len[i])
|
||||
return 1;
|
||||
if (!lguest_address_ok(lg, dma->addr[i], dma->len[i]))
|
||||
goto kill;
|
||||
if (dma->len[i] > PAGE_SIZE)
|
||||
goto kill;
|
||||
/* We could do over a page, but is it worth it? */
|
||||
if ((dma->addr[i] % PAGE_SIZE) + dma->len[i] > PAGE_SIZE)
|
||||
goto kill;
|
||||
}
|
||||
return 1;
|
||||
|
||||
kill:
|
||||
kill_guest(lg, "bad DMA entry: %u@%#lx", dma->len[i], dma->addr[i]);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*L:330 This is our hash function, using the wonderful Jenkins hash.
|
||||
*
|
||||
* The futex key is a union with three parts: an unsigned long word, a pointer,
|
||||
* and an int "offset". We could use jhash_2words() which takes three u32s.
|
||||
* (Ok, the hash functions are great: the naming sucks though).
|
||||
*
|
||||
* It's nice to be portable to 64-bit platforms, so we use the more generic
|
||||
* jhash2(), which takes an array of u32, the number of u32s, and an initial
|
||||
* u32 to roll in. This is uglier, but breaks down to almost the same code on
|
||||
* 32-bit platforms like this one.
|
||||
*
|
||||
* We want a position in the array, so we modulo ARRAY_SIZE(dma_hash) (ie. 61).
|
||||
*/
|
||||
static unsigned int hash(const union futex_key *key)
|
||||
{
|
||||
return jhash2((u32*)&key->both.word,
|
||||
(sizeof(key->both.word)+sizeof(key->both.ptr))/4,
|
||||
key->both.offset)
|
||||
% ARRAY_SIZE(dma_hash);
|
||||
}
|
||||
|
||||
/* This is a convenience routine to compare two keys. It's a much bemoaned C
|
||||
* weakness that it doesn't allow '==' on structures or unions, so we have to
|
||||
* open-code it like this. */
|
||||
static inline int key_eq(const union futex_key *a, const union futex_key *b)
|
||||
{
|
||||
return (a->both.word == b->both.word
|
||||
&& a->both.ptr == b->both.ptr
|
||||
&& a->both.offset == b->both.offset);
|
||||
}
|
||||
|
||||
/*L:360 OK, when we need to actually free up a Guest's DMA array we do several
|
||||
* things, so we have a convenient function to do it.
|
||||
*
|
||||
* The caller must hold a read lock on dmainfo owner's current->mm->mmap_sem
|
||||
* for the drop_futex_key_refs(). */
|
||||
static void unlink_dma(struct lguest_dma_info *dmainfo)
|
||||
{
|
||||
/* You locked this too, right? */
|
||||
BUG_ON(!mutex_is_locked(&lguest_lock));
|
||||
/* This is how we know that the entry is free. */
|
||||
dmainfo->interrupt = 0;
|
||||
/* Remove it from the hash table. */
|
||||
list_del(&dmainfo->list);
|
||||
/* Drop the references we were holding (to the inode or mm). */
|
||||
drop_futex_key_refs(&dmainfo->key);
|
||||
}
|
||||
|
||||
/*L:350 This is the routine which we call when the Guest asks to unregister a
|
||||
* DMA array attached to a given key. Returns true if the array was found. */
|
||||
static int unbind_dma(struct lguest *lg,
|
||||
const union futex_key *key,
|
||||
unsigned long dmas)
|
||||
{
|
||||
int i, ret = 0;
|
||||
|
||||
/* We don't bother with the hash table, just look through all this
|
||||
* Guest's DMA arrays. */
|
||||
for (i = 0; i < LGUEST_MAX_DMA; i++) {
|
||||
/* In theory it could have more than one array on the same key,
|
||||
* or one array on multiple keys, so we check both */
|
||||
if (key_eq(key, &lg->dma[i].key) && dmas == lg->dma[i].dmas) {
|
||||
unlink_dma(&lg->dma[i]);
|
||||
ret = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*L:340 BIND_DMA: this is the hypercall which sets up an array of "struct
|
||||
* lguest_dma" for receiving I/O.
|
||||
*
|
||||
* The Guest wants to bind an array of "struct lguest_dma"s to a particular key
|
||||
* to receive input. This only happens when the Guest is setting up a new
|
||||
* device, so it doesn't have to be very fast.
|
||||
*
|
||||
* It returns 1 on a successful registration (it can fail if we hit the limit
|
||||
* of registrations for this Guest).
|
||||
*/
|
||||
int bind_dma(struct lguest *lg,
|
||||
unsigned long ukey, unsigned long dmas, u16 numdmas, u8 interrupt)
|
||||
{
|
||||
unsigned int i;
|
||||
int ret = 0;
|
||||
union futex_key key;
|
||||
/* Futex code needs the mmap_sem. */
|
||||
struct rw_semaphore *fshared = ¤t->mm->mmap_sem;
|
||||
|
||||
/* Invalid interrupt? (We could kill the guest here). */
|
||||
if (interrupt >= LGUEST_IRQS)
|
||||
return 0;
|
||||
|
||||
/* We need to grab the Big Lguest Lock, because other Guests may be
|
||||
* trying to look through this Guest's DMAs to send something while
|
||||
* we're doing this. */
|
||||
mutex_lock(&lguest_lock);
|
||||
down_read(fshared);
|
||||
if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) {
|
||||
kill_guest(lg, "bad dma key %#lx", ukey);
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
/* We want to keep this key valid once we drop mmap_sem, so we have to
|
||||
* hold a reference. */
|
||||
get_futex_key_refs(&key);
|
||||
|
||||
/* If the Guest specified an interrupt of 0, that means they want to
|
||||
* unregister this array of "struct lguest_dma"s. */
|
||||
if (interrupt == 0)
|
||||
ret = unbind_dma(lg, &key, dmas);
|
||||
else {
|
||||
/* Look through this Guest's dma array for an unused entry. */
|
||||
for (i = 0; i < LGUEST_MAX_DMA; i++) {
|
||||
/* If the interrupt is non-zero, the entry is already
|
||||
* used. */
|
||||
if (lg->dma[i].interrupt)
|
||||
continue;
|
||||
|
||||
/* OK, a free one! Fill on our details. */
|
||||
lg->dma[i].dmas = dmas;
|
||||
lg->dma[i].num_dmas = numdmas;
|
||||
lg->dma[i].next_dma = 0;
|
||||
lg->dma[i].key = key;
|
||||
lg->dma[i].guestid = lg->guestid;
|
||||
lg->dma[i].interrupt = interrupt;
|
||||
|
||||
/* Now we add it to the hash table: the position
|
||||
* depends on the futex key that we got. */
|
||||
list_add(&lg->dma[i].list, &dma_hash[hash(&key)]);
|
||||
/* Success! */
|
||||
ret = 1;
|
||||
goto unlock;
|
||||
}
|
||||
}
|
||||
/* If we didn't find a slot to put the key in, drop the reference
|
||||
* again. */
|
||||
drop_futex_key_refs(&key);
|
||||
unlock:
|
||||
/* Unlock and out. */
|
||||
up_read(fshared);
|
||||
mutex_unlock(&lguest_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*L:385 Note that our routines to access a different Guest's memory are called
|
||||
* lgread_other() and lgwrite_other(): these names emphasize that they are only
|
||||
* used when the Guest is *not* the current Guest.
|
||||
*
|
||||
* The interface for copying from another process's memory is called
|
||||
* access_process_vm(), with a final argument of 0 for a read, and 1 for a
|
||||
* write.
|
||||
*
|
||||
* We need lgread_other() to read the destination Guest's "struct lguest_dma"
|
||||
* array. */
|
||||
static int lgread_other(struct lguest *lg,
|
||||
void *buf, u32 addr, unsigned bytes)
|
||||
{
|
||||
if (!lguest_address_ok(lg, addr, bytes)
|
||||
|| access_process_vm(lg->tsk, addr, buf, bytes, 0) != bytes) {
|
||||
memset(buf, 0, bytes);
|
||||
kill_guest(lg, "bad address in registered DMA struct");
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* "lgwrite()" to another Guest: used to update the destination "used_len" once
|
||||
* we've transferred data into the buffer. */
|
||||
static int lgwrite_other(struct lguest *lg, u32 addr,
|
||||
const void *buf, unsigned bytes)
|
||||
{
|
||||
if (!lguest_address_ok(lg, addr, bytes)
|
||||
|| (access_process_vm(lg->tsk, addr, (void *)buf, bytes, 1)
|
||||
!= bytes)) {
|
||||
kill_guest(lg, "bad address writing to registered DMA");
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*L:400 This is the generic engine which copies from a source "struct
|
||||
* lguest_dma" from this Guest into another Guest's "struct lguest_dma". The
|
||||
* destination Guest's pages have already been mapped, as contained in the
|
||||
* pages array.
|
||||
*
|
||||
* If you're wondering if there's a nice "copy from one process to another"
|
||||
* routine, so was I. But Linux isn't really set up to copy between two
|
||||
* unrelated processes, so we have to write it ourselves.
|
||||
*/
|
||||
static u32 copy_data(struct lguest *srclg,
|
||||
const struct lguest_dma *src,
|
||||
const struct lguest_dma *dst,
|
||||
struct page *pages[])
|
||||
{
|
||||
unsigned int totlen, si, di, srcoff, dstoff;
|
||||
void *maddr = NULL;
|
||||
|
||||
/* We return the total length transferred. */
|
||||
totlen = 0;
|
||||
|
||||
/* We keep indexes into the source and destination "struct lguest_dma",
|
||||
* and an offset within each region. */
|
||||
si = di = 0;
|
||||
srcoff = dstoff = 0;
|
||||
|
||||
/* We loop until the source or destination is exhausted. */
|
||||
while (si < LGUEST_MAX_DMA_SECTIONS && src->len[si]
|
||||
&& di < LGUEST_MAX_DMA_SECTIONS && dst->len[di]) {
|
||||
/* We can only transfer the rest of the src buffer, or as much
|
||||
* as will fit into the destination buffer. */
|
||||
u32 len = min(src->len[si] - srcoff, dst->len[di] - dstoff);
|
||||
|
||||
/* For systems using "highmem" we need to use kmap() to access
|
||||
* the page we want. We often use the same page over and over,
|
||||
* so rather than kmap() it on every loop, we set the maddr
|
||||
* pointer to NULL when we need to move to the next
|
||||
* destination page. */
|
||||
if (!maddr)
|
||||
maddr = kmap(pages[di]);
|
||||
|
||||
/* Copy directly from (this Guest's) source address to the
|
||||
* destination Guest's kmap()ed buffer. Note that maddr points
|
||||
* to the start of the page: we need to add the offset of the
|
||||
* destination address and offset within the buffer. */
|
||||
|
||||
/* FIXME: This is not completely portable. I looked at
|
||||
* copy_to_user_page(), and some arch's seem to need special
|
||||
* flushes. x86 is fine. */
|
||||
if (copy_from_user(maddr + (dst->addr[di] + dstoff)%PAGE_SIZE,
|
||||
(void __user *)src->addr[si], len) != 0) {
|
||||
/* If a copy failed, it's the source's fault. */
|
||||
kill_guest(srclg, "bad address in sending DMA");
|
||||
totlen = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
/* Increment the total and src & dst offsets */
|
||||
totlen += len;
|
||||
srcoff += len;
|
||||
dstoff += len;
|
||||
|
||||
/* Presumably we reached the end of the src or dest buffers: */
|
||||
if (srcoff == src->len[si]) {
|
||||
/* Move to the next buffer at offset 0 */
|
||||
si++;
|
||||
srcoff = 0;
|
||||
}
|
||||
if (dstoff == dst->len[di]) {
|
||||
/* We need to unmap that destination page and reset
|
||||
* maddr ready for the next one. */
|
||||
kunmap(pages[di]);
|
||||
maddr = NULL;
|
||||
di++;
|
||||
dstoff = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* If we still had a page mapped at the end, unmap now. */
|
||||
if (maddr)
|
||||
kunmap(pages[di]);
|
||||
|
||||
return totlen;
|
||||
}
|
||||
|
||||
/*L:390 This is how we transfer a "struct lguest_dma" from the source Guest
|
||||
* (the current Guest which called SEND_DMA) to another Guest. */
|
||||
static u32 do_dma(struct lguest *srclg, const struct lguest_dma *src,
|
||||
struct lguest *dstlg, const struct lguest_dma *dst)
|
||||
{
|
||||
int i;
|
||||
u32 ret;
|
||||
struct page *pages[LGUEST_MAX_DMA_SECTIONS];
|
||||
|
||||
/* We check that both source and destination "struct lguest_dma"s are
|
||||
* within the bounds of the source and destination Guests */
|
||||
if (!check_dma_list(dstlg, dst) || !check_dma_list(srclg, src))
|
||||
return 0;
|
||||
|
||||
/* We need to map the pages which correspond to each parts of
|
||||
* destination buffer. */
|
||||
for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
|
||||
if (dst->len[i] == 0)
|
||||
break;
|
||||
/* get_user_pages() is a complicated function, especially since
|
||||
* we only want a single page. But it works, and returns the
|
||||
* number of pages. Note that we're holding the destination's
|
||||
* mmap_sem, as get_user_pages() requires. */
|
||||
if (get_user_pages(dstlg->tsk, dstlg->mm,
|
||||
dst->addr[i], 1, 1, 1, pages+i, NULL)
|
||||
!= 1) {
|
||||
/* This means the destination gave us a bogus buffer */
|
||||
kill_guest(dstlg, "Error mapping DMA pages");
|
||||
ret = 0;
|
||||
goto drop_pages;
|
||||
}
|
||||
}
|
||||
|
||||
/* Now copy the data until we run out of src or dst. */
|
||||
ret = copy_data(srclg, src, dst, pages);
|
||||
|
||||
drop_pages:
|
||||
while (--i >= 0)
|
||||
put_page(pages[i]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*L:380 Transferring data from one Guest to another is not as simple as I'd
|
||||
* like. We've found the "struct lguest_dma_info" bound to the same address as
|
||||
* the send, we need to copy into it.
|
||||
*
|
||||
* This function returns true if the destination array was empty. */
|
||||
static int dma_transfer(struct lguest *srclg,
|
||||
unsigned long udma,
|
||||
struct lguest_dma_info *dst)
|
||||
{
|
||||
struct lguest_dma dst_dma, src_dma;
|
||||
struct lguest *dstlg;
|
||||
u32 i, dma = 0;
|
||||
|
||||
/* From the "struct lguest_dma_info" we found in the hash, grab the
|
||||
* Guest. */
|
||||
dstlg = &lguests[dst->guestid];
|
||||
/* Read in the source "struct lguest_dma" handed to SEND_DMA. */
|
||||
lgread(srclg, &src_dma, udma, sizeof(src_dma));
|
||||
|
||||
/* We need the destination's mmap_sem, and we already hold the source's
|
||||
* mmap_sem for the futex key lookup. Normally this would suggest that
|
||||
* we could deadlock if the destination Guest was trying to send to
|
||||
* this source Guest at the same time, which is another reason that all
|
||||
* I/O is done under the big lguest_lock. */
|
||||
down_read(&dstlg->mm->mmap_sem);
|
||||
|
||||
/* Look through the destination DMA array for an available buffer. */
|
||||
for (i = 0; i < dst->num_dmas; i++) {
|
||||
/* We keep a "next_dma" pointer which often helps us avoid
|
||||
* looking at lots of previously-filled entries. */
|
||||
dma = (dst->next_dma + i) % dst->num_dmas;
|
||||
if (!lgread_other(dstlg, &dst_dma,
|
||||
dst->dmas + dma * sizeof(struct lguest_dma),
|
||||
sizeof(dst_dma))) {
|
||||
goto fail;
|
||||
}
|
||||
if (!dst_dma.used_len)
|
||||
break;
|
||||
}
|
||||
|
||||
/* If we found a buffer, we do the actual data copy. */
|
||||
if (i != dst->num_dmas) {
|
||||
unsigned long used_lenp;
|
||||
unsigned int ret;
|
||||
|
||||
ret = do_dma(srclg, &src_dma, dstlg, &dst_dma);
|
||||
/* Put used length in the source "struct lguest_dma"'s used_len
|
||||
* field. It's a little tricky to figure out where that is,
|
||||
* though. */
|
||||
lgwrite_u32(srclg,
|
||||
udma+offsetof(struct lguest_dma, used_len), ret);
|
||||
/* Tranferring 0 bytes is OK if the source buffer was empty. */
|
||||
if (ret == 0 && src_dma.len[0] != 0)
|
||||
goto fail;
|
||||
|
||||
/* The destination Guest might be running on a different CPU:
|
||||
* we have to make sure that it will see the "used_len" field
|
||||
* change to non-zero *after* it sees the data we copied into
|
||||
* the buffer. Hence a write memory barrier. */
|
||||
wmb();
|
||||
/* Figuring out where the destination's used_len field for this
|
||||
* "struct lguest_dma" in the array is also a little ugly. */
|
||||
used_lenp = dst->dmas
|
||||
+ dma * sizeof(struct lguest_dma)
|
||||
+ offsetof(struct lguest_dma, used_len);
|
||||
lgwrite_other(dstlg, used_lenp, &ret, sizeof(ret));
|
||||
/* Move the cursor for next time. */
|
||||
dst->next_dma++;
|
||||
}
|
||||
up_read(&dstlg->mm->mmap_sem);
|
||||
|
||||
/* We trigger the destination interrupt, even if the destination was
|
||||
* empty and we didn't transfer anything: this gives them a chance to
|
||||
* wake up and refill. */
|
||||
set_bit(dst->interrupt, dstlg->irqs_pending);
|
||||
/* Wake up the destination process. */
|
||||
wake_up_process(dstlg->tsk);
|
||||
/* If we passed the last "struct lguest_dma", the receive had no
|
||||
* buffers left. */
|
||||
return i == dst->num_dmas;
|
||||
|
||||
fail:
|
||||
up_read(&dstlg->mm->mmap_sem);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*L:370 This is the counter-side to the BIND_DMA hypercall; the SEND_DMA
|
||||
* hypercall. We find out who's listening, and send to them. */
|
||||
void send_dma(struct lguest *lg, unsigned long ukey, unsigned long udma)
|
||||
{
|
||||
union futex_key key;
|
||||
int empty = 0;
|
||||
struct rw_semaphore *fshared = ¤t->mm->mmap_sem;
|
||||
|
||||
again:
|
||||
mutex_lock(&lguest_lock);
|
||||
down_read(fshared);
|
||||
/* Get the futex key for the key the Guest gave us */
|
||||
if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) {
|
||||
kill_guest(lg, "bad sending DMA key");
|
||||
goto unlock;
|
||||
}
|
||||
/* Since the key must be a multiple of 4, the futex key uses the lower
|
||||
* bit of the "offset" field (which would always be 0) to indicate a
|
||||
* mapping which is shared with other processes (ie. Guests). */
|
||||
if (key.shared.offset & 1) {
|
||||
struct lguest_dma_info *i;
|
||||
/* Look through the hash for other Guests. */
|
||||
list_for_each_entry(i, &dma_hash[hash(&key)], list) {
|
||||
/* Don't send to ourselves. */
|
||||
if (i->guestid == lg->guestid)
|
||||
continue;
|
||||
if (!key_eq(&key, &i->key))
|
||||
continue;
|
||||
|
||||
/* If dma_transfer() tells us the destination has no
|
||||
* available buffers, we increment "empty". */
|
||||
empty += dma_transfer(lg, udma, i);
|
||||
break;
|
||||
}
|
||||
/* If the destination is empty, we release our locks and
|
||||
* give the destination Guest a brief chance to restock. */
|
||||
if (empty == 1) {
|
||||
/* Give any recipients one chance to restock. */
|
||||
up_read(¤t->mm->mmap_sem);
|
||||
mutex_unlock(&lguest_lock);
|
||||
/* Next time, we won't try again. */
|
||||
empty++;
|
||||
goto again;
|
||||
}
|
||||
} else {
|
||||
/* Private mapping: Guest is sending to its Launcher. We set
|
||||
* the "dma_is_pending" flag so that the main loop will exit
|
||||
* and the Launcher's read() from /dev/lguest will return. */
|
||||
lg->dma_is_pending = 1;
|
||||
lg->pending_dma = udma;
|
||||
lg->pending_key = ukey;
|
||||
}
|
||||
unlock:
|
||||
up_read(fshared);
|
||||
mutex_unlock(&lguest_lock);
|
||||
}
|
||||
/*:*/
|
||||
|
||||
void release_all_dma(struct lguest *lg)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
BUG_ON(!mutex_is_locked(&lguest_lock));
|
||||
|
||||
down_read(&lg->mm->mmap_sem);
|
||||
for (i = 0; i < LGUEST_MAX_DMA; i++) {
|
||||
if (lg->dma[i].interrupt)
|
||||
unlink_dma(&lg->dma[i]);
|
||||
}
|
||||
up_read(&lg->mm->mmap_sem);
|
||||
}
|
||||
|
||||
/*M:007 We only return a single DMA buffer to the Launcher, but it would be
|
||||
* more efficient to return a pointer to the entire array of DMA buffers, which
|
||||
* it can cache and choose one whenever it wants.
|
||||
*
|
||||
* Currently the Launcher uses a write to /dev/lguest, and the return value is
|
||||
* the address of the DMA structure with the interrupt number placed in
|
||||
* dma->used_len. If we wanted to return the entire array, we need to return
|
||||
* the address, array size and interrupt number: this seems to require an
|
||||
* ioctl(). :*/
|
||||
|
||||
/*L:320 This routine looks for a DMA buffer registered by the Guest on the
|
||||
* given key (using the BIND_DMA hypercall). */
|
||||
unsigned long get_dma_buffer(struct lguest *lg,
|
||||
unsigned long ukey, unsigned long *interrupt)
|
||||
{
|
||||
unsigned long ret = 0;
|
||||
union futex_key key;
|
||||
struct lguest_dma_info *i;
|
||||
struct rw_semaphore *fshared = ¤t->mm->mmap_sem;
|
||||
|
||||
/* Take the Big Lguest Lock to stop other Guests sending this Guest DMA
|
||||
* at the same time. */
|
||||
mutex_lock(&lguest_lock);
|
||||
/* To match between Guests sharing the same underlying memory we steal
|
||||
* code from the futex infrastructure. This requires that we hold the
|
||||
* "mmap_sem" for our process (the Launcher), and pass it to the futex
|
||||
* code. */
|
||||
down_read(fshared);
|
||||
|
||||
/* This can fail if it's not a valid address, or if the address is not
|
||||
* divisible by 4 (the futex code needs that, we don't really). */
|
||||
if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) {
|
||||
kill_guest(lg, "bad registered DMA buffer");
|
||||
goto unlock;
|
||||
}
|
||||
/* Search the hash table for matching entries (the Launcher can only
|
||||
* send to its own Guest for the moment, so the entry must be for this
|
||||
* Guest) */
|
||||
list_for_each_entry(i, &dma_hash[hash(&key)], list) {
|
||||
if (key_eq(&key, &i->key) && i->guestid == lg->guestid) {
|
||||
unsigned int j;
|
||||
/* Look through the registered DMA array for an
|
||||
* available buffer. */
|
||||
for (j = 0; j < i->num_dmas; j++) {
|
||||
struct lguest_dma dma;
|
||||
|
||||
ret = i->dmas + j * sizeof(struct lguest_dma);
|
||||
lgread(lg, &dma, ret, sizeof(dma));
|
||||
if (dma.used_len == 0)
|
||||
break;
|
||||
}
|
||||
/* Store the interrupt the Guest wants when the buffer
|
||||
* is used. */
|
||||
*interrupt = i->interrupt;
|
||||
break;
|
||||
}
|
||||
}
|
||||
unlock:
|
||||
up_read(fshared);
|
||||
mutex_unlock(&lguest_lock);
|
||||
return ret;
|
||||
}
|
||||
/*:*/
|
||||
|
||||
/*L:410 This really has completed the Launcher. Not only have we now finished
|
||||
* the longest chapter in our journey, but this also means we are over halfway
|
||||
* through!
|
||||
*
|
||||
* Enough prevaricating around the bush: it is time for us to dive into the
|
||||
* core of the Host, in "make Host".
|
||||
*/
|
@ -1,119 +1,25 @@
|
||||
#ifndef _LGUEST_H
|
||||
#define _LGUEST_H
|
||||
|
||||
#include <asm/desc.h>
|
||||
|
||||
#define GDT_ENTRY_LGUEST_CS 10
|
||||
#define GDT_ENTRY_LGUEST_DS 11
|
||||
#define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8)
|
||||
#define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8)
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
#include <linux/types.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/stringify.h>
|
||||
#include <linux/binfmts.h>
|
||||
#include <linux/futex.h>
|
||||
#include <linux/lguest.h>
|
||||
#include <linux/lguest_launcher.h>
|
||||
#include <linux/wait.h>
|
||||
#include <linux/err.h>
|
||||
#include <asm/semaphore.h>
|
||||
#include "irq_vectors.h"
|
||||
|
||||
#define GUEST_PL 1
|
||||
|
||||
struct lguest_regs
|
||||
{
|
||||
/* Manually saved part. */
|
||||
unsigned long ebx, ecx, edx;
|
||||
unsigned long esi, edi, ebp;
|
||||
unsigned long gs;
|
||||
unsigned long eax;
|
||||
unsigned long fs, ds, es;
|
||||
unsigned long trapnum, errcode;
|
||||
/* Trap pushed part */
|
||||
unsigned long eip;
|
||||
unsigned long cs;
|
||||
unsigned long eflags;
|
||||
unsigned long esp;
|
||||
unsigned long ss;
|
||||
};
|
||||
#include <asm/lguest.h>
|
||||
|
||||
void free_pagetables(void);
|
||||
int init_pagetables(struct page **switcher_page, unsigned int pages);
|
||||
|
||||
/* Full 4G segment descriptors, suitable for CS and DS. */
|
||||
#define FULL_EXEC_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9b00})
|
||||
#define FULL_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9300})
|
||||
|
||||
struct lguest_dma_info
|
||||
{
|
||||
struct list_head list;
|
||||
union futex_key key;
|
||||
unsigned long dmas;
|
||||
u16 next_dma;
|
||||
u16 num_dmas;
|
||||
u16 guestid;
|
||||
u8 interrupt; /* 0 when not registered */
|
||||
};
|
||||
|
||||
/*H:310 The page-table code owes a great debt of gratitude to Andi Kleen. He
|
||||
* reviewed the original code which used "u32" for all page table entries, and
|
||||
* insisted that it would be far clearer with explicit typing. I thought it
|
||||
* was overkill, but he was right: it is much clearer than it was before.
|
||||
*
|
||||
* We have separate types for the Guest's ptes & pgds and the shadow ptes &
|
||||
* pgds. There's already a Linux type for these (pte_t and pgd_t) but they
|
||||
* change depending on kernel config options (PAE). */
|
||||
|
||||
/* Each entry is identical: lower 12 bits of flags and upper 20 bits for the
|
||||
* "page frame number" (0 == first physical page, etc). They are different
|
||||
* types so the compiler will warn us if we mix them improperly. */
|
||||
typedef union {
|
||||
struct { unsigned flags:12, pfn:20; };
|
||||
struct { unsigned long val; } raw;
|
||||
} spgd_t;
|
||||
typedef union {
|
||||
struct { unsigned flags:12, pfn:20; };
|
||||
struct { unsigned long val; } raw;
|
||||
} spte_t;
|
||||
typedef union {
|
||||
struct { unsigned flags:12, pfn:20; };
|
||||
struct { unsigned long val; } raw;
|
||||
} gpgd_t;
|
||||
typedef union {
|
||||
struct { unsigned flags:12, pfn:20; };
|
||||
struct { unsigned long val; } raw;
|
||||
} gpte_t;
|
||||
|
||||
/* We have two convenient macros to convert a "raw" value as handed to us by
|
||||
* the Guest into the correct Guest PGD or PTE type. */
|
||||
#define mkgpte(_val) ((gpte_t){.raw.val = _val})
|
||||
#define mkgpgd(_val) ((gpgd_t){.raw.val = _val})
|
||||
/*:*/
|
||||
|
||||
struct pgdir
|
||||
{
|
||||
unsigned long cr3;
|
||||
spgd_t *pgdir;
|
||||
};
|
||||
|
||||
/* This is a guest-specific page (mapped ro) into the guest. */
|
||||
struct lguest_ro_state
|
||||
{
|
||||
/* Host information we need to restore when we switch back. */
|
||||
u32 host_cr3;
|
||||
struct Xgt_desc_struct host_idt_desc;
|
||||
struct Xgt_desc_struct host_gdt_desc;
|
||||
u32 host_sp;
|
||||
|
||||
/* Fields which are used when guest is running. */
|
||||
struct Xgt_desc_struct guest_idt_desc;
|
||||
struct Xgt_desc_struct guest_gdt_desc;
|
||||
struct i386_hw_tss guest_tss;
|
||||
struct desc_struct guest_idt[IDT_ENTRIES];
|
||||
struct desc_struct guest_gdt[GDT_ENTRIES];
|
||||
unsigned long gpgdir;
|
||||
pgd_t *pgdir;
|
||||
};
|
||||
|
||||
/* We have two pages shared with guests, per cpu. */
|
||||
@ -141,9 +47,11 @@ struct lguest
|
||||
struct lguest_data __user *lguest_data;
|
||||
struct task_struct *tsk;
|
||||
struct mm_struct *mm; /* == tsk->mm, but that becomes NULL on exit */
|
||||
u16 guestid;
|
||||
u32 pfn_limit;
|
||||
u32 page_offset;
|
||||
/* This provides the offset to the base of guest-physical
|
||||
* memory in the Launcher. */
|
||||
void __user *mem_base;
|
||||
unsigned long kernel_address;
|
||||
u32 cr2;
|
||||
int halted;
|
||||
int ts;
|
||||
@ -151,6 +59,9 @@ struct lguest
|
||||
u32 esp1;
|
||||
u8 ss1;
|
||||
|
||||
/* If a hypercall was asked for, this points to the arguments. */
|
||||
struct hcall_args *hcall;
|
||||
|
||||
/* Do we need to stop what we're doing and return to userspace? */
|
||||
int break_out;
|
||||
wait_queue_head_t break_wq;
|
||||
@ -167,24 +78,15 @@ struct lguest
|
||||
struct task_struct *wake;
|
||||
|
||||
unsigned long noirq_start, noirq_end;
|
||||
int dma_is_pending;
|
||||
unsigned long pending_dma; /* struct lguest_dma */
|
||||
unsigned long pending_key; /* address they're sending to */
|
||||
unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */
|
||||
|
||||
unsigned int stack_pages;
|
||||
u32 tsc_khz;
|
||||
|
||||
struct lguest_dma_info dma[LGUEST_MAX_DMA];
|
||||
|
||||
/* Dead? */
|
||||
const char *dead;
|
||||
|
||||
/* The GDT entries copied into lguest_ro_state when running. */
|
||||
struct desc_struct gdt[GDT_ENTRIES];
|
||||
|
||||
/* The IDT entries: some copied into lguest_ro_state when running. */
|
||||
struct desc_struct idt[FIRST_EXTERNAL_VECTOR+LGUEST_IRQS];
|
||||
struct desc_struct syscall_idt;
|
||||
struct lguest_arch arch;
|
||||
|
||||
/* Virtual clock device */
|
||||
struct hrtimer hrt;
|
||||
@ -193,19 +95,38 @@ struct lguest
|
||||
DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
|
||||
};
|
||||
|
||||
extern struct lguest lguests[];
|
||||
extern struct mutex lguest_lock;
|
||||
|
||||
/* core.c: */
|
||||
u32 lgread_u32(struct lguest *lg, unsigned long addr);
|
||||
void lgwrite_u32(struct lguest *lg, unsigned long addr, u32 val);
|
||||
void lgread(struct lguest *lg, void *buf, unsigned long addr, unsigned len);
|
||||
void lgwrite(struct lguest *lg, unsigned long, const void *buf, unsigned len);
|
||||
int find_free_guest(void);
|
||||
int lguest_address_ok(const struct lguest *lg,
|
||||
unsigned long addr, unsigned long len);
|
||||
void __lgread(struct lguest *, void *, unsigned long, unsigned);
|
||||
void __lgwrite(struct lguest *, unsigned long, const void *, unsigned);
|
||||
|
||||
/*L:306 Using memory-copy operations like that is usually inconvient, so we
|
||||
* have the following helper macros which read and write a specific type (often
|
||||
* an unsigned long).
|
||||
*
|
||||
* This reads into a variable of the given type then returns that. */
|
||||
#define lgread(lg, addr, type) \
|
||||
({ type _v; __lgread((lg), &_v, (addr), sizeof(_v)); _v; })
|
||||
|
||||
/* This checks that the variable is of the given type, then writes it out. */
|
||||
#define lgwrite(lg, addr, type, val) \
|
||||
do { \
|
||||
typecheck(type, val); \
|
||||
__lgwrite((lg), (addr), &(val), sizeof(val)); \
|
||||
} while(0)
|
||||
/* (end of memory access helper routines) :*/
|
||||
|
||||
int run_guest(struct lguest *lg, unsigned long __user *user);
|
||||
|
||||
/* Helper macros to obtain the first 12 or the last 20 bits, this is only the
|
||||
* first step in the migration to the kernel types. pte_pfn is already defined
|
||||
* in the kernel. */
|
||||
#define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK)
|
||||
#define pte_flags(x) (pte_val(x) & ~PAGE_MASK)
|
||||
#define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT)
|
||||
|
||||
/* interrupts_and_traps.c: */
|
||||
void maybe_do_interrupt(struct lguest *lg);
|
||||
@ -219,6 +140,9 @@ void copy_traps(const struct lguest *lg, struct desc_struct *idt,
|
||||
const unsigned long *def);
|
||||
void guest_set_clockevent(struct lguest *lg, unsigned long delta);
|
||||
void init_clockdev(struct lguest *lg);
|
||||
bool check_syscall_vector(struct lguest *lg);
|
||||
int init_interrupts(void);
|
||||
void free_interrupts(void);
|
||||
|
||||
/* segments.c: */
|
||||
void setup_default_gdt_entries(struct lguest_ro_state *state);
|
||||
@ -232,28 +156,33 @@ void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt);
|
||||
int init_guest_pagetable(struct lguest *lg, unsigned long pgtable);
|
||||
void free_guest_pagetable(struct lguest *lg);
|
||||
void guest_new_pagetable(struct lguest *lg, unsigned long pgtable);
|
||||
void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 i);
|
||||
void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i);
|
||||
void guest_pagetable_clear_all(struct lguest *lg);
|
||||
void guest_pagetable_flush_user(struct lguest *lg);
|
||||
void guest_set_pte(struct lguest *lg, unsigned long cr3,
|
||||
unsigned long vaddr, gpte_t val);
|
||||
void guest_set_pte(struct lguest *lg, unsigned long gpgdir,
|
||||
unsigned long vaddr, pte_t val);
|
||||
void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages);
|
||||
int demand_page(struct lguest *info, unsigned long cr2, int errcode);
|
||||
void pin_page(struct lguest *lg, unsigned long vaddr);
|
||||
unsigned long guest_pa(struct lguest *lg, unsigned long vaddr);
|
||||
void page_table_guest_data_init(struct lguest *lg);
|
||||
|
||||
/* <arch>/core.c: */
|
||||
void lguest_arch_host_init(void);
|
||||
void lguest_arch_host_fini(void);
|
||||
void lguest_arch_run_guest(struct lguest *lg);
|
||||
void lguest_arch_handle_trap(struct lguest *lg);
|
||||
int lguest_arch_init_hypercalls(struct lguest *lg);
|
||||
int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args);
|
||||
void lguest_arch_setup_regs(struct lguest *lg, unsigned long start);
|
||||
|
||||
/* <arch>/switcher.S: */
|
||||
extern char start_switcher_text[], end_switcher_text[], switch_to_guest[];
|
||||
|
||||
/* lguest_user.c: */
|
||||
int lguest_device_init(void);
|
||||
void lguest_device_remove(void);
|
||||
|
||||
/* io.c: */
|
||||
void lguest_io_init(void);
|
||||
int bind_dma(struct lguest *lg,
|
||||
unsigned long key, unsigned long udma, u16 numdmas, u8 interrupt);
|
||||
void send_dma(struct lguest *info, unsigned long key, unsigned long udma);
|
||||
void release_all_dma(struct lguest *lg);
|
||||
unsigned long get_dma_buffer(struct lguest *lg, unsigned long key,
|
||||
unsigned long *interrupt);
|
||||
|
||||
/* hypercalls.c: */
|
||||
void do_hypercalls(struct lguest *lg);
|
||||
void write_timestamp(struct lguest *lg);
|
||||
@ -292,9 +221,5 @@ do { \
|
||||
} while(0)
|
||||
/* (End of aside) :*/
|
||||
|
||||
static inline unsigned long guest_pa(struct lguest *lg, unsigned long vaddr)
|
||||
{
|
||||
return vaddr - lg->page_offset;
|
||||
}
|
||||
#endif /* __ASSEMBLY__ */
|
||||
#endif /* _LGUEST_H */
|
||||
|
@ -1,218 +0,0 @@
|
||||
/*P:050 Lguest guests use a very simple bus for devices. It's a simple array
|
||||
* of device descriptors contained just above the top of normal memory. The
|
||||
* lguest bus is 80% tedious boilerplate code. :*/
|
||||
#include <linux/init.h>
|
||||
#include <linux/bootmem.h>
|
||||
#include <linux/lguest_bus.h>
|
||||
#include <asm/io.h>
|
||||
#include <asm/paravirt.h>
|
||||
|
||||
static ssize_t type_show(struct device *_dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
|
||||
return sprintf(buf, "%hu", lguest_devices[dev->index].type);
|
||||
}
|
||||
static ssize_t features_show(struct device *_dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
|
||||
return sprintf(buf, "%hx", lguest_devices[dev->index].features);
|
||||
}
|
||||
static ssize_t pfn_show(struct device *_dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
|
||||
return sprintf(buf, "%u", lguest_devices[dev->index].pfn);
|
||||
}
|
||||
static ssize_t status_show(struct device *_dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
|
||||
return sprintf(buf, "%hx", lguest_devices[dev->index].status);
|
||||
}
|
||||
static ssize_t status_store(struct device *_dev, struct device_attribute *attr,
|
||||
const char *buf, size_t count)
|
||||
{
|
||||
struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
|
||||
if (sscanf(buf, "%hi", &lguest_devices[dev->index].status) != 1)
|
||||
return -EINVAL;
|
||||
return count;
|
||||
}
|
||||
static struct device_attribute lguest_dev_attrs[] = {
|
||||
__ATTR_RO(type),
|
||||
__ATTR_RO(features),
|
||||
__ATTR_RO(pfn),
|
||||
__ATTR(status, 0644, status_show, status_store),
|
||||
__ATTR_NULL
|
||||
};
|
||||
|
||||
/*D:130 The generic bus infrastructure requires a function which says whether a
|
||||
* device matches a driver. For us, it is simple: "struct lguest_driver"
|
||||
* contains a "device_type" field which indicates what type of device it can
|
||||
* handle, so we just cast the args and compare: */
|
||||
static int lguest_dev_match(struct device *_dev, struct device_driver *_drv)
|
||||
{
|
||||
struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
|
||||
struct lguest_driver *drv = container_of(_drv,struct lguest_driver,drv);
|
||||
|
||||
return (drv->device_type == lguest_devices[dev->index].type);
|
||||
}
|
||||
/*:*/
|
||||
|
||||
struct lguest_bus {
|
||||
struct bus_type bus;
|
||||
struct device dev;
|
||||
};
|
||||
|
||||
static struct lguest_bus lguest_bus = {
|
||||
.bus = {
|
||||
.name = "lguest",
|
||||
.match = lguest_dev_match,
|
||||
.dev_attrs = lguest_dev_attrs,
|
||||
},
|
||||
.dev = {
|
||||
.parent = NULL,
|
||||
.bus_id = "lguest",
|
||||
}
|
||||
};
|
||||
|
||||
/*D:140 This is the callback which occurs once the bus infrastructure matches
|
||||
* up a device and driver, ie. in response to add_lguest_device() calling
|
||||
* device_register(), or register_lguest_driver() calling driver_register().
|
||||
*
|
||||
* At the moment it's always the latter: the devices are added first, since
|
||||
* scan_devices() is called from a "core_initcall", and the drivers themselves
|
||||
* called later as a normal "initcall". But it would work the other way too.
|
||||
*
|
||||
* So now we have the happy couple, we add the status bit to indicate that we
|
||||
* found a driver. If the driver truly loves the device, it will return
|
||||
* happiness from its probe function (ok, perhaps this wasn't my greatest
|
||||
* analogy), and we set the final "driver ok" bit so the Host sees it's all
|
||||
* green. */
|
||||
static int lguest_dev_probe(struct device *_dev)
|
||||
{
|
||||
int ret;
|
||||
struct lguest_device*dev = container_of(_dev,struct lguest_device,dev);
|
||||
struct lguest_driver*drv = container_of(dev->dev.driver,
|
||||
struct lguest_driver, drv);
|
||||
|
||||
lguest_devices[dev->index].status |= LGUEST_DEVICE_S_DRIVER;
|
||||
ret = drv->probe(dev);
|
||||
if (ret == 0)
|
||||
lguest_devices[dev->index].status |= LGUEST_DEVICE_S_DRIVER_OK;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* The last part of the bus infrastructure is the function lguest drivers use
|
||||
* to register themselves. Firstly, we do nothing if there's no lguest bus
|
||||
* (ie. this is not a Guest), otherwise we fill in the embedded generic "struct
|
||||
* driver" fields and call the generic driver_register(). */
|
||||
int register_lguest_driver(struct lguest_driver *drv)
|
||||
{
|
||||
if (!lguest_devices)
|
||||
return 0;
|
||||
|
||||
drv->drv.bus = &lguest_bus.bus;
|
||||
drv->drv.name = drv->name;
|
||||
drv->drv.owner = drv->owner;
|
||||
drv->drv.probe = lguest_dev_probe;
|
||||
|
||||
return driver_register(&drv->drv);
|
||||
}
|
||||
|
||||
/* At the moment we build all the drivers into the kernel because they're so
|
||||
* simple: 8144 bytes for all three of them as I type this. And as the console
|
||||
* really needs to be built in, it's actually only 3527 bytes for the network
|
||||
* and block drivers.
|
||||
*
|
||||
* If they get complex it will make sense for them to be modularized, so we
|
||||
* need to explicitly export the symbol.
|
||||
*
|
||||
* I don't think non-GPL modules make sense, so it's a GPL-only export.
|
||||
*/
|
||||
EXPORT_SYMBOL_GPL(register_lguest_driver);
|
||||
|
||||
/*D:120 This is the core of the lguest bus: actually adding a new device.
|
||||
* It's a separate function because it's neater that way, and because an
|
||||
* earlier version of the code supported hotplug and unplug. They were removed
|
||||
* early on because they were never used.
|
||||
*
|
||||
* As Andrew Tridgell says, "Untested code is buggy code".
|
||||
*
|
||||
* It's worth reading this carefully: we start with an index into the array of
|
||||
* "struct lguest_device_desc"s indicating the device which is new: */
|
||||
static void add_lguest_device(unsigned int index)
|
||||
{
|
||||
struct lguest_device *new;
|
||||
|
||||
/* Each "struct lguest_device_desc" has a "status" field, which the
|
||||
* Guest updates as the device is probed. In the worst case, the Host
|
||||
* can look at these bits to tell what part of device setup failed,
|
||||
* even if the console isn't available. */
|
||||
lguest_devices[index].status |= LGUEST_DEVICE_S_ACKNOWLEDGE;
|
||||
new = kmalloc(sizeof(struct lguest_device), GFP_KERNEL);
|
||||
if (!new) {
|
||||
printk(KERN_EMERG "Cannot allocate lguest device %u\n", index);
|
||||
lguest_devices[index].status |= LGUEST_DEVICE_S_FAILED;
|
||||
return;
|
||||
}
|
||||
|
||||
/* The "struct lguest_device" setup is pretty straight-forward example
|
||||
* code. */
|
||||
new->index = index;
|
||||
new->private = NULL;
|
||||
memset(&new->dev, 0, sizeof(new->dev));
|
||||
new->dev.parent = &lguest_bus.dev;
|
||||
new->dev.bus = &lguest_bus.bus;
|
||||
sprintf(new->dev.bus_id, "%u", index);
|
||||
|
||||
/* device_register() causes the bus infrastructure to look for a
|
||||
* matching driver. */
|
||||
if (device_register(&new->dev) != 0) {
|
||||
printk(KERN_EMERG "Cannot register lguest device %u\n", index);
|
||||
lguest_devices[index].status |= LGUEST_DEVICE_S_FAILED;
|
||||
kfree(new);
|
||||
}
|
||||
}
|
||||
|
||||
/*D:110 scan_devices() simply iterates through the device array. The type 0
|
||||
* is reserved to mean "no device", and anything else means we have found a
|
||||
* device: add it. */
|
||||
static void scan_devices(void)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < LGUEST_MAX_DEVICES; i++)
|
||||
if (lguest_devices[i].type)
|
||||
add_lguest_device(i);
|
||||
}
|
||||
|
||||
/*D:100 Fairly early in boot, lguest_bus_init() is called to set up the lguest
|
||||
* bus. We check that we are a Guest by checking paravirt_ops.name: there are
|
||||
* other ways of checking, but this seems most obvious to me.
|
||||
*
|
||||
* So we can access the array of "struct lguest_device_desc"s easily, we map
|
||||
* that memory and store the pointer in the global "lguest_devices". Then we
|
||||
* register the bus with the core. Doing two registrations seems clunky to me,
|
||||
* but it seems to be the correct sysfs incantation.
|
||||
*
|
||||
* Finally we call scan_devices() which adds all the devices found in the
|
||||
* "struct lguest_device_desc" array. */
|
||||
static int __init lguest_bus_init(void)
|
||||
{
|
||||
if (strcmp(pv_info.name, "lguest") != 0)
|
||||
return 0;
|
||||
|
||||
/* Devices are in a single page above top of "normal" mem */
|
||||
lguest_devices = lguest_map(max_pfn<<PAGE_SHIFT, 1);
|
||||
|
||||
if (bus_register(&lguest_bus.bus) != 0
|
||||
|| device_register(&lguest_bus.dev) != 0)
|
||||
panic("lguest bus registration failed");
|
||||
|
||||
scan_devices();
|
||||
return 0;
|
||||
}
|
||||
/* Do this after core stuff, before devices. */
|
||||
postcore_initcall(lguest_bus_init);
|
373
drivers/lguest/lguest_device.c
Normal file
373
drivers/lguest/lguest_device.c
Normal file
@ -0,0 +1,373 @@
|
||||
/*P:050 Lguest guests use a very simple method to describe devices. It's a
|
||||
* series of device descriptors contained just above the top of normal
|
||||
* memory.
|
||||
*
|
||||
* We use the standard "virtio" device infrastructure, which provides us with a
|
||||
* console, a network and a block driver. Each one expects some configuration
|
||||
* information and a "virtqueue" mechanism to send and receive data. :*/
|
||||
#include <linux/init.h>
|
||||
#include <linux/bootmem.h>
|
||||
#include <linux/lguest_launcher.h>
|
||||
#include <linux/virtio.h>
|
||||
#include <linux/virtio_config.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/virtio_ring.h>
|
||||
#include <linux/err.h>
|
||||
#include <asm/io.h>
|
||||
#include <asm/paravirt.h>
|
||||
#include <asm/lguest_hcall.h>
|
||||
|
||||
/* The pointer to our (page) of device descriptions. */
|
||||
static void *lguest_devices;
|
||||
|
||||
/* Unique numbering for lguest devices. */
|
||||
static unsigned int dev_index;
|
||||
|
||||
/* For Guests, device memory can be used as normal memory, so we cast away the
|
||||
* __iomem to quieten sparse. */
|
||||
static inline void *lguest_map(unsigned long phys_addr, unsigned long pages)
|
||||
{
|
||||
return (__force void *)ioremap(phys_addr, PAGE_SIZE*pages);
|
||||
}
|
||||
|
||||
static inline void lguest_unmap(void *addr)
|
||||
{
|
||||
iounmap((__force void __iomem *)addr);
|
||||
}
|
||||
|
||||
/*D:100 Each lguest device is just a virtio device plus a pointer to its entry
|
||||
* in the lguest_devices page. */
|
||||
struct lguest_device {
|
||||
struct virtio_device vdev;
|
||||
|
||||
/* The entry in the lguest_devices page for this device. */
|
||||
struct lguest_device_desc *desc;
|
||||
};
|
||||
|
||||
/* Since the virtio infrastructure hands us a pointer to the virtio_device all
|
||||
* the time, it helps to have a curt macro to get a pointer to the struct
|
||||
* lguest_device it's enclosed in. */
|
||||
#define to_lgdev(vdev) container_of(vdev, struct lguest_device, vdev)
|
||||
|
||||
/*D:130
|
||||
* Device configurations
|
||||
*
|
||||
* The configuration information for a device consists of a series of fields.
|
||||
* The device will look for these fields during setup.
|
||||
*
|
||||
* For us these fields come immediately after that device's descriptor in the
|
||||
* lguest_devices page.
|
||||
*
|
||||
* Each field starts with a "type" byte, a "length" byte, then that number of
|
||||
* bytes of configuration information. The device descriptor tells us the
|
||||
* total configuration length so we know when we've reached the last field. */
|
||||
|
||||
/* type + length bytes */
|
||||
#define FHDR_LEN 2
|
||||
|
||||
/* This finds the first field of a given type for a device's configuration. */
|
||||
static void *lg_find(struct virtio_device *vdev, u8 type, unsigned int *len)
|
||||
{
|
||||
struct lguest_device_desc *desc = to_lgdev(vdev)->desc;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < desc->config_len; i += FHDR_LEN + desc->config[i+1]) {
|
||||
if (desc->config[i] == type) {
|
||||
/* Mark it used, so Host can know we looked at it, and
|
||||
* also so we won't find the same one twice. */
|
||||
desc->config[i] |= 0x80;
|
||||
/* Remember, the second byte is the length. */
|
||||
*len = desc->config[i+1];
|
||||
/* We return a pointer to the field header. */
|
||||
return desc->config + i;
|
||||
}
|
||||
}
|
||||
|
||||
/* Not found: return NULL for failure. */
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Once they've found a field, getting a copy of it is easy. */
|
||||
static void lg_get(struct virtio_device *vdev, void *token,
|
||||
void *buf, unsigned len)
|
||||
{
|
||||
/* Check they didn't ask for more than the length of the field! */
|
||||
BUG_ON(len > ((u8 *)token)[1]);
|
||||
memcpy(buf, token + FHDR_LEN, len);
|
||||
}
|
||||
|
||||
/* Setting the contents is also trivial. */
|
||||
static void lg_set(struct virtio_device *vdev, void *token,
|
||||
const void *buf, unsigned len)
|
||||
{
|
||||
BUG_ON(len > ((u8 *)token)[1]);
|
||||
memcpy(token + FHDR_LEN, buf, len);
|
||||
}
|
||||
|
||||
/* The operations to get and set the status word just access the status field
|
||||
* of the device descriptor. */
|
||||
static u8 lg_get_status(struct virtio_device *vdev)
|
||||
{
|
||||
return to_lgdev(vdev)->desc->status;
|
||||
}
|
||||
|
||||
static void lg_set_status(struct virtio_device *vdev, u8 status)
|
||||
{
|
||||
to_lgdev(vdev)->desc->status = status;
|
||||
}
|
||||
|
||||
/*
|
||||
* Virtqueues
|
||||
*
|
||||
* The other piece of infrastructure virtio needs is a "virtqueue": a way of
|
||||
* the Guest device registering buffers for the other side to read from or
|
||||
* write into (ie. send and receive buffers). Each device can have multiple
|
||||
* virtqueues: for example the console has one queue for sending and one for
|
||||
* receiving.
|
||||
*
|
||||
* Fortunately for us, a very fast shared-memory-plus-descriptors virtqueue
|
||||
* already exists in virtio_ring.c. We just need to connect it up.
|
||||
*
|
||||
* We start with the information we need to keep about each virtqueue.
|
||||
*/
|
||||
|
||||
/*D:140 This is the information we remember about each virtqueue. */
|
||||
struct lguest_vq_info
|
||||
{
|
||||
/* A copy of the information contained in the device config. */
|
||||
struct lguest_vqconfig config;
|
||||
|
||||
/* The address where we mapped the virtio ring, so we can unmap it. */
|
||||
void *pages;
|
||||
};
|
||||
|
||||
/* When the virtio_ring code wants to prod the Host, it calls us here and we
|
||||
* make a hypercall. We hand the page number of the virtqueue so the Host
|
||||
* knows which virtqueue we're talking about. */
|
||||
static void lg_notify(struct virtqueue *vq)
|
||||
{
|
||||
/* We store our virtqueue information in the "priv" pointer of the
|
||||
* virtqueue structure. */
|
||||
struct lguest_vq_info *lvq = vq->priv;
|
||||
|
||||
hcall(LHCALL_NOTIFY, lvq->config.pfn << PAGE_SHIFT, 0, 0);
|
||||
}
|
||||
|
||||
/* This routine finds the first virtqueue described in the configuration of
|
||||
* this device and sets it up.
|
||||
*
|
||||
* This is kind of an ugly duckling. It'd be nicer to have a standard
|
||||
* representation of a virtqueue in the configuration space, but it seems that
|
||||
* everyone wants to do it differently. The KVM guys want the Guest to
|
||||
* allocate its own pages and tell the Host where they are, but for lguest it's
|
||||
* simpler for the Host to simply tell us where the pages are.
|
||||
*
|
||||
* So we provide devices with a "find virtqueue and set it up" function. */
|
||||
static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
|
||||
bool (*callback)(struct virtqueue *vq))
|
||||
{
|
||||
struct lguest_vq_info *lvq;
|
||||
struct virtqueue *vq;
|
||||
unsigned int len;
|
||||
void *token;
|
||||
int err;
|
||||
|
||||
/* Look for a field of the correct type to mark a virtqueue. Note that
|
||||
* if this succeeds, then the type will be changed so it won't be found
|
||||
* again, and future lg_find_vq() calls will find the next
|
||||
* virtqueue (if any). */
|
||||
token = vdev->config->find(vdev, VIRTIO_CONFIG_F_VIRTQUEUE, &len);
|
||||
if (!token)
|
||||
return ERR_PTR(-ENOENT);
|
||||
|
||||
lvq = kmalloc(sizeof(*lvq), GFP_KERNEL);
|
||||
if (!lvq)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
/* Note: we could use a configuration space inside here, just like we
|
||||
* do for the device. This would allow expansion in future, because
|
||||
* our configuration system is designed to be expansible. But this is
|
||||
* way easier. */
|
||||
if (len != sizeof(lvq->config)) {
|
||||
dev_err(&vdev->dev, "Unexpected virtio config len %u\n", len);
|
||||
err = -EIO;
|
||||
goto free_lvq;
|
||||
}
|
||||
/* Make a copy of the "struct lguest_vqconfig" field. We need a copy
|
||||
* because the config space might not be aligned correctly. */
|
||||
vdev->config->get(vdev, token, &lvq->config, sizeof(lvq->config));
|
||||
|
||||
/* Figure out how many pages the ring will take, and map that memory */
|
||||
lvq->pages = lguest_map((unsigned long)lvq->config.pfn << PAGE_SHIFT,
|
||||
DIV_ROUND_UP(vring_size(lvq->config.num),
|
||||
PAGE_SIZE));
|
||||
if (!lvq->pages) {
|
||||
err = -ENOMEM;
|
||||
goto free_lvq;
|
||||
}
|
||||
|
||||
/* OK, tell virtio_ring.c to set up a virtqueue now we know its size
|
||||
* and we've got a pointer to its pages. */
|
||||
vq = vring_new_virtqueue(lvq->config.num, vdev, lvq->pages,
|
||||
lg_notify, callback);
|
||||
if (!vq) {
|
||||
err = -ENOMEM;
|
||||
goto unmap;
|
||||
}
|
||||
|
||||
/* Tell the interrupt for this virtqueue to go to the virtio_ring
|
||||
* interrupt handler. */
|
||||
/* FIXME: We used to have a flag for the Host to tell us we could use
|
||||
* the interrupt as a source of randomness: it'd be nice to have that
|
||||
* back.. */
|
||||
err = request_irq(lvq->config.irq, vring_interrupt, IRQF_SHARED,
|
||||
vdev->dev.bus_id, vq);
|
||||
if (err)
|
||||
goto destroy_vring;
|
||||
|
||||
/* Last of all we hook up our 'struct lguest_vq_info" to the
|
||||
* virtqueue's priv pointer. */
|
||||
vq->priv = lvq;
|
||||
return vq;
|
||||
|
||||
destroy_vring:
|
||||
vring_del_virtqueue(vq);
|
||||
unmap:
|
||||
lguest_unmap(lvq->pages);
|
||||
free_lvq:
|
||||
kfree(lvq);
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
/*:*/
|
||||
|
||||
/* Cleaning up a virtqueue is easy */
|
||||
static void lg_del_vq(struct virtqueue *vq)
|
||||
{
|
||||
struct lguest_vq_info *lvq = vq->priv;
|
||||
|
||||
/* Tell virtio_ring.c to free the virtqueue. */
|
||||
vring_del_virtqueue(vq);
|
||||
/* Unmap the pages containing the ring. */
|
||||
lguest_unmap(lvq->pages);
|
||||
/* Free our own queue information. */
|
||||
kfree(lvq);
|
||||
}
|
||||
|
||||
/* The ops structure which hooks everything together. */
|
||||
static struct virtio_config_ops lguest_config_ops = {
|
||||
.find = lg_find,
|
||||
.get = lg_get,
|
||||
.set = lg_set,
|
||||
.get_status = lg_get_status,
|
||||
.set_status = lg_set_status,
|
||||
.find_vq = lg_find_vq,
|
||||
.del_vq = lg_del_vq,
|
||||
};
|
||||
|
||||
/* The root device for the lguest virtio devices. This makes them appear as
|
||||
* /sys/devices/lguest/0,1,2 not /sys/devices/0,1,2. */
|
||||
static struct device lguest_root = {
|
||||
.parent = NULL,
|
||||
.bus_id = "lguest",
|
||||
};
|
||||
|
||||
/*D:120 This is the core of the lguest bus: actually adding a new device.
|
||||
* It's a separate function because it's neater that way, and because an
|
||||
* earlier version of the code supported hotplug and unplug. They were removed
|
||||
* early on because they were never used.
|
||||
*
|
||||
* As Andrew Tridgell says, "Untested code is buggy code".
|
||||
*
|
||||
* It's worth reading this carefully: we start with a pointer to the new device
|
||||
* descriptor in the "lguest_devices" page. */
|
||||
static void add_lguest_device(struct lguest_device_desc *d)
|
||||
{
|
||||
struct lguest_device *ldev;
|
||||
|
||||
ldev = kzalloc(sizeof(*ldev), GFP_KERNEL);
|
||||
if (!ldev) {
|
||||
printk(KERN_EMERG "Cannot allocate lguest dev %u\n",
|
||||
dev_index++);
|
||||
return;
|
||||
}
|
||||
|
||||
/* This devices' parent is the lguest/ dir. */
|
||||
ldev->vdev.dev.parent = &lguest_root;
|
||||
/* We have a unique device index thanks to the dev_index counter. */
|
||||
ldev->vdev.index = dev_index++;
|
||||
/* The device type comes straight from the descriptor. There's also a
|
||||
* device vendor field in the virtio_device struct, which we leave as
|
||||
* 0. */
|
||||
ldev->vdev.id.device = d->type;
|
||||
/* We have a simple set of routines for querying the device's
|
||||
* configuration information and setting its status. */
|
||||
ldev->vdev.config = &lguest_config_ops;
|
||||
/* And we remember the device's descriptor for lguest_config_ops. */
|
||||
ldev->desc = d;
|
||||
|
||||
/* register_virtio_device() sets up the generic fields for the struct
|
||||
* virtio_device and calls device_register(). This makes the bus
|
||||
* infrastructure look for a matching driver. */
|
||||
if (register_virtio_device(&ldev->vdev) != 0) {
|
||||
printk(KERN_ERR "Failed to register lguest device %u\n",
|
||||
ldev->vdev.index);
|
||||
kfree(ldev);
|
||||
}
|
||||
}
|
||||
|
||||
/*D:110 scan_devices() simply iterates through the device page. The type 0 is
|
||||
* reserved to mean "end of devices". */
|
||||
static void scan_devices(void)
|
||||
{
|
||||
unsigned int i;
|
||||
struct lguest_device_desc *d;
|
||||
|
||||
/* We start at the page beginning, and skip over each entry. */
|
||||
for (i = 0; i < PAGE_SIZE; i += sizeof(*d) + d->config_len) {
|
||||
d = lguest_devices + i;
|
||||
|
||||
/* Once we hit a zero, stop. */
|
||||
if (d->type == 0)
|
||||
break;
|
||||
|
||||
add_lguest_device(d);
|
||||
}
|
||||
}
|
||||
|
||||
/*D:105 Fairly early in boot, lguest_devices_init() is called to set up the
|
||||
* lguest device infrastructure. We check that we are a Guest by checking
|
||||
* pv_info.name: there are other ways of checking, but this seems most
|
||||
* obvious to me.
|
||||
*
|
||||
* So we can access the "struct lguest_device_desc"s easily, we map that memory
|
||||
* and store the pointer in the global "lguest_devices". Then we register a
|
||||
* root device from which all our devices will hang (this seems to be the
|
||||
* correct sysfs incantation).
|
||||
*
|
||||
* Finally we call scan_devices() which adds all the devices found in the
|
||||
* lguest_devices page. */
|
||||
static int __init lguest_devices_init(void)
|
||||
{
|
||||
if (strcmp(pv_info.name, "lguest") != 0)
|
||||
return 0;
|
||||
|
||||
if (device_register(&lguest_root) != 0)
|
||||
panic("Could not register lguest root");
|
||||
|
||||
/* Devices are in a single page above top of "normal" mem */
|
||||
lguest_devices = lguest_map(max_pfn<<PAGE_SHIFT, 1);
|
||||
|
||||
scan_devices();
|
||||
return 0;
|
||||
}
|
||||
/* We do this after core stuff, but before the drivers. */
|
||||
postcore_initcall(lguest_devices_init);
|
||||
|
||||
/*D:150 At this point in the journey we used to now wade through the lguest
|
||||
* devices themselves: net, block and console. Since they're all now virtio
|
||||
* devices rather than lguest-specific, I've decided to ignore them. Mostly,
|
||||
* they're kind of boring. But this does mean you'll never experience the
|
||||
* thrill of reading the forbidden love scene buried deep in the block driver.
|
||||
*
|
||||
* "make Launcher" beckons, where we answer questions like "Where do Guests
|
||||
* come from?", and "What do you do when someone asks for optimization?". */
|
@ -1,73 +1,17 @@
|
||||
/*P:200 This contains all the /dev/lguest code, whereby the userspace launcher
|
||||
* controls and communicates with the Guest. For example, the first write will
|
||||
* tell us the memory size, pagetable, entry point and kernel address offset.
|
||||
* A read will run the Guest until a signal is pending (-EINTR), or the Guest
|
||||
* does a DMA out to the Launcher. Writes are also used to get a DMA buffer
|
||||
* registered by the Guest and to send the Guest an interrupt. :*/
|
||||
* tell us the Guest's memory layout, pagetable, entry point and kernel address
|
||||
* offset. A read will run the Guest until something happens, such as a signal
|
||||
* or the Guest doing a NOTIFY out to the Launcher. :*/
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/miscdevice.h>
|
||||
#include <linux/fs.h>
|
||||
#include "lg.h"
|
||||
|
||||
/*L:030 setup_regs() doesn't really belong in this file, but it gives us an
|
||||
* early glimpse deeper into the Host so it's worth having here.
|
||||
*
|
||||
* Most of the Guest's registers are left alone: we used get_zeroed_page() to
|
||||
* allocate the structure, so they will be 0. */
|
||||
static void setup_regs(struct lguest_regs *regs, unsigned long start)
|
||||
{
|
||||
/* There are four "segment" registers which the Guest needs to boot:
|
||||
* The "code segment" register (cs) refers to the kernel code segment
|
||||
* __KERNEL_CS, and the "data", "extra" and "stack" segment registers
|
||||
* refer to the kernel data segment __KERNEL_DS.
|
||||
*
|
||||
* The privilege level is packed into the lower bits. The Guest runs
|
||||
* at privilege level 1 (GUEST_PL).*/
|
||||
regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL;
|
||||
regs->cs = __KERNEL_CS|GUEST_PL;
|
||||
|
||||
/* The "eflags" register contains miscellaneous flags. Bit 1 (0x002)
|
||||
* is supposed to always be "1". Bit 9 (0x200) controls whether
|
||||
* interrupts are enabled. We always leave interrupts enabled while
|
||||
* running the Guest. */
|
||||
regs->eflags = 0x202;
|
||||
|
||||
/* The "Extended Instruction Pointer" register says where the Guest is
|
||||
* running. */
|
||||
regs->eip = start;
|
||||
|
||||
/* %esi points to our boot information, at physical address 0, so don't
|
||||
* touch it. */
|
||||
}
|
||||
|
||||
/*L:310 To send DMA into the Guest, the Launcher needs to be able to ask for a
|
||||
* DMA buffer. This is done by writing LHREQ_GETDMA and the key to
|
||||
* /dev/lguest. */
|
||||
static long user_get_dma(struct lguest *lg, const u32 __user *input)
|
||||
{
|
||||
unsigned long key, udma, irq;
|
||||
|
||||
/* Fetch the key they wrote to us. */
|
||||
if (get_user(key, input) != 0)
|
||||
return -EFAULT;
|
||||
/* Look for a free Guest DMA buffer bound to that key. */
|
||||
udma = get_dma_buffer(lg, key, &irq);
|
||||
if (!udma)
|
||||
return -ENOENT;
|
||||
|
||||
/* We need to tell the Launcher what interrupt the Guest expects after
|
||||
* the buffer is filled. We stash it in udma->used_len. */
|
||||
lgwrite_u32(lg, udma + offsetof(struct lguest_dma, used_len), irq);
|
||||
|
||||
/* The (guest-physical) address of the DMA buffer is returned from
|
||||
* the write(). */
|
||||
return udma;
|
||||
}
|
||||
|
||||
/*L:315 To force the Guest to stop running and return to the Launcher, the
|
||||
* Waker sets writes LHREQ_BREAK and the value "1" to /dev/lguest. The
|
||||
* Launcher then writes LHREQ_BREAK and "0" to release the Waker. */
|
||||
static int break_guest_out(struct lguest *lg, const u32 __user *input)
|
||||
static int break_guest_out(struct lguest *lg, const unsigned long __user *input)
|
||||
{
|
||||
unsigned long on;
|
||||
|
||||
@ -90,9 +34,9 @@ static int break_guest_out(struct lguest *lg, const u32 __user *input)
|
||||
|
||||
/*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt
|
||||
* number to /dev/lguest. */
|
||||
static int user_send_irq(struct lguest *lg, const u32 __user *input)
|
||||
static int user_send_irq(struct lguest *lg, const unsigned long __user *input)
|
||||
{
|
||||
u32 irq;
|
||||
unsigned long irq;
|
||||
|
||||
if (get_user(irq, input) != 0)
|
||||
return -EFAULT;
|
||||
@ -133,17 +77,19 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
|
||||
return len;
|
||||
}
|
||||
|
||||
/* If we returned from read() last time because the Guest sent DMA,
|
||||
/* If we returned from read() last time because the Guest notified,
|
||||
* clear the flag. */
|
||||
if (lg->dma_is_pending)
|
||||
lg->dma_is_pending = 0;
|
||||
if (lg->pending_notify)
|
||||
lg->pending_notify = 0;
|
||||
|
||||
/* Run the Guest until something interesting happens. */
|
||||
return run_guest(lg, (unsigned long __user *)user);
|
||||
}
|
||||
|
||||
/*L:020 The initialization write supplies 4 32-bit values (in addition to the
|
||||
* 32-bit LHREQ_INITIALIZE value). These are:
|
||||
/*L:020 The initialization write supplies 4 pointer sized (32 or 64 bit)
|
||||
* values (in addition to the LHREQ_INITIALIZE value). These are:
|
||||
*
|
||||
* base: The start of the Guest-physical memory inside the Launcher memory.
|
||||
*
|
||||
* pfnlimit: The highest (Guest-physical) page number the Guest should be
|
||||
* allowed to access. The Launcher has to live in Guest memory, so it sets
|
||||
@ -153,23 +99,17 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
|
||||
* pagetables (which are set up by the Launcher).
|
||||
*
|
||||
* start: The first instruction to execute ("eip" in x86-speak).
|
||||
*
|
||||
* page_offset: The PAGE_OFFSET constant in the Guest kernel. We should
|
||||
* probably wean the code off this, but it's a very useful constant! Any
|
||||
* address above this is within the Guest kernel, and any kernel address can
|
||||
* quickly converted from physical to virtual by adding PAGE_OFFSET. It's
|
||||
* 0xC0000000 (3G) by default, but it's configurable at kernel build time.
|
||||
*/
|
||||
static int initialize(struct file *file, const u32 __user *input)
|
||||
static int initialize(struct file *file, const unsigned long __user *input)
|
||||
{
|
||||
/* "struct lguest" contains everything we (the Host) know about a
|
||||
* Guest. */
|
||||
struct lguest *lg;
|
||||
int err, i;
|
||||
u32 args[4];
|
||||
int err;
|
||||
unsigned long args[4];
|
||||
|
||||
/* We grab the Big Lguest lock, which protects the global array
|
||||
* "lguests" and multiple simultaneous initializations. */
|
||||
/* We grab the Big Lguest lock, which protects against multiple
|
||||
* simultaneous initializations. */
|
||||
mutex_lock(&lguest_lock);
|
||||
/* You can't initialize twice! Close the device and start again... */
|
||||
if (file->private_data) {
|
||||
@ -182,20 +122,15 @@ static int initialize(struct file *file, const u32 __user *input)
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
/* Find an unused guest. */
|
||||
i = find_free_guest();
|
||||
if (i < 0) {
|
||||
err = -ENOSPC;
|
||||
lg = kzalloc(sizeof(*lg), GFP_KERNEL);
|
||||
if (!lg) {
|
||||
err = -ENOMEM;
|
||||
goto unlock;
|
||||
}
|
||||
/* OK, we have an index into the "lguest" array: "lg" is a convenient
|
||||
* pointer. */
|
||||
lg = &lguests[i];
|
||||
|
||||
/* Populate the easy fields of our "struct lguest" */
|
||||
lg->guestid = i;
|
||||
lg->pfn_limit = args[0];
|
||||
lg->page_offset = args[3];
|
||||
lg->mem_base = (void __user *)(long)args[0];
|
||||
lg->pfn_limit = args[1];
|
||||
|
||||
/* We need a complete page for the Guest registers: they are accessible
|
||||
* to the Guest and we can only grant it access to whole pages. */
|
||||
@ -210,17 +145,13 @@ static int initialize(struct file *file, const u32 __user *input)
|
||||
/* Initialize the Guest's shadow page tables, using the toplevel
|
||||
* address the Launcher gave us. This allocates memory, so can
|
||||
* fail. */
|
||||
err = init_guest_pagetable(lg, args[1]);
|
||||
err = init_guest_pagetable(lg, args[2]);
|
||||
if (err)
|
||||
goto free_regs;
|
||||
|
||||
/* Now we initialize the Guest's registers, handing it the start
|
||||
* address. */
|
||||
setup_regs(lg->regs, args[2]);
|
||||
|
||||
/* There are a couple of GDT entries the Guest expects when first
|
||||
* booting. */
|
||||
setup_guest_gdt(lg);
|
||||
lguest_arch_setup_regs(lg, args[3]);
|
||||
|
||||
/* The timer for lguest's clock needs initialization. */
|
||||
init_clockdev(lg);
|
||||
@ -260,18 +191,19 @@ unlock:
|
||||
/*L:010 The first operation the Launcher does must be a write. All writes
|
||||
* start with a 32 bit number: for the first write this must be
|
||||
* LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use
|
||||
* writes of other values to get DMA buffers and send interrupts. */
|
||||
static ssize_t write(struct file *file, const char __user *input,
|
||||
* writes of other values to send interrupts. */
|
||||
static ssize_t write(struct file *file, const char __user *in,
|
||||
size_t size, loff_t *off)
|
||||
{
|
||||
/* Once the guest is initialized, we hold the "struct lguest" in the
|
||||
* file private data. */
|
||||
struct lguest *lg = file->private_data;
|
||||
u32 req;
|
||||
const unsigned long __user *input = (const unsigned long __user *)in;
|
||||
unsigned long req;
|
||||
|
||||
if (get_user(req, input) != 0)
|
||||
return -EFAULT;
|
||||
input += sizeof(req);
|
||||
input++;
|
||||
|
||||
/* If you haven't initialized, you must do that first. */
|
||||
if (req != LHREQ_INITIALIZE && !lg)
|
||||
@ -287,13 +219,11 @@ static ssize_t write(struct file *file, const char __user *input,
|
||||
|
||||
switch (req) {
|
||||
case LHREQ_INITIALIZE:
|
||||
return initialize(file, (const u32 __user *)input);
|
||||
case LHREQ_GETDMA:
|
||||
return user_get_dma(lg, (const u32 __user *)input);
|
||||
return initialize(file, input);
|
||||
case LHREQ_IRQ:
|
||||
return user_send_irq(lg, (const u32 __user *)input);
|
||||
return user_send_irq(lg, input);
|
||||
case LHREQ_BREAK:
|
||||
return break_guest_out(lg, (const u32 __user *)input);
|
||||
return break_guest_out(lg, input);
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
@ -319,8 +249,6 @@ static int close(struct inode *inode, struct file *file)
|
||||
mutex_lock(&lguest_lock);
|
||||
/* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */
|
||||
hrtimer_cancel(&lg->hrt);
|
||||
/* Free any DMA buffers the Guest had bound. */
|
||||
release_all_dma(lg);
|
||||
/* Free up the shadow page tables for the Guest. */
|
||||
free_guest_pagetable(lg);
|
||||
/* Now all the memory cleanups are done, it's safe to release the
|
||||
|
@ -13,6 +13,7 @@
|
||||
#include <linux/random.h>
|
||||
#include <linux/percpu.h>
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/uaccess.h>
|
||||
#include "lg.h"
|
||||
|
||||
/*M:008 We hold reference to pages, which prevents them from being swapped.
|
||||
@ -44,44 +45,32 @@
|
||||
* (vii) Setting up the page tables initially.
|
||||
:*/
|
||||
|
||||
/* Pages a 4k long, and each page table entry is 4 bytes long, giving us 1024
|
||||
* (or 2^10) entries per page. */
|
||||
#define PTES_PER_PAGE_SHIFT 10
|
||||
#define PTES_PER_PAGE (1 << PTES_PER_PAGE_SHIFT)
|
||||
|
||||
/* 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is
|
||||
* conveniently placed at the top 4MB, so it uses a separate, complete PTE
|
||||
* page. */
|
||||
#define SWITCHER_PGD_INDEX (PTES_PER_PAGE - 1)
|
||||
#define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1)
|
||||
|
||||
/* We actually need a separate PTE page for each CPU. Remember that after the
|
||||
* Switcher code itself comes two pages for each CPU, and we don't want this
|
||||
* CPU's guest to see the pages of any other CPU. */
|
||||
static DEFINE_PER_CPU(spte_t *, switcher_pte_pages);
|
||||
static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
|
||||
#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)
|
||||
|
||||
/*H:320 With our shadow and Guest types established, we need to deal with
|
||||
* them: the page table code is curly enough to need helper functions to keep
|
||||
* it clear and clean.
|
||||
*
|
||||
* The first helper takes a virtual address, and says which entry in the top
|
||||
* level page table deals with that address. Since each top level entry deals
|
||||
* with 4M, this effectively divides by 4M. */
|
||||
static unsigned vaddr_to_pgd_index(unsigned long vaddr)
|
||||
{
|
||||
return vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT);
|
||||
}
|
||||
|
||||
/* There are two functions which return pointers to the shadow (aka "real")
|
||||
* There are two functions which return pointers to the shadow (aka "real")
|
||||
* page tables.
|
||||
*
|
||||
* spgd_addr() takes the virtual address and returns a pointer to the top-level
|
||||
* page directory entry for that address. Since we keep track of several page
|
||||
* tables, the "i" argument tells us which one we're interested in (it's
|
||||
* usually the current one). */
|
||||
static spgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr)
|
||||
static pgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr)
|
||||
{
|
||||
unsigned int index = vaddr_to_pgd_index(vaddr);
|
||||
unsigned int index = pgd_index(vaddr);
|
||||
|
||||
/* We kill any Guest trying to touch the Switcher addresses. */
|
||||
if (index >= SWITCHER_PGD_INDEX) {
|
||||
@ -95,28 +84,28 @@ static spgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr)
|
||||
/* This routine then takes the PGD entry given above, which contains the
|
||||
* address of the PTE page. It then returns a pointer to the PTE entry for the
|
||||
* given address. */
|
||||
static spte_t *spte_addr(struct lguest *lg, spgd_t spgd, unsigned long vaddr)
|
||||
static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, unsigned long vaddr)
|
||||
{
|
||||
spte_t *page = __va(spgd.pfn << PAGE_SHIFT);
|
||||
pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
|
||||
/* You should never call this if the PGD entry wasn't valid */
|
||||
BUG_ON(!(spgd.flags & _PAGE_PRESENT));
|
||||
return &page[(vaddr >> PAGE_SHIFT) % PTES_PER_PAGE];
|
||||
BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
|
||||
return &page[(vaddr >> PAGE_SHIFT) % PTRS_PER_PTE];
|
||||
}
|
||||
|
||||
/* These two functions just like the above two, except they access the Guest
|
||||
* page tables. Hence they return a Guest address. */
|
||||
static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr)
|
||||
{
|
||||
unsigned int index = vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT);
|
||||
return lg->pgdirs[lg->pgdidx].cr3 + index * sizeof(gpgd_t);
|
||||
unsigned int index = vaddr >> (PGDIR_SHIFT);
|
||||
return lg->pgdirs[lg->pgdidx].gpgdir + index * sizeof(pgd_t);
|
||||
}
|
||||
|
||||
static unsigned long gpte_addr(struct lguest *lg,
|
||||
gpgd_t gpgd, unsigned long vaddr)
|
||||
pgd_t gpgd, unsigned long vaddr)
|
||||
{
|
||||
unsigned long gpage = gpgd.pfn << PAGE_SHIFT;
|
||||
BUG_ON(!(gpgd.flags & _PAGE_PRESENT));
|
||||
return gpage + ((vaddr>>PAGE_SHIFT) % PTES_PER_PAGE) * sizeof(gpte_t);
|
||||
unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
|
||||
BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
|
||||
return gpage + ((vaddr>>PAGE_SHIFT) % PTRS_PER_PTE) * sizeof(pte_t);
|
||||
}
|
||||
|
||||
/*H:350 This routine takes a page number given by the Guest and converts it to
|
||||
@ -149,53 +138,55 @@ static unsigned long get_pfn(unsigned long virtpfn, int write)
|
||||
* entry can be a little tricky. The flags are (almost) the same, but the
|
||||
* Guest PTE contains a virtual page number: the CPU needs the real page
|
||||
* number. */
|
||||
static spte_t gpte_to_spte(struct lguest *lg, gpte_t gpte, int write)
|
||||
static pte_t gpte_to_spte(struct lguest *lg, pte_t gpte, int write)
|
||||
{
|
||||
spte_t spte;
|
||||
unsigned long pfn;
|
||||
unsigned long pfn, base, flags;
|
||||
|
||||
/* The Guest sets the global flag, because it thinks that it is using
|
||||
* PGE. We only told it to use PGE so it would tell us whether it was
|
||||
* flushing a kernel mapping or a userspace mapping. We don't actually
|
||||
* use the global bit, so throw it away. */
|
||||
spte.flags = (gpte.flags & ~_PAGE_GLOBAL);
|
||||
flags = (pte_flags(gpte) & ~_PAGE_GLOBAL);
|
||||
|
||||
/* The Guest's pages are offset inside the Launcher. */
|
||||
base = (unsigned long)lg->mem_base / PAGE_SIZE;
|
||||
|
||||
/* We need a temporary "unsigned long" variable to hold the answer from
|
||||
* get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't
|
||||
* fit in spte.pfn. get_pfn() finds the real physical number of the
|
||||
* page, given the virtual number. */
|
||||
pfn = get_pfn(gpte.pfn, write);
|
||||
pfn = get_pfn(base + pte_pfn(gpte), write);
|
||||
if (pfn == -1UL) {
|
||||
kill_guest(lg, "failed to get page %u", gpte.pfn);
|
||||
kill_guest(lg, "failed to get page %lu", pte_pfn(gpte));
|
||||
/* When we destroy the Guest, we'll go through the shadow page
|
||||
* tables and release_pte() them. Make sure we don't think
|
||||
* this one is valid! */
|
||||
spte.flags = 0;
|
||||
flags = 0;
|
||||
}
|
||||
/* Now we assign the page number, and our shadow PTE is complete. */
|
||||
spte.pfn = pfn;
|
||||
return spte;
|
||||
/* Now we assemble our shadow PTE from the page number and flags. */
|
||||
return pfn_pte(pfn, __pgprot(flags));
|
||||
}
|
||||
|
||||
/*H:460 And to complete the chain, release_pte() looks like this: */
|
||||
static void release_pte(spte_t pte)
|
||||
static void release_pte(pte_t pte)
|
||||
{
|
||||
/* Remember that get_user_pages() took a reference to the page, in
|
||||
* get_pfn()? We have to put it back now. */
|
||||
if (pte.flags & _PAGE_PRESENT)
|
||||
put_page(pfn_to_page(pte.pfn));
|
||||
if (pte_flags(pte) & _PAGE_PRESENT)
|
||||
put_page(pfn_to_page(pte_pfn(pte)));
|
||||
}
|
||||
/*:*/
|
||||
|
||||
static void check_gpte(struct lguest *lg, gpte_t gpte)
|
||||
static void check_gpte(struct lguest *lg, pte_t gpte)
|
||||
{
|
||||
if ((gpte.flags & (_PAGE_PWT|_PAGE_PSE)) || gpte.pfn >= lg->pfn_limit)
|
||||
if ((pte_flags(gpte) & (_PAGE_PWT|_PAGE_PSE))
|
||||
|| pte_pfn(gpte) >= lg->pfn_limit)
|
||||
kill_guest(lg, "bad page table entry");
|
||||
}
|
||||
|
||||
static void check_gpgd(struct lguest *lg, gpgd_t gpgd)
|
||||
static void check_gpgd(struct lguest *lg, pgd_t gpgd)
|
||||
{
|
||||
if ((gpgd.flags & ~_PAGE_TABLE) || gpgd.pfn >= lg->pfn_limit)
|
||||
if ((pgd_flags(gpgd) & ~_PAGE_TABLE) || pgd_pfn(gpgd) >= lg->pfn_limit)
|
||||
kill_guest(lg, "bad page directory entry");
|
||||
}
|
||||
|
||||
@ -211,21 +202,21 @@ static void check_gpgd(struct lguest *lg, gpgd_t gpgd)
|
||||
* true. */
|
||||
int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
|
||||
{
|
||||
gpgd_t gpgd;
|
||||
spgd_t *spgd;
|
||||
pgd_t gpgd;
|
||||
pgd_t *spgd;
|
||||
unsigned long gpte_ptr;
|
||||
gpte_t gpte;
|
||||
spte_t *spte;
|
||||
pte_t gpte;
|
||||
pte_t *spte;
|
||||
|
||||
/* First step: get the top-level Guest page table entry. */
|
||||
gpgd = mkgpgd(lgread_u32(lg, gpgd_addr(lg, vaddr)));
|
||||
gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t);
|
||||
/* Toplevel not present? We can't map it in. */
|
||||
if (!(gpgd.flags & _PAGE_PRESENT))
|
||||
if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
|
||||
return 0;
|
||||
|
||||
/* Now look at the matching shadow entry. */
|
||||
spgd = spgd_addr(lg, lg->pgdidx, vaddr);
|
||||
if (!(spgd->flags & _PAGE_PRESENT)) {
|
||||
if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
|
||||
/* No shadow entry: allocate a new shadow PTE page. */
|
||||
unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
|
||||
/* This is not really the Guest's fault, but killing it is
|
||||
@ -238,34 +229,35 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
|
||||
check_gpgd(lg, gpgd);
|
||||
/* And we copy the flags to the shadow PGD entry. The page
|
||||
* number in the shadow PGD is the page we just allocated. */
|
||||
spgd->raw.val = (__pa(ptepage) | gpgd.flags);
|
||||
*spgd = __pgd(__pa(ptepage) | pgd_flags(gpgd));
|
||||
}
|
||||
|
||||
/* OK, now we look at the lower level in the Guest page table: keep its
|
||||
* address, because we might update it later. */
|
||||
gpte_ptr = gpte_addr(lg, gpgd, vaddr);
|
||||
gpte = mkgpte(lgread_u32(lg, gpte_ptr));
|
||||
gpte = lgread(lg, gpte_ptr, pte_t);
|
||||
|
||||
/* If this page isn't in the Guest page tables, we can't page it in. */
|
||||
if (!(gpte.flags & _PAGE_PRESENT))
|
||||
if (!(pte_flags(gpte) & _PAGE_PRESENT))
|
||||
return 0;
|
||||
|
||||
/* Check they're not trying to write to a page the Guest wants
|
||||
* read-only (bit 2 of errcode == write). */
|
||||
if ((errcode & 2) && !(gpte.flags & _PAGE_RW))
|
||||
if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW))
|
||||
return 0;
|
||||
|
||||
/* User access to a kernel page? (bit 3 == user access) */
|
||||
if ((errcode & 4) && !(gpte.flags & _PAGE_USER))
|
||||
if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER))
|
||||
return 0;
|
||||
|
||||
/* Check that the Guest PTE flags are OK, and the page number is below
|
||||
* the pfn_limit (ie. not mapping the Launcher binary). */
|
||||
check_gpte(lg, gpte);
|
||||
/* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
|
||||
gpte.flags |= _PAGE_ACCESSED;
|
||||
gpte = pte_mkyoung(gpte);
|
||||
|
||||
if (errcode & 2)
|
||||
gpte.flags |= _PAGE_DIRTY;
|
||||
gpte = pte_mkdirty(gpte);
|
||||
|
||||
/* Get the pointer to the shadow PTE entry we're going to set. */
|
||||
spte = spte_addr(lg, *spgd, vaddr);
|
||||
@ -275,21 +267,18 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
|
||||
|
||||
/* If this is a write, we insist that the Guest page is writable (the
|
||||
* final arg to gpte_to_spte()). */
|
||||
if (gpte.flags & _PAGE_DIRTY)
|
||||
if (pte_dirty(gpte))
|
||||
*spte = gpte_to_spte(lg, gpte, 1);
|
||||
else {
|
||||
else
|
||||
/* If this is a read, don't set the "writable" bit in the page
|
||||
* table entry, even if the Guest says it's writable. That way
|
||||
* we come back here when a write does actually ocur, so we can
|
||||
* update the Guest's _PAGE_DIRTY flag. */
|
||||
gpte_t ro_gpte = gpte;
|
||||
ro_gpte.flags &= ~_PAGE_RW;
|
||||
*spte = gpte_to_spte(lg, ro_gpte, 0);
|
||||
}
|
||||
*spte = gpte_to_spte(lg, pte_wrprotect(gpte), 0);
|
||||
|
||||
/* Finally, we write the Guest PTE entry back: we've set the
|
||||
* _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */
|
||||
lgwrite_u32(lg, gpte_ptr, gpte.raw.val);
|
||||
lgwrite(lg, gpte_ptr, pte_t, gpte);
|
||||
|
||||
/* We succeeded in mapping the page! */
|
||||
return 1;
|
||||
@ -305,17 +294,18 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
|
||||
* mapped by the shadow page tables, and is it writable? */
|
||||
static int page_writable(struct lguest *lg, unsigned long vaddr)
|
||||
{
|
||||
spgd_t *spgd;
|
||||
pgd_t *spgd;
|
||||
unsigned long flags;
|
||||
|
||||
/* Look at the top level entry: is it present? */
|
||||
spgd = spgd_addr(lg, lg->pgdidx, vaddr);
|
||||
if (!(spgd->flags & _PAGE_PRESENT))
|
||||
if (!(pgd_flags(*spgd) & _PAGE_PRESENT))
|
||||
return 0;
|
||||
|
||||
/* Check the flags on the pte entry itself: it must be present and
|
||||
* writable. */
|
||||
flags = spte_addr(lg, *spgd, vaddr)->flags;
|
||||
flags = pte_flags(*(spte_addr(lg, *spgd, vaddr)));
|
||||
|
||||
return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
|
||||
}
|
||||
|
||||
@ -329,22 +319,22 @@ void pin_page(struct lguest *lg, unsigned long vaddr)
|
||||
}
|
||||
|
||||
/*H:450 If we chase down the release_pgd() code, it looks like this: */
|
||||
static void release_pgd(struct lguest *lg, spgd_t *spgd)
|
||||
static void release_pgd(struct lguest *lg, pgd_t *spgd)
|
||||
{
|
||||
/* If the entry's not present, there's nothing to release. */
|
||||
if (spgd->flags & _PAGE_PRESENT) {
|
||||
if (pgd_flags(*spgd) & _PAGE_PRESENT) {
|
||||
unsigned int i;
|
||||
/* Converting the pfn to find the actual PTE page is easy: turn
|
||||
* the page number into a physical address, then convert to a
|
||||
* virtual address (easy for kernel pages like this one). */
|
||||
spte_t *ptepage = __va(spgd->pfn << PAGE_SHIFT);
|
||||
pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
|
||||
/* For each entry in the page, we might need to release it. */
|
||||
for (i = 0; i < PTES_PER_PAGE; i++)
|
||||
for (i = 0; i < PTRS_PER_PTE; i++)
|
||||
release_pte(ptepage[i]);
|
||||
/* Now we can free the page of PTEs */
|
||||
free_page((long)ptepage);
|
||||
/* And zero out the PGD entry we we never release it twice. */
|
||||
spgd->raw.val = 0;
|
||||
*spgd = __pgd(0);
|
||||
}
|
||||
}
|
||||
|
||||
@ -356,7 +346,7 @@ static void flush_user_mappings(struct lguest *lg, int idx)
|
||||
{
|
||||
unsigned int i;
|
||||
/* Release every pgd entry up to the kernel's address. */
|
||||
for (i = 0; i < vaddr_to_pgd_index(lg->page_offset); i++)
|
||||
for (i = 0; i < pgd_index(lg->kernel_address); i++)
|
||||
release_pgd(lg, lg->pgdirs[idx].pgdir + i);
|
||||
}
|
||||
|
||||
@ -369,6 +359,25 @@ void guest_pagetable_flush_user(struct lguest *lg)
|
||||
}
|
||||
/*:*/
|
||||
|
||||
/* We walk down the guest page tables to get a guest-physical address */
|
||||
unsigned long guest_pa(struct lguest *lg, unsigned long vaddr)
|
||||
{
|
||||
pgd_t gpgd;
|
||||
pte_t gpte;
|
||||
|
||||
/* First step: get the top-level Guest page table entry. */
|
||||
gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t);
|
||||
/* Toplevel not present? We can't map it in. */
|
||||
if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
|
||||
kill_guest(lg, "Bad address %#lx", vaddr);
|
||||
|
||||
gpte = lgread(lg, gpte_addr(lg, gpgd, vaddr), pte_t);
|
||||
if (!(pte_flags(gpte) & _PAGE_PRESENT))
|
||||
kill_guest(lg, "Bad address %#lx", vaddr);
|
||||
|
||||
return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK);
|
||||
}
|
||||
|
||||
/* We keep several page tables. This is a simple routine to find the page
|
||||
* table (if any) corresponding to this top-level address the Guest has given
|
||||
* us. */
|
||||
@ -376,7 +385,7 @@ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
|
||||
{
|
||||
unsigned int i;
|
||||
for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
|
||||
if (lg->pgdirs[i].cr3 == pgtable)
|
||||
if (lg->pgdirs[i].gpgdir == pgtable)
|
||||
break;
|
||||
return i;
|
||||
}
|
||||
@ -385,7 +394,7 @@ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
|
||||
* allocate a new one (and so the kernel parts are not there), we set
|
||||
* blank_pgdir. */
|
||||
static unsigned int new_pgdir(struct lguest *lg,
|
||||
unsigned long cr3,
|
||||
unsigned long gpgdir,
|
||||
int *blank_pgdir)
|
||||
{
|
||||
unsigned int next;
|
||||
@ -395,7 +404,7 @@ static unsigned int new_pgdir(struct lguest *lg,
|
||||
next = random32() % ARRAY_SIZE(lg->pgdirs);
|
||||
/* If it's never been allocated at all before, try now. */
|
||||
if (!lg->pgdirs[next].pgdir) {
|
||||
lg->pgdirs[next].pgdir = (spgd_t *)get_zeroed_page(GFP_KERNEL);
|
||||
lg->pgdirs[next].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);
|
||||
/* If the allocation fails, just keep using the one we have */
|
||||
if (!lg->pgdirs[next].pgdir)
|
||||
next = lg->pgdidx;
|
||||
@ -405,7 +414,7 @@ static unsigned int new_pgdir(struct lguest *lg,
|
||||
*blank_pgdir = 1;
|
||||
}
|
||||
/* Record which Guest toplevel this shadows. */
|
||||
lg->pgdirs[next].cr3 = cr3;
|
||||
lg->pgdirs[next].gpgdir = gpgdir;
|
||||
/* Release all the non-kernel mappings. */
|
||||
flush_user_mappings(lg, next);
|
||||
|
||||
@ -472,26 +481,27 @@ void guest_pagetable_clear_all(struct lguest *lg)
|
||||
* they set _PAGE_DIRTY then we can put a writable PTE entry in immediately.
|
||||
*/
|
||||
static void do_set_pte(struct lguest *lg, int idx,
|
||||
unsigned long vaddr, gpte_t gpte)
|
||||
unsigned long vaddr, pte_t gpte)
|
||||
{
|
||||
/* Look up the matching shadow page directot entry. */
|
||||
spgd_t *spgd = spgd_addr(lg, idx, vaddr);
|
||||
pgd_t *spgd = spgd_addr(lg, idx, vaddr);
|
||||
|
||||
/* If the top level isn't present, there's no entry to update. */
|
||||
if (spgd->flags & _PAGE_PRESENT) {
|
||||
if (pgd_flags(*spgd) & _PAGE_PRESENT) {
|
||||
/* Otherwise, we start by releasing the existing entry. */
|
||||
spte_t *spte = spte_addr(lg, *spgd, vaddr);
|
||||
pte_t *spte = spte_addr(lg, *spgd, vaddr);
|
||||
release_pte(*spte);
|
||||
|
||||
/* If they're setting this entry as dirty or accessed, we might
|
||||
* as well put that entry they've given us in now. This shaves
|
||||
* 10% off a copy-on-write micro-benchmark. */
|
||||
if (gpte.flags & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
|
||||
if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
|
||||
check_gpte(lg, gpte);
|
||||
*spte = gpte_to_spte(lg, gpte, gpte.flags&_PAGE_DIRTY);
|
||||
*spte = gpte_to_spte(lg, gpte,
|
||||
pte_flags(gpte) & _PAGE_DIRTY);
|
||||
} else
|
||||
/* Otherwise we can demand_page() it in later. */
|
||||
spte->raw.val = 0;
|
||||
*spte = __pte(0);
|
||||
}
|
||||
}
|
||||
|
||||
@ -506,18 +516,18 @@ static void do_set_pte(struct lguest *lg, int idx,
|
||||
* The benefit is that when we have to track a new page table, we can copy keep
|
||||
* all the kernel mappings. This speeds up context switch immensely. */
|
||||
void guest_set_pte(struct lguest *lg,
|
||||
unsigned long cr3, unsigned long vaddr, gpte_t gpte)
|
||||
unsigned long gpgdir, unsigned long vaddr, pte_t gpte)
|
||||
{
|
||||
/* Kernel mappings must be changed on all top levels. Slow, but
|
||||
* doesn't happen often. */
|
||||
if (vaddr >= lg->page_offset) {
|
||||
if (vaddr >= lg->kernel_address) {
|
||||
unsigned int i;
|
||||
for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
|
||||
if (lg->pgdirs[i].pgdir)
|
||||
do_set_pte(lg, i, vaddr, gpte);
|
||||
} else {
|
||||
/* Is this page table one we have a shadow for? */
|
||||
int pgdir = find_pgdir(lg, cr3);
|
||||
int pgdir = find_pgdir(lg, gpgdir);
|
||||
if (pgdir != ARRAY_SIZE(lg->pgdirs))
|
||||
/* If so, do the update. */
|
||||
do_set_pte(lg, pgdir, vaddr, gpte);
|
||||
@ -538,7 +548,7 @@ void guest_set_pte(struct lguest *lg,
|
||||
*
|
||||
* So with that in mind here's our code to to update a (top-level) PGD entry:
|
||||
*/
|
||||
void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 idx)
|
||||
void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx)
|
||||
{
|
||||
int pgdir;
|
||||
|
||||
@ -548,7 +558,7 @@ void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 idx)
|
||||
return;
|
||||
|
||||
/* If they're talking about a page table we have a shadow for... */
|
||||
pgdir = find_pgdir(lg, cr3);
|
||||
pgdir = find_pgdir(lg, gpgdir);
|
||||
if (pgdir < ARRAY_SIZE(lg->pgdirs))
|
||||
/* ... throw it away. */
|
||||
release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx);
|
||||
@ -560,21 +570,34 @@ void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 idx)
|
||||
* its first page table is. We set some things up here: */
|
||||
int init_guest_pagetable(struct lguest *lg, unsigned long pgtable)
|
||||
{
|
||||
/* In flush_user_mappings() we loop from 0 to
|
||||
* "vaddr_to_pgd_index(lg->page_offset)". This assumes it won't hit
|
||||
* the Switcher mappings, so check that now. */
|
||||
if (vaddr_to_pgd_index(lg->page_offset) >= SWITCHER_PGD_INDEX)
|
||||
return -EINVAL;
|
||||
/* We start on the first shadow page table, and give it a blank PGD
|
||||
* page. */
|
||||
lg->pgdidx = 0;
|
||||
lg->pgdirs[lg->pgdidx].cr3 = pgtable;
|
||||
lg->pgdirs[lg->pgdidx].pgdir = (spgd_t*)get_zeroed_page(GFP_KERNEL);
|
||||
lg->pgdirs[lg->pgdidx].gpgdir = pgtable;
|
||||
lg->pgdirs[lg->pgdidx].pgdir = (pgd_t*)get_zeroed_page(GFP_KERNEL);
|
||||
if (!lg->pgdirs[lg->pgdidx].pgdir)
|
||||
return -ENOMEM;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
|
||||
void page_table_guest_data_init(struct lguest *lg)
|
||||
{
|
||||
/* We get the kernel address: above this is all kernel memory. */
|
||||
if (get_user(lg->kernel_address, &lg->lguest_data->kernel_address)
|
||||
/* We tell the Guest that it can't use the top 4MB of virtual
|
||||
* addresses used by the Switcher. */
|
||||
|| put_user(4U*1024*1024, &lg->lguest_data->reserve_mem)
|
||||
|| put_user(lg->pgdirs[lg->pgdidx].gpgdir,&lg->lguest_data->pgdir))
|
||||
kill_guest(lg, "bad guest page %p", lg->lguest_data);
|
||||
|
||||
/* In flush_user_mappings() we loop from 0 to
|
||||
* "pgd_index(lg->kernel_address)". This assumes it won't hit the
|
||||
* Switcher mappings, so check that now. */
|
||||
if (pgd_index(lg->kernel_address) >= SWITCHER_PGD_INDEX)
|
||||
kill_guest(lg, "bad kernel address %#lx", lg->kernel_address);
|
||||
}
|
||||
|
||||
/* When a Guest dies, our cleanup is fairly simple. */
|
||||
void free_guest_pagetable(struct lguest *lg)
|
||||
{
|
||||
@ -594,14 +617,14 @@ void free_guest_pagetable(struct lguest *lg)
|
||||
* for each CPU already set up, we just need to hook them in. */
|
||||
void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages)
|
||||
{
|
||||
spte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);
|
||||
spgd_t switcher_pgd;
|
||||
spte_t regs_pte;
|
||||
pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);
|
||||
pgd_t switcher_pgd;
|
||||
pte_t regs_pte;
|
||||
|
||||
/* Make the last PGD entry for this Guest point to the Switcher's PTE
|
||||
* page for this CPU (with appropriate flags). */
|
||||
switcher_pgd.pfn = __pa(switcher_pte_page) >> PAGE_SHIFT;
|
||||
switcher_pgd.flags = _PAGE_KERNEL;
|
||||
switcher_pgd = __pgd(__pa(switcher_pte_page) | _PAGE_KERNEL);
|
||||
|
||||
lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
|
||||
|
||||
/* We also change the Switcher PTE page. When we're running the Guest,
|
||||
@ -611,10 +634,8 @@ void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages)
|
||||
* CPU's "struct lguest_pages": if we make sure the Guest's register
|
||||
* page is already mapped there, we don't have to copy them out
|
||||
* again. */
|
||||
regs_pte.pfn = __pa(lg->regs_page) >> PAGE_SHIFT;
|
||||
regs_pte.flags = _PAGE_KERNEL;
|
||||
switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTES_PER_PAGE]
|
||||
= regs_pte;
|
||||
regs_pte = pfn_pte (__pa(lg->regs_page) >> PAGE_SHIFT, __pgprot(_PAGE_KERNEL));
|
||||
switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTRS_PER_PTE] = regs_pte;
|
||||
}
|
||||
/*:*/
|
||||
|
||||
@ -635,24 +656,25 @@ static __init void populate_switcher_pte_page(unsigned int cpu,
|
||||
unsigned int pages)
|
||||
{
|
||||
unsigned int i;
|
||||
spte_t *pte = switcher_pte_page(cpu);
|
||||
pte_t *pte = switcher_pte_page(cpu);
|
||||
|
||||
/* The first entries are easy: they map the Switcher code. */
|
||||
for (i = 0; i < pages; i++) {
|
||||
pte[i].pfn = page_to_pfn(switcher_page[i]);
|
||||
pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED;
|
||||
pte[i] = mk_pte(switcher_page[i],
|
||||
__pgprot(_PAGE_PRESENT|_PAGE_ACCESSED));
|
||||
}
|
||||
|
||||
/* The only other thing we map is this CPU's pair of pages. */
|
||||
i = pages + cpu*2;
|
||||
|
||||
/* First page (Guest registers) is writable from the Guest */
|
||||
pte[i].pfn = page_to_pfn(switcher_page[i]);
|
||||
pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW;
|
||||
pte[i] = pfn_pte(page_to_pfn(switcher_page[i]),
|
||||
__pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW));
|
||||
|
||||
/* The second page contains the "struct lguest_ro_state", and is
|
||||
* read-only. */
|
||||
pte[i+1].pfn = page_to_pfn(switcher_page[i+1]);
|
||||
pte[i+1].flags = _PAGE_PRESENT|_PAGE_ACCESSED;
|
||||
pte[i+1] = pfn_pte(page_to_pfn(switcher_page[i+1]),
|
||||
__pgprot(_PAGE_PRESENT|_PAGE_ACCESSED));
|
||||
}
|
||||
|
||||
/*H:510 At boot or module load time, init_pagetables() allocates and populates
|
||||
@ -662,7 +684,7 @@ __init int init_pagetables(struct page **switcher_page, unsigned int pages)
|
||||
unsigned int i;
|
||||
|
||||
for_each_possible_cpu(i) {
|
||||
switcher_pte_page(i) = (spte_t *)get_zeroed_page(GFP_KERNEL);
|
||||
switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL);
|
||||
if (!switcher_pte_page(i)) {
|
||||
free_switcher_pte_pages();
|
||||
return -ENOMEM;
|
||||
|
@ -73,14 +73,14 @@ static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end)
|
||||
/* Segment descriptors contain a privilege level: the Guest is
|
||||
* sometimes careless and leaves this as 0, even though it's
|
||||
* running at privilege level 1. If so, we fix it here. */
|
||||
if ((lg->gdt[i].b & 0x00006000) == 0)
|
||||
lg->gdt[i].b |= (GUEST_PL << 13);
|
||||
if ((lg->arch.gdt[i].b & 0x00006000) == 0)
|
||||
lg->arch.gdt[i].b |= (GUEST_PL << 13);
|
||||
|
||||
/* Each descriptor has an "accessed" bit. If we don't set it
|
||||
* now, the CPU will try to set it when the Guest first loads
|
||||
* that entry into a segment register. But the GDT isn't
|
||||
* writable by the Guest, so bad things can happen. */
|
||||
lg->gdt[i].b |= 0x00000100;
|
||||
lg->arch.gdt[i].b |= 0x00000100;
|
||||
}
|
||||
}
|
||||
|
||||
@ -106,12 +106,12 @@ void setup_default_gdt_entries(struct lguest_ro_state *state)
|
||||
void setup_guest_gdt(struct lguest *lg)
|
||||
{
|
||||
/* Start with full 0-4G segments... */
|
||||
lg->gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
|
||||
lg->gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
|
||||
lg->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
|
||||
lg->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
|
||||
/* ...except the Guest is allowed to use them, so set the privilege
|
||||
* level appropriately in the flags. */
|
||||
lg->gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13);
|
||||
lg->gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13);
|
||||
lg->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13);
|
||||
lg->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13);
|
||||
}
|
||||
|
||||
/* Like the IDT, we never simply use the GDT the Guest gives us. We set up the
|
||||
@ -126,7 +126,7 @@ void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt)
|
||||
unsigned int i;
|
||||
|
||||
for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++)
|
||||
gdt[i] = lg->gdt[i];
|
||||
gdt[i] = lg->arch.gdt[i];
|
||||
}
|
||||
|
||||
/* This is the full version */
|
||||
@ -138,7 +138,7 @@ void copy_gdt(const struct lguest *lg, struct desc_struct *gdt)
|
||||
* replaced. See ignored_gdt() above. */
|
||||
for (i = 0; i < GDT_ENTRIES; i++)
|
||||
if (!ignored_gdt(i))
|
||||
gdt[i] = lg->gdt[i];
|
||||
gdt[i] = lg->arch.gdt[i];
|
||||
}
|
||||
|
||||
/* This is where the Guest asks us to load a new GDT (LHCALL_LOAD_GDT). */
|
||||
@ -146,12 +146,12 @@ void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num)
|
||||
{
|
||||
/* We assume the Guest has the same number of GDT entries as the
|
||||
* Host, otherwise we'd have to dynamically allocate the Guest GDT. */
|
||||
if (num > ARRAY_SIZE(lg->gdt))
|
||||
if (num > ARRAY_SIZE(lg->arch.gdt))
|
||||
kill_guest(lg, "too many gdt entries %i", num);
|
||||
|
||||
/* We read the whole thing in, then fix it up. */
|
||||
lgread(lg, lg->gdt, table, num * sizeof(lg->gdt[0]));
|
||||
fixup_gdt_table(lg, 0, ARRAY_SIZE(lg->gdt));
|
||||
__lgread(lg, lg->arch.gdt, table, num * sizeof(lg->arch.gdt[0]));
|
||||
fixup_gdt_table(lg, 0, ARRAY_SIZE(lg->arch.gdt));
|
||||
/* Mark that the GDT changed so the core knows it has to copy it again,
|
||||
* even if the Guest is run on the same CPU. */
|
||||
lg->changed |= CHANGED_GDT;
|
||||
@ -159,9 +159,9 @@ void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num)
|
||||
|
||||
void guest_load_tls(struct lguest *lg, unsigned long gtls)
|
||||
{
|
||||
struct desc_struct *tls = &lg->gdt[GDT_ENTRY_TLS_MIN];
|
||||
struct desc_struct *tls = &lg->arch.gdt[GDT_ENTRY_TLS_MIN];
|
||||
|
||||
lgread(lg, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
|
||||
__lgread(lg, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
|
||||
fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1);
|
||||
lg->changed |= CHANGED_GDT_TLS;
|
||||
}
|
||||
|
577
drivers/lguest/x86/core.c
Normal file
577
drivers/lguest/x86/core.c
Normal file
@ -0,0 +1,577 @@
|
||||
/*
|
||||
* Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
|
||||
* Copyright (C) 2007, Jes Sorensen <jes@sgi.com> SGI.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
||||
* NON INFRINGEMENT. See the GNU General Public License for more
|
||||
* details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
*/
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/start_kernel.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/console.h>
|
||||
#include <linux/screen_info.h>
|
||||
#include <linux/irq.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/clocksource.h>
|
||||
#include <linux/clockchips.h>
|
||||
#include <linux/cpu.h>
|
||||
#include <linux/lguest.h>
|
||||
#include <linux/lguest_launcher.h>
|
||||
#include <asm/paravirt.h>
|
||||
#include <asm/param.h>
|
||||
#include <asm/page.h>
|
||||
#include <asm/pgtable.h>
|
||||
#include <asm/desc.h>
|
||||
#include <asm/setup.h>
|
||||
#include <asm/lguest.h>
|
||||
#include <asm/uaccess.h>
|
||||
#include <asm/i387.h>
|
||||
#include "../lg.h"
|
||||
|
||||
static int cpu_had_pge;
|
||||
|
||||
static struct {
|
||||
unsigned long offset;
|
||||
unsigned short segment;
|
||||
} lguest_entry;
|
||||
|
||||
/* Offset from where switcher.S was compiled to where we've copied it */
|
||||
static unsigned long switcher_offset(void)
|
||||
{
|
||||
return SWITCHER_ADDR - (unsigned long)start_switcher_text;
|
||||
}
|
||||
|
||||
/* This cpu's struct lguest_pages. */
|
||||
static struct lguest_pages *lguest_pages(unsigned int cpu)
|
||||
{
|
||||
return &(((struct lguest_pages *)
|
||||
(SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]);
|
||||
}
|
||||
|
||||
static DEFINE_PER_CPU(struct lguest *, last_guest);
|
||||
|
||||
/*S:010
|
||||
* We are getting close to the Switcher.
|
||||
*
|
||||
* Remember that each CPU has two pages which are visible to the Guest when it
|
||||
* runs on that CPU. This has to contain the state for that Guest: we copy the
|
||||
* state in just before we run the Guest.
|
||||
*
|
||||
* Each Guest has "changed" flags which indicate what has changed in the Guest
|
||||
* since it last ran. We saw this set in interrupts_and_traps.c and
|
||||
* segments.c.
|
||||
*/
|
||||
static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages)
|
||||
{
|
||||
/* Copying all this data can be quite expensive. We usually run the
|
||||
* same Guest we ran last time (and that Guest hasn't run anywhere else
|
||||
* meanwhile). If that's not the case, we pretend everything in the
|
||||
* Guest has changed. */
|
||||
if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) {
|
||||
__get_cpu_var(last_guest) = lg;
|
||||
lg->last_pages = pages;
|
||||
lg->changed = CHANGED_ALL;
|
||||
}
|
||||
|
||||
/* These copies are pretty cheap, so we do them unconditionally: */
|
||||
/* Save the current Host top-level page directory. */
|
||||
pages->state.host_cr3 = __pa(current->mm->pgd);
|
||||
/* Set up the Guest's page tables to see this CPU's pages (and no
|
||||
* other CPU's pages). */
|
||||
map_switcher_in_guest(lg, pages);
|
||||
/* Set up the two "TSS" members which tell the CPU what stack to use
|
||||
* for traps which do directly into the Guest (ie. traps at privilege
|
||||
* level 1). */
|
||||
pages->state.guest_tss.esp1 = lg->esp1;
|
||||
pages->state.guest_tss.ss1 = lg->ss1;
|
||||
|
||||
/* Copy direct-to-Guest trap entries. */
|
||||
if (lg->changed & CHANGED_IDT)
|
||||
copy_traps(lg, pages->state.guest_idt, default_idt_entries);
|
||||
|
||||
/* Copy all GDT entries which the Guest can change. */
|
||||
if (lg->changed & CHANGED_GDT)
|
||||
copy_gdt(lg, pages->state.guest_gdt);
|
||||
/* If only the TLS entries have changed, copy them. */
|
||||
else if (lg->changed & CHANGED_GDT_TLS)
|
||||
copy_gdt_tls(lg, pages->state.guest_gdt);
|
||||
|
||||
/* Mark the Guest as unchanged for next time. */
|
||||
lg->changed = 0;
|
||||
}
|
||||
|
||||
/* Finally: the code to actually call into the Switcher to run the Guest. */
|
||||
static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
|
||||
{
|
||||
/* This is a dummy value we need for GCC's sake. */
|
||||
unsigned int clobber;
|
||||
|
||||
/* Copy the guest-specific information into this CPU's "struct
|
||||
* lguest_pages". */
|
||||
copy_in_guest_info(lg, pages);
|
||||
|
||||
/* Set the trap number to 256 (impossible value). If we fault while
|
||||
* switching to the Guest (bad segment registers or bug), this will
|
||||
* cause us to abort the Guest. */
|
||||
lg->regs->trapnum = 256;
|
||||
|
||||
/* Now: we push the "eflags" register on the stack, then do an "lcall".
|
||||
* This is how we change from using the kernel code segment to using
|
||||
* the dedicated lguest code segment, as well as jumping into the
|
||||
* Switcher.
|
||||
*
|
||||
* The lcall also pushes the old code segment (KERNEL_CS) onto the
|
||||
* stack, then the address of this call. This stack layout happens to
|
||||
* exactly match the stack of an interrupt... */
|
||||
asm volatile("pushf; lcall *lguest_entry"
|
||||
/* This is how we tell GCC that %eax ("a") and %ebx ("b")
|
||||
* are changed by this routine. The "=" means output. */
|
||||
: "=a"(clobber), "=b"(clobber)
|
||||
/* %eax contains the pages pointer. ("0" refers to the
|
||||
* 0-th argument above, ie "a"). %ebx contains the
|
||||
* physical address of the Guest's top-level page
|
||||
* directory. */
|
||||
: "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir))
|
||||
/* We tell gcc that all these registers could change,
|
||||
* which means we don't have to save and restore them in
|
||||
* the Switcher. */
|
||||
: "memory", "%edx", "%ecx", "%edi", "%esi");
|
||||
}
|
||||
/*:*/
|
||||
|
||||
/*H:040 This is the i386-specific code to setup and run the Guest. Interrupts
|
||||
* are disabled: we own the CPU. */
|
||||
void lguest_arch_run_guest(struct lguest *lg)
|
||||
{
|
||||
/* Remember the awfully-named TS bit? If the Guest has asked
|
||||
* to set it we set it now, so we can trap and pass that trap
|
||||
* to the Guest if it uses the FPU. */
|
||||
if (lg->ts)
|
||||
lguest_set_ts();
|
||||
|
||||
/* SYSENTER is an optimized way of doing system calls. We
|
||||
* can't allow it because it always jumps to privilege level 0.
|
||||
* A normal Guest won't try it because we don't advertise it in
|
||||
* CPUID, but a malicious Guest (or malicious Guest userspace
|
||||
* program) could, so we tell the CPU to disable it before
|
||||
* running the Guest. */
|
||||
if (boot_cpu_has(X86_FEATURE_SEP))
|
||||
wrmsr(MSR_IA32_SYSENTER_CS, 0, 0);
|
||||
|
||||
/* Now we actually run the Guest. It will pop back out when
|
||||
* something interesting happens, and we can examine its
|
||||
* registers to see what it was doing. */
|
||||
run_guest_once(lg, lguest_pages(raw_smp_processor_id()));
|
||||
|
||||
/* The "regs" pointer contains two extra entries which are not
|
||||
* really registers: a trap number which says what interrupt or
|
||||
* trap made the switcher code come back, and an error code
|
||||
* which some traps set. */
|
||||
|
||||
/* If the Guest page faulted, then the cr2 register will tell
|
||||
* us the bad virtual address. We have to grab this now,
|
||||
* because once we re-enable interrupts an interrupt could
|
||||
* fault and thus overwrite cr2, or we could even move off to a
|
||||
* different CPU. */
|
||||
if (lg->regs->trapnum == 14)
|
||||
lg->arch.last_pagefault = read_cr2();
|
||||
/* Similarly, if we took a trap because the Guest used the FPU,
|
||||
* we have to restore the FPU it expects to see. */
|
||||
else if (lg->regs->trapnum == 7)
|
||||
math_state_restore();
|
||||
|
||||
/* Restore SYSENTER if it's supposed to be on. */
|
||||
if (boot_cpu_has(X86_FEATURE_SEP))
|
||||
wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
|
||||
}
|
||||
|
||||
/*H:130 Our Guest is usually so well behaved; it never tries to do things it
|
||||
* isn't allowed to. Unfortunately, Linux's paravirtual infrastructure isn't
|
||||
* quite complete, because it doesn't contain replacements for the Intel I/O
|
||||
* instructions. As a result, the Guest sometimes fumbles across one during
|
||||
* the boot process as it probes for various things which are usually attached
|
||||
* to a PC.
|
||||
*
|
||||
* When the Guest uses one of these instructions, we get trap #13 (General
|
||||
* Protection Fault) and come here. We see if it's one of those troublesome
|
||||
* instructions and skip over it. We return true if we did. */
|
||||
static int emulate_insn(struct lguest *lg)
|
||||
{
|
||||
u8 insn;
|
||||
unsigned int insnlen = 0, in = 0, shift = 0;
|
||||
/* The eip contains the *virtual* address of the Guest's instruction:
|
||||
* guest_pa just subtracts the Guest's page_offset. */
|
||||
unsigned long physaddr = guest_pa(lg, lg->regs->eip);
|
||||
|
||||
/* This must be the Guest kernel trying to do something, not userspace!
|
||||
* The bottom two bits of the CS segment register are the privilege
|
||||
* level. */
|
||||
if ((lg->regs->cs & 3) != GUEST_PL)
|
||||
return 0;
|
||||
|
||||
/* Decoding x86 instructions is icky. */
|
||||
insn = lgread(lg, physaddr, u8);
|
||||
|
||||
/* 0x66 is an "operand prefix". It means it's using the upper 16 bits
|
||||
of the eax register. */
|
||||
if (insn == 0x66) {
|
||||
shift = 16;
|
||||
/* The instruction is 1 byte so far, read the next byte. */
|
||||
insnlen = 1;
|
||||
insn = lgread(lg, physaddr + insnlen, u8);
|
||||
}
|
||||
|
||||
/* We can ignore the lower bit for the moment and decode the 4 opcodes
|
||||
* we need to emulate. */
|
||||
switch (insn & 0xFE) {
|
||||
case 0xE4: /* in <next byte>,%al */
|
||||
insnlen += 2;
|
||||
in = 1;
|
||||
break;
|
||||
case 0xEC: /* in (%dx),%al */
|
||||
insnlen += 1;
|
||||
in = 1;
|
||||
break;
|
||||
case 0xE6: /* out %al,<next byte> */
|
||||
insnlen += 2;
|
||||
break;
|
||||
case 0xEE: /* out %al,(%dx) */
|
||||
insnlen += 1;
|
||||
break;
|
||||
default:
|
||||
/* OK, we don't know what this is, can't emulate. */
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* If it was an "IN" instruction, they expect the result to be read
|
||||
* into %eax, so we change %eax. We always return all-ones, which
|
||||
* traditionally means "there's nothing there". */
|
||||
if (in) {
|
||||
/* Lower bit tells is whether it's a 16 or 32 bit access */
|
||||
if (insn & 0x1)
|
||||
lg->regs->eax = 0xFFFFFFFF;
|
||||
else
|
||||
lg->regs->eax |= (0xFFFF << shift);
|
||||
}
|
||||
/* Finally, we've "done" the instruction, so move past it. */
|
||||
lg->regs->eip += insnlen;
|
||||
/* Success! */
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
|
||||
void lguest_arch_handle_trap(struct lguest *lg)
|
||||
{
|
||||
switch (lg->regs->trapnum) {
|
||||
case 13: /* We've intercepted a GPF. */
|
||||
/* Check if this was one of those annoying IN or OUT
|
||||
* instructions which we need to emulate. If so, we
|
||||
* just go back into the Guest after we've done it. */
|
||||
if (lg->regs->errcode == 0) {
|
||||
if (emulate_insn(lg))
|
||||
return;
|
||||
}
|
||||
break;
|
||||
case 14: /* We've intercepted a page fault. */
|
||||
/* The Guest accessed a virtual address that wasn't
|
||||
* mapped. This happens a lot: we don't actually set
|
||||
* up most of the page tables for the Guest at all when
|
||||
* we start: as it runs it asks for more and more, and
|
||||
* we set them up as required. In this case, we don't
|
||||
* even tell the Guest that the fault happened.
|
||||
*
|
||||
* The errcode tells whether this was a read or a
|
||||
* write, and whether kernel or userspace code. */
|
||||
if (demand_page(lg, lg->arch.last_pagefault, lg->regs->errcode))
|
||||
return;
|
||||
|
||||
/* OK, it's really not there (or not OK): the Guest
|
||||
* needs to know. We write out the cr2 value so it
|
||||
* knows where the fault occurred.
|
||||
*
|
||||
* Note that if the Guest were really messed up, this
|
||||
* could happen before it's done the INITIALIZE
|
||||
* hypercall, so lg->lguest_data will be NULL */
|
||||
if (lg->lguest_data &&
|
||||
put_user(lg->arch.last_pagefault, &lg->lguest_data->cr2))
|
||||
kill_guest(lg, "Writing cr2");
|
||||
break;
|
||||
case 7: /* We've intercepted a Device Not Available fault. */
|
||||
/* If the Guest doesn't want to know, we already
|
||||
* restored the Floating Point Unit, so we just
|
||||
* continue without telling it. */
|
||||
if (!lg->ts)
|
||||
return;
|
||||
break;
|
||||
case 32 ... 255:
|
||||
/* These values mean a real interrupt occurred, in which case
|
||||
* the Host handler has already been run. We just do a
|
||||
* friendly check if another process should now be run, then
|
||||
* return to run the Guest again */
|
||||
cond_resched();
|
||||
return;
|
||||
case LGUEST_TRAP_ENTRY:
|
||||
/* Our 'struct hcall_args' maps directly over our regs: we set
|
||||
* up the pointer now to indicate a hypercall is pending. */
|
||||
lg->hcall = (struct hcall_args *)lg->regs;
|
||||
return;
|
||||
}
|
||||
|
||||
/* We didn't handle the trap, so it needs to go to the Guest. */
|
||||
if (!deliver_trap(lg, lg->regs->trapnum))
|
||||
/* If the Guest doesn't have a handler (either it hasn't
|
||||
* registered any yet, or it's one of the faults we don't let
|
||||
* it handle), it dies with a cryptic error message. */
|
||||
kill_guest(lg, "unhandled trap %li at %#lx (%#lx)",
|
||||
lg->regs->trapnum, lg->regs->eip,
|
||||
lg->regs->trapnum == 14 ? lg->arch.last_pagefault
|
||||
: lg->regs->errcode);
|
||||
}
|
||||
|
||||
/* Now we can look at each of the routines this calls, in increasing order of
|
||||
* complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(),
|
||||
* deliver_trap() and demand_page(). After all those, we'll be ready to
|
||||
* examine the Switcher, and our philosophical understanding of the Host/Guest
|
||||
* duality will be complete. :*/
|
||||
static void adjust_pge(void *on)
|
||||
{
|
||||
if (on)
|
||||
write_cr4(read_cr4() | X86_CR4_PGE);
|
||||
else
|
||||
write_cr4(read_cr4() & ~X86_CR4_PGE);
|
||||
}
|
||||
|
||||
/*H:020 Now the Switcher is mapped and every thing else is ready, we need to do
|
||||
* some more i386-specific initialization. */
|
||||
void __init lguest_arch_host_init(void)
|
||||
{
|
||||
int i;
|
||||
|
||||
/* Most of the i386/switcher.S doesn't care that it's been moved; on
|
||||
* Intel, jumps are relative, and it doesn't access any references to
|
||||
* external code or data.
|
||||
*
|
||||
* The only exception is the interrupt handlers in switcher.S: their
|
||||
* addresses are placed in a table (default_idt_entries), so we need to
|
||||
* update the table with the new addresses. switcher_offset() is a
|
||||
* convenience function which returns the distance between the builtin
|
||||
* switcher code and the high-mapped copy we just made. */
|
||||
for (i = 0; i < IDT_ENTRIES; i++)
|
||||
default_idt_entries[i] += switcher_offset();
|
||||
|
||||
/*
|
||||
* Set up the Switcher's per-cpu areas.
|
||||
*
|
||||
* Each CPU gets two pages of its own within the high-mapped region
|
||||
* (aka. "struct lguest_pages"). Much of this can be initialized now,
|
||||
* but some depends on what Guest we are running (which is set up in
|
||||
* copy_in_guest_info()).
|
||||
*/
|
||||
for_each_possible_cpu(i) {
|
||||
/* lguest_pages() returns this CPU's two pages. */
|
||||
struct lguest_pages *pages = lguest_pages(i);
|
||||
/* This is a convenience pointer to make the code fit one
|
||||
* statement to a line. */
|
||||
struct lguest_ro_state *state = &pages->state;
|
||||
|
||||
/* The Global Descriptor Table: the Host has a different one
|
||||
* for each CPU. We keep a descriptor for the GDT which says
|
||||
* where it is and how big it is (the size is actually the last
|
||||
* byte, not the size, hence the "-1"). */
|
||||
state->host_gdt_desc.size = GDT_SIZE-1;
|
||||
state->host_gdt_desc.address = (long)get_cpu_gdt_table(i);
|
||||
|
||||
/* All CPUs on the Host use the same Interrupt Descriptor
|
||||
* Table, so we just use store_idt(), which gets this CPU's IDT
|
||||
* descriptor. */
|
||||
store_idt(&state->host_idt_desc);
|
||||
|
||||
/* The descriptors for the Guest's GDT and IDT can be filled
|
||||
* out now, too. We copy the GDT & IDT into ->guest_gdt and
|
||||
* ->guest_idt before actually running the Guest. */
|
||||
state->guest_idt_desc.size = sizeof(state->guest_idt)-1;
|
||||
state->guest_idt_desc.address = (long)&state->guest_idt;
|
||||
state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1;
|
||||
state->guest_gdt_desc.address = (long)&state->guest_gdt;
|
||||
|
||||
/* We know where we want the stack to be when the Guest enters
|
||||
* the switcher: in pages->regs. The stack grows upwards, so
|
||||
* we start it at the end of that structure. */
|
||||
state->guest_tss.esp0 = (long)(&pages->regs + 1);
|
||||
/* And this is the GDT entry to use for the stack: we keep a
|
||||
* couple of special LGUEST entries. */
|
||||
state->guest_tss.ss0 = LGUEST_DS;
|
||||
|
||||
/* x86 can have a finegrained bitmap which indicates what I/O
|
||||
* ports the process can use. We set it to the end of our
|
||||
* structure, meaning "none". */
|
||||
state->guest_tss.io_bitmap_base = sizeof(state->guest_tss);
|
||||
|
||||
/* Some GDT entries are the same across all Guests, so we can
|
||||
* set them up now. */
|
||||
setup_default_gdt_entries(state);
|
||||
/* Most IDT entries are the same for all Guests, too.*/
|
||||
setup_default_idt_entries(state, default_idt_entries);
|
||||
|
||||
/* The Host needs to be able to use the LGUEST segments on this
|
||||
* CPU, too, so put them in the Host GDT. */
|
||||
get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
|
||||
get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
|
||||
}
|
||||
|
||||
/* In the Switcher, we want the %cs segment register to use the
|
||||
* LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so
|
||||
* it will be undisturbed when we switch. To change %cs and jump we
|
||||
* need this structure to feed to Intel's "lcall" instruction. */
|
||||
lguest_entry.offset = (long)switch_to_guest + switcher_offset();
|
||||
lguest_entry.segment = LGUEST_CS;
|
||||
|
||||
/* Finally, we need to turn off "Page Global Enable". PGE is an
|
||||
* optimization where page table entries are specially marked to show
|
||||
* they never change. The Host kernel marks all the kernel pages this
|
||||
* way because it's always present, even when userspace is running.
|
||||
*
|
||||
* Lguest breaks this: unbeknownst to the rest of the Host kernel, we
|
||||
* switch to the Guest kernel. If you don't disable this on all CPUs,
|
||||
* you'll get really weird bugs that you'll chase for two days.
|
||||
*
|
||||
* I used to turn PGE off every time we switched to the Guest and back
|
||||
* on when we return, but that slowed the Switcher down noticibly. */
|
||||
|
||||
/* We don't need the complexity of CPUs coming and going while we're
|
||||
* doing this. */
|
||||
lock_cpu_hotplug();
|
||||
if (cpu_has_pge) { /* We have a broader idea of "global". */
|
||||
/* Remember that this was originally set (for cleanup). */
|
||||
cpu_had_pge = 1;
|
||||
/* adjust_pge is a helper function which sets or unsets the PGE
|
||||
* bit on its CPU, depending on the argument (0 == unset). */
|
||||
on_each_cpu(adjust_pge, (void *)0, 0, 1);
|
||||
/* Turn off the feature in the global feature set. */
|
||||
clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
|
||||
}
|
||||
unlock_cpu_hotplug();
|
||||
};
|
||||
/*:*/
|
||||
|
||||
void __exit lguest_arch_host_fini(void)
|
||||
{
|
||||
/* If we had PGE before we started, turn it back on now. */
|
||||
lock_cpu_hotplug();
|
||||
if (cpu_had_pge) {
|
||||
set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
|
||||
/* adjust_pge's argument "1" means set PGE. */
|
||||
on_each_cpu(adjust_pge, (void *)1, 0, 1);
|
||||
}
|
||||
unlock_cpu_hotplug();
|
||||
}
|
||||
|
||||
|
||||
/*H:122 The i386-specific hypercalls simply farm out to the right functions. */
|
||||
int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args)
|
||||
{
|
||||
switch (args->arg0) {
|
||||
case LHCALL_LOAD_GDT:
|
||||
load_guest_gdt(lg, args->arg1, args->arg2);
|
||||
break;
|
||||
case LHCALL_LOAD_IDT_ENTRY:
|
||||
load_guest_idt_entry(lg, args->arg1, args->arg2, args->arg3);
|
||||
break;
|
||||
case LHCALL_LOAD_TLS:
|
||||
guest_load_tls(lg, args->arg1);
|
||||
break;
|
||||
default:
|
||||
/* Bad Guest. Bad! */
|
||||
return -EIO;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*H:126 i386-specific hypercall initialization: */
|
||||
int lguest_arch_init_hypercalls(struct lguest *lg)
|
||||
{
|
||||
u32 tsc_speed;
|
||||
|
||||
/* The pointer to the Guest's "struct lguest_data" is the only
|
||||
* argument. We check that address now. */
|
||||
if (!lguest_address_ok(lg, lg->hcall->arg1, sizeof(*lg->lguest_data)))
|
||||
return -EFAULT;
|
||||
|
||||
/* Having checked it, we simply set lg->lguest_data to point straight
|
||||
* into the Launcher's memory at the right place and then use
|
||||
* copy_to_user/from_user from now on, instead of lgread/write. I put
|
||||
* this in to show that I'm not immune to writing stupid
|
||||
* optimizations. */
|
||||
lg->lguest_data = lg->mem_base + lg->hcall->arg1;
|
||||
|
||||
/* We insist that the Time Stamp Counter exist and doesn't change with
|
||||
* cpu frequency. Some devious chip manufacturers decided that TSC
|
||||
* changes could be handled in software. I decided that time going
|
||||
* backwards might be good for benchmarks, but it's bad for users.
|
||||
*
|
||||
* We also insist that the TSC be stable: the kernel detects unreliable
|
||||
* TSCs for its own purposes, and we use that here. */
|
||||
if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable())
|
||||
tsc_speed = tsc_khz;
|
||||
else
|
||||
tsc_speed = 0;
|
||||
if (put_user(tsc_speed, &lg->lguest_data->tsc_khz))
|
||||
return -EFAULT;
|
||||
|
||||
/* The interrupt code might not like the system call vector. */
|
||||
if (!check_syscall_vector(lg))
|
||||
kill_guest(lg, "bad syscall vector");
|
||||
|
||||
return 0;
|
||||
}
|
||||
/* Now we've examined the hypercall code; our Guest can make requests. There
|
||||
* is one other way we can do things for the Guest, as we see in
|
||||
* emulate_insn(). :*/
|
||||
|
||||
/*L:030 lguest_arch_setup_regs()
|
||||
*
|
||||
* Most of the Guest's registers are left alone: we used get_zeroed_page() to
|
||||
* allocate the structure, so they will be 0. */
|
||||
void lguest_arch_setup_regs(struct lguest *lg, unsigned long start)
|
||||
{
|
||||
struct lguest_regs *regs = lg->regs;
|
||||
|
||||
/* There are four "segment" registers which the Guest needs to boot:
|
||||
* The "code segment" register (cs) refers to the kernel code segment
|
||||
* __KERNEL_CS, and the "data", "extra" and "stack" segment registers
|
||||
* refer to the kernel data segment __KERNEL_DS.
|
||||
*
|
||||
* The privilege level is packed into the lower bits. The Guest runs
|
||||
* at privilege level 1 (GUEST_PL).*/
|
||||
regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL;
|
||||
regs->cs = __KERNEL_CS|GUEST_PL;
|
||||
|
||||
/* The "eflags" register contains miscellaneous flags. Bit 1 (0x002)
|
||||
* is supposed to always be "1". Bit 9 (0x200) controls whether
|
||||
* interrupts are enabled. We always leave interrupts enabled while
|
||||
* running the Guest. */
|
||||
regs->eflags = 0x202;
|
||||
|
||||
/* The "Extended Instruction Pointer" register says where the Guest is
|
||||
* running. */
|
||||
regs->eip = start;
|
||||
|
||||
/* %esi points to our boot information, at physical address 0, so don't
|
||||
* touch it. */
|
||||
/* There are a couple of GDT entries the Guest expects when first
|
||||
* booting. */
|
||||
|
||||
setup_guest_gdt(lg);
|
||||
}
|
@ -48,7 +48,8 @@
|
||||
#include <linux/linkage.h>
|
||||
#include <asm/asm-offsets.h>
|
||||
#include <asm/page.h>
|
||||
#include "lg.h"
|
||||
#include <asm/segment.h>
|
||||
#include <asm/lguest.h>
|
||||
|
||||
// We mark the start of the code to copy
|
||||
// It's placed in .text tho it's never run here
|
||||
@ -132,6 +133,7 @@ ENTRY(switch_to_guest)
|
||||
// The Guest's register page has been mapped
|
||||
// Writable onto our %esp (stack) --
|
||||
// We can simply pop off all Guest regs.
|
||||
popl %eax
|
||||
popl %ebx
|
||||
popl %ecx
|
||||
popl %edx
|
||||
@ -139,7 +141,6 @@ ENTRY(switch_to_guest)
|
||||
popl %edi
|
||||
popl %ebp
|
||||
popl %gs
|
||||
popl %eax
|
||||
popl %fs
|
||||
popl %ds
|
||||
popl %es
|
||||
@ -167,7 +168,6 @@ ENTRY(switch_to_guest)
|
||||
pushl %es; \
|
||||
pushl %ds; \
|
||||
pushl %fs; \
|
||||
pushl %eax; \
|
||||
pushl %gs; \
|
||||
pushl %ebp; \
|
||||
pushl %edi; \
|
||||
@ -175,6 +175,7 @@ ENTRY(switch_to_guest)
|
||||
pushl %edx; \
|
||||
pushl %ecx; \
|
||||
pushl %ebx; \
|
||||
pushl %eax; \
|
||||
/* Our stack and our code are using segments \
|
||||
* Set in the TSS and IDT \
|
||||
* Yet if we were to touch data we'd use \
|
@ -3100,4 +3100,10 @@ config NETPOLL_TRAP
|
||||
config NET_POLL_CONTROLLER
|
||||
def_bool NETPOLL
|
||||
|
||||
config VIRTIO_NET
|
||||
tristate "Virtio network driver (EXPERIMENTAL)"
|
||||
depends on EXPERIMENTAL && VIRTIO
|
||||
---help---
|
||||
This is the virtual network driver for lguest. Say Y or M.
|
||||
|
||||
endif # NETDEVICES
|
||||
|
@ -183,7 +183,6 @@ obj-$(CONFIG_ZORRO8390) += zorro8390.o
|
||||
obj-$(CONFIG_HPLANCE) += hplance.o 7990.o
|
||||
obj-$(CONFIG_MVME147_NET) += mvme147.o 7990.o
|
||||
obj-$(CONFIG_EQUALIZER) += eql.o
|
||||
obj-$(CONFIG_LGUEST_NET) += lguest_net.o
|
||||
obj-$(CONFIG_MIPS_JAZZ_SONIC) += jazzsonic.o
|
||||
obj-$(CONFIG_MIPS_AU1X00_ENET) += au1000_eth.o
|
||||
obj-$(CONFIG_MIPS_SIM_NET) += mipsnet.o
|
||||
@ -243,3 +242,4 @@ obj-$(CONFIG_FS_ENET) += fs_enet/
|
||||
|
||||
obj-$(CONFIG_NETXEN_NIC) += netxen/
|
||||
obj-$(CONFIG_NIU) += niu.o
|
||||
obj-$(CONFIG_VIRTIO_NET) += virtio_net.o
|
||||
|
@ -1,555 +0,0 @@
|
||||
/*D:500
|
||||
* The Guest network driver.
|
||||
*
|
||||
* This is very simple a virtual network driver, and our last Guest driver.
|
||||
* The only trick is that it can talk directly to multiple other recipients
|
||||
* (ie. other Guests on the same network). It can also be used with only the
|
||||
* Host on the network.
|
||||
:*/
|
||||
|
||||
/* Copyright 2006 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
*/
|
||||
//#define DEBUG
|
||||
#include <linux/netdevice.h>
|
||||
#include <linux/etherdevice.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/mm_types.h>
|
||||
#include <linux/io.h>
|
||||
#include <linux/lguest_bus.h>
|
||||
|
||||
#define SHARED_SIZE PAGE_SIZE
|
||||
#define MAX_LANS 4
|
||||
#define NUM_SKBS 8
|
||||
|
||||
/*M:011 Network code master Jeff Garzik points out numerous shortcomings in
|
||||
* this driver if it aspires to greatness.
|
||||
*
|
||||
* Firstly, it doesn't use "NAPI": the networking's New API, and is poorer for
|
||||
* it. As he says "NAPI means system-wide load leveling, across multiple
|
||||
* network interfaces. Lack of NAPI can mean competition at higher loads."
|
||||
*
|
||||
* He also points out that we don't implement set_mac_address, so users cannot
|
||||
* change the devices hardware address. When I asked why one would want to:
|
||||
* "Bonding, and situations where you /do/ want the MAC address to "leak" out
|
||||
* of the host onto the wider net."
|
||||
*
|
||||
* Finally, he would like module unloading: "It is not unrealistic to think of
|
||||
* [un|re|]loading the net support module in an lguest guest. And, adding
|
||||
* module support makes the programmer more responsible, because they now have
|
||||
* to learn to clean up after themselves. Any driver that cannot clean up
|
||||
* after itself is an incomplete driver in my book."
|
||||
:*/
|
||||
|
||||
/*D:530 The "struct lguestnet_info" contains all the information we need to
|
||||
* know about the network device. */
|
||||
struct lguestnet_info
|
||||
{
|
||||
/* The mapped device page(s) (an array of "struct lguest_net"). */
|
||||
struct lguest_net *peer;
|
||||
/* The physical address of the device page(s) */
|
||||
unsigned long peer_phys;
|
||||
/* The size of the device page(s). */
|
||||
unsigned long mapsize;
|
||||
|
||||
/* The lguest_device I come from */
|
||||
struct lguest_device *lgdev;
|
||||
|
||||
/* My peerid (ie. my slot in the array). */
|
||||
unsigned int me;
|
||||
|
||||
/* Receive queue: the network packets waiting to be filled. */
|
||||
struct sk_buff *skb[NUM_SKBS];
|
||||
struct lguest_dma dma[NUM_SKBS];
|
||||
};
|
||||
/*:*/
|
||||
|
||||
/* How many bytes left in this page. */
|
||||
static unsigned int rest_of_page(void *data)
|
||||
{
|
||||
return PAGE_SIZE - ((unsigned long)data % PAGE_SIZE);
|
||||
}
|
||||
|
||||
/*D:570 Each peer (ie. Guest or Host) on the network binds their receive
|
||||
* buffers to a different key: we simply use the physical address of the
|
||||
* device's memory page plus the peer number. The Host insists that all keys
|
||||
* be a multiple of 4, so we multiply the peer number by 4. */
|
||||
static unsigned long peer_key(struct lguestnet_info *info, unsigned peernum)
|
||||
{
|
||||
return info->peer_phys + 4 * peernum;
|
||||
}
|
||||
|
||||
/* This is the routine which sets up a "struct lguest_dma" to point to a
|
||||
* network packet, similar to req_to_dma() in lguest_blk.c. The structure of a
|
||||
* "struct sk_buff" has grown complex over the years: it consists of a "head"
|
||||
* linear section pointed to by "skb->data", and possibly an array of
|
||||
* "fragments" in the case of a non-linear packet.
|
||||
*
|
||||
* Our receive buffers don't use fragments at all but outgoing skbs might, so
|
||||
* we handle it. */
|
||||
static void skb_to_dma(const struct sk_buff *skb, unsigned int headlen,
|
||||
struct lguest_dma *dma)
|
||||
{
|
||||
unsigned int i, seg;
|
||||
|
||||
/* First, we put the linear region into the "struct lguest_dma". Each
|
||||
* entry can't go over a page boundary, so even though all our packets
|
||||
* are 1514 bytes or less, we might need to use two entries here: */
|
||||
for (i = seg = 0; i < headlen; seg++, i += rest_of_page(skb->data+i)) {
|
||||
dma->addr[seg] = virt_to_phys(skb->data + i);
|
||||
dma->len[seg] = min((unsigned)(headlen - i),
|
||||
rest_of_page(skb->data + i));
|
||||
}
|
||||
|
||||
/* Now we handle the fragments: at least they're guaranteed not to go
|
||||
* over a page. skb_shinfo(skb) returns a pointer to the structure
|
||||
* which tells us about the number of fragments and the fragment
|
||||
* array. */
|
||||
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++, seg++) {
|
||||
const skb_frag_t *f = &skb_shinfo(skb)->frags[i];
|
||||
/* Should not happen with MTU less than 64k - 2 * PAGE_SIZE. */
|
||||
if (seg == LGUEST_MAX_DMA_SECTIONS) {
|
||||
/* We will end up sending a truncated packet should
|
||||
* this ever happen. Plus, a cool log message! */
|
||||
printk("Woah dude! Megapacket!\n");
|
||||
break;
|
||||
}
|
||||
dma->addr[seg] = page_to_phys(f->page) + f->page_offset;
|
||||
dma->len[seg] = f->size;
|
||||
}
|
||||
|
||||
/* If after all that we didn't use the entire "struct lguest_dma"
|
||||
* array, we terminate it with a 0 length. */
|
||||
if (seg < LGUEST_MAX_DMA_SECTIONS)
|
||||
dma->len[seg] = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Packet transmission.
|
||||
*
|
||||
* Our packet transmission is a little unusual. A real network card would just
|
||||
* send out the packet and leave the receivers to decide if they're interested.
|
||||
* Instead, we look through the network device memory page and see if any of
|
||||
* the ethernet addresses match the packet destination, and if so we send it to
|
||||
* that Guest.
|
||||
*
|
||||
* This is made a little more complicated in two cases. The first case is
|
||||
* broadcast packets: for that we send the packet to all Guests on the network,
|
||||
* one at a time. The second case is "promiscuous" mode, where a Guest wants
|
||||
* to see all the packets on the network. We need a way for the Guest to tell
|
||||
* us it wants to see all packets, so it sets the "multicast" bit on its
|
||||
* published MAC address, which is never valid in a real ethernet address.
|
||||
*/
|
||||
#define PROMISC_BIT 0x01
|
||||
|
||||
/* This is the callback which is summoned whenever the network device's
|
||||
* multicast or promiscuous state changes. If the card is in promiscuous mode,
|
||||
* we advertise that in our ethernet address in the device's memory. We do the
|
||||
* same if Linux wants any or all multicast traffic. */
|
||||
static void lguestnet_set_multicast(struct net_device *dev)
|
||||
{
|
||||
struct lguestnet_info *info = netdev_priv(dev);
|
||||
|
||||
if ((dev->flags & (IFF_PROMISC|IFF_ALLMULTI)) || dev->mc_count)
|
||||
info->peer[info->me].mac[0] |= PROMISC_BIT;
|
||||
else
|
||||
info->peer[info->me].mac[0] &= ~PROMISC_BIT;
|
||||
}
|
||||
|
||||
/* A simple test function to see if a peer wants to see all packets.*/
|
||||
static int promisc(struct lguestnet_info *info, unsigned int peer)
|
||||
{
|
||||
return info->peer[peer].mac[0] & PROMISC_BIT;
|
||||
}
|
||||
|
||||
/* Another simple function to see if a peer's advertised ethernet address
|
||||
* matches a packet's destination ethernet address. */
|
||||
static int mac_eq(const unsigned char mac[ETH_ALEN],
|
||||
struct lguestnet_info *info, unsigned int peer)
|
||||
{
|
||||
/* Ignore multicast bit, which peer turns on to mean promisc. */
|
||||
if ((info->peer[peer].mac[0] & (~PROMISC_BIT)) != mac[0])
|
||||
return 0;
|
||||
return memcmp(mac+1, info->peer[peer].mac+1, ETH_ALEN-1) == 0;
|
||||
}
|
||||
|
||||
/* This is the function which actually sends a packet once we've decided a
|
||||
* peer wants it: */
|
||||
static void transfer_packet(struct net_device *dev,
|
||||
struct sk_buff *skb,
|
||||
unsigned int peernum)
|
||||
{
|
||||
struct lguestnet_info *info = netdev_priv(dev);
|
||||
struct lguest_dma dma;
|
||||
|
||||
/* We use our handy "struct lguest_dma" packing function to prepare
|
||||
* the skb for sending. */
|
||||
skb_to_dma(skb, skb_headlen(skb), &dma);
|
||||
pr_debug("xfer length %04x (%u)\n", htons(skb->len), skb->len);
|
||||
|
||||
/* This is the actual send call which copies the packet. */
|
||||
lguest_send_dma(peer_key(info, peernum), &dma);
|
||||
|
||||
/* Check that the entire packet was transmitted. If not, it could mean
|
||||
* that the other Guest registered a short receive buffer, but this
|
||||
* driver should never do that. More likely, the peer is dead. */
|
||||
if (dma.used_len != skb->len) {
|
||||
dev->stats.tx_carrier_errors++;
|
||||
pr_debug("Bad xfer to peer %i: %i of %i (dma %p/%i)\n",
|
||||
peernum, dma.used_len, skb->len,
|
||||
(void *)dma.addr[0], dma.len[0]);
|
||||
} else {
|
||||
/* On success we update the stats. */
|
||||
dev->stats.tx_bytes += skb->len;
|
||||
dev->stats.tx_packets++;
|
||||
}
|
||||
}
|
||||
|
||||
/* Another helper function to tell is if a slot in the device memory is unused.
|
||||
* Since we always set the Local Assignment bit in the ethernet address, the
|
||||
* first byte can never be 0. */
|
||||
static int unused_peer(const struct lguest_net peer[], unsigned int num)
|
||||
{
|
||||
return peer[num].mac[0] == 0;
|
||||
}
|
||||
|
||||
/* Finally, here is the routine which handles an outgoing packet. It's called
|
||||
* "start_xmit" for traditional reasons. */
|
||||
static int lguestnet_start_xmit(struct sk_buff *skb, struct net_device *dev)
|
||||
{
|
||||
unsigned int i;
|
||||
int broadcast;
|
||||
struct lguestnet_info *info = netdev_priv(dev);
|
||||
/* Extract the destination ethernet address from the packet. */
|
||||
const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
|
||||
DECLARE_MAC_BUF(mac);
|
||||
|
||||
pr_debug("%s: xmit %s\n", dev->name, print_mac(mac, dest));
|
||||
|
||||
/* If it's a multicast packet, we broadcast to everyone. That's not
|
||||
* very efficient, but there are very few applications which actually
|
||||
* use multicast, which is a shame really.
|
||||
*
|
||||
* As etherdevice.h points out: "By definition the broadcast address is
|
||||
* also a multicast address." So we don't have to test for broadcast
|
||||
* packets separately. */
|
||||
broadcast = is_multicast_ether_addr(dest);
|
||||
|
||||
/* Look through all the published ethernet addresses to see if we
|
||||
* should send this packet. */
|
||||
for (i = 0; i < info->mapsize/sizeof(struct lguest_net); i++) {
|
||||
/* We don't send to ourselves (we actually can't SEND_DMA to
|
||||
* ourselves anyway), and don't send to unused slots.*/
|
||||
if (i == info->me || unused_peer(info->peer, i))
|
||||
continue;
|
||||
|
||||
/* If it's broadcast we send it. If they want every packet we
|
||||
* send it. If the destination matches their address we send
|
||||
* it. Otherwise we go to the next peer. */
|
||||
if (!broadcast && !promisc(info, i) && !mac_eq(dest, info, i))
|
||||
continue;
|
||||
|
||||
pr_debug("lguestnet %s: sending from %i to %i\n",
|
||||
dev->name, info->me, i);
|
||||
/* Our routine which actually does the transfer. */
|
||||
transfer_packet(dev, skb, i);
|
||||
}
|
||||
|
||||
/* An xmit routine is expected to dispose of the packet, so we do. */
|
||||
dev_kfree_skb(skb);
|
||||
|
||||
/* As per kernel convention, 0 means success. This is why I love
|
||||
* networking: even if we never sent to anyone, that's still
|
||||
* success! */
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*D:560
|
||||
* Packet receiving.
|
||||
*
|
||||
* First, here's a helper routine which fills one of our array of receive
|
||||
* buffers: */
|
||||
static int fill_slot(struct net_device *dev, unsigned int slot)
|
||||
{
|
||||
struct lguestnet_info *info = netdev_priv(dev);
|
||||
|
||||
/* We can receive ETH_DATA_LEN (1500) byte packets, plus a standard
|
||||
* ethernet header of ETH_HLEN (14) bytes. */
|
||||
info->skb[slot] = netdev_alloc_skb(dev, ETH_HLEN + ETH_DATA_LEN);
|
||||
if (!info->skb[slot]) {
|
||||
printk("%s: could not fill slot %i\n", dev->name, slot);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
/* skb_to_dma() is a helper which sets up the "struct lguest_dma" to
|
||||
* point to the data in the skb: we also use it for sending out a
|
||||
* packet. */
|
||||
skb_to_dma(info->skb[slot], ETH_HLEN + ETH_DATA_LEN, &info->dma[slot]);
|
||||
|
||||
/* This is a Write Memory Barrier: it ensures that the entry in the
|
||||
* receive buffer array is written *before* we set the "used_len" entry
|
||||
* to 0. If the Host were looking at the receive buffer array from a
|
||||
* different CPU, it could potentially see "used_len = 0" and not see
|
||||
* the updated receive buffer information. This would be a horribly
|
||||
* nasty bug, so make sure the compiler and CPU know this has to happen
|
||||
* first. */
|
||||
wmb();
|
||||
/* Writing 0 to "used_len" tells the Host it can use this receive
|
||||
* buffer now. */
|
||||
info->dma[slot].used_len = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* This is the actual receive routine. When we receive an interrupt from the
|
||||
* Host to tell us a packet has been delivered, we arrive here: */
|
||||
static irqreturn_t lguestnet_rcv(int irq, void *dev_id)
|
||||
{
|
||||
struct net_device *dev = dev_id;
|
||||
struct lguestnet_info *info = netdev_priv(dev);
|
||||
unsigned int i, done = 0;
|
||||
|
||||
/* Look through our entire receive array for an entry which has data
|
||||
* in it. */
|
||||
for (i = 0; i < ARRAY_SIZE(info->dma); i++) {
|
||||
unsigned int length;
|
||||
struct sk_buff *skb;
|
||||
|
||||
length = info->dma[i].used_len;
|
||||
if (length == 0)
|
||||
continue;
|
||||
|
||||
/* We've found one! Remember the skb (we grabbed the length
|
||||
* above), and immediately refill the slot we've taken it
|
||||
* from. */
|
||||
done++;
|
||||
skb = info->skb[i];
|
||||
fill_slot(dev, i);
|
||||
|
||||
/* This shouldn't happen: micropackets could be sent by a
|
||||
* badly-behaved Guest on the network, but the Host will never
|
||||
* stuff more data in the buffer than the buffer length. */
|
||||
if (length < ETH_HLEN || length > ETH_HLEN + ETH_DATA_LEN) {
|
||||
pr_debug(KERN_WARNING "%s: unbelievable skb len: %i\n",
|
||||
dev->name, length);
|
||||
dev_kfree_skb(skb);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* skb_put(), what a great function! I've ranted about this
|
||||
* function before (http://lkml.org/lkml/1999/9/26/24). You
|
||||
* call it after you've added data to the end of an skb (in
|
||||
* this case, it was the Host which wrote the data). */
|
||||
skb_put(skb, length);
|
||||
|
||||
/* The ethernet header contains a protocol field: we use the
|
||||
* standard helper to extract it, and place the result in
|
||||
* skb->protocol. The helper also sets up skb->pkt_type and
|
||||
* eats up the ethernet header from the front of the packet. */
|
||||
skb->protocol = eth_type_trans(skb, dev);
|
||||
|
||||
/* If this device doesn't need checksums for sending, we also
|
||||
* don't need to check the packets when they come in. */
|
||||
if (dev->features & NETIF_F_NO_CSUM)
|
||||
skb->ip_summed = CHECKSUM_UNNECESSARY;
|
||||
|
||||
/* As a last resort for debugging the driver or the lguest I/O
|
||||
* subsystem, you can uncomment the "#define DEBUG" at the top
|
||||
* of this file, which turns all the pr_debug() into printk()
|
||||
* and floods the logs. */
|
||||
pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
|
||||
ntohs(skb->protocol), skb->len, skb->pkt_type);
|
||||
|
||||
/* Update the packet and byte counts (visible from ifconfig,
|
||||
* and good for debugging). */
|
||||
dev->stats.rx_bytes += skb->len;
|
||||
dev->stats.rx_packets++;
|
||||
|
||||
/* Hand our fresh network packet into the stack's "network
|
||||
* interface receive" routine. That will free the packet
|
||||
* itself when it's finished. */
|
||||
netif_rx(skb);
|
||||
}
|
||||
|
||||
/* If we found any packets, we assume the interrupt was for us. */
|
||||
return done ? IRQ_HANDLED : IRQ_NONE;
|
||||
}
|
||||
|
||||
/*D:550 This is where we start: when the device is brought up by dhcpd or
|
||||
* ifconfig. At this point we advertise our MAC address to the rest of the
|
||||
* network, and register receive buffers ready for incoming packets. */
|
||||
static int lguestnet_open(struct net_device *dev)
|
||||
{
|
||||
int i;
|
||||
struct lguestnet_info *info = netdev_priv(dev);
|
||||
|
||||
/* Copy our MAC address into the device page, so others on the network
|
||||
* can find us. */
|
||||
memcpy(info->peer[info->me].mac, dev->dev_addr, ETH_ALEN);
|
||||
|
||||
/* We might already be in promisc mode (dev->flags & IFF_PROMISC). Our
|
||||
* set_multicast callback handles this already, so we call it now. */
|
||||
lguestnet_set_multicast(dev);
|
||||
|
||||
/* Allocate packets and put them into our "struct lguest_dma" array.
|
||||
* If we fail to allocate all the packets we could still limp along,
|
||||
* but it's a sign of real stress so we should probably give up now. */
|
||||
for (i = 0; i < ARRAY_SIZE(info->dma); i++) {
|
||||
if (fill_slot(dev, i) != 0)
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* Finally we tell the Host where our array of "struct lguest_dma"
|
||||
* receive buffers is, binding it to the key corresponding to the
|
||||
* device's physical memory plus our peerid. */
|
||||
if (lguest_bind_dma(peer_key(info,info->me), info->dma,
|
||||
NUM_SKBS, lgdev_irq(info->lgdev)) != 0)
|
||||
goto cleanup;
|
||||
return 0;
|
||||
|
||||
cleanup:
|
||||
while (--i >= 0)
|
||||
dev_kfree_skb(info->skb[i]);
|
||||
return -ENOMEM;
|
||||
}
|
||||
/*:*/
|
||||
|
||||
/* The close routine is called when the device is no longer in use: we clean up
|
||||
* elegantly. */
|
||||
static int lguestnet_close(struct net_device *dev)
|
||||
{
|
||||
unsigned int i;
|
||||
struct lguestnet_info *info = netdev_priv(dev);
|
||||
|
||||
/* Clear all trace of our existence out of the device memory by setting
|
||||
* the slot which held our MAC address to 0 (unused). */
|
||||
memset(&info->peer[info->me], 0, sizeof(info->peer[info->me]));
|
||||
|
||||
/* Unregister our array of receive buffers */
|
||||
lguest_unbind_dma(peer_key(info, info->me), info->dma);
|
||||
for (i = 0; i < ARRAY_SIZE(info->dma); i++)
|
||||
dev_kfree_skb(info->skb[i]);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*D:510 The network device probe function is basically a standard ethernet
|
||||
* device setup. It reads the "struct lguest_device_desc" and sets the "struct
|
||||
* net_device". Oh, the line-by-line excitement! Let's skip over it. :*/
|
||||
static int lguestnet_probe(struct lguest_device *lgdev)
|
||||
{
|
||||
int err, irqf = IRQF_SHARED;
|
||||
struct net_device *dev;
|
||||
struct lguestnet_info *info;
|
||||
struct lguest_device_desc *desc = &lguest_devices[lgdev->index];
|
||||
|
||||
pr_debug("lguest_net: probing for device %i\n", lgdev->index);
|
||||
|
||||
dev = alloc_etherdev(sizeof(struct lguestnet_info));
|
||||
if (!dev)
|
||||
return -ENOMEM;
|
||||
|
||||
/* Ethernet defaults with some changes */
|
||||
ether_setup(dev);
|
||||
dev->set_mac_address = NULL;
|
||||
|
||||
dev->dev_addr[0] = 0x02; /* set local assignment bit (IEEE802) */
|
||||
dev->dev_addr[1] = 0x00;
|
||||
memcpy(&dev->dev_addr[2], &lguest_data.guestid, 2);
|
||||
dev->dev_addr[4] = 0x00;
|
||||
dev->dev_addr[5] = 0x00;
|
||||
|
||||
dev->open = lguestnet_open;
|
||||
dev->stop = lguestnet_close;
|
||||
dev->hard_start_xmit = lguestnet_start_xmit;
|
||||
|
||||
/* We don't actually support multicast yet, but turning on/off
|
||||
* promisc also calls dev->set_multicast_list. */
|
||||
dev->set_multicast_list = lguestnet_set_multicast;
|
||||
SET_NETDEV_DEV(dev, &lgdev->dev);
|
||||
|
||||
/* The network code complains if you have "scatter-gather" capability
|
||||
* if you don't also handle checksums (it seem that would be
|
||||
* "illogical"). So we use a lie of omission and don't tell it that we
|
||||
* can handle scattered packets unless we also don't want checksums,
|
||||
* even though to us they're completely independent. */
|
||||
if (desc->features & LGUEST_NET_F_NOCSUM)
|
||||
dev->features = NETIF_F_SG|NETIF_F_NO_CSUM;
|
||||
|
||||
info = netdev_priv(dev);
|
||||
info->mapsize = PAGE_SIZE * desc->num_pages;
|
||||
info->peer_phys = ((unsigned long)desc->pfn << PAGE_SHIFT);
|
||||
info->lgdev = lgdev;
|
||||
info->peer = lguest_map(info->peer_phys, desc->num_pages);
|
||||
if (!info->peer) {
|
||||
err = -ENOMEM;
|
||||
goto free;
|
||||
}
|
||||
|
||||
/* This stores our peerid (upper bits reserved for future). */
|
||||
info->me = (desc->features & (info->mapsize-1));
|
||||
|
||||
err = register_netdev(dev);
|
||||
if (err) {
|
||||
pr_debug("lguestnet: registering device failed\n");
|
||||
goto unmap;
|
||||
}
|
||||
|
||||
if (lguest_devices[lgdev->index].features & LGUEST_DEVICE_F_RANDOMNESS)
|
||||
irqf |= IRQF_SAMPLE_RANDOM;
|
||||
if (request_irq(lgdev_irq(lgdev), lguestnet_rcv, irqf, "lguestnet",
|
||||
dev) != 0) {
|
||||
pr_debug("lguestnet: cannot get irq %i\n", lgdev_irq(lgdev));
|
||||
goto unregister;
|
||||
}
|
||||
|
||||
pr_debug("lguestnet: registered device %s\n", dev->name);
|
||||
/* Finally, we put the "struct net_device" in the generic "struct
|
||||
* lguest_device"s private pointer. Again, it's not necessary, but
|
||||
* makes sure the cool kernel kids don't tease us. */
|
||||
lgdev->private = dev;
|
||||
return 0;
|
||||
|
||||
unregister:
|
||||
unregister_netdev(dev);
|
||||
unmap:
|
||||
lguest_unmap(info->peer);
|
||||
free:
|
||||
free_netdev(dev);
|
||||
return err;
|
||||
}
|
||||
|
||||
static struct lguest_driver lguestnet_drv = {
|
||||
.name = "lguestnet",
|
||||
.owner = THIS_MODULE,
|
||||
.device_type = LGUEST_DEVICE_T_NET,
|
||||
.probe = lguestnet_probe,
|
||||
};
|
||||
|
||||
static __init int lguestnet_init(void)
|
||||
{
|
||||
return register_lguest_driver(&lguestnet_drv);
|
||||
}
|
||||
module_init(lguestnet_init);
|
||||
|
||||
MODULE_DESCRIPTION("Lguest network driver");
|
||||
MODULE_LICENSE("GPL");
|
||||
|
||||
/*D:580
|
||||
* This is the last of the Drivers, and with this we have covered the many and
|
||||
* wonderous and fine (and boring) details of the Guest.
|
||||
*
|
||||
* "make Launcher" beckons, where we answer questions like "Where do Guests
|
||||
* come from?", and "What do you do when someone asks for optimization?"
|
||||
*/
|
435
drivers/net/virtio_net.c
Normal file
435
drivers/net/virtio_net.c
Normal file
@ -0,0 +1,435 @@
|
||||
/* A simple network driver using virtio.
|
||||
*
|
||||
* Copyright 2007 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
*/
|
||||
//#define DEBUG
|
||||
#include <linux/netdevice.h>
|
||||
#include <linux/etherdevice.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/virtio.h>
|
||||
#include <linux/virtio_net.h>
|
||||
#include <linux/scatterlist.h>
|
||||
|
||||
/* FIXME: MTU in config. */
|
||||
#define MAX_PACKET_LEN (ETH_HLEN+ETH_DATA_LEN)
|
||||
|
||||
struct virtnet_info
|
||||
{
|
||||
struct virtio_device *vdev;
|
||||
struct virtqueue *rvq, *svq;
|
||||
struct net_device *dev;
|
||||
struct napi_struct napi;
|
||||
|
||||
/* Number of input buffers, and max we've ever had. */
|
||||
unsigned int num, max;
|
||||
|
||||
/* Receive & send queues. */
|
||||
struct sk_buff_head recv;
|
||||
struct sk_buff_head send;
|
||||
};
|
||||
|
||||
static inline struct virtio_net_hdr *skb_vnet_hdr(struct sk_buff *skb)
|
||||
{
|
||||
return (struct virtio_net_hdr *)skb->cb;
|
||||
}
|
||||
|
||||
static inline void vnet_hdr_to_sg(struct scatterlist *sg, struct sk_buff *skb)
|
||||
{
|
||||
sg_init_one(sg, skb_vnet_hdr(skb), sizeof(struct virtio_net_hdr));
|
||||
}
|
||||
|
||||
static bool skb_xmit_done(struct virtqueue *rvq)
|
||||
{
|
||||
struct virtnet_info *vi = rvq->vdev->priv;
|
||||
|
||||
/* In case we were waiting for output buffers. */
|
||||
netif_wake_queue(vi->dev);
|
||||
return true;
|
||||
}
|
||||
|
||||
static void receive_skb(struct net_device *dev, struct sk_buff *skb,
|
||||
unsigned len)
|
||||
{
|
||||
struct virtio_net_hdr *hdr = skb_vnet_hdr(skb);
|
||||
|
||||
if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) {
|
||||
pr_debug("%s: short packet %i\n", dev->name, len);
|
||||
dev->stats.rx_length_errors++;
|
||||
goto drop;
|
||||
}
|
||||
len -= sizeof(struct virtio_net_hdr);
|
||||
BUG_ON(len > MAX_PACKET_LEN);
|
||||
|
||||
skb_trim(skb, len);
|
||||
skb->protocol = eth_type_trans(skb, dev);
|
||||
pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
|
||||
ntohs(skb->protocol), skb->len, skb->pkt_type);
|
||||
dev->stats.rx_bytes += skb->len;
|
||||
dev->stats.rx_packets++;
|
||||
|
||||
if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
|
||||
pr_debug("Needs csum!\n");
|
||||
skb->ip_summed = CHECKSUM_PARTIAL;
|
||||
skb->csum_start = hdr->csum_start;
|
||||
skb->csum_offset = hdr->csum_offset;
|
||||
if (skb->csum_start > skb->len - 2
|
||||
|| skb->csum_offset > skb->len - 2) {
|
||||
if (net_ratelimit())
|
||||
printk(KERN_WARNING "%s: csum=%u/%u len=%u\n",
|
||||
dev->name, skb->csum_start,
|
||||
skb->csum_offset, skb->len);
|
||||
goto frame_err;
|
||||
}
|
||||
}
|
||||
|
||||
if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
|
||||
pr_debug("GSO!\n");
|
||||
switch (hdr->gso_type) {
|
||||
case VIRTIO_NET_HDR_GSO_TCPV4:
|
||||
skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
|
||||
break;
|
||||
case VIRTIO_NET_HDR_GSO_TCPV4_ECN:
|
||||
skb_shinfo(skb)->gso_type = SKB_GSO_TCP_ECN;
|
||||
break;
|
||||
case VIRTIO_NET_HDR_GSO_UDP:
|
||||
skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
|
||||
break;
|
||||
case VIRTIO_NET_HDR_GSO_TCPV6:
|
||||
skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
|
||||
break;
|
||||
default:
|
||||
if (net_ratelimit())
|
||||
printk(KERN_WARNING "%s: bad gso type %u.\n",
|
||||
dev->name, hdr->gso_type);
|
||||
goto frame_err;
|
||||
}
|
||||
|
||||
skb_shinfo(skb)->gso_size = hdr->gso_size;
|
||||
if (skb_shinfo(skb)->gso_size == 0) {
|
||||
if (net_ratelimit())
|
||||
printk(KERN_WARNING "%s: zero gso size.\n",
|
||||
dev->name);
|
||||
goto frame_err;
|
||||
}
|
||||
|
||||
/* Header must be checked, and gso_segs computed. */
|
||||
skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
|
||||
skb_shinfo(skb)->gso_segs = 0;
|
||||
}
|
||||
|
||||
netif_receive_skb(skb);
|
||||
return;
|
||||
|
||||
frame_err:
|
||||
dev->stats.rx_frame_errors++;
|
||||
drop:
|
||||
dev_kfree_skb(skb);
|
||||
}
|
||||
|
||||
static void try_fill_recv(struct virtnet_info *vi)
|
||||
{
|
||||
struct sk_buff *skb;
|
||||
struct scatterlist sg[1+MAX_SKB_FRAGS];
|
||||
int num, err;
|
||||
|
||||
for (;;) {
|
||||
skb = netdev_alloc_skb(vi->dev, MAX_PACKET_LEN);
|
||||
if (unlikely(!skb))
|
||||
break;
|
||||
|
||||
skb_put(skb, MAX_PACKET_LEN);
|
||||
vnet_hdr_to_sg(sg, skb);
|
||||
num = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1;
|
||||
skb_queue_head(&vi->recv, skb);
|
||||
|
||||
err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, num, skb);
|
||||
if (err) {
|
||||
skb_unlink(skb, &vi->recv);
|
||||
kfree_skb(skb);
|
||||
break;
|
||||
}
|
||||
vi->num++;
|
||||
}
|
||||
if (unlikely(vi->num > vi->max))
|
||||
vi->max = vi->num;
|
||||
vi->rvq->vq_ops->kick(vi->rvq);
|
||||
}
|
||||
|
||||
static bool skb_recv_done(struct virtqueue *rvq)
|
||||
{
|
||||
struct virtnet_info *vi = rvq->vdev->priv;
|
||||
netif_rx_schedule(vi->dev, &vi->napi);
|
||||
/* Suppress further interrupts. */
|
||||
return false;
|
||||
}
|
||||
|
||||
static int virtnet_poll(struct napi_struct *napi, int budget)
|
||||
{
|
||||
struct virtnet_info *vi = container_of(napi, struct virtnet_info, napi);
|
||||
struct sk_buff *skb = NULL;
|
||||
unsigned int len, received = 0;
|
||||
|
||||
again:
|
||||
while (received < budget &&
|
||||
(skb = vi->rvq->vq_ops->get_buf(vi->rvq, &len)) != NULL) {
|
||||
__skb_unlink(skb, &vi->recv);
|
||||
receive_skb(vi->dev, skb, len);
|
||||
vi->num--;
|
||||
received++;
|
||||
}
|
||||
|
||||
/* FIXME: If we oom and completely run out of inbufs, we need
|
||||
* to start a timer trying to fill more. */
|
||||
if (vi->num < vi->max / 2)
|
||||
try_fill_recv(vi);
|
||||
|
||||
/* All done? */
|
||||
if (!skb) {
|
||||
netif_rx_complete(vi->dev, napi);
|
||||
if (unlikely(!vi->rvq->vq_ops->restart(vi->rvq))
|
||||
&& netif_rx_reschedule(vi->dev, napi))
|
||||
goto again;
|
||||
}
|
||||
|
||||
return received;
|
||||
}
|
||||
|
||||
static void free_old_xmit_skbs(struct virtnet_info *vi)
|
||||
{
|
||||
struct sk_buff *skb;
|
||||
unsigned int len;
|
||||
|
||||
while ((skb = vi->svq->vq_ops->get_buf(vi->svq, &len)) != NULL) {
|
||||
pr_debug("Sent skb %p\n", skb);
|
||||
__skb_unlink(skb, &vi->send);
|
||||
vi->dev->stats.tx_bytes += len;
|
||||
vi->dev->stats.tx_packets++;
|
||||
kfree_skb(skb);
|
||||
}
|
||||
}
|
||||
|
||||
static int start_xmit(struct sk_buff *skb, struct net_device *dev)
|
||||
{
|
||||
struct virtnet_info *vi = netdev_priv(dev);
|
||||
int num, err;
|
||||
struct scatterlist sg[1+MAX_SKB_FRAGS];
|
||||
struct virtio_net_hdr *hdr;
|
||||
const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
|
||||
DECLARE_MAC_BUF(mac);
|
||||
|
||||
pr_debug("%s: xmit %p %s\n", dev->name, skb, print_mac(mac, dest));
|
||||
|
||||
free_old_xmit_skbs(vi);
|
||||
|
||||
/* Encode metadata header at front. */
|
||||
hdr = skb_vnet_hdr(skb);
|
||||
if (skb->ip_summed == CHECKSUM_PARTIAL) {
|
||||
hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
|
||||
hdr->csum_start = skb->csum_start - skb_headroom(skb);
|
||||
hdr->csum_offset = skb->csum_offset;
|
||||
} else {
|
||||
hdr->flags = 0;
|
||||
hdr->csum_offset = hdr->csum_start = 0;
|
||||
}
|
||||
|
||||
if (skb_is_gso(skb)) {
|
||||
hdr->gso_size = skb_shinfo(skb)->gso_size;
|
||||
if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_ECN)
|
||||
hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4_ECN;
|
||||
else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4)
|
||||
hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
|
||||
else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6)
|
||||
hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
|
||||
else if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
|
||||
hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP;
|
||||
else
|
||||
BUG();
|
||||
} else {
|
||||
hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
|
||||
hdr->gso_size = 0;
|
||||
}
|
||||
|
||||
vnet_hdr_to_sg(sg, skb);
|
||||
num = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1;
|
||||
__skb_queue_head(&vi->send, skb);
|
||||
err = vi->svq->vq_ops->add_buf(vi->svq, sg, num, 0, skb);
|
||||
if (err) {
|
||||
pr_debug("%s: virtio not prepared to send\n", dev->name);
|
||||
skb_unlink(skb, &vi->send);
|
||||
netif_stop_queue(dev);
|
||||
return NETDEV_TX_BUSY;
|
||||
}
|
||||
vi->svq->vq_ops->kick(vi->svq);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int virtnet_open(struct net_device *dev)
|
||||
{
|
||||
struct virtnet_info *vi = netdev_priv(dev);
|
||||
|
||||
try_fill_recv(vi);
|
||||
|
||||
/* If we didn't even get one input buffer, we're useless. */
|
||||
if (vi->num == 0)
|
||||
return -ENOMEM;
|
||||
|
||||
napi_enable(&vi->napi);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int virtnet_close(struct net_device *dev)
|
||||
{
|
||||
struct virtnet_info *vi = netdev_priv(dev);
|
||||
struct sk_buff *skb;
|
||||
|
||||
napi_disable(&vi->napi);
|
||||
|
||||
/* networking core has neutered skb_xmit_done/skb_recv_done, so don't
|
||||
* worry about races vs. get(). */
|
||||
vi->rvq->vq_ops->shutdown(vi->rvq);
|
||||
while ((skb = __skb_dequeue(&vi->recv)) != NULL) {
|
||||
kfree_skb(skb);
|
||||
vi->num--;
|
||||
}
|
||||
vi->svq->vq_ops->shutdown(vi->svq);
|
||||
while ((skb = __skb_dequeue(&vi->send)) != NULL)
|
||||
kfree_skb(skb);
|
||||
|
||||
BUG_ON(vi->num != 0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int virtnet_probe(struct virtio_device *vdev)
|
||||
{
|
||||
int err;
|
||||
unsigned int len;
|
||||
struct net_device *dev;
|
||||
struct virtnet_info *vi;
|
||||
void *token;
|
||||
|
||||
/* Allocate ourselves a network device with room for our info */
|
||||
dev = alloc_etherdev(sizeof(struct virtnet_info));
|
||||
if (!dev)
|
||||
return -ENOMEM;
|
||||
|
||||
/* Set up network device as normal. */
|
||||
ether_setup(dev);
|
||||
dev->open = virtnet_open;
|
||||
dev->stop = virtnet_close;
|
||||
dev->hard_start_xmit = start_xmit;
|
||||
dev->features = NETIF_F_HIGHDMA;
|
||||
SET_NETDEV_DEV(dev, &vdev->dev);
|
||||
|
||||
/* Do we support "hardware" checksums? */
|
||||
token = vdev->config->find(vdev, VIRTIO_CONFIG_NET_F, &len);
|
||||
if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_NO_CSUM)) {
|
||||
/* This opens up the world of extra features. */
|
||||
dev->features |= NETIF_F_HW_CSUM|NETIF_F_SG|NETIF_F_FRAGLIST;
|
||||
if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_TSO4))
|
||||
dev->features |= NETIF_F_TSO;
|
||||
if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_UFO))
|
||||
dev->features |= NETIF_F_UFO;
|
||||
if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_TSO4_ECN))
|
||||
dev->features |= NETIF_F_TSO_ECN;
|
||||
if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_TSO6))
|
||||
dev->features |= NETIF_F_TSO6;
|
||||
}
|
||||
|
||||
/* Configuration may specify what MAC to use. Otherwise random. */
|
||||
token = vdev->config->find(vdev, VIRTIO_CONFIG_NET_MAC_F, &len);
|
||||
if (token) {
|
||||
dev->addr_len = len;
|
||||
vdev->config->get(vdev, token, dev->dev_addr, len);
|
||||
} else
|
||||
random_ether_addr(dev->dev_addr);
|
||||
|
||||
/* Set up our device-specific information */
|
||||
vi = netdev_priv(dev);
|
||||
netif_napi_add(dev, &vi->napi, virtnet_poll, 16);
|
||||
vi->dev = dev;
|
||||
vi->vdev = vdev;
|
||||
|
||||
/* We expect two virtqueues, receive then send. */
|
||||
vi->rvq = vdev->config->find_vq(vdev, skb_recv_done);
|
||||
if (IS_ERR(vi->rvq)) {
|
||||
err = PTR_ERR(vi->rvq);
|
||||
goto free;
|
||||
}
|
||||
|
||||
vi->svq = vdev->config->find_vq(vdev, skb_xmit_done);
|
||||
if (IS_ERR(vi->svq)) {
|
||||
err = PTR_ERR(vi->svq);
|
||||
goto free_recv;
|
||||
}
|
||||
|
||||
/* Initialize our empty receive and send queues. */
|
||||
skb_queue_head_init(&vi->recv);
|
||||
skb_queue_head_init(&vi->send);
|
||||
|
||||
err = register_netdev(dev);
|
||||
if (err) {
|
||||
pr_debug("virtio_net: registering device failed\n");
|
||||
goto free_send;
|
||||
}
|
||||
pr_debug("virtnet: registered device %s\n", dev->name);
|
||||
vdev->priv = vi;
|
||||
return 0;
|
||||
|
||||
free_send:
|
||||
vdev->config->del_vq(vi->svq);
|
||||
free_recv:
|
||||
vdev->config->del_vq(vi->rvq);
|
||||
free:
|
||||
free_netdev(dev);
|
||||
return err;
|
||||
}
|
||||
|
||||
static void virtnet_remove(struct virtio_device *vdev)
|
||||
{
|
||||
unregister_netdev(vdev->priv);
|
||||
free_netdev(vdev->priv);
|
||||
}
|
||||
|
||||
static struct virtio_device_id id_table[] = {
|
||||
{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
|
||||
{ 0 },
|
||||
};
|
||||
|
||||
static struct virtio_driver virtio_net = {
|
||||
.driver.name = KBUILD_MODNAME,
|
||||
.driver.owner = THIS_MODULE,
|
||||
.id_table = id_table,
|
||||
.probe = virtnet_probe,
|
||||
.remove = __devexit_p(virtnet_remove),
|
||||
};
|
||||
|
||||
static int __init init(void)
|
||||
{
|
||||
return register_virtio_driver(&virtio_net);
|
||||
}
|
||||
|
||||
static void __exit fini(void)
|
||||
{
|
||||
unregister_virtio_driver(&virtio_net);
|
||||
}
|
||||
module_init(init);
|
||||
module_exit(fini);
|
||||
|
||||
MODULE_DEVICE_TABLE(virtio, id_table);
|
||||
MODULE_DESCRIPTION("Virtio network driver");
|
||||
MODULE_LICENSE("GPL");
|
8
drivers/virtio/Kconfig
Normal file
8
drivers/virtio/Kconfig
Normal file
@ -0,0 +1,8 @@
|
||||
# Virtio always gets selected by whoever wants it.
|
||||
config VIRTIO
|
||||
bool
|
||||
|
||||
# Similarly the virtio ring implementation.
|
||||
config VIRTIO_RING
|
||||
bool
|
||||
depends on VIRTIO
|
2
drivers/virtio/Makefile
Normal file
2
drivers/virtio/Makefile
Normal file
@ -0,0 +1,2 @@
|
||||
obj-$(CONFIG_VIRTIO) += virtio.o
|
||||
obj-$(CONFIG_VIRTIO_RING) += virtio_ring.o
|
13
drivers/virtio/config.c
Normal file
13
drivers/virtio/config.c
Normal file
@ -0,0 +1,13 @@
|
||||
/* Configuration space parsing helpers for virtio.
|
||||
*
|
||||
* The configuration is [type][len][... len bytes ...] fields.
|
||||
*
|
||||
* Copyright 2007 Rusty Russell, IBM Corporation.
|
||||
* GPL v2 or later.
|
||||
*/
|
||||
#include <linux/err.h>
|
||||
#include <linux/virtio.h>
|
||||
#include <linux/virtio_config.h>
|
||||
#include <linux/bug.h>
|
||||
#include <asm/system.h>
|
||||
|
189
drivers/virtio/virtio.c
Normal file
189
drivers/virtio/virtio.c
Normal file
@ -0,0 +1,189 @@
|
||||
#include <linux/virtio.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/virtio_config.h>
|
||||
|
||||
static ssize_t device_show(struct device *_d,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
struct virtio_device *dev = container_of(_d,struct virtio_device,dev);
|
||||
return sprintf(buf, "%hu", dev->id.device);
|
||||
}
|
||||
static ssize_t vendor_show(struct device *_d,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
struct virtio_device *dev = container_of(_d,struct virtio_device,dev);
|
||||
return sprintf(buf, "%hu", dev->id.vendor);
|
||||
}
|
||||
static ssize_t status_show(struct device *_d,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
struct virtio_device *dev = container_of(_d,struct virtio_device,dev);
|
||||
return sprintf(buf, "0x%08x", dev->config->get_status(dev));
|
||||
}
|
||||
static ssize_t modalias_show(struct device *_d,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
struct virtio_device *dev = container_of(_d,struct virtio_device,dev);
|
||||
|
||||
return sprintf(buf, "virtio:d%08Xv%08X\n",
|
||||
dev->id.device, dev->id.vendor);
|
||||
}
|
||||
static struct device_attribute virtio_dev_attrs[] = {
|
||||
__ATTR_RO(device),
|
||||
__ATTR_RO(vendor),
|
||||
__ATTR_RO(status),
|
||||
__ATTR_RO(modalias),
|
||||
__ATTR_NULL
|
||||
};
|
||||
|
||||
static inline int virtio_id_match(const struct virtio_device *dev,
|
||||
const struct virtio_device_id *id)
|
||||
{
|
||||
if (id->device != dev->id.device)
|
||||
return 0;
|
||||
|
||||
return id->vendor == VIRTIO_DEV_ANY_ID || id->vendor != dev->id.vendor;
|
||||
}
|
||||
|
||||
/* This looks through all the IDs a driver claims to support. If any of them
|
||||
* match, we return 1 and the kernel will call virtio_dev_probe(). */
|
||||
static int virtio_dev_match(struct device *_dv, struct device_driver *_dr)
|
||||
{
|
||||
unsigned int i;
|
||||
struct virtio_device *dev = container_of(_dv,struct virtio_device,dev);
|
||||
const struct virtio_device_id *ids;
|
||||
|
||||
ids = container_of(_dr, struct virtio_driver, driver)->id_table;
|
||||
for (i = 0; ids[i].device; i++)
|
||||
if (virtio_id_match(dev, &ids[i]))
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int virtio_uevent(struct device *_dv, struct kobj_uevent_env *env)
|
||||
{
|
||||
struct virtio_device *dev = container_of(_dv,struct virtio_device,dev);
|
||||
|
||||
return add_uevent_var(env, "MODALIAS=virtio:d%08Xv%08X",
|
||||
dev->id.device, dev->id.vendor);
|
||||
}
|
||||
|
||||
static struct bus_type virtio_bus = {
|
||||
.name = "virtio",
|
||||
.match = virtio_dev_match,
|
||||
.dev_attrs = virtio_dev_attrs,
|
||||
.uevent = virtio_uevent,
|
||||
};
|
||||
|
||||
static void add_status(struct virtio_device *dev, unsigned status)
|
||||
{
|
||||
dev->config->set_status(dev, dev->config->get_status(dev) | status);
|
||||
}
|
||||
|
||||
static int virtio_dev_probe(struct device *_d)
|
||||
{
|
||||
int err;
|
||||
struct virtio_device *dev = container_of(_d,struct virtio_device,dev);
|
||||
struct virtio_driver *drv = container_of(dev->dev.driver,
|
||||
struct virtio_driver, driver);
|
||||
|
||||
add_status(dev, VIRTIO_CONFIG_S_DRIVER);
|
||||
err = drv->probe(dev);
|
||||
if (err)
|
||||
add_status(dev, VIRTIO_CONFIG_S_FAILED);
|
||||
else
|
||||
add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
|
||||
return err;
|
||||
}
|
||||
|
||||
int register_virtio_driver(struct virtio_driver *driver)
|
||||
{
|
||||
driver->driver.bus = &virtio_bus;
|
||||
driver->driver.probe = virtio_dev_probe;
|
||||
return driver_register(&driver->driver);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(register_virtio_driver);
|
||||
|
||||
void unregister_virtio_driver(struct virtio_driver *driver)
|
||||
{
|
||||
driver_unregister(&driver->driver);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(unregister_virtio_driver);
|
||||
|
||||
int register_virtio_device(struct virtio_device *dev)
|
||||
{
|
||||
int err;
|
||||
|
||||
dev->dev.bus = &virtio_bus;
|
||||
sprintf(dev->dev.bus_id, "%u", dev->index);
|
||||
|
||||
/* Acknowledge that we've seen the device. */
|
||||
add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
|
||||
|
||||
/* device_register() causes the bus infrastructure to look for a
|
||||
* matching driver. */
|
||||
err = device_register(&dev->dev);
|
||||
if (err)
|
||||
add_status(dev, VIRTIO_CONFIG_S_FAILED);
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(register_virtio_device);
|
||||
|
||||
void unregister_virtio_device(struct virtio_device *dev)
|
||||
{
|
||||
device_unregister(&dev->dev);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(unregister_virtio_device);
|
||||
|
||||
int __virtio_config_val(struct virtio_device *vdev,
|
||||
u8 type, void *val, size_t size)
|
||||
{
|
||||
void *token;
|
||||
unsigned int len;
|
||||
|
||||
token = vdev->config->find(vdev, type, &len);
|
||||
if (!token)
|
||||
return -ENOENT;
|
||||
|
||||
if (len != size)
|
||||
return -EIO;
|
||||
|
||||
vdev->config->get(vdev, token, val, size);
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__virtio_config_val);
|
||||
|
||||
int virtio_use_bit(struct virtio_device *vdev,
|
||||
void *token, unsigned int len, unsigned int bitnum)
|
||||
{
|
||||
unsigned long bits[16];
|
||||
|
||||
/* This makes it convenient to pass-through find() results. */
|
||||
if (!token)
|
||||
return 0;
|
||||
|
||||
/* bit not in range of this bitfield? */
|
||||
if (bitnum * 8 >= len / 2)
|
||||
return 0;
|
||||
|
||||
/* Giant feature bitfields are silly. */
|
||||
BUG_ON(len > sizeof(bits));
|
||||
vdev->config->get(vdev, token, bits, len);
|
||||
|
||||
if (!test_bit(bitnum, bits))
|
||||
return 0;
|
||||
|
||||
/* Set acknowledge bit, and write it back. */
|
||||
set_bit(bitnum + len * 8 / 2, bits);
|
||||
vdev->config->set(vdev, token, bits, len);
|
||||
return 1;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(virtio_use_bit);
|
||||
|
||||
static int virtio_init(void)
|
||||
{
|
||||
if (bus_register(&virtio_bus) != 0)
|
||||
panic("virtio bus registration failed");
|
||||
return 0;
|
||||
}
|
||||
core_initcall(virtio_init);
|
313
drivers/virtio/virtio_ring.c
Normal file
313
drivers/virtio/virtio_ring.c
Normal file
@ -0,0 +1,313 @@
|
||||
/* Virtio ring implementation.
|
||||
*
|
||||
* Copyright 2007 Rusty Russell IBM Corporation
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
#include <linux/virtio.h>
|
||||
#include <linux/virtio_ring.h>
|
||||
#include <linux/device.h>
|
||||
|
||||
#ifdef DEBUG
|
||||
/* For development, we want to crash whenever the ring is screwed. */
|
||||
#define BAD_RING(vq, fmt...) \
|
||||
do { dev_err(&vq->vq.vdev->dev, fmt); BUG(); } while(0)
|
||||
#define START_USE(vq) \
|
||||
do { if ((vq)->in_use) panic("in_use = %i\n", (vq)->in_use); (vq)->in_use = __LINE__; mb(); } while(0)
|
||||
#define END_USE(vq) \
|
||||
do { BUG_ON(!(vq)->in_use); (vq)->in_use = 0; mb(); } while(0)
|
||||
#else
|
||||
#define BAD_RING(vq, fmt...) \
|
||||
do { dev_err(&vq->vq.vdev->dev, fmt); (vq)->broken = true; } while(0)
|
||||
#define START_USE(vq)
|
||||
#define END_USE(vq)
|
||||
#endif
|
||||
|
||||
struct vring_virtqueue
|
||||
{
|
||||
struct virtqueue vq;
|
||||
|
||||
/* Actual memory layout for this queue */
|
||||
struct vring vring;
|
||||
|
||||
/* Other side has made a mess, don't try any more. */
|
||||
bool broken;
|
||||
|
||||
/* Number of free buffers */
|
||||
unsigned int num_free;
|
||||
/* Head of free buffer list. */
|
||||
unsigned int free_head;
|
||||
/* Number we've added since last sync. */
|
||||
unsigned int num_added;
|
||||
|
||||
/* Last used index we've seen. */
|
||||
unsigned int last_used_idx;
|
||||
|
||||
/* How to notify other side. FIXME: commonalize hcalls! */
|
||||
void (*notify)(struct virtqueue *vq);
|
||||
|
||||
#ifdef DEBUG
|
||||
/* They're supposed to lock for us. */
|
||||
unsigned int in_use;
|
||||
#endif
|
||||
|
||||
/* Tokens for callbacks. */
|
||||
void *data[];
|
||||
};
|
||||
|
||||
#define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
|
||||
|
||||
static int vring_add_buf(struct virtqueue *_vq,
|
||||
struct scatterlist sg[],
|
||||
unsigned int out,
|
||||
unsigned int in,
|
||||
void *data)
|
||||
{
|
||||
struct vring_virtqueue *vq = to_vvq(_vq);
|
||||
unsigned int i, avail, head, uninitialized_var(prev);
|
||||
|
||||
BUG_ON(data == NULL);
|
||||
BUG_ON(out + in > vq->vring.num);
|
||||
BUG_ON(out + in == 0);
|
||||
|
||||
START_USE(vq);
|
||||
|
||||
if (vq->num_free < out + in) {
|
||||
pr_debug("Can't add buf len %i - avail = %i\n",
|
||||
out + in, vq->num_free);
|
||||
END_USE(vq);
|
||||
return -ENOSPC;
|
||||
}
|
||||
|
||||
/* We're about to use some buffers from the free list. */
|
||||
vq->num_free -= out + in;
|
||||
|
||||
head = vq->free_head;
|
||||
for (i = vq->free_head; out; i = vq->vring.desc[i].next, out--) {
|
||||
vq->vring.desc[i].flags = VRING_DESC_F_NEXT;
|
||||
vq->vring.desc[i].addr = (page_to_pfn(sg_page(sg))<<PAGE_SHIFT)
|
||||
+ sg->offset;
|
||||
vq->vring.desc[i].len = sg->length;
|
||||
prev = i;
|
||||
sg++;
|
||||
}
|
||||
for (; in; i = vq->vring.desc[i].next, in--) {
|
||||
vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
|
||||
vq->vring.desc[i].addr = (page_to_pfn(sg_page(sg))<<PAGE_SHIFT)
|
||||
+ sg->offset;
|
||||
vq->vring.desc[i].len = sg->length;
|
||||
prev = i;
|
||||
sg++;
|
||||
}
|
||||
/* Last one doesn't continue. */
|
||||
vq->vring.desc[prev].flags &= ~VRING_DESC_F_NEXT;
|
||||
|
||||
/* Update free pointer */
|
||||
vq->free_head = i;
|
||||
|
||||
/* Set token. */
|
||||
vq->data[head] = data;
|
||||
|
||||
/* Put entry in available array (but don't update avail->idx until they
|
||||
* do sync). FIXME: avoid modulus here? */
|
||||
avail = (vq->vring.avail->idx + vq->num_added++) % vq->vring.num;
|
||||
vq->vring.avail->ring[avail] = head;
|
||||
|
||||
pr_debug("Added buffer head %i to %p\n", head, vq);
|
||||
END_USE(vq);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void vring_kick(struct virtqueue *_vq)
|
||||
{
|
||||
struct vring_virtqueue *vq = to_vvq(_vq);
|
||||
START_USE(vq);
|
||||
/* Descriptors and available array need to be set before we expose the
|
||||
* new available array entries. */
|
||||
wmb();
|
||||
|
||||
vq->vring.avail->idx += vq->num_added;
|
||||
vq->num_added = 0;
|
||||
|
||||
/* Need to update avail index before checking if we should notify */
|
||||
mb();
|
||||
|
||||
if (!(vq->vring.used->flags & VRING_USED_F_NO_NOTIFY))
|
||||
/* Prod other side to tell it about changes. */
|
||||
vq->notify(&vq->vq);
|
||||
|
||||
END_USE(vq);
|
||||
}
|
||||
|
||||
static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
/* Clear data ptr. */
|
||||
vq->data[head] = NULL;
|
||||
|
||||
/* Put back on free list: find end */
|
||||
i = head;
|
||||
while (vq->vring.desc[i].flags & VRING_DESC_F_NEXT) {
|
||||
i = vq->vring.desc[i].next;
|
||||
vq->num_free++;
|
||||
}
|
||||
|
||||
vq->vring.desc[i].next = vq->free_head;
|
||||
vq->free_head = head;
|
||||
/* Plus final descriptor */
|
||||
vq->num_free++;
|
||||
}
|
||||
|
||||
/* FIXME: We need to tell other side about removal, to synchronize. */
|
||||
static void vring_shutdown(struct virtqueue *_vq)
|
||||
{
|
||||
struct vring_virtqueue *vq = to_vvq(_vq);
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < vq->vring.num; i++)
|
||||
detach_buf(vq, i);
|
||||
}
|
||||
|
||||
static inline bool more_used(const struct vring_virtqueue *vq)
|
||||
{
|
||||
return vq->last_used_idx != vq->vring.used->idx;
|
||||
}
|
||||
|
||||
static void *vring_get_buf(struct virtqueue *_vq, unsigned int *len)
|
||||
{
|
||||
struct vring_virtqueue *vq = to_vvq(_vq);
|
||||
void *ret;
|
||||
unsigned int i;
|
||||
|
||||
START_USE(vq);
|
||||
|
||||
if (!more_used(vq)) {
|
||||
pr_debug("No more buffers in queue\n");
|
||||
END_USE(vq);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
i = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].id;
|
||||
*len = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].len;
|
||||
|
||||
if (unlikely(i >= vq->vring.num)) {
|
||||
BAD_RING(vq, "id %u out of range\n", i);
|
||||
return NULL;
|
||||
}
|
||||
if (unlikely(!vq->data[i])) {
|
||||
BAD_RING(vq, "id %u is not a head!\n", i);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* detach_buf clears data, so grab it now. */
|
||||
ret = vq->data[i];
|
||||
detach_buf(vq, i);
|
||||
vq->last_used_idx++;
|
||||
END_USE(vq);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool vring_restart(struct virtqueue *_vq)
|
||||
{
|
||||
struct vring_virtqueue *vq = to_vvq(_vq);
|
||||
|
||||
START_USE(vq);
|
||||
BUG_ON(!(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT));
|
||||
|
||||
/* We optimistically turn back on interrupts, then check if there was
|
||||
* more to do. */
|
||||
vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT;
|
||||
mb();
|
||||
if (unlikely(more_used(vq))) {
|
||||
vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
|
||||
END_USE(vq);
|
||||
return false;
|
||||
}
|
||||
|
||||
END_USE(vq);
|
||||
return true;
|
||||
}
|
||||
|
||||
irqreturn_t vring_interrupt(int irq, void *_vq)
|
||||
{
|
||||
struct vring_virtqueue *vq = to_vvq(_vq);
|
||||
|
||||
if (!more_used(vq)) {
|
||||
pr_debug("virtqueue interrupt with no work for %p\n", vq);
|
||||
return IRQ_NONE;
|
||||
}
|
||||
|
||||
if (unlikely(vq->broken))
|
||||
return IRQ_HANDLED;
|
||||
|
||||
pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback);
|
||||
if (vq->vq.callback && !vq->vq.callback(&vq->vq))
|
||||
vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
|
||||
|
||||
return IRQ_HANDLED;
|
||||
}
|
||||
|
||||
static struct virtqueue_ops vring_vq_ops = {
|
||||
.add_buf = vring_add_buf,
|
||||
.get_buf = vring_get_buf,
|
||||
.kick = vring_kick,
|
||||
.restart = vring_restart,
|
||||
.shutdown = vring_shutdown,
|
||||
};
|
||||
|
||||
struct virtqueue *vring_new_virtqueue(unsigned int num,
|
||||
struct virtio_device *vdev,
|
||||
void *pages,
|
||||
void (*notify)(struct virtqueue *),
|
||||
bool (*callback)(struct virtqueue *))
|
||||
{
|
||||
struct vring_virtqueue *vq;
|
||||
unsigned int i;
|
||||
|
||||
vq = kmalloc(sizeof(*vq) + sizeof(void *)*num, GFP_KERNEL);
|
||||
if (!vq)
|
||||
return NULL;
|
||||
|
||||
vring_init(&vq->vring, num, pages);
|
||||
vq->vq.callback = callback;
|
||||
vq->vq.vdev = vdev;
|
||||
vq->vq.vq_ops = &vring_vq_ops;
|
||||
vq->notify = notify;
|
||||
vq->broken = false;
|
||||
vq->last_used_idx = 0;
|
||||
vq->num_added = 0;
|
||||
#ifdef DEBUG
|
||||
vq->in_use = false;
|
||||
#endif
|
||||
|
||||
/* No callback? Tell other side not to bother us. */
|
||||
if (!callback)
|
||||
vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
|
||||
|
||||
/* Put everything in free lists. */
|
||||
vq->num_free = num;
|
||||
vq->free_head = 0;
|
||||
for (i = 0; i < num-1; i++)
|
||||
vq->vring.desc[i].next = i+1;
|
||||
|
||||
return &vq->vq;
|
||||
}
|
||||
|
||||
void vring_del_virtqueue(struct virtqueue *vq)
|
||||
{
|
||||
kfree(to_vvq(vq));
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
include include/asm-generic/Kbuild.asm
|
||||
|
||||
header-y += boot.h
|
||||
header-y += bootparam.h
|
||||
header-y += debugreg.h
|
||||
header-y += ldt.h
|
||||
header-y += msr-index.h
|
||||
@ -14,8 +15,10 @@ unifdef-y += a.out_32.h
|
||||
unifdef-y += a.out_64.h
|
||||
unifdef-y += byteorder_32.h
|
||||
unifdef-y += byteorder_64.h
|
||||
unifdef-y += e820.h
|
||||
unifdef-y += elf_32.h
|
||||
unifdef-y += elf_64.h
|
||||
unifdef-y += ist.h
|
||||
unifdef-y += mce.h
|
||||
unifdef-y += msgbuf_32.h
|
||||
unifdef-y += msgbuf_64.h
|
||||
|
@ -10,85 +10,85 @@
|
||||
#include <video/edid.h>
|
||||
|
||||
struct setup_header {
|
||||
u8 setup_sects;
|
||||
u16 root_flags;
|
||||
u32 syssize;
|
||||
u16 ram_size;
|
||||
__u8 setup_sects;
|
||||
__u16 root_flags;
|
||||
__u32 syssize;
|
||||
__u16 ram_size;
|
||||
#define RAMDISK_IMAGE_START_MASK 0x07FF
|
||||
#define RAMDISK_PROMPT_FLAG 0x8000
|
||||
#define RAMDISK_LOAD_FLAG 0x4000
|
||||
u16 vid_mode;
|
||||
u16 root_dev;
|
||||
u16 boot_flag;
|
||||
u16 jump;
|
||||
u32 header;
|
||||
u16 version;
|
||||
u32 realmode_swtch;
|
||||
u16 start_sys;
|
||||
u16 kernel_version;
|
||||
u8 type_of_loader;
|
||||
u8 loadflags;
|
||||
__u16 vid_mode;
|
||||
__u16 root_dev;
|
||||
__u16 boot_flag;
|
||||
__u16 jump;
|
||||
__u32 header;
|
||||
__u16 version;
|
||||
__u32 realmode_swtch;
|
||||
__u16 start_sys;
|
||||
__u16 kernel_version;
|
||||
__u8 type_of_loader;
|
||||
__u8 loadflags;
|
||||
#define LOADED_HIGH (1<<0)
|
||||
#define KEEP_SEGMENTS (1<<6)
|
||||
#define CAN_USE_HEAP (1<<7)
|
||||
u16 setup_move_size;
|
||||
u32 code32_start;
|
||||
u32 ramdisk_image;
|
||||
u32 ramdisk_size;
|
||||
u32 bootsect_kludge;
|
||||
u16 heap_end_ptr;
|
||||
u16 _pad1;
|
||||
u32 cmd_line_ptr;
|
||||
u32 initrd_addr_max;
|
||||
u32 kernel_alignment;
|
||||
u8 relocatable_kernel;
|
||||
u8 _pad2[3];
|
||||
u32 cmdline_size;
|
||||
u32 hardware_subarch;
|
||||
u64 hardware_subarch_data;
|
||||
__u16 setup_move_size;
|
||||
__u32 code32_start;
|
||||
__u32 ramdisk_image;
|
||||
__u32 ramdisk_size;
|
||||
__u32 bootsect_kludge;
|
||||
__u16 heap_end_ptr;
|
||||
__u16 _pad1;
|
||||
__u32 cmd_line_ptr;
|
||||
__u32 initrd_addr_max;
|
||||
__u32 kernel_alignment;
|
||||
__u8 relocatable_kernel;
|
||||
__u8 _pad2[3];
|
||||
__u32 cmdline_size;
|
||||
__u32 hardware_subarch;
|
||||
__u64 hardware_subarch_data;
|
||||
} __attribute__((packed));
|
||||
|
||||
struct sys_desc_table {
|
||||
u16 length;
|
||||
u8 table[14];
|
||||
__u16 length;
|
||||
__u8 table[14];
|
||||
};
|
||||
|
||||
struct efi_info {
|
||||
u32 _pad1;
|
||||
u32 efi_systab;
|
||||
u32 efi_memdesc_size;
|
||||
u32 efi_memdesc_version;
|
||||
u32 efi_memmap;
|
||||
u32 efi_memmap_size;
|
||||
u32 _pad2[2];
|
||||
__u32 _pad1;
|
||||
__u32 efi_systab;
|
||||
__u32 efi_memdesc_size;
|
||||
__u32 efi_memdesc_version;
|
||||
__u32 efi_memmap;
|
||||
__u32 efi_memmap_size;
|
||||
__u32 _pad2[2];
|
||||
};
|
||||
|
||||
/* The so-called "zeropage" */
|
||||
struct boot_params {
|
||||
struct screen_info screen_info; /* 0x000 */
|
||||
struct apm_bios_info apm_bios_info; /* 0x040 */
|
||||
u8 _pad2[12]; /* 0x054 */
|
||||
__u8 _pad2[12]; /* 0x054 */
|
||||
struct ist_info ist_info; /* 0x060 */
|
||||
u8 _pad3[16]; /* 0x070 */
|
||||
u8 hd0_info[16]; /* obsolete! */ /* 0x080 */
|
||||
u8 hd1_info[16]; /* obsolete! */ /* 0x090 */
|
||||
__u8 _pad3[16]; /* 0x070 */
|
||||
__u8 hd0_info[16]; /* obsolete! */ /* 0x080 */
|
||||
__u8 hd1_info[16]; /* obsolete! */ /* 0x090 */
|
||||
struct sys_desc_table sys_desc_table; /* 0x0a0 */
|
||||
u8 _pad4[144]; /* 0x0b0 */
|
||||
__u8 _pad4[144]; /* 0x0b0 */
|
||||
struct edid_info edid_info; /* 0x140 */
|
||||
struct efi_info efi_info; /* 0x1c0 */
|
||||
u32 alt_mem_k; /* 0x1e0 */
|
||||
u32 scratch; /* Scratch field! */ /* 0x1e4 */
|
||||
u8 e820_entries; /* 0x1e8 */
|
||||
u8 eddbuf_entries; /* 0x1e9 */
|
||||
u8 edd_mbr_sig_buf_entries; /* 0x1ea */
|
||||
u8 _pad6[6]; /* 0x1eb */
|
||||
__u32 alt_mem_k; /* 0x1e0 */
|
||||
__u32 scratch; /* Scratch field! */ /* 0x1e4 */
|
||||
__u8 e820_entries; /* 0x1e8 */
|
||||
__u8 eddbuf_entries; /* 0x1e9 */
|
||||
__u8 edd_mbr_sig_buf_entries; /* 0x1ea */
|
||||
__u8 _pad6[6]; /* 0x1eb */
|
||||
struct setup_header hdr; /* setup header */ /* 0x1f1 */
|
||||
u8 _pad7[0x290-0x1f1-sizeof(struct setup_header)];
|
||||
u32 edd_mbr_sig_buffer[EDD_MBR_SIG_MAX]; /* 0x290 */
|
||||
__u8 _pad7[0x290-0x1f1-sizeof(struct setup_header)];
|
||||
__u32 edd_mbr_sig_buffer[EDD_MBR_SIG_MAX]; /* 0x290 */
|
||||
struct e820entry e820_map[E820MAX]; /* 0x2d0 */
|
||||
u8 _pad8[48]; /* 0xcd0 */
|
||||
__u8 _pad8[48]; /* 0xcd0 */
|
||||
struct edd_info eddbuf[EDDMAXNR]; /* 0xd00 */
|
||||
u8 _pad9[276]; /* 0xeec */
|
||||
__u8 _pad9[276]; /* 0xeec */
|
||||
} __attribute__((packed));
|
||||
|
||||
#endif /* _ASM_BOOTPARAM_H */
|
||||
|
@ -1,5 +1,33 @@
|
||||
#ifndef __ASM_E820_H
|
||||
#define __ASM_E820_H
|
||||
#define E820MAP 0x2d0 /* our map */
|
||||
#define E820MAX 128 /* number of entries in E820MAP */
|
||||
#define E820NR 0x1e8 /* # entries in E820MAP */
|
||||
|
||||
#define E820_RAM 1
|
||||
#define E820_RESERVED 2
|
||||
#define E820_ACPI 3
|
||||
#define E820_NVS 4
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
struct e820entry {
|
||||
__u64 addr; /* start of memory segment */
|
||||
__u64 size; /* size of memory segment */
|
||||
__u32 type; /* type of memory segment */
|
||||
} __attribute__((packed));
|
||||
|
||||
struct e820map {
|
||||
__u32 nr_map;
|
||||
struct e820entry map[E820MAX];
|
||||
};
|
||||
#endif /* __ASSEMBLY__ */
|
||||
|
||||
#ifdef __KERNEL__
|
||||
#ifdef CONFIG_X86_32
|
||||
# include "e820_32.h"
|
||||
#else
|
||||
# include "e820_64.h"
|
||||
#endif
|
||||
#endif /* __KERNEL__ */
|
||||
|
||||
#endif /* __ASM_E820_H */
|
||||
|
@ -12,30 +12,10 @@
|
||||
#ifndef __E820_HEADER
|
||||
#define __E820_HEADER
|
||||
|
||||
#define E820MAP 0x2d0 /* our map */
|
||||
#define E820MAX 128 /* number of entries in E820MAP */
|
||||
#define E820NR 0x1e8 /* # entries in E820MAP */
|
||||
|
||||
#define E820_RAM 1
|
||||
#define E820_RESERVED 2
|
||||
#define E820_ACPI 3
|
||||
#define E820_NVS 4
|
||||
|
||||
#define HIGH_MEMORY (1024*1024)
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
|
||||
struct e820entry {
|
||||
u64 addr; /* start of memory segment */
|
||||
u64 size; /* size of memory segment */
|
||||
u32 type; /* type of memory segment */
|
||||
} __attribute__((packed));
|
||||
|
||||
struct e820map {
|
||||
u32 nr_map;
|
||||
struct e820entry map[E820MAX];
|
||||
};
|
||||
|
||||
extern struct e820map e820;
|
||||
|
||||
extern int e820_all_mapped(unsigned long start, unsigned long end,
|
||||
@ -56,5 +36,4 @@ static inline void e820_mark_nosave_regions(void)
|
||||
#endif
|
||||
|
||||
#endif/*!__ASSEMBLY__*/
|
||||
|
||||
#endif/*__E820_HEADER*/
|
||||
|
@ -11,27 +11,7 @@
|
||||
#ifndef __E820_HEADER
|
||||
#define __E820_HEADER
|
||||
|
||||
#define E820MAP 0x2d0 /* our map */
|
||||
#define E820MAX 128 /* number of entries in E820MAP */
|
||||
#define E820NR 0x1e8 /* # entries in E820MAP */
|
||||
|
||||
#define E820_RAM 1
|
||||
#define E820_RESERVED 2
|
||||
#define E820_ACPI 3
|
||||
#define E820_NVS 4
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
struct e820entry {
|
||||
u64 addr; /* start of memory segment */
|
||||
u64 size; /* size of memory segment */
|
||||
u32 type; /* type of memory segment */
|
||||
} __attribute__((packed));
|
||||
|
||||
struct e820map {
|
||||
u32 nr_map;
|
||||
struct e820entry map[E820MAX];
|
||||
};
|
||||
|
||||
extern unsigned long find_e820_area(unsigned long start, unsigned long end,
|
||||
unsigned size);
|
||||
extern void add_memory_region(unsigned long start, unsigned long size,
|
||||
|
@ -17,17 +17,17 @@
|
||||
*/
|
||||
|
||||
|
||||
#ifdef __KERNEL__
|
||||
|
||||
#include <linux/types.h>
|
||||
|
||||
struct ist_info {
|
||||
u32 signature;
|
||||
u32 command;
|
||||
u32 event;
|
||||
u32 perf_level;
|
||||
__u32 signature;
|
||||
__u32 command;
|
||||
__u32 event;
|
||||
__u32 perf_level;
|
||||
};
|
||||
|
||||
#ifdef __KERNEL__
|
||||
|
||||
extern struct ist_info ist_info;
|
||||
|
||||
#endif /* __KERNEL__ */
|
||||
|
86
include/asm-x86/lguest.h
Normal file
86
include/asm-x86/lguest.h
Normal file
@ -0,0 +1,86 @@
|
||||
#ifndef _X86_LGUEST_H
|
||||
#define _X86_LGUEST_H
|
||||
|
||||
#define GDT_ENTRY_LGUEST_CS 10
|
||||
#define GDT_ENTRY_LGUEST_DS 11
|
||||
#define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8)
|
||||
#define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8)
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
#include <asm/desc.h>
|
||||
|
||||
#define GUEST_PL 1
|
||||
|
||||
/* Every guest maps the core switcher code. */
|
||||
#define SHARED_SWITCHER_PAGES \
|
||||
DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE)
|
||||
/* Pages for switcher itself, then two pages per cpu */
|
||||
#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * NR_CPUS)
|
||||
|
||||
/* We map at -4M for ease of mapping into the guest (one PTE page). */
|
||||
#define SWITCHER_ADDR 0xFFC00000
|
||||
|
||||
/* Found in switcher.S */
|
||||
extern unsigned long default_idt_entries[];
|
||||
|
||||
struct lguest_regs
|
||||
{
|
||||
/* Manually saved part. */
|
||||
unsigned long eax, ebx, ecx, edx;
|
||||
unsigned long esi, edi, ebp;
|
||||
unsigned long gs;
|
||||
unsigned long fs, ds, es;
|
||||
unsigned long trapnum, errcode;
|
||||
/* Trap pushed part */
|
||||
unsigned long eip;
|
||||
unsigned long cs;
|
||||
unsigned long eflags;
|
||||
unsigned long esp;
|
||||
unsigned long ss;
|
||||
};
|
||||
|
||||
/* This is a guest-specific page (mapped ro) into the guest. */
|
||||
struct lguest_ro_state
|
||||
{
|
||||
/* Host information we need to restore when we switch back. */
|
||||
u32 host_cr3;
|
||||
struct Xgt_desc_struct host_idt_desc;
|
||||
struct Xgt_desc_struct host_gdt_desc;
|
||||
u32 host_sp;
|
||||
|
||||
/* Fields which are used when guest is running. */
|
||||
struct Xgt_desc_struct guest_idt_desc;
|
||||
struct Xgt_desc_struct guest_gdt_desc;
|
||||
struct i386_hw_tss guest_tss;
|
||||
struct desc_struct guest_idt[IDT_ENTRIES];
|
||||
struct desc_struct guest_gdt[GDT_ENTRIES];
|
||||
};
|
||||
|
||||
struct lguest_arch
|
||||
{
|
||||
/* The GDT entries copied into lguest_ro_state when running. */
|
||||
struct desc_struct gdt[GDT_ENTRIES];
|
||||
|
||||
/* The IDT entries: some copied into lguest_ro_state when running. */
|
||||
struct desc_struct idt[IDT_ENTRIES];
|
||||
|
||||
/* The address of the last guest-visible pagefault (ie. cr2). */
|
||||
unsigned long last_pagefault;
|
||||
};
|
||||
|
||||
static inline void lguest_set_ts(void)
|
||||
{
|
||||
u32 cr0;
|
||||
|
||||
cr0 = read_cr0();
|
||||
if (!(cr0 & 8))
|
||||
write_cr0(cr0|8);
|
||||
}
|
||||
|
||||
/* Full 4G segment descriptors, suitable for CS and DS. */
|
||||
#define FULL_EXEC_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9b00})
|
||||
#define FULL_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9300})
|
||||
|
||||
#endif /* __ASSEMBLY__ */
|
||||
|
||||
#endif
|
71
include/asm-x86/lguest_hcall.h
Normal file
71
include/asm-x86/lguest_hcall.h
Normal file
@ -0,0 +1,71 @@
|
||||
/* Architecture specific portion of the lguest hypercalls */
|
||||
#ifndef _X86_LGUEST_HCALL_H
|
||||
#define _X86_LGUEST_HCALL_H
|
||||
|
||||
#define LHCALL_FLUSH_ASYNC 0
|
||||
#define LHCALL_LGUEST_INIT 1
|
||||
#define LHCALL_CRASH 2
|
||||
#define LHCALL_LOAD_GDT 3
|
||||
#define LHCALL_NEW_PGTABLE 4
|
||||
#define LHCALL_FLUSH_TLB 5
|
||||
#define LHCALL_LOAD_IDT_ENTRY 6
|
||||
#define LHCALL_SET_STACK 7
|
||||
#define LHCALL_TS 8
|
||||
#define LHCALL_SET_CLOCKEVENT 9
|
||||
#define LHCALL_HALT 10
|
||||
#define LHCALL_SET_PTE 14
|
||||
#define LHCALL_SET_PMD 15
|
||||
#define LHCALL_LOAD_TLS 16
|
||||
#define LHCALL_NOTIFY 17
|
||||
|
||||
/*G:031 First, how does our Guest contact the Host to ask for privileged
|
||||
* operations? There are two ways: the direct way is to make a "hypercall",
|
||||
* to make requests of the Host Itself.
|
||||
*
|
||||
* Our hypercall mechanism uses the highest unused trap code (traps 32 and
|
||||
* above are used by real hardware interrupts). Seventeen hypercalls are
|
||||
* available: the hypercall number is put in the %eax register, and the
|
||||
* arguments (when required) are placed in %edx, %ebx and %ecx. If a return
|
||||
* value makes sense, it's returned in %eax.
|
||||
*
|
||||
* Grossly invalid calls result in Sudden Death at the hands of the vengeful
|
||||
* Host, rather than returning failure. This reflects Winston Churchill's
|
||||
* definition of a gentleman: "someone who is only rude intentionally". */
|
||||
#define LGUEST_TRAP_ENTRY 0x1F
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
#include <asm/hw_irq.h>
|
||||
|
||||
static inline unsigned long
|
||||
hcall(unsigned long call,
|
||||
unsigned long arg1, unsigned long arg2, unsigned long arg3)
|
||||
{
|
||||
/* "int" is the Intel instruction to trigger a trap. */
|
||||
asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
|
||||
/* The call is in %eax (aka "a"), and can be replaced */
|
||||
: "=a"(call)
|
||||
/* The other arguments are in %eax, %edx, %ebx & %ecx */
|
||||
: "a"(call), "d"(arg1), "b"(arg2), "c"(arg3)
|
||||
/* "memory" means this might write somewhere in memory.
|
||||
* This isn't true for all calls, but it's safe to tell
|
||||
* gcc that it might happen so it doesn't get clever. */
|
||||
: "memory");
|
||||
return call;
|
||||
}
|
||||
/*:*/
|
||||
|
||||
void async_hcall(unsigned long call,
|
||||
unsigned long arg1, unsigned long arg2, unsigned long arg3);
|
||||
|
||||
/* Can't use our min() macro here: needs to be a constant */
|
||||
#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
|
||||
|
||||
#define LHCALL_RING_SIZE 64
|
||||
struct hcall_args
|
||||
{
|
||||
/* These map directly onto eax, ebx, ecx, edx in struct lguest_regs */
|
||||
unsigned long arg0, arg2, arg3, arg1;
|
||||
};
|
||||
|
||||
#endif /* !__ASSEMBLY__ */
|
||||
#endif /* _I386_LGUEST_HCALL_H */
|
@ -186,6 +186,7 @@ unifdef-y += cyclades.h
|
||||
unifdef-y += dccp.h
|
||||
unifdef-y += dirent.h
|
||||
unifdef-y += dlm.h
|
||||
unifdef-y += edd.h
|
||||
unifdef-y += elfcore.h
|
||||
unifdef-y += errno.h
|
||||
unifdef-y += errqueue.h
|
||||
@ -306,6 +307,7 @@ unifdef-y += rtc.h
|
||||
unifdef-y += rtnetlink.h
|
||||
unifdef-y += scc.h
|
||||
unifdef-y += sched.h
|
||||
unifdef-y += screen_info.h
|
||||
unifdef-y += sdla.h
|
||||
unifdef-y += selinux_netlink.h
|
||||
unifdef-y += sem.h
|
||||
@ -341,6 +343,9 @@ unifdef-y += user.h
|
||||
unifdef-y += utsname.h
|
||||
unifdef-y += videodev2.h
|
||||
unifdef-y += videodev.h
|
||||
unifdef-y += virtio_config.h
|
||||
unifdef-y += virtio_blk.h
|
||||
unifdef-y += virtio_net.h
|
||||
unifdef-y += wait.h
|
||||
unifdef-y += wanrouter.h
|
||||
unifdef-y += watchdog.h
|
||||
|
@ -16,29 +16,29 @@
|
||||
* General Public License for more details.
|
||||
*/
|
||||
|
||||
typedef unsigned short apm_event_t;
|
||||
typedef unsigned short apm_eventinfo_t;
|
||||
#include <linux/types.h>
|
||||
|
||||
struct apm_bios_info {
|
||||
__u16 version;
|
||||
__u16 cseg;
|
||||
__u32 offset;
|
||||
__u16 cseg_16;
|
||||
__u16 dseg;
|
||||
__u16 flags;
|
||||
__u16 cseg_len;
|
||||
__u16 cseg_16_len;
|
||||
__u16 dseg_len;
|
||||
};
|
||||
|
||||
#ifdef __KERNEL__
|
||||
|
||||
#include <linux/types.h>
|
||||
typedef unsigned short apm_event_t;
|
||||
typedef unsigned short apm_eventinfo_t;
|
||||
|
||||
#define APM_CS (GDT_ENTRY_APMBIOS_BASE * 8)
|
||||
#define APM_CS_16 (APM_CS + 8)
|
||||
#define APM_DS (APM_CS_16 + 8)
|
||||
|
||||
struct apm_bios_info {
|
||||
u16 version;
|
||||
u16 cseg;
|
||||
u32 offset;
|
||||
u16 cseg_16;
|
||||
u16 dseg;
|
||||
u16 flags;
|
||||
u16 cseg_len;
|
||||
u16 cseg_16_len;
|
||||
u16 dseg_len;
|
||||
};
|
||||
|
||||
/* Results of APM Installation Check */
|
||||
#define APM_16_BIT_SUPPORT 0x0001
|
||||
#define APM_32_BIT_SUPPORT 0x0002
|
||||
|
@ -67,113 +67,113 @@
|
||||
#define EDD_INFO_USE_INT13_FN50 (1 << 7)
|
||||
|
||||
struct edd_device_params {
|
||||
u16 length;
|
||||
u16 info_flags;
|
||||
u32 num_default_cylinders;
|
||||
u32 num_default_heads;
|
||||
u32 sectors_per_track;
|
||||
u64 number_of_sectors;
|
||||
u16 bytes_per_sector;
|
||||
u32 dpte_ptr; /* 0xFFFFFFFF for our purposes */
|
||||
u16 key; /* = 0xBEDD */
|
||||
u8 device_path_info_length; /* = 44 */
|
||||
u8 reserved2;
|
||||
u16 reserved3;
|
||||
u8 host_bus_type[4];
|
||||
u8 interface_type[8];
|
||||
__u16 length;
|
||||
__u16 info_flags;
|
||||
__u32 num_default_cylinders;
|
||||
__u32 num_default_heads;
|
||||
__u32 sectors_per_track;
|
||||
__u64 number_of_sectors;
|
||||
__u16 bytes_per_sector;
|
||||
__u32 dpte_ptr; /* 0xFFFFFFFF for our purposes */
|
||||
__u16 key; /* = 0xBEDD */
|
||||
__u8 device_path_info_length; /* = 44 */
|
||||
__u8 reserved2;
|
||||
__u16 reserved3;
|
||||
__u8 host_bus_type[4];
|
||||
__u8 interface_type[8];
|
||||
union {
|
||||
struct {
|
||||
u16 base_address;
|
||||
u16 reserved1;
|
||||
u32 reserved2;
|
||||
__u16 base_address;
|
||||
__u16 reserved1;
|
||||
__u32 reserved2;
|
||||
} __attribute__ ((packed)) isa;
|
||||
struct {
|
||||
u8 bus;
|
||||
u8 slot;
|
||||
u8 function;
|
||||
u8 channel;
|
||||
u32 reserved;
|
||||
__u8 bus;
|
||||
__u8 slot;
|
||||
__u8 function;
|
||||
__u8 channel;
|
||||
__u32 reserved;
|
||||
} __attribute__ ((packed)) pci;
|
||||
/* pcix is same as pci */
|
||||
struct {
|
||||
u64 reserved;
|
||||
__u64 reserved;
|
||||
} __attribute__ ((packed)) ibnd;
|
||||
struct {
|
||||
u64 reserved;
|
||||
__u64 reserved;
|
||||
} __attribute__ ((packed)) xprs;
|
||||
struct {
|
||||
u64 reserved;
|
||||
__u64 reserved;
|
||||
} __attribute__ ((packed)) htpt;
|
||||
struct {
|
||||
u64 reserved;
|
||||
__u64 reserved;
|
||||
} __attribute__ ((packed)) unknown;
|
||||
} interface_path;
|
||||
union {
|
||||
struct {
|
||||
u8 device;
|
||||
u8 reserved1;
|
||||
u16 reserved2;
|
||||
u32 reserved3;
|
||||
u64 reserved4;
|
||||
__u8 device;
|
||||
__u8 reserved1;
|
||||
__u16 reserved2;
|
||||
__u32 reserved3;
|
||||
__u64 reserved4;
|
||||
} __attribute__ ((packed)) ata;
|
||||
struct {
|
||||
u8 device;
|
||||
u8 lun;
|
||||
u8 reserved1;
|
||||
u8 reserved2;
|
||||
u32 reserved3;
|
||||
u64 reserved4;
|
||||
__u8 device;
|
||||
__u8 lun;
|
||||
__u8 reserved1;
|
||||
__u8 reserved2;
|
||||
__u32 reserved3;
|
||||
__u64 reserved4;
|
||||
} __attribute__ ((packed)) atapi;
|
||||
struct {
|
||||
u16 id;
|
||||
u64 lun;
|
||||
u16 reserved1;
|
||||
u32 reserved2;
|
||||
__u16 id;
|
||||
__u64 lun;
|
||||
__u16 reserved1;
|
||||
__u32 reserved2;
|
||||
} __attribute__ ((packed)) scsi;
|
||||
struct {
|
||||
u64 serial_number;
|
||||
u64 reserved;
|
||||
__u64 serial_number;
|
||||
__u64 reserved;
|
||||
} __attribute__ ((packed)) usb;
|
||||
struct {
|
||||
u64 eui;
|
||||
u64 reserved;
|
||||
__u64 eui;
|
||||
__u64 reserved;
|
||||
} __attribute__ ((packed)) i1394;
|
||||
struct {
|
||||
u64 wwid;
|
||||
u64 lun;
|
||||
__u64 wwid;
|
||||
__u64 lun;
|
||||
} __attribute__ ((packed)) fibre;
|
||||
struct {
|
||||
u64 identity_tag;
|
||||
u64 reserved;
|
||||
__u64 identity_tag;
|
||||
__u64 reserved;
|
||||
} __attribute__ ((packed)) i2o;
|
||||
struct {
|
||||
u32 array_number;
|
||||
u32 reserved1;
|
||||
u64 reserved2;
|
||||
__u32 array_number;
|
||||
__u32 reserved1;
|
||||
__u64 reserved2;
|
||||
} __attribute__ ((packed)) raid;
|
||||
struct {
|
||||
u8 device;
|
||||
u8 reserved1;
|
||||
u16 reserved2;
|
||||
u32 reserved3;
|
||||
u64 reserved4;
|
||||
__u8 device;
|
||||
__u8 reserved1;
|
||||
__u16 reserved2;
|
||||
__u32 reserved3;
|
||||
__u64 reserved4;
|
||||
} __attribute__ ((packed)) sata;
|
||||
struct {
|
||||
u64 reserved1;
|
||||
u64 reserved2;
|
||||
__u64 reserved1;
|
||||
__u64 reserved2;
|
||||
} __attribute__ ((packed)) unknown;
|
||||
} device_path;
|
||||
u8 reserved4;
|
||||
u8 checksum;
|
||||
__u8 reserved4;
|
||||
__u8 checksum;
|
||||
} __attribute__ ((packed));
|
||||
|
||||
struct edd_info {
|
||||
u8 device;
|
||||
u8 version;
|
||||
u16 interface_support;
|
||||
u16 legacy_max_cylinder;
|
||||
u8 legacy_max_head;
|
||||
u8 legacy_sectors_per_track;
|
||||
__u8 device;
|
||||
__u8 version;
|
||||
__u16 interface_support;
|
||||
__u16 legacy_max_cylinder;
|
||||
__u8 legacy_max_head;
|
||||
__u8 legacy_sectors_per_track;
|
||||
struct edd_device_params params;
|
||||
} __attribute__ ((packed));
|
||||
|
||||
@ -184,8 +184,9 @@ struct edd {
|
||||
unsigned char edd_info_nr;
|
||||
};
|
||||
|
||||
#ifdef __KERNEL__
|
||||
extern struct edd edd;
|
||||
|
||||
#endif /* __KERNEL__ */
|
||||
#endif /*!__ASSEMBLY__ */
|
||||
|
||||
#endif /* _LINUX_EDD_H */
|
||||
|
@ -1,76 +1,16 @@
|
||||
/* Things the lguest guest needs to know. Note: like all lguest interfaces,
|
||||
* this is subject to wild and random change between versions. */
|
||||
#ifndef _ASM_LGUEST_H
|
||||
#define _ASM_LGUEST_H
|
||||
#ifndef _LINUX_LGUEST_H
|
||||
#define _LINUX_LGUEST_H
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
#include <linux/time.h>
|
||||
#include <asm/irq.h>
|
||||
|
||||
#define LHCALL_FLUSH_ASYNC 0
|
||||
#define LHCALL_LGUEST_INIT 1
|
||||
#define LHCALL_CRASH 2
|
||||
#define LHCALL_LOAD_GDT 3
|
||||
#define LHCALL_NEW_PGTABLE 4
|
||||
#define LHCALL_FLUSH_TLB 5
|
||||
#define LHCALL_LOAD_IDT_ENTRY 6
|
||||
#define LHCALL_SET_STACK 7
|
||||
#define LHCALL_TS 8
|
||||
#define LHCALL_SET_CLOCKEVENT 9
|
||||
#define LHCALL_HALT 10
|
||||
#define LHCALL_BIND_DMA 12
|
||||
#define LHCALL_SEND_DMA 13
|
||||
#define LHCALL_SET_PTE 14
|
||||
#define LHCALL_SET_PMD 15
|
||||
#define LHCALL_LOAD_TLS 16
|
||||
#include <asm/lguest_hcall.h>
|
||||
|
||||
#define LG_CLOCK_MIN_DELTA 100UL
|
||||
#define LG_CLOCK_MAX_DELTA ULONG_MAX
|
||||
|
||||
/*G:031 First, how does our Guest contact the Host to ask for privileged
|
||||
* operations? There are two ways: the direct way is to make a "hypercall",
|
||||
* to make requests of the Host Itself.
|
||||
*
|
||||
* Our hypercall mechanism uses the highest unused trap code (traps 32 and
|
||||
* above are used by real hardware interrupts). Seventeen hypercalls are
|
||||
* available: the hypercall number is put in the %eax register, and the
|
||||
* arguments (when required) are placed in %edx, %ebx and %ecx. If a return
|
||||
* value makes sense, it's returned in %eax.
|
||||
*
|
||||
* Grossly invalid calls result in Sudden Death at the hands of the vengeful
|
||||
* Host, rather than returning failure. This reflects Winston Churchill's
|
||||
* definition of a gentleman: "someone who is only rude intentionally". */
|
||||
#define LGUEST_TRAP_ENTRY 0x1F
|
||||
|
||||
static inline unsigned long
|
||||
hcall(unsigned long call,
|
||||
unsigned long arg1, unsigned long arg2, unsigned long arg3)
|
||||
{
|
||||
/* "int" is the Intel instruction to trigger a trap. */
|
||||
asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
|
||||
/* The call is in %eax (aka "a"), and can be replaced */
|
||||
: "=a"(call)
|
||||
/* The other arguments are in %eax, %edx, %ebx & %ecx */
|
||||
: "a"(call), "d"(arg1), "b"(arg2), "c"(arg3)
|
||||
/* "memory" means this might write somewhere in memory.
|
||||
* This isn't true for all calls, but it's safe to tell
|
||||
* gcc that it might happen so it doesn't get clever. */
|
||||
: "memory");
|
||||
return call;
|
||||
}
|
||||
/*:*/
|
||||
|
||||
void async_hcall(unsigned long call,
|
||||
unsigned long arg1, unsigned long arg2, unsigned long arg3);
|
||||
|
||||
/* Can't use our min() macro here: needs to be a constant */
|
||||
#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
|
||||
|
||||
#define LHCALL_RING_SIZE 64
|
||||
struct hcall_ring
|
||||
{
|
||||
u32 eax, edx, ebx, ecx;
|
||||
};
|
||||
|
||||
/*G:032 The second method of communicating with the Host is to via "struct
|
||||
* lguest_data". The Guest's very first hypercall is to tell the Host where
|
||||
* this is, and then the Guest and Host both publish information in it. :*/
|
||||
@ -97,20 +37,24 @@ struct lguest_data
|
||||
/* 0xFF == done (set by Host), 0 == pending (set by Guest). */
|
||||
u8 hcall_status[LHCALL_RING_SIZE];
|
||||
/* The actual registers for the hypercalls. */
|
||||
struct hcall_ring hcalls[LHCALL_RING_SIZE];
|
||||
struct hcall_args hcalls[LHCALL_RING_SIZE];
|
||||
|
||||
/* Fields initialized by the Host at boot: */
|
||||
/* Memory not to try to access */
|
||||
unsigned long reserve_mem;
|
||||
/* ID of this Guest (used by network driver to set ethernet address) */
|
||||
u16 guestid;
|
||||
/* KHz for the TSC clock. */
|
||||
u32 tsc_khz;
|
||||
/* Page where the top-level pagetable is */
|
||||
unsigned long pgdir;
|
||||
|
||||
/* Fields initialized by the Guest at boot: */
|
||||
/* Instruction range to suppress interrupts even if enabled */
|
||||
unsigned long noirq_start, noirq_end;
|
||||
/* Address above which page tables are all identical. */
|
||||
unsigned long kernel_address;
|
||||
/* The vector to try to use for system calls (0x40 or 0x80). */
|
||||
unsigned int syscall_vec;
|
||||
};
|
||||
extern struct lguest_data lguest_data;
|
||||
#endif /* __ASSEMBLY__ */
|
||||
#endif /* _ASM_LGUEST_H */
|
||||
#endif /* _LINUX_LGUEST_H */
|
||||
|
@ -1,51 +0,0 @@
|
||||
#ifndef _ASM_LGUEST_DEVICE_H
|
||||
#define _ASM_LGUEST_DEVICE_H
|
||||
/* Everything you need to know about lguest devices. */
|
||||
#include <linux/device.h>
|
||||
#include <linux/lguest.h>
|
||||
#include <linux/lguest_launcher.h>
|
||||
|
||||
struct lguest_device {
|
||||
/* Unique busid, and index into lguest_page->devices[] */
|
||||
unsigned int index;
|
||||
|
||||
struct device dev;
|
||||
|
||||
/* Driver can hang data off here. */
|
||||
void *private;
|
||||
};
|
||||
|
||||
/*D:380 Since interrupt numbers are arbitrary, we use a convention: each device
|
||||
* can use the interrupt number corresponding to its index. The +1 is because
|
||||
* interrupt 0 is not usable (it's actually the timer interrupt). */
|
||||
static inline int lgdev_irq(const struct lguest_device *dev)
|
||||
{
|
||||
return dev->index + 1;
|
||||
}
|
||||
/*:*/
|
||||
|
||||
/* dma args must not be vmalloced! */
|
||||
void lguest_send_dma(unsigned long key, struct lguest_dma *dma);
|
||||
int lguest_bind_dma(unsigned long key, struct lguest_dma *dmas,
|
||||
unsigned int num, u8 irq);
|
||||
void lguest_unbind_dma(unsigned long key, struct lguest_dma *dmas);
|
||||
|
||||
/* Map the virtual device space */
|
||||
void *lguest_map(unsigned long phys_addr, unsigned long pages);
|
||||
void lguest_unmap(void *);
|
||||
|
||||
struct lguest_driver {
|
||||
const char *name;
|
||||
struct module *owner;
|
||||
u16 device_type;
|
||||
int (*probe)(struct lguest_device *dev);
|
||||
void (*remove)(struct lguest_device *dev);
|
||||
|
||||
struct device_driver drv;
|
||||
};
|
||||
|
||||
extern int register_lguest_driver(struct lguest_driver *drv);
|
||||
extern void unregister_lguest_driver(struct lguest_driver *drv);
|
||||
|
||||
extern struct lguest_device_desc *lguest_devices; /* Just past max_pfn */
|
||||
#endif /* _ASM_LGUEST_DEVICE_H */
|
@ -1,6 +1,7 @@
|
||||
#ifndef _ASM_LGUEST_USER
|
||||
#define _ASM_LGUEST_USER
|
||||
/* Everything the "lguest" userspace program needs to know. */
|
||||
#include <linux/types.h>
|
||||
/* They can register up to 32 arrays of lguest_dma. */
|
||||
#define LGUEST_MAX_DMA 32
|
||||
/* At most we can dma 16 lguest_dma in one op. */
|
||||
@ -9,66 +10,6 @@
|
||||
/* How many devices? Assume each one wants up to two dma arrays per device. */
|
||||
#define LGUEST_MAX_DEVICES (LGUEST_MAX_DMA/2)
|
||||
|
||||
/*D:200
|
||||
* Lguest I/O
|
||||
*
|
||||
* The lguest I/O mechanism is the only way Guests can talk to devices. There
|
||||
* are two hypercalls involved: SEND_DMA for output and BIND_DMA for input. In
|
||||
* each case, "struct lguest_dma" describes the buffer: this contains 16
|
||||
* addr/len pairs, and if there are fewer buffer elements the len array is
|
||||
* terminated with a 0.
|
||||
*
|
||||
* I/O is organized by keys: BIND_DMA attaches buffers to a particular key, and
|
||||
* SEND_DMA transfers to buffers bound to particular key. By convention, keys
|
||||
* correspond to a physical address within the device's page. This means that
|
||||
* devices will never accidentally end up with the same keys, and allows the
|
||||
* Host use The Futex Trick (as we'll see later in our journey).
|
||||
*
|
||||
* SEND_DMA simply indicates a key to send to, and the physical address of the
|
||||
* "struct lguest_dma" to send. The Host will write the number of bytes
|
||||
* transferred into the "struct lguest_dma"'s used_len member.
|
||||
*
|
||||
* BIND_DMA indicates a key to bind to, a pointer to an array of "struct
|
||||
* lguest_dma"s ready for receiving, the size of that array, and an interrupt
|
||||
* to trigger when data is received. The Host will only allow transfers into
|
||||
* buffers with a used_len of zero: it then sets used_len to the number of
|
||||
* bytes transferred and triggers the interrupt for the Guest to process the
|
||||
* new input. */
|
||||
struct lguest_dma
|
||||
{
|
||||
/* 0 if free to be used, filled by the Host. */
|
||||
u32 used_len;
|
||||
unsigned long addr[LGUEST_MAX_DMA_SECTIONS];
|
||||
u16 len[LGUEST_MAX_DMA_SECTIONS];
|
||||
};
|
||||
/*:*/
|
||||
|
||||
/*D:460 This is the layout of a block device memory page. The Launcher sets up
|
||||
* the num_sectors initially to tell the Guest the size of the disk. The Guest
|
||||
* puts the type, sector and length of the request in the first three fields,
|
||||
* then DMAs to the Host. The Host processes the request, sets up the result,
|
||||
* then DMAs back to the Guest. */
|
||||
struct lguest_block_page
|
||||
{
|
||||
/* 0 is a read, 1 is a write. */
|
||||
int type;
|
||||
u32 sector; /* Offset in device = sector * 512. */
|
||||
u32 bytes; /* Length expected to be read/written in bytes */
|
||||
/* 0 = pending, 1 = done, 2 = done, error */
|
||||
int result;
|
||||
u32 num_sectors; /* Disk length = num_sectors * 512 */
|
||||
};
|
||||
|
||||
/*D:520 The network device is basically a memory page where all the Guests on
|
||||
* the network publish their MAC (ethernet) addresses: it's an array of "struct
|
||||
* lguest_net": */
|
||||
struct lguest_net
|
||||
{
|
||||
/* Simply the mac address (with multicast bit meaning promisc). */
|
||||
unsigned char mac[6];
|
||||
};
|
||||
/*:*/
|
||||
|
||||
/* Where the Host expects the Guest to SEND_DMA console output to. */
|
||||
#define LGUEST_CONSOLE_DMA_KEY 0
|
||||
|
||||
@ -81,38 +22,29 @@ struct lguest_net
|
||||
* complex burden for the Host and suboptimal for the Guest, so we have our own
|
||||
* "lguest" bus and simple drivers.
|
||||
*
|
||||
* Devices are described by an array of LGUEST_MAX_DEVICES of these structs,
|
||||
* placed by the Launcher just above the top of physical memory:
|
||||
* Devices are described by a simplified ID, a status byte, and some "config"
|
||||
* bytes which describe this device's configuration. This is placed by the
|
||||
* Launcher just above the top of physical memory:
|
||||
*/
|
||||
struct lguest_device_desc {
|
||||
/* The device type: console, network, disk etc. */
|
||||
u16 type;
|
||||
#define LGUEST_DEVICE_T_CONSOLE 1
|
||||
#define LGUEST_DEVICE_T_NET 2
|
||||
#define LGUEST_DEVICE_T_BLOCK 3
|
||||
/* The device type: console, network, disk etc. Type 0 terminates. */
|
||||
__u8 type;
|
||||
/* The number of bytes of the config array. */
|
||||
__u8 config_len;
|
||||
/* A status byte, written by the Guest. */
|
||||
__u8 status;
|
||||
__u8 config[0];
|
||||
};
|
||||
|
||||
/* The specific features of this device: these depends on device type
|
||||
* except for LGUEST_DEVICE_F_RANDOMNESS. */
|
||||
u16 features;
|
||||
#define LGUEST_NET_F_NOCSUM 0x4000 /* Don't bother checksumming */
|
||||
#define LGUEST_DEVICE_F_RANDOMNESS 0x8000 /* IRQ is fairly random */
|
||||
|
||||
/* This is how the Guest reports status of the device: the Host can set
|
||||
* LGUEST_DEVICE_S_REMOVED to indicate removal, but the rest are only
|
||||
* ever manipulated by the Guest, and only ever set. */
|
||||
u16 status;
|
||||
/* 256 and above are device specific. */
|
||||
#define LGUEST_DEVICE_S_ACKNOWLEDGE 1 /* We have seen device. */
|
||||
#define LGUEST_DEVICE_S_DRIVER 2 /* We have found a driver */
|
||||
#define LGUEST_DEVICE_S_DRIVER_OK 4 /* Driver says OK! */
|
||||
#define LGUEST_DEVICE_S_REMOVED 8 /* Device has gone away. */
|
||||
#define LGUEST_DEVICE_S_REMOVED_ACK 16 /* Driver has been told. */
|
||||
#define LGUEST_DEVICE_S_FAILED 128 /* Something actually failed */
|
||||
|
||||
/* Each device exists somewhere in Guest physical memory, over some
|
||||
* number of pages. */
|
||||
u16 num_pages;
|
||||
u32 pfn;
|
||||
/*D:135 This is how we expect the device configuration field for a virtqueue
|
||||
* (type VIRTIO_CONFIG_F_VIRTQUEUE) to be laid out: */
|
||||
struct lguest_vqconfig {
|
||||
/* The number of entries in the virtio_ring */
|
||||
__u16 num;
|
||||
/* The interrupt we get when something happens. */
|
||||
__u16 irq;
|
||||
/* The page number of the virtio ring for this device. */
|
||||
__u32 pfn;
|
||||
};
|
||||
/*:*/
|
||||
|
||||
@ -120,7 +52,7 @@ struct lguest_device_desc {
|
||||
enum lguest_req
|
||||
{
|
||||
LHREQ_INITIALIZE, /* + pfnlimit, pgdir, start, pageoffset */
|
||||
LHREQ_GETDMA, /* + addr (returns &lguest_dma, irq in ->used_len) */
|
||||
LHREQ_GETDMA, /* No longer used */
|
||||
LHREQ_IRQ, /* + irq */
|
||||
LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */
|
||||
};
|
||||
|
@ -361,4 +361,10 @@ struct ssb_device_id {
|
||||
#define SSB_ANY_ID 0xFFFF
|
||||
#define SSB_ANY_REV 0xFF
|
||||
|
||||
struct virtio_device_id {
|
||||
__u32 device;
|
||||
__u32 vendor;
|
||||
};
|
||||
#define VIRTIO_DEV_ANY_ID 0xffffffff
|
||||
|
||||
#endif /* LINUX_MOD_DEVICETABLE_H */
|
||||
|
@ -8,45 +8,43 @@
|
||||
*/
|
||||
|
||||
struct screen_info {
|
||||
u8 orig_x; /* 0x00 */
|
||||
u8 orig_y; /* 0x01 */
|
||||
u16 ext_mem_k; /* 0x02 */
|
||||
u16 orig_video_page; /* 0x04 */
|
||||
u8 orig_video_mode; /* 0x06 */
|
||||
u8 orig_video_cols; /* 0x07 */
|
||||
u16 unused2; /* 0x08 */
|
||||
u16 orig_video_ega_bx; /* 0x0a */
|
||||
u16 unused3; /* 0x0c */
|
||||
u8 orig_video_lines; /* 0x0e */
|
||||
u8 orig_video_isVGA; /* 0x0f */
|
||||
u16 orig_video_points; /* 0x10 */
|
||||
__u8 orig_x; /* 0x00 */
|
||||
__u8 orig_y; /* 0x01 */
|
||||
__u16 ext_mem_k; /* 0x02 */
|
||||
__u16 orig_video_page; /* 0x04 */
|
||||
__u8 orig_video_mode; /* 0x06 */
|
||||
__u8 orig_video_cols; /* 0x07 */
|
||||
__u16 unused2; /* 0x08 */
|
||||
__u16 orig_video_ega_bx;/* 0x0a */
|
||||
__u16 unused3; /* 0x0c */
|
||||
__u8 orig_video_lines; /* 0x0e */
|
||||
__u8 orig_video_isVGA; /* 0x0f */
|
||||
__u16 orig_video_points;/* 0x10 */
|
||||
|
||||
/* VESA graphic mode -- linear frame buffer */
|
||||
u16 lfb_width; /* 0x12 */
|
||||
u16 lfb_height; /* 0x14 */
|
||||
u16 lfb_depth; /* 0x16 */
|
||||
u32 lfb_base; /* 0x18 */
|
||||
u32 lfb_size; /* 0x1c */
|
||||
u16 cl_magic, cl_offset; /* 0x20 */
|
||||
u16 lfb_linelength; /* 0x24 */
|
||||
u8 red_size; /* 0x26 */
|
||||
u8 red_pos; /* 0x27 */
|
||||
u8 green_size; /* 0x28 */
|
||||
u8 green_pos; /* 0x29 */
|
||||
u8 blue_size; /* 0x2a */
|
||||
u8 blue_pos; /* 0x2b */
|
||||
u8 rsvd_size; /* 0x2c */
|
||||
u8 rsvd_pos; /* 0x2d */
|
||||
u16 vesapm_seg; /* 0x2e */
|
||||
u16 vesapm_off; /* 0x30 */
|
||||
u16 pages; /* 0x32 */
|
||||
u16 vesa_attributes; /* 0x34 */
|
||||
u32 capabilities; /* 0x36 */
|
||||
u8 _reserved[6]; /* 0x3a */
|
||||
__u16 lfb_width; /* 0x12 */
|
||||
__u16 lfb_height; /* 0x14 */
|
||||
__u16 lfb_depth; /* 0x16 */
|
||||
__u32 lfb_base; /* 0x18 */
|
||||
__u32 lfb_size; /* 0x1c */
|
||||
__u16 cl_magic, cl_offset; /* 0x20 */
|
||||
__u16 lfb_linelength; /* 0x24 */
|
||||
__u8 red_size; /* 0x26 */
|
||||
__u8 red_pos; /* 0x27 */
|
||||
__u8 green_size; /* 0x28 */
|
||||
__u8 green_pos; /* 0x29 */
|
||||
__u8 blue_size; /* 0x2a */
|
||||
__u8 blue_pos; /* 0x2b */
|
||||
__u8 rsvd_size; /* 0x2c */
|
||||
__u8 rsvd_pos; /* 0x2d */
|
||||
__u16 vesapm_seg; /* 0x2e */
|
||||
__u16 vesapm_off; /* 0x30 */
|
||||
__u16 pages; /* 0x32 */
|
||||
__u16 vesa_attributes; /* 0x34 */
|
||||
__u32 capabilities; /* 0x36 */
|
||||
__u8 _reserved[6]; /* 0x3a */
|
||||
} __attribute__((packed));
|
||||
|
||||
extern struct screen_info screen_info;
|
||||
|
||||
#define VIDEO_TYPE_MDA 0x10 /* Monochrome Text Display */
|
||||
#define VIDEO_TYPE_CGA 0x11 /* CGA Display */
|
||||
#define VIDEO_TYPE_EGAM 0x20 /* EGA/VGA in Monochrome Mode */
|
||||
@ -65,4 +63,17 @@ extern struct screen_info screen_info;
|
||||
|
||||
#define VIDEO_TYPE_PMAC 0x60 /* PowerMacintosh frame buffer. */
|
||||
|
||||
#ifdef __KERNEL__
|
||||
extern struct screen_info screen_info;
|
||||
|
||||
#define ORIG_X (screen_info.orig_x)
|
||||
#define ORIG_Y (screen_info.orig_y)
|
||||
#define ORIG_VIDEO_MODE (screen_info.orig_video_mode)
|
||||
#define ORIG_VIDEO_COLS (screen_info.orig_video_cols)
|
||||
#define ORIG_VIDEO_EGA_BX (screen_info.orig_video_ega_bx)
|
||||
#define ORIG_VIDEO_LINES (screen_info.orig_video_lines)
|
||||
#define ORIG_VIDEO_ISVGA (screen_info.orig_video_isVGA)
|
||||
#define ORIG_VIDEO_POINTS (screen_info.orig_video_points)
|
||||
#endif /* __KERNEL__ */
|
||||
|
||||
#endif /* _SCREEN_INFO_H */
|
||||
|
110
include/linux/virtio.h
Normal file
110
include/linux/virtio.h
Normal file
@ -0,0 +1,110 @@
|
||||
#ifndef _LINUX_VIRTIO_H
|
||||
#define _LINUX_VIRTIO_H
|
||||
/* Everything a virtio driver needs to work with any particular virtio
|
||||
* implementation. */
|
||||
#include <linux/types.h>
|
||||
#include <linux/scatterlist.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/device.h>
|
||||
#include <linux/mod_devicetable.h>
|
||||
|
||||
/**
|
||||
* virtqueue - a queue to register buffers for sending or receiving.
|
||||
* @callback: the function to call when buffers are consumed (can be NULL).
|
||||
* If this returns false, callbacks are suppressed until vq_ops->restart
|
||||
* is called.
|
||||
* @vdev: the virtio device this queue was created for.
|
||||
* @vq_ops: the operations for this virtqueue (see below).
|
||||
* @priv: a pointer for the virtqueue implementation to use.
|
||||
*/
|
||||
struct virtqueue
|
||||
{
|
||||
bool (*callback)(struct virtqueue *vq);
|
||||
struct virtio_device *vdev;
|
||||
struct virtqueue_ops *vq_ops;
|
||||
void *priv;
|
||||
};
|
||||
|
||||
/**
|
||||
* virtqueue_ops - operations for virtqueue abstraction layer
|
||||
* @add_buf: expose buffer to other end
|
||||
* vq: the struct virtqueue we're talking about.
|
||||
* sg: the description of the buffer(s).
|
||||
* out_num: the number of sg readable by other side
|
||||
* in_num: the number of sg which are writable (after readable ones)
|
||||
* data: the token identifying the buffer.
|
||||
* Returns 0 or an error.
|
||||
* @kick: update after add_buf
|
||||
* vq: the struct virtqueue
|
||||
* After one or more add_buf calls, invoke this to kick the other side.
|
||||
* @get_buf: get the next used buffer
|
||||
* vq: the struct virtqueue we're talking about.
|
||||
* len: the length written into the buffer
|
||||
* Returns NULL or the "data" token handed to add_buf.
|
||||
* @restart: restart callbacks after callback returned false.
|
||||
* vq: the struct virtqueue we're talking about.
|
||||
* This returns "false" (and doesn't re-enable) if there are pending
|
||||
* buffers in the queue, to avoid a race.
|
||||
* @shutdown: "unadd" all buffers.
|
||||
* vq: the struct virtqueue we're talking about.
|
||||
* Remove everything from the queue.
|
||||
*
|
||||
* Locking rules are straightforward: the driver is responsible for
|
||||
* locking. No two operations may be invoked simultaneously.
|
||||
*
|
||||
* All operations can be called in any context.
|
||||
*/
|
||||
struct virtqueue_ops {
|
||||
int (*add_buf)(struct virtqueue *vq,
|
||||
struct scatterlist sg[],
|
||||
unsigned int out_num,
|
||||
unsigned int in_num,
|
||||
void *data);
|
||||
|
||||
void (*kick)(struct virtqueue *vq);
|
||||
|
||||
void *(*get_buf)(struct virtqueue *vq, unsigned int *len);
|
||||
|
||||
bool (*restart)(struct virtqueue *vq);
|
||||
|
||||
void (*shutdown)(struct virtqueue *vq);
|
||||
};
|
||||
|
||||
/**
|
||||
* virtio_device - representation of a device using virtio
|
||||
* @index: unique position on the virtio bus
|
||||
* @dev: underlying device.
|
||||
* @id: the device type identification (used to match it with a driver).
|
||||
* @config: the configuration ops for this device.
|
||||
* @priv: private pointer for the driver's use.
|
||||
*/
|
||||
struct virtio_device
|
||||
{
|
||||
int index;
|
||||
struct device dev;
|
||||
struct virtio_device_id id;
|
||||
struct virtio_config_ops *config;
|
||||
void *priv;
|
||||
};
|
||||
|
||||
int register_virtio_device(struct virtio_device *dev);
|
||||
void unregister_virtio_device(struct virtio_device *dev);
|
||||
|
||||
/**
|
||||
* virtio_driver - operations for a virtio I/O driver
|
||||
* @driver: underlying device driver (populate name and owner).
|
||||
* @id_table: the ids serviced by this driver.
|
||||
* @probe: the function to call when a device is found. Returns a token for
|
||||
* remove, or PTR_ERR().
|
||||
* @remove: the function when a device is removed.
|
||||
*/
|
||||
struct virtio_driver {
|
||||
struct device_driver driver;
|
||||
const struct virtio_device_id *id_table;
|
||||
int (*probe)(struct virtio_device *dev);
|
||||
void (*remove)(struct virtio_device *dev);
|
||||
};
|
||||
|
||||
int register_virtio_driver(struct virtio_driver *drv);
|
||||
void unregister_virtio_driver(struct virtio_driver *drv);
|
||||
#endif /* _LINUX_VIRTIO_H */
|
51
include/linux/virtio_blk.h
Normal file
51
include/linux/virtio_blk.h
Normal file
@ -0,0 +1,51 @@
|
||||
#ifndef _LINUX_VIRTIO_BLK_H
|
||||
#define _LINUX_VIRTIO_BLK_H
|
||||
#include <linux/virtio_config.h>
|
||||
|
||||
/* The ID for virtio_block */
|
||||
#define VIRTIO_ID_BLOCK 2
|
||||
|
||||
/* Feature bits */
|
||||
#define VIRTIO_CONFIG_BLK_F 0x40
|
||||
#define VIRTIO_BLK_F_BARRIER 1 /* Does host support barriers? */
|
||||
|
||||
/* The capacity (in 512-byte sectors). */
|
||||
#define VIRTIO_CONFIG_BLK_F_CAPACITY 0x41
|
||||
/* The maximum segment size. */
|
||||
#define VIRTIO_CONFIG_BLK_F_SIZE_MAX 0x42
|
||||
/* The maximum number of segments. */
|
||||
#define VIRTIO_CONFIG_BLK_F_SEG_MAX 0x43
|
||||
|
||||
/* These two define direction. */
|
||||
#define VIRTIO_BLK_T_IN 0
|
||||
#define VIRTIO_BLK_T_OUT 1
|
||||
|
||||
/* This bit says it's a scsi command, not an actual read or write. */
|
||||
#define VIRTIO_BLK_T_SCSI_CMD 2
|
||||
|
||||
/* Barrier before this op. */
|
||||
#define VIRTIO_BLK_T_BARRIER 0x80000000
|
||||
|
||||
/* This is the first element of the read scatter-gather list. */
|
||||
struct virtio_blk_outhdr
|
||||
{
|
||||
/* VIRTIO_BLK_T* */
|
||||
__u32 type;
|
||||
/* io priority. */
|
||||
__u32 ioprio;
|
||||
/* Sector (ie. 512 byte offset) */
|
||||
__u64 sector;
|
||||
/* Where to put reply. */
|
||||
__u64 id;
|
||||
};
|
||||
|
||||
#define VIRTIO_BLK_S_OK 0
|
||||
#define VIRTIO_BLK_S_IOERR 1
|
||||
#define VIRTIO_BLK_S_UNSUPP 2
|
||||
|
||||
/* This is the first element of the write scatter-gather list */
|
||||
struct virtio_blk_inhdr
|
||||
{
|
||||
unsigned char status;
|
||||
};
|
||||
#endif /* _LINUX_VIRTIO_BLK_H */
|
111
include/linux/virtio_config.h
Normal file
111
include/linux/virtio_config.h
Normal file
@ -0,0 +1,111 @@
|
||||
#ifndef _LINUX_VIRTIO_CONFIG_H
|
||||
#define _LINUX_VIRTIO_CONFIG_H
|
||||
/* Virtio devices use a standardized configuration space to define their
|
||||
* features and pass configuration information, but each implementation can
|
||||
* store and access that space differently. */
|
||||
#include <linux/types.h>
|
||||
|
||||
/* Status byte for guest to report progress, and synchronize config. */
|
||||
/* We have seen device and processed generic fields (VIRTIO_CONFIG_F_VIRTIO) */
|
||||
#define VIRTIO_CONFIG_S_ACKNOWLEDGE 1
|
||||
/* We have found a driver for the device. */
|
||||
#define VIRTIO_CONFIG_S_DRIVER 2
|
||||
/* Driver has used its parts of the config, and is happy */
|
||||
#define VIRTIO_CONFIG_S_DRIVER_OK 4
|
||||
/* We've given up on this device. */
|
||||
#define VIRTIO_CONFIG_S_FAILED 0x80
|
||||
|
||||
/* Feature byte (actually 7 bits availabe): */
|
||||
/* Requirements/features of the virtio implementation. */
|
||||
#define VIRTIO_CONFIG_F_VIRTIO 1
|
||||
/* Requirements/features of the virtqueue (may have more than one). */
|
||||
#define VIRTIO_CONFIG_F_VIRTQUEUE 2
|
||||
|
||||
#ifdef __KERNEL__
|
||||
struct virtio_device;
|
||||
|
||||
/**
|
||||
* virtio_config_ops - operations for configuring a virtio device
|
||||
* @find: search for the next configuration field of the given type.
|
||||
* vdev: the virtio_device
|
||||
* type: the feature type
|
||||
* len: the (returned) length of the field if found.
|
||||
* Returns a token if found, or NULL. Never returnes the same field twice
|
||||
* (ie. it's used up).
|
||||
* @get: read the value of a configuration field after find().
|
||||
* vdev: the virtio_device
|
||||
* token: the token returned from find().
|
||||
* buf: the buffer to write the field value into.
|
||||
* len: the length of the buffer (given by find()).
|
||||
* Note that contents are conventionally little-endian.
|
||||
* @set: write the value of a configuration field after find().
|
||||
* vdev: the virtio_device
|
||||
* token: the token returned from find().
|
||||
* buf: the buffer to read the field value from.
|
||||
* len: the length of the buffer (given by find()).
|
||||
* Note that contents are conventionally little-endian.
|
||||
* @get_status: read the status byte
|
||||
* vdev: the virtio_device
|
||||
* Returns the status byte
|
||||
* @set_status: write the status byte
|
||||
* vdev: the virtio_device
|
||||
* status: the new status byte
|
||||
* @find_vq: find the first VIRTIO_CONFIG_F_VIRTQUEUE and create a virtqueue.
|
||||
* vdev: the virtio_device
|
||||
* callback: the virqtueue callback
|
||||
* Returns the new virtqueue or ERR_PTR().
|
||||
* @del_vq: free a virtqueue found by find_vq().
|
||||
*/
|
||||
struct virtio_config_ops
|
||||
{
|
||||
void *(*find)(struct virtio_device *vdev, u8 type, unsigned *len);
|
||||
void (*get)(struct virtio_device *vdev, void *token,
|
||||
void *buf, unsigned len);
|
||||
void (*set)(struct virtio_device *vdev, void *token,
|
||||
const void *buf, unsigned len);
|
||||
u8 (*get_status)(struct virtio_device *vdev);
|
||||
void (*set_status)(struct virtio_device *vdev, u8 status);
|
||||
struct virtqueue *(*find_vq)(struct virtio_device *vdev,
|
||||
bool (*callback)(struct virtqueue *));
|
||||
void (*del_vq)(struct virtqueue *vq);
|
||||
};
|
||||
|
||||
/**
|
||||
* virtio_config_val - get a single virtio config and mark it used.
|
||||
* @config: the virtio config space
|
||||
* @type: the type to search for.
|
||||
* @val: a pointer to the value to fill in.
|
||||
*
|
||||
* Once used, the config type is marked with VIRTIO_CONFIG_F_USED so it can't
|
||||
* be found again. This version does endian conversion. */
|
||||
#define virtio_config_val(vdev, type, v) ({ \
|
||||
int _err = __virtio_config_val((vdev),(type),(v),sizeof(*(v))); \
|
||||
\
|
||||
BUILD_BUG_ON(sizeof(*(v)) != 1 && sizeof(*(v)) != 2 \
|
||||
&& sizeof(*(v)) != 4 && sizeof(*(v)) != 8); \
|
||||
if (!_err) { \
|
||||
switch (sizeof(*(v))) { \
|
||||
case 2: le16_to_cpus((__u16 *) v); break; \
|
||||
case 4: le32_to_cpus((__u32 *) v); break; \
|
||||
case 8: le64_to_cpus((__u64 *) v); break; \
|
||||
} \
|
||||
} \
|
||||
_err; \
|
||||
})
|
||||
|
||||
int __virtio_config_val(struct virtio_device *dev,
|
||||
u8 type, void *val, size_t size);
|
||||
|
||||
/**
|
||||
* virtio_use_bit - helper to use a feature bit in a bitfield value.
|
||||
* @dev: the virtio device
|
||||
* @token: the token as returned from vdev->config->find().
|
||||
* @len: the length of the field.
|
||||
* @bitnum: the bit to test.
|
||||
*
|
||||
* If handed a NULL token, it returns false, otherwise returns bit status.
|
||||
* If it's one, it sets the mirroring acknowledgement bit. */
|
||||
int virtio_use_bit(struct virtio_device *vdev,
|
||||
void *token, unsigned int len, unsigned int bitnum);
|
||||
#endif /* __KERNEL__ */
|
||||
#endif /* _LINUX_VIRTIO_CONFIG_H */
|
12
include/linux/virtio_console.h
Normal file
12
include/linux/virtio_console.h
Normal file
@ -0,0 +1,12 @@
|
||||
#ifndef _LINUX_VIRTIO_CONSOLE_H
|
||||
#define _LINUX_VIRTIO_CONSOLE_H
|
||||
#include <linux/virtio_config.h>
|
||||
|
||||
/* The ID for virtio console */
|
||||
#define VIRTIO_ID_CONSOLE 3
|
||||
|
||||
#ifdef __KERNEL__
|
||||
int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int));
|
||||
#endif /* __KERNEL__ */
|
||||
|
||||
#endif /* _LINUX_VIRTIO_CONSOLE_H */
|
36
include/linux/virtio_net.h
Normal file
36
include/linux/virtio_net.h
Normal file
@ -0,0 +1,36 @@
|
||||
#ifndef _LINUX_VIRTIO_NET_H
|
||||
#define _LINUX_VIRTIO_NET_H
|
||||
#include <linux/virtio_config.h>
|
||||
|
||||
/* The ID for virtio_net */
|
||||
#define VIRTIO_ID_NET 1
|
||||
|
||||
/* The bitmap of config for virtio net */
|
||||
#define VIRTIO_CONFIG_NET_F 0x40
|
||||
#define VIRTIO_NET_F_NO_CSUM 0
|
||||
#define VIRTIO_NET_F_TSO4 1
|
||||
#define VIRTIO_NET_F_UFO 2
|
||||
#define VIRTIO_NET_F_TSO4_ECN 3
|
||||
#define VIRTIO_NET_F_TSO6 4
|
||||
|
||||
/* The config defining mac address. */
|
||||
#define VIRTIO_CONFIG_NET_MAC_F 0x41
|
||||
|
||||
/* This is the first element of the scatter-gather list. If you don't
|
||||
* specify GSO or CSUM features, you can simply ignore the header. */
|
||||
struct virtio_net_hdr
|
||||
{
|
||||
#define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 // Use csum_start, csum_offset
|
||||
__u8 flags;
|
||||
#define VIRTIO_NET_HDR_GSO_NONE 0 // Not a GSO frame
|
||||
#define VIRTIO_NET_HDR_GSO_TCPV4 1 // GSO frame, IPv4 TCP (TSO)
|
||||
/* FIXME: Do we need this? If they said they can handle ECN, do they care? */
|
||||
#define VIRTIO_NET_HDR_GSO_TCPV4_ECN 2 // GSO frame, IPv4 TCP w/ ECN
|
||||
#define VIRTIO_NET_HDR_GSO_UDP 3 // GSO frame, IPv4 UDP (UFO)
|
||||
#define VIRTIO_NET_HDR_GSO_TCPV6 4 // GSO frame, IPv6 TCP
|
||||
__u8 gso_type;
|
||||
__u16 gso_size;
|
||||
__u16 csum_start;
|
||||
__u16 csum_offset;
|
||||
};
|
||||
#endif /* _LINUX_VIRTIO_NET_H */
|
119
include/linux/virtio_ring.h
Normal file
119
include/linux/virtio_ring.h
Normal file
@ -0,0 +1,119 @@
|
||||
#ifndef _LINUX_VIRTIO_RING_H
|
||||
#define _LINUX_VIRTIO_RING_H
|
||||
/* An interface for efficient virtio implementation, currently for use by KVM
|
||||
* and lguest, but hopefully others soon. Do NOT change this since it will
|
||||
* break existing servers and clients.
|
||||
*
|
||||
* This header is BSD licensed so anyone can use the definitions to implement
|
||||
* compatible drivers/servers.
|
||||
*
|
||||
* Copyright Rusty Russell IBM Corporation 2007. */
|
||||
#include <linux/types.h>
|
||||
|
||||
/* This marks a buffer as continuing via the next field. */
|
||||
#define VRING_DESC_F_NEXT 1
|
||||
/* This marks a buffer as write-only (otherwise read-only). */
|
||||
#define VRING_DESC_F_WRITE 2
|
||||
|
||||
/* This means don't notify other side when buffer added. */
|
||||
#define VRING_USED_F_NO_NOTIFY 1
|
||||
/* This means don't interrupt guest when buffer consumed. */
|
||||
#define VRING_AVAIL_F_NO_INTERRUPT 1
|
||||
|
||||
/* Virtio ring descriptors: 16 bytes. These can chain together via "next". */
|
||||
struct vring_desc
|
||||
{
|
||||
/* Address (guest-physical). */
|
||||
__u64 addr;
|
||||
/* Length. */
|
||||
__u32 len;
|
||||
/* The flags as indicated above. */
|
||||
__u16 flags;
|
||||
/* We chain unused descriptors via this, too */
|
||||
__u16 next;
|
||||
};
|
||||
|
||||
struct vring_avail
|
||||
{
|
||||
__u16 flags;
|
||||
__u16 idx;
|
||||
__u16 ring[];
|
||||
};
|
||||
|
||||
/* u32 is used here for ids for padding reasons. */
|
||||
struct vring_used_elem
|
||||
{
|
||||
/* Index of start of used descriptor chain. */
|
||||
__u32 id;
|
||||
/* Total length of the descriptor chain which was used (written to) */
|
||||
__u32 len;
|
||||
};
|
||||
|
||||
struct vring_used
|
||||
{
|
||||
__u16 flags;
|
||||
__u16 idx;
|
||||
struct vring_used_elem ring[];
|
||||
};
|
||||
|
||||
struct vring {
|
||||
unsigned int num;
|
||||
|
||||
struct vring_desc *desc;
|
||||
|
||||
struct vring_avail *avail;
|
||||
|
||||
struct vring_used *used;
|
||||
};
|
||||
|
||||
/* The standard layout for the ring is a continuous chunk of memory which looks
|
||||
* like this. The used fields will be aligned to a "num+1" boundary.
|
||||
*
|
||||
* struct vring
|
||||
* {
|
||||
* // The actual descriptors (16 bytes each)
|
||||
* struct vring_desc desc[num];
|
||||
*
|
||||
* // A ring of available descriptor heads with free-running index.
|
||||
* __u16 avail_flags;
|
||||
* __u16 avail_idx;
|
||||
* __u16 available[num];
|
||||
*
|
||||
* // Padding so a correctly-chosen num value will cache-align used_idx.
|
||||
* char pad[sizeof(struct vring_desc) - sizeof(avail_flags)];
|
||||
*
|
||||
* // A ring of used descriptor heads with free-running index.
|
||||
* __u16 used_flags;
|
||||
* __u16 used_idx;
|
||||
* struct vring_used_elem used[num];
|
||||
* };
|
||||
*/
|
||||
static inline void vring_init(struct vring *vr, unsigned int num, void *p)
|
||||
{
|
||||
vr->num = num;
|
||||
vr->desc = p;
|
||||
vr->avail = p + num*sizeof(struct vring);
|
||||
vr->used = p + (num+1)*(sizeof(struct vring) + sizeof(__u16));
|
||||
}
|
||||
|
||||
static inline unsigned vring_size(unsigned int num)
|
||||
{
|
||||
return (num + 1) * (sizeof(struct vring_desc) + sizeof(__u16))
|
||||
+ sizeof(__u32) + num * sizeof(struct vring_used_elem);
|
||||
}
|
||||
|
||||
#ifdef __KERNEL__
|
||||
#include <linux/irqreturn.h>
|
||||
struct virtio_device;
|
||||
struct virtqueue;
|
||||
|
||||
struct virtqueue *vring_new_virtqueue(unsigned int num,
|
||||
struct virtio_device *vdev,
|
||||
void *pages,
|
||||
void (*notify)(struct virtqueue *vq),
|
||||
bool (*callback)(struct virtqueue *vq));
|
||||
void vring_del_virtqueue(struct virtqueue *vq);
|
||||
|
||||
irqreturn_t vring_interrupt(int irq, void *_vq);
|
||||
#endif /* __KERNEL__ */
|
||||
#endif /* _LINUX_VIRTIO_RING_H */
|
@ -1 +1,2 @@
|
||||
unifdef-y += sisfb.h uvesafb.h
|
||||
unifdef-y += edid.h
|
||||
|
@ -1,17 +1,16 @@
|
||||
#ifndef __linux_video_edid_h__
|
||||
#define __linux_video_edid_h__
|
||||
|
||||
#ifdef __KERNEL__
|
||||
#if !defined(__KERNEL__) || defined(CONFIG_X86)
|
||||
|
||||
|
||||
#ifdef CONFIG_X86
|
||||
struct edid_info {
|
||||
unsigned char dummy[128];
|
||||
};
|
||||
|
||||
#ifdef __KERNEL__
|
||||
extern struct edid_info edid_info;
|
||||
#endif /* CONFIG_X86 */
|
||||
|
||||
#endif /* __KERNEL__ */
|
||||
|
||||
#endif
|
||||
|
||||
#endif /* __linux_video_edid_h__ */
|
||||
|
@ -525,6 +525,20 @@ static int do_ssb_entry(const char *filename,
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Looks like: virtio:dNvN */
|
||||
static int do_virtio_entry(const char *filename, struct virtio_device_id *id,
|
||||
char *alias)
|
||||
{
|
||||
id->device = TO_NATIVE(id->device);
|
||||
id->vendor = TO_NATIVE(id->vendor);
|
||||
|
||||
strcpy(alias, "virtio:");
|
||||
ADD(alias, "d", 1, id->device);
|
||||
ADD(alias, "v", id->vendor != VIRTIO_DEV_ANY_ID, id->vendor);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Ignore any prefix, eg. v850 prepends _ */
|
||||
static inline int sym_is(const char *symbol, const char *name)
|
||||
{
|
||||
@ -651,6 +665,10 @@ void handle_moddevtable(struct module *mod, struct elf_info *info,
|
||||
do_table(symval, sym->st_size,
|
||||
sizeof(struct ssb_device_id), "ssb",
|
||||
do_ssb_entry, mod);
|
||||
else if (sym_is(symname, "__mod_virtio_device_table"))
|
||||
do_table(symval, sym->st_size,
|
||||
sizeof(struct virtio_device_id), "virtio",
|
||||
do_virtio_entry, mod);
|
||||
free(zeros);
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user