From 18c137371b2ea86d263b75665a4904a0b8872990 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 11 Feb 2015 15:15:09 +1030 Subject: lguest: add operations to get/set a register from the Launcher. We use the ptrace API struct, and we currently don't let them set anything but the normal registers (we'd have to filter the others). Signed-off-by: Rusty Russell --- drivers/lguest/core.c | 8 ++++++++ drivers/lguest/lg.h | 3 +++ drivers/lguest/lguest_user.c | 49 ++++++++++++++++++++++++++++++++++++++++++++ drivers/lguest/x86/core.c | 46 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 106 insertions(+) (limited to 'drivers/lguest') diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index 6590558d1d31..cdb2f9aa5860 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -208,6 +208,14 @@ void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b, */ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) { + /* If the launcher asked for a register with LHREQ_GETREG */ + if (cpu->reg_read) { + if (put_user(*cpu->reg_read, user)) + return -EFAULT; + cpu->reg_read = NULL; + return sizeof(*cpu->reg_read); + } + /* We stop running once the Guest is dead. */ while (!cpu->lg->dead) { unsigned int irq; diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 2eef40be4c04..1c98bf74fd68 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -52,6 +52,8 @@ struct lg_cpu { unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */ + unsigned long *reg_read; /* register from LHREQ_GETREG */ + /* At end of a page shared mapped over lguest_pages in guest. */ unsigned long regs_page; struct lguest_regs *regs; @@ -210,6 +212,7 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu); int lguest_arch_init_hypercalls(struct lg_cpu *cpu); int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args); void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start); +unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any); /* /switcher.S: */ extern char start_switcher_text[], end_switcher_text[], switch_to_guest[]; diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index 4263f4cc8c55..7f14c152dd23 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -173,6 +173,51 @@ static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) return err; } +/* The Launcher can get the registers, and also set some of them. */ +static int getreg_setup(struct lg_cpu *cpu, const unsigned long __user *input) +{ + unsigned long which; + + /* We re-use the ptrace structure to specify which register to read. */ + if (get_user(which, input) != 0) + return -EFAULT; + + /* + * We set up the cpu register pointer, and their next read will + * actually get the value (instead of running the guest). + * + * The last argument 'true' says we can access any register. + */ + cpu->reg_read = lguest_arch_regptr(cpu, which, true); + if (!cpu->reg_read) + return -ENOENT; + + /* And because this is a write() call, we return the length used. */ + return sizeof(unsigned long) * 2; +} + +static int setreg(struct lg_cpu *cpu, const unsigned long __user *input) +{ + unsigned long which, value, *reg; + + /* We re-use the ptrace structure to specify which register to read. */ + if (get_user(which, input) != 0) + return -EFAULT; + input++; + if (get_user(value, input) != 0) + return -EFAULT; + + /* The last argument 'false' means we can't access all registers. */ + reg = lguest_arch_regptr(cpu, which, false); + if (!reg) + return -ENOENT; + + *reg = value; + + /* And because this is a write() call, we return the length used. */ + return sizeof(unsigned long) * 3; +} + /*L:050 * Sending an interrupt is done by writing LHREQ_IRQ and an interrupt * number to /dev/lguest. @@ -434,6 +479,10 @@ static ssize_t write(struct file *file, const char __user *in, return user_send_irq(cpu, input); case LHREQ_EVENTFD: return attach_eventfd(lg, input); + case LHREQ_GETREG: + return getreg_setup(cpu, input); + case LHREQ_SETREG: + return setreg(cpu, input); default: return -EINVAL; } diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 922a1acbf652..f7a16b4ea456 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -181,6 +181,52 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages) } /*:*/ +unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any) +{ + switch (reg_off) { + case offsetof(struct pt_regs, bx): + return &cpu->regs->ebx; + case offsetof(struct pt_regs, cx): + return &cpu->regs->ecx; + case offsetof(struct pt_regs, dx): + return &cpu->regs->edx; + case offsetof(struct pt_regs, si): + return &cpu->regs->esi; + case offsetof(struct pt_regs, di): + return &cpu->regs->edi; + case offsetof(struct pt_regs, bp): + return &cpu->regs->ebp; + case offsetof(struct pt_regs, ax): + return &cpu->regs->eax; + case offsetof(struct pt_regs, ip): + return &cpu->regs->eip; + case offsetof(struct pt_regs, sp): + return &cpu->regs->esp; + } + + /* Launcher can read these, but we don't allow any setting. */ + if (any) { + switch (reg_off) { + case offsetof(struct pt_regs, ds): + return &cpu->regs->ds; + case offsetof(struct pt_regs, es): + return &cpu->regs->es; + case offsetof(struct pt_regs, fs): + return &cpu->regs->fs; + case offsetof(struct pt_regs, gs): + return &cpu->regs->gs; + case offsetof(struct pt_regs, cs): + return &cpu->regs->cs; + case offsetof(struct pt_regs, flags): + return &cpu->regs->eflags; + case offsetof(struct pt_regs, ss): + return &cpu->regs->ss; + } + } + + return NULL; +} + /*M:002 * There are hooks in the scheduler which we can register to tell when we * get kicked off the CPU (preempt_notifier_register()). This would allow us -- cgit v1.2.3-58-ga151 From 69a09dc1742ffbb3b02f3a1e03da4801e96452e9 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 11 Feb 2015 15:15:09 +1030 Subject: lguest: write more information to userspace about pending traps. This is preparation for userspace handling MMIO and ioport accesses. Signed-off-by: Rusty Russell --- drivers/lguest/core.c | 7 ++++--- drivers/lguest/hypercalls.c | 7 ++++--- drivers/lguest/lg.h | 3 ++- drivers/lguest/lguest_user.c | 14 +++++++++----- include/linux/lguest_launcher.h | 13 +++++++++++++ tools/lguest/lguest.c | 16 ++++++++++------ 6 files changed, 42 insertions(+), 18 deletions(-) (limited to 'drivers/lguest') diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index cdb2f9aa5860..9159dbc583f6 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -229,16 +229,17 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) * It's possible the Guest did a NOTIFY hypercall to the * Launcher. */ - if (cpu->pending_notify) { + if (cpu->pending.trap) { /* * Does it just needs to write to a registered * eventfd (ie. the appropriate virtqueue thread)? */ if (!send_notify_to_eventfd(cpu)) { /* OK, we tell the main Launcher. */ - if (put_user(cpu->pending_notify, user)) + if (copy_to_user(user, &cpu->pending, + sizeof(cpu->pending))) return -EFAULT; - return sizeof(cpu->pending_notify); + return sizeof(cpu->pending); } } diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index 83511eb0923d..5dd1fb8a6610 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c @@ -118,7 +118,8 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) cpu->halted = 1; break; case LHCALL_NOTIFY: - cpu->pending_notify = args->arg1; + cpu->pending.trap = LGUEST_TRAP_ENTRY; + cpu->pending.addr = args->arg1; break; default: /* It should be an architecture-specific hypercall. */ @@ -189,7 +190,7 @@ static void do_async_hcalls(struct lg_cpu *cpu) * Stop doing hypercalls if they want to notify the Launcher: * it needs to service this first. */ - if (cpu->pending_notify) + if (cpu->pending.trap) break; } } @@ -280,7 +281,7 @@ void do_hypercalls(struct lg_cpu *cpu) * NOTIFY to the Launcher, we want to return now. Otherwise we do * the hypercall. */ - if (!cpu->pending_notify) { + if (!cpu->pending.trap) { do_hcall(cpu, cpu->hcall); /* * Tricky point: we reset the hcall pointer to mark the diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 1c98bf74fd68..020fec5bb072 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -50,7 +50,8 @@ struct lg_cpu { /* Bitmap of what has changed: see CHANGED_* above. */ int changed; - unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */ + /* Pending operation. */ + struct lguest_pending pending; unsigned long *reg_read; /* register from LHREQ_GETREG */ diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index 7f14c152dd23..dcf9efd94cf4 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -29,6 +29,10 @@ bool send_notify_to_eventfd(struct lg_cpu *cpu) unsigned int i; struct lg_eventfd_map *map; + /* We only connect LHCALL_NOTIFY to event fds, not other traps. */ + if (cpu->pending.trap != LGUEST_TRAP_ENTRY) + return false; + /* * This "rcu_read_lock()" helps track when someone is still looking at * the (RCU-using) eventfds array. It's not actually a lock at all; @@ -52,9 +56,9 @@ bool send_notify_to_eventfd(struct lg_cpu *cpu) * we'll continue to use the old array and just won't see the new one. */ for (i = 0; i < map->num; i++) { - if (map->map[i].addr == cpu->pending_notify) { + if (map->map[i].addr == cpu->pending.addr) { eventfd_signal(map->map[i].event, 1); - cpu->pending_notify = 0; + cpu->pending.trap = 0; break; } } @@ -62,7 +66,7 @@ bool send_notify_to_eventfd(struct lg_cpu *cpu) rcu_read_unlock(); /* If we cleared the notification, it's because we found a match. */ - return cpu->pending_notify == 0; + return cpu->pending.trap == 0; } /*L:055 @@ -282,8 +286,8 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) * If we returned from read() last time because the Guest sent I/O, * clear the flag. */ - if (cpu->pending_notify) - cpu->pending_notify = 0; + if (cpu->pending.trap) + cpu->pending.trap = 0; /* Run the Guest until something interesting happens. */ return run_guest(cpu, (unsigned long __user *)user); diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h index f27cae27b0c1..c4451ebece47 100644 --- a/include/linux/lguest_launcher.h +++ b/include/linux/lguest_launcher.h @@ -67,6 +67,19 @@ enum lguest_req LHREQ_SETREG, /* + offset within struct pt_regs, value. */ }; +/* + * This is what read() of the lguest fd populates. trap == + * LGUEST_TRAP_ENTRY for an LHCALL_NOTIFY (addr is the + * argument), 14 for a page fault in the MMIO region (addr is + * the trap address, insn is the instruction), or 13 for a GPF + * (insn is the instruction). + */ +struct lguest_pending { + __u8 trap; + __u8 insn[7]; + __u32 addr; +}; + /* * The alignment to use between consumer and producer parts of vring. * x86 pagesize for historical reasons. diff --git a/tools/lguest/lguest.c b/tools/lguest/lguest.c index 3f7f2326cd9a..0e754d04876d 100644 --- a/tools/lguest/lguest.c +++ b/tools/lguest/lguest.c @@ -1820,17 +1820,21 @@ static void __attribute__((noreturn)) restart_guest(void) static void __attribute__((noreturn)) run_guest(void) { for (;;) { - unsigned long notify_addr; + struct lguest_pending notify; int readval; /* We read from the /dev/lguest device to run the Guest. */ - readval = pread(lguest_fd, ¬ify_addr, - sizeof(notify_addr), cpu_id); + readval = pread(lguest_fd, ¬ify, sizeof(notify), cpu_id); /* One unsigned long means the Guest did HCALL_NOTIFY */ - if (readval == sizeof(notify_addr)) { - verbose("Notify on address %#lx\n", notify_addr); - handle_output(notify_addr); + if (readval == sizeof(notify)) { + if (notify.trap == 0x1F) { + verbose("Notify on address %#08x\n", + notify.addr); + handle_output(notify.addr); + } else + errx(1, "Unknown trap %i addr %#08x\n", + notify.trap, notify.addr); /* ENOENT means the Guest died. Reading tells us why. */ } else if (errno == ENOENT) { char reason[1024] = { 0 }; -- cgit v1.2.3-58-ga151 From 8ed313001a892f240269dea05d4b925cbd150492 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 11 Feb 2015 15:15:09 +1030 Subject: lguest: add infrastructure for userspace to deliver a trap to the guest. This is required for instruction emulation to move to userspace. Signed-off-by: Rusty Russell --- drivers/lguest/lguest_user.c | 19 +++++++++++++++++++ include/linux/lguest_launcher.h | 1 + 2 files changed, 20 insertions(+) (limited to 'drivers/lguest') diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index dcf9efd94cf4..be996d173615 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -243,6 +243,23 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) return 0; } +/*L:053 + * Deliver a trap: this is used by the Launcher if it can't emulate + * an instruction. + */ +static int trap(struct lg_cpu *cpu, const unsigned long __user *input) +{ + unsigned long trapnum; + + if (get_user(trapnum, input) != 0) + return -EFAULT; + + if (!deliver_trap(cpu, trapnum)) + return -EINVAL; + + return 0; +} + /*L:040 * Once our Guest is initialized, the Launcher makes it run by reading * from /dev/lguest. @@ -487,6 +504,8 @@ static ssize_t write(struct file *file, const char __user *in, return getreg_setup(cpu, input); case LHREQ_SETREG: return setreg(cpu, input); + case LHREQ_TRAP: + return trap(cpu, input); default: return -EINVAL; } diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h index c4451ebece47..3c402b843e03 100644 --- a/include/linux/lguest_launcher.h +++ b/include/linux/lguest_launcher.h @@ -65,6 +65,7 @@ enum lguest_req LHREQ_EVENTFD, /* + address, fd. */ LHREQ_GETREG, /* + offset within struct pt_regs (then read value). */ LHREQ_SETREG, /* + offset within struct pt_regs, value. */ + LHREQ_TRAP, /* + trap number to deliver to guest. */ }; /* -- cgit v1.2.3-58-ga151 From c9e433e4b852b70ea267388cf9b5d8096b04c44c Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 11 Feb 2015 15:15:09 +1030 Subject: lguest: add infrastructure to check mappings. We normally abort the guest unconditionally when it gives us a bad address, but in the next patch we want to copy some bytes which may not be mapped. Signed-off-by: Rusty Russell --- drivers/lguest/lg.h | 1 + drivers/lguest/page_tables.c | 42 +++++++++++++++++++++++++++++------------- 2 files changed, 30 insertions(+), 13 deletions(-) (limited to 'drivers/lguest') diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 020fec5bb072..9da4f351e077 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -202,6 +202,7 @@ void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages); bool demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode); void pin_page(struct lg_cpu *cpu, unsigned long vaddr); +bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr); unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr); void page_table_guest_data_init(struct lg_cpu *cpu); diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index e8b55c3a6170..69c35caa955a 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -647,7 +647,7 @@ void guest_pagetable_flush_user(struct lg_cpu *cpu) /*:*/ /* We walk down the guest page tables to get a guest-physical address */ -unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) +bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr) { pgd_t gpgd; pte_t gpte; @@ -656,31 +656,47 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) #endif /* Still not set up? Just map 1:1. */ - if (unlikely(cpu->linear_pages)) - return vaddr; + if (unlikely(cpu->linear_pages)) { + *paddr = vaddr; + return true; + } /* First step: get the top-level Guest page table entry. */ gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); /* Toplevel not present? We can't map it in. */ - if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) { - kill_guest(cpu, "Bad address %#lx", vaddr); - return -1UL; - } + if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) + goto fail; #ifdef CONFIG_X86_PAE gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); - if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) { - kill_guest(cpu, "Bad address %#lx", vaddr); - return -1UL; - } + if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) + goto fail; gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t); #else gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t); #endif if (!(pte_flags(gpte) & _PAGE_PRESENT)) - kill_guest(cpu, "Bad address %#lx", vaddr); + goto fail; + + *paddr = pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); + return true; + +fail: + *paddr = -1UL; + return false; +} - return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); +/* + * This is the version we normally use: kills the Guest if it uses a + * bad address + */ +unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) +{ + unsigned long paddr; + + if (!__guest_pa(cpu, vaddr, &paddr)) + kill_guest(cpu, "Bad address %#lx", vaddr); + return paddr; } /* -- cgit v1.2.3-58-ga151 From c565650b1028bc551e5d16dd0ec8f7078da7cace Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 11 Feb 2015 15:15:10 +1030 Subject: lguest: send trap 13 through to userspace. We copy 7 bytes at eip for userspace's instruction decode; we have to carefully handle the case where eip is at the end of a page. We can't leave this to userspace since kernel has all the page table decode logic. The decode logic moves to userspace, basically unchanged. Signed-off-by: Rusty Russell --- drivers/lguest/x86/core.c | 133 +++++++++++++---------------------------- tools/lguest/lguest.c | 149 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 192 insertions(+), 90 deletions(-) (limited to 'drivers/lguest') diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index f7a16b4ea456..42e87bf14113 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -314,95 +314,52 @@ void lguest_arch_run_guest(struct lg_cpu *cpu) * usually attached to a PC. * * When the Guest uses one of these instructions, we get a trap (General - * Protection Fault) and come here. We see if it's one of those troublesome - * instructions and skip over it. We return true if we did. + * Protection Fault) and come here. We queue this to be sent out to the + * Launcher to handle. */ -static int emulate_insn(struct lg_cpu *cpu) -{ - u8 insn; - unsigned int insnlen = 0, in = 0, small_operand = 0; - /* - * The eip contains the *virtual* address of the Guest's instruction: - * walk the Guest's page tables to find the "physical" address. - */ - unsigned long physaddr = guest_pa(cpu, cpu->regs->eip); - - /* - * This must be the Guest kernel trying to do something, not userspace! - * The bottom two bits of the CS segment register are the privilege - * level. - */ - if ((cpu->regs->cs & 3) != GUEST_PL) - return 0; - /* Decoding x86 instructions is icky. */ - insn = lgread(cpu, physaddr, u8); - - /* - * Around 2.6.33, the kernel started using an emulation for the - * cmpxchg8b instruction in early boot on many configurations. This - * code isn't paravirtualized, and it tries to disable interrupts. - * Ignore it, which will Mostly Work. - */ - if (insn == 0xfa) { - /* "cli", or Clear Interrupt Enable instruction. Skip it. */ - cpu->regs->eip++; - return 1; +/* + * The eip contains the *virtual* address of the Guest's instruction: + * we copy the instruction here so the Launcher doesn't have to walk + * the page tables to decode it. We handle the case (eg. in a kernel + * module) where the instruction is over two pages, and the pages are + * virtually but not physically contiguous. + * + * The longest possible x86 instruction is 15 bytes, but we don't handle + * anything that strange. + */ +static void copy_from_guest(struct lg_cpu *cpu, + void *dst, unsigned long vaddr, size_t len) +{ + size_t to_page_end = PAGE_SIZE - (vaddr % PAGE_SIZE); + unsigned long paddr; + + BUG_ON(len > PAGE_SIZE); + + /* If it goes over a page, copy in two parts. */ + if (len > to_page_end) { + /* But make sure the next page is mapped! */ + if (__guest_pa(cpu, vaddr + to_page_end, &paddr)) + copy_from_guest(cpu, dst + to_page_end, + vaddr + to_page_end, + len - to_page_end); + else + /* Otherwise fill with zeroes. */ + memset(dst + to_page_end, 0, len - to_page_end); + len = to_page_end; } - /* - * 0x66 is an "operand prefix". It means a 16, not 32 bit in/out. - */ - if (insn == 0x66) { - small_operand = 1; - /* The instruction is 1 byte so far, read the next byte. */ - insnlen = 1; - insn = lgread(cpu, physaddr + insnlen, u8); - } + /* This will kill the guest if it isn't mapped, but that + * shouldn't happen. */ + __lgread(cpu, dst, guest_pa(cpu, vaddr), len); +} - /* - * We can ignore the lower bit for the moment and decode the 4 opcodes - * we need to emulate. - */ - switch (insn & 0xFE) { - case 0xE4: /* in ,%al */ - insnlen += 2; - in = 1; - break; - case 0xEC: /* in (%dx),%al */ - insnlen += 1; - in = 1; - break; - case 0xE6: /* out %al, */ - insnlen += 2; - break; - case 0xEE: /* out %al,(%dx) */ - insnlen += 1; - break; - default: - /* OK, we don't know what this is, can't emulate. */ - return 0; - } - /* - * If it was an "IN" instruction, they expect the result to be read - * into %eax, so we change %eax. We always return all-ones, which - * traditionally means "there's nothing there". - */ - if (in) { - /* Lower bit tells means it's a 32/16 bit access */ - if (insn & 0x1) { - if (small_operand) - cpu->regs->eax |= 0xFFFF; - else - cpu->regs->eax = 0xFFFFFFFF; - } else - cpu->regs->eax |= 0xFF; - } - /* Finally, we've "done" the instruction, so move past it. */ - cpu->regs->eip += insnlen; - /* Success! */ - return 1; +static void setup_emulate_insn(struct lg_cpu *cpu) +{ + cpu->pending.trap = 13; + copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip, + sizeof(cpu->pending.insn)); } /*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */ @@ -410,14 +367,10 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu) { switch (cpu->regs->trapnum) { case 13: /* We've intercepted a General Protection Fault. */ - /* - * Check if this was one of those annoying IN or OUT - * instructions which we need to emulate. If so, we just go - * back into the Guest after we've done it. - */ + /* Hand to Launcher to emulate those pesky IN and OUT insns */ if (cpu->regs->errcode == 0) { - if (emulate_insn(cpu)) - return; + setup_emulate_insn(cpu); + return; } break; case 14: /* We've intercepted a Page Fault. */ diff --git a/tools/lguest/lguest.c b/tools/lguest/lguest.c index 0e754d04876d..b2217657f62c 100644 --- a/tools/lguest/lguest.c +++ b/tools/lguest/lguest.c @@ -41,6 +41,7 @@ #include #include #include +#include #ifndef VIRTIO_F_ANY_LAYOUT #define VIRTIO_F_ANY_LAYOUT 27 @@ -1143,6 +1144,150 @@ static void handle_output(unsigned long addr) strnlen(from_guest_phys(addr), guest_limit - addr)); } +/*L:216 + * This is where we emulate a handful of Guest instructions. It's ugly + * and we used to do it in the kernel but it grew over time. + */ + +/* + * We use the ptrace syscall's pt_regs struct to talk about registers + * to lguest: these macros convert the names to the offsets. + */ +#define getreg(name) getreg_off(offsetof(struct user_regs_struct, name)) +#define setreg(name, val) \ + setreg_off(offsetof(struct user_regs_struct, name), (val)) + +static u32 getreg_off(size_t offset) +{ + u32 r; + unsigned long args[] = { LHREQ_GETREG, offset }; + + if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0) + err(1, "Getting register %u", offset); + if (pread(lguest_fd, &r, sizeof(r), cpu_id) != sizeof(r)) + err(1, "Reading register %u", offset); + + return r; +} + +static void setreg_off(size_t offset, u32 val) +{ + unsigned long args[] = { LHREQ_SETREG, offset, val }; + + if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0) + err(1, "Setting register %u", offset); +} + +static void emulate_insn(const u8 insn[]) +{ + unsigned long args[] = { LHREQ_TRAP, 13 }; + unsigned int insnlen = 0, in = 0, small_operand = 0, byte_access; + unsigned int eax, port, mask; + /* + * We always return all-ones on IO port reads, which traditionally + * means "there's nothing there". + */ + u32 val = 0xFFFFFFFF; + + /* + * This must be the Guest kernel trying to do something, not userspace! + * The bottom two bits of the CS segment register are the privilege + * level. + */ + if ((getreg(xcs) & 3) != 0x1) + goto no_emulate; + + /* Decoding x86 instructions is icky. */ + + /* + * Around 2.6.33, the kernel started using an emulation for the + * cmpxchg8b instruction in early boot on many configurations. This + * code isn't paravirtualized, and it tries to disable interrupts. + * Ignore it, which will Mostly Work. + */ + if (insn[insnlen] == 0xfa) { + /* "cli", or Clear Interrupt Enable instruction. Skip it. */ + insnlen = 1; + goto skip_insn; + } + + /* + * 0x66 is an "operand prefix". It means a 16, not 32 bit in/out. + */ + if (insn[insnlen] == 0x66) { + small_operand = 1; + /* The instruction is 1 byte so far, read the next byte. */ + insnlen = 1; + } + + /* If the lower bit isn't set, it's a single byte access */ + byte_access = !(insn[insnlen] & 1); + + /* + * Now we can ignore the lower bit and decode the 4 opcodes + * we need to emulate. + */ + switch (insn[insnlen] & 0xFE) { + case 0xE4: /* in ,%al */ + port = insn[insnlen+1]; + insnlen += 2; + in = 1; + break; + case 0xEC: /* in (%dx),%al */ + port = getreg(edx) & 0xFFFF; + insnlen += 1; + in = 1; + break; + case 0xE6: /* out %al, */ + port = insn[insnlen+1]; + insnlen += 2; + break; + case 0xEE: /* out %al,(%dx) */ + port = getreg(edx) & 0xFFFF; + insnlen += 1; + break; + default: + /* OK, we don't know what this is, can't emulate. */ + goto no_emulate; + } + + /* Set a mask of the 1, 2 or 4 bytes, depending on size of IO */ + if (byte_access) + mask = 0xFF; + else if (small_operand) + mask = 0xFFFF; + else + mask = 0xFFFFFFFF; + + /* + * If it was an "IN" instruction, they expect the result to be read + * into %eax, so we change %eax. + */ + eax = getreg(eax); + + if (in) { + /* Clear the bits we're about to read */ + eax &= ~mask; + /* Copy bits in from val. */ + eax |= val & mask; + /* Now update the register. */ + setreg(eax, eax); + } + + verbose("IO %s of %x to %u: %#08x\n", + in ? "IN" : "OUT", mask, port, eax); +skip_insn: + /* Finally, we've "done" the instruction, so move past it. */ + setreg(eip, getreg(eip) + insnlen); + return; + +no_emulate: + /* Inject trap into Guest. */ + if (write(lguest_fd, args, sizeof(args)) < 0) + err(1, "Reinjecting trap 13 for fault at %#x", getreg(eip)); +} + + /*L:190 * Device Setup * @@ -1832,6 +1977,10 @@ static void __attribute__((noreturn)) run_guest(void) verbose("Notify on address %#08x\n", notify.addr); handle_output(notify.addr); + } else if (notify.trap == 13) { + verbose("Emulating instruction at %#x\n", + getreg(eip)); + emulate_insn(notify.insn); } else errx(1, "Unknown trap %i addr %#08x\n", notify.trap, notify.addr); -- cgit v1.2.3-58-ga151 From 7313d5217e6b9817897172d6a6ff477bdc415ed6 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 11 Feb 2015 15:15:10 +1030 Subject: lguest: add iomem region, where guest page faults get sent to userspace. This lets us implement PCI. Signed-off-by: Rusty Russell --- drivers/lguest/lg.h | 7 ++++++- drivers/lguest/lguest_user.c | 3 ++- drivers/lguest/page_tables.c | 33 ++++++++++++++++++++++++++++++--- drivers/lguest/x86/core.c | 19 ++++++++++++++++++- tools/lguest/lguest.c | 3 ++- 5 files changed, 58 insertions(+), 7 deletions(-) (limited to 'drivers/lguest') diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 9da4f351e077..eb81abc05995 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -97,8 +97,12 @@ struct lguest { struct lg_cpu cpus[NR_CPUS]; unsigned int nr_cpus; + /* Valid guest memory pages must be < this. */ u32 pfn_limit; + /* Device memory is >= pfn_limit and < device_limit. */ + u32 device_limit; + /* * This provides the offset to the base of guest-physical memory in the * Launcher. @@ -200,7 +204,8 @@ void guest_pagetable_flush_user(struct lg_cpu *cpu); void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, unsigned long vaddr, pte_t val); void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages); -bool demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode); +bool demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode, + unsigned long *iomem); void pin_page(struct lg_cpu *cpu, unsigned long vaddr); bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr); unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr); diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index be996d173615..c8b0e8575b44 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -385,7 +385,7 @@ static int initialize(struct file *file, const unsigned long __user *input) /* "struct lguest" contains all we (the Host) know about a Guest. */ struct lguest *lg; int err; - unsigned long args[3]; + unsigned long args[4]; /* * We grab the Big Lguest lock, which protects against multiple @@ -419,6 +419,7 @@ static int initialize(struct file *file, const unsigned long __user *input) /* Populate the easy fields of our "struct lguest" */ lg->mem_base = (void __user *)args[0]; lg->pfn_limit = args[1]; + lg->device_limit = args[3]; /* This is the first cpu (cpu 0) and it will start booting at args[2] */ err = lg_cpu_start(&lg->cpus[0], 0, args[2]); diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 69c35caa955a..e3abebc912c0 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -250,6 +250,16 @@ static void release_pte(pte_t pte) } /*:*/ +static bool gpte_in_iomem(struct lg_cpu *cpu, pte_t gpte) +{ + /* We don't handle large pages. */ + if (pte_flags(gpte) & _PAGE_PSE) + return false; + + return (pte_pfn(gpte) >= cpu->lg->pfn_limit + && pte_pfn(gpte) < cpu->lg->device_limit); +} + static bool check_gpte(struct lg_cpu *cpu, pte_t gpte) { if ((pte_flags(gpte) & _PAGE_PSE) || @@ -374,8 +384,14 @@ static pte_t *find_spte(struct lg_cpu *cpu, unsigned long vaddr, bool allocate, * * If we fixed up the fault (ie. we mapped the address), this routine returns * true. Otherwise, it was a real fault and we need to tell the Guest. + * + * There's a corner case: they're trying to access memory between + * pfn_limit and device_limit, which is I/O memory. In this case, we + * return false and set @iomem to the physical address, so the the + * Launcher can handle the instruction manually. */ -bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) +bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode, + unsigned long *iomem) { unsigned long gpte_ptr; pte_t gpte; @@ -383,6 +399,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) pmd_t gpmd; pgd_t gpgd; + *iomem = 0; + /* We never demand page the Switcher, so trying is a mistake. */ if (vaddr >= switcher_addr) return false; @@ -459,6 +477,12 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER)) return false; + /* If they're accessing io memory, we expect a fault. */ + if (gpte_in_iomem(cpu, gpte)) { + *iomem = (pte_pfn(gpte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK); + return false; + } + /* * Check that the Guest PTE flags are OK, and the page number is below * the pfn_limit (ie. not mapping the Launcher binary). @@ -553,7 +577,9 @@ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) */ void pin_page(struct lg_cpu *cpu, unsigned long vaddr) { - if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) + unsigned long iomem; + + if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2, &iomem)) kill_guest(cpu, "bad stack page %#lx", vaddr); } /*:*/ @@ -928,7 +954,8 @@ static void __guest_set_pte(struct lg_cpu *cpu, int idx, * now. This shaves 10% off a copy-on-write * micro-benchmark. */ - if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { + if ((pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) + && !gpte_in_iomem(cpu, gpte)) { if (!check_gpte(cpu, gpte)) return; set_pte(spte, diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 42e87bf14113..18d841e738bc 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -362,9 +362,19 @@ static void setup_emulate_insn(struct lg_cpu *cpu) sizeof(cpu->pending.insn)); } +static void setup_iomem_insn(struct lg_cpu *cpu, unsigned long iomem_addr) +{ + cpu->pending.trap = 14; + cpu->pending.addr = iomem_addr; + copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip, + sizeof(cpu->pending.insn)); +} + /*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */ void lguest_arch_handle_trap(struct lg_cpu *cpu) { + unsigned long iomem_addr; + switch (cpu->regs->trapnum) { case 13: /* We've intercepted a General Protection Fault. */ /* Hand to Launcher to emulate those pesky IN and OUT insns */ @@ -385,8 +395,15 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu) * whether kernel or userspace code. */ if (demand_page(cpu, cpu->arch.last_pagefault, - cpu->regs->errcode)) + cpu->regs->errcode, &iomem_addr)) + return; + + /* Was this an access to memory mapped IO? */ + if (iomem_addr) { + /* Tell Launcher, let it handle it. */ + setup_iomem_insn(cpu, iomem_addr); return; + } /* * OK, it's really not there (or not OK): the Guest needs to diff --git a/tools/lguest/lguest.c b/tools/lguest/lguest.c index 485fe13db12e..02f353989e6c 100644 --- a/tools/lguest/lguest.c +++ b/tools/lguest/lguest.c @@ -548,7 +548,8 @@ static void tell_kernel(unsigned long start) { unsigned long args[] = { LHREQ_INITIALIZE, (unsigned long)guest_base, - guest_limit / getpagesize(), start }; + guest_limit / getpagesize(), start, + guest_limit / getpagesize() }; verbose("Guest: %p - %p (%#lx)\n", guest_base, guest_base + guest_limit, guest_limit); lguest_fd = open_or_die("/dev/lguest", O_RDWR); -- cgit v1.2.3-58-ga151 From e68ccd1f9d3d0fe8085b4e18c2cc2245f384c420 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 11 Feb 2015 15:20:01 +1030 Subject: lguest: remove support for lguest bus. The demonstration launcher now uses PCI entirely. Signed-off-by: Rusty Russell --- drivers/lguest/Makefile | 3 - drivers/lguest/lguest_device.c | 540 ----------------------------------------- 2 files changed, 543 deletions(-) delete mode 100644 drivers/lguest/lguest_device.c (limited to 'drivers/lguest') diff --git a/drivers/lguest/Makefile b/drivers/lguest/Makefile index c4197503900e..16f52ee73994 100644 --- a/drivers/lguest/Makefile +++ b/drivers/lguest/Makefile @@ -1,6 +1,3 @@ -# Guest requires the device configuration and probing code. -obj-$(CONFIG_LGUEST_GUEST) += lguest_device.o - # Host requires the other files, which can be a module. obj-$(CONFIG_LGUEST) += lg.o lg-y = core.o hypercalls.o page_tables.o interrupts_and_traps.o \ diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c deleted file mode 100644 index 89088d6538fd..000000000000 --- a/drivers/lguest/lguest_device.c +++ /dev/null @@ -1,540 +0,0 @@ -/*P:050 - * Lguest guests use a very simple method to describe devices. It's a - * series of device descriptors contained just above the top of normal Guest - * memory. - * - * We use the standard "virtio" device infrastructure, which provides us with a - * console, a network and a block driver. Each one expects some configuration - * information and a "virtqueue" or two to send and receive data. -:*/ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* The pointer to our (page) of device descriptions. */ -static void *lguest_devices; - -/* - * For Guests, device memory can be used as normal memory, so we cast away the - * __iomem to quieten sparse. - */ -static inline void *lguest_map(unsigned long phys_addr, unsigned long pages) -{ - return (__force void *)ioremap_cache(phys_addr, PAGE_SIZE*pages); -} - -static inline void lguest_unmap(void *addr) -{ - iounmap((__force void __iomem *)addr); -} - -/*D:100 - * Each lguest device is just a virtio device plus a pointer to its entry - * in the lguest_devices page. - */ -struct lguest_device { - struct virtio_device vdev; - - /* The entry in the lguest_devices page for this device. */ - struct lguest_device_desc *desc; -}; - -/* - * Since the virtio infrastructure hands us a pointer to the virtio_device all - * the time, it helps to have a curt macro to get a pointer to the struct - * lguest_device it's enclosed in. - */ -#define to_lgdev(vd) container_of(vd, struct lguest_device, vdev) - -/*D:130 - * Device configurations - * - * The configuration information for a device consists of one or more - * virtqueues, a feature bitmap, and some configuration bytes. The - * configuration bytes don't really matter to us: the Launcher sets them up, and - * the driver will look at them during setup. - * - * A convenient routine to return the device's virtqueue config array: - * immediately after the descriptor. - */ -static struct lguest_vqconfig *lg_vq(const struct lguest_device_desc *desc) -{ - return (void *)(desc + 1); -} - -/* The features come immediately after the virtqueues. */ -static u8 *lg_features(const struct lguest_device_desc *desc) -{ - return (void *)(lg_vq(desc) + desc->num_vq); -} - -/* The config space comes after the two feature bitmasks. */ -static u8 *lg_config(const struct lguest_device_desc *desc) -{ - return lg_features(desc) + desc->feature_len * 2; -} - -/* The total size of the config page used by this device (incl. desc) */ -static unsigned desc_size(const struct lguest_device_desc *desc) -{ - return sizeof(*desc) - + desc->num_vq * sizeof(struct lguest_vqconfig) - + desc->feature_len * 2 - + desc->config_len; -} - -/* This gets the device's feature bits. */ -static u64 lg_get_features(struct virtio_device *vdev) -{ - unsigned int i; - u32 features = 0; - struct lguest_device_desc *desc = to_lgdev(vdev)->desc; - u8 *in_features = lg_features(desc); - - /* We do this the slow but generic way. */ - for (i = 0; i < min(desc->feature_len * 8, 32); i++) - if (in_features[i / 8] & (1 << (i % 8))) - features |= (1 << i); - - return features; -} - -/* - * To notify on reset or feature finalization, we (ab)use the NOTIFY - * hypercall, with the descriptor address of the device. - */ -static void status_notify(struct virtio_device *vdev) -{ - unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices; - - hcall(LHCALL_NOTIFY, (max_pfn << PAGE_SHIFT) + offset, 0, 0, 0); -} - -/* - * The virtio core takes the features the Host offers, and copies the ones - * supported by the driver into the vdev->features array. Once that's all - * sorted out, this routine is called so we can tell the Host which features we - * understand and accept. - */ -static int lg_finalize_features(struct virtio_device *vdev) -{ - unsigned int i, bits; - struct lguest_device_desc *desc = to_lgdev(vdev)->desc; - /* Second half of bitmap is features we accept. */ - u8 *out_features = lg_features(desc) + desc->feature_len; - - /* Give virtio_ring a chance to accept features. */ - vring_transport_features(vdev); - - /* Make sure we don't have any features > 32 bits! */ - BUG_ON((u32)vdev->features != vdev->features); - - /* - * Since lguest is currently x86-only, we're little-endian. That - * means we could just memcpy. But it's not time critical, and in - * case someone copies this code, we do it the slow, obvious way. - */ - memset(out_features, 0, desc->feature_len); - bits = min_t(unsigned, desc->feature_len, sizeof(vdev->features)) * 8; - for (i = 0; i < bits; i++) { - if (__virtio_test_bit(vdev, i)) - out_features[i / 8] |= (1 << (i % 8)); - } - - /* Tell Host we've finished with this device's feature negotiation */ - status_notify(vdev); - - return 0; -} - -/* Once they've found a field, getting a copy of it is easy. */ -static void lg_get(struct virtio_device *vdev, unsigned int offset, - void *buf, unsigned len) -{ - struct lguest_device_desc *desc = to_lgdev(vdev)->desc; - - /* Check they didn't ask for more than the length of the config! */ - BUG_ON(offset + len > desc->config_len); - memcpy(buf, lg_config(desc) + offset, len); -} - -/* Setting the contents is also trivial. */ -static void lg_set(struct virtio_device *vdev, unsigned int offset, - const void *buf, unsigned len) -{ - struct lguest_device_desc *desc = to_lgdev(vdev)->desc; - - /* Check they didn't ask for more than the length of the config! */ - BUG_ON(offset + len > desc->config_len); - memcpy(lg_config(desc) + offset, buf, len); -} - -/* - * The operations to get and set the status word just access the status field - * of the device descriptor. - */ -static u8 lg_get_status(struct virtio_device *vdev) -{ - return to_lgdev(vdev)->desc->status; -} - -static void lg_set_status(struct virtio_device *vdev, u8 status) -{ - BUG_ON(!status); - to_lgdev(vdev)->desc->status = status; - - /* Tell Host immediately if we failed. */ - if (status & VIRTIO_CONFIG_S_FAILED) - status_notify(vdev); -} - -static void lg_reset(struct virtio_device *vdev) -{ - /* 0 status means "reset" */ - to_lgdev(vdev)->desc->status = 0; - status_notify(vdev); -} - -/* - * Virtqueues - * - * The other piece of infrastructure virtio needs is a "virtqueue": a way of - * the Guest device registering buffers for the other side to read from or - * write into (ie. send and receive buffers). Each device can have multiple - * virtqueues: for example the console driver uses one queue for sending and - * another for receiving. - * - * Fortunately for us, a very fast shared-memory-plus-descriptors virtqueue - * already exists in virtio_ring.c. We just need to connect it up. - * - * We start with the information we need to keep about each virtqueue. - */ - -/*D:140 This is the information we remember about each virtqueue. */ -struct lguest_vq_info { - /* A copy of the information contained in the device config. */ - struct lguest_vqconfig config; - - /* The address where we mapped the virtio ring, so we can unmap it. */ - void *pages; -}; - -/* - * When the virtio_ring code wants to prod the Host, it calls us here and we - * make a hypercall. We hand the physical address of the virtqueue so the Host - * knows which virtqueue we're talking about. - */ -static bool lg_notify(struct virtqueue *vq) -{ - /* - * We store our virtqueue information in the "priv" pointer of the - * virtqueue structure. - */ - struct lguest_vq_info *lvq = vq->priv; - - hcall(LHCALL_NOTIFY, lvq->config.pfn << PAGE_SHIFT, 0, 0, 0); - return true; -} - -/* An extern declaration inside a C file is bad form. Don't do it. */ -extern int lguest_setup_irq(unsigned int irq); - -/* - * This routine finds the Nth virtqueue described in the configuration of - * this device and sets it up. - * - * This is kind of an ugly duckling. It'd be nicer to have a standard - * representation of a virtqueue in the configuration space, but it seems that - * everyone wants to do it differently. The KVM coders want the Guest to - * allocate its own pages and tell the Host where they are, but for lguest it's - * simpler for the Host to simply tell us where the pages are. - */ -static struct virtqueue *lg_find_vq(struct virtio_device *vdev, - unsigned index, - void (*callback)(struct virtqueue *vq), - const char *name) -{ - struct lguest_device *ldev = to_lgdev(vdev); - struct lguest_vq_info *lvq; - struct virtqueue *vq; - int err; - - if (!name) - return NULL; - - /* We must have this many virtqueues. */ - if (index >= ldev->desc->num_vq) - return ERR_PTR(-ENOENT); - - lvq = kmalloc(sizeof(*lvq), GFP_KERNEL); - if (!lvq) - return ERR_PTR(-ENOMEM); - - /* - * Make a copy of the "struct lguest_vqconfig" entry, which sits after - * the descriptor. We need a copy because the config space might not - * be aligned correctly. - */ - memcpy(&lvq->config, lg_vq(ldev->desc)+index, sizeof(lvq->config)); - - printk("Mapping virtqueue %i addr %lx\n", index, - (unsigned long)lvq->config.pfn << PAGE_SHIFT); - /* Figure out how many pages the ring will take, and map that memory */ - lvq->pages = lguest_map((unsigned long)lvq->config.pfn << PAGE_SHIFT, - DIV_ROUND_UP(vring_size(lvq->config.num, - LGUEST_VRING_ALIGN), - PAGE_SIZE)); - if (!lvq->pages) { - err = -ENOMEM; - goto free_lvq; - } - - /* - * OK, tell virtio_ring.c to set up a virtqueue now we know its size - * and we've got a pointer to its pages. Note that we set weak_barriers - * to 'true': the host just a(nother) SMP CPU, so we only need inter-cpu - * barriers. - */ - vq = vring_new_virtqueue(index, lvq->config.num, LGUEST_VRING_ALIGN, vdev, - true, lvq->pages, lg_notify, callback, name); - if (!vq) { - err = -ENOMEM; - goto unmap; - } - - /* Make sure the interrupt is allocated. */ - err = lguest_setup_irq(lvq->config.irq); - if (err) - goto destroy_vring; - - /* - * Tell the interrupt for this virtqueue to go to the virtio_ring - * interrupt handler. - * - * FIXME: We used to have a flag for the Host to tell us we could use - * the interrupt as a source of randomness: it'd be nice to have that - * back. - */ - err = request_irq(lvq->config.irq, vring_interrupt, IRQF_SHARED, - dev_name(&vdev->dev), vq); - if (err) - goto free_desc; - - /* - * Last of all we hook up our 'struct lguest_vq_info" to the - * virtqueue's priv pointer. - */ - vq->priv = lvq; - return vq; - -free_desc: - irq_free_desc(lvq->config.irq); -destroy_vring: - vring_del_virtqueue(vq); -unmap: - lguest_unmap(lvq->pages); -free_lvq: - kfree(lvq); - return ERR_PTR(err); -} -/*:*/ - -/* Cleaning up a virtqueue is easy */ -static void lg_del_vq(struct virtqueue *vq) -{ - struct lguest_vq_info *lvq = vq->priv; - - /* Release the interrupt */ - free_irq(lvq->config.irq, vq); - /* Tell virtio_ring.c to free the virtqueue. */ - vring_del_virtqueue(vq); - /* Unmap the pages containing the ring. */ - lguest_unmap(lvq->pages); - /* Free our own queue information. */ - kfree(lvq); -} - -static void lg_del_vqs(struct virtio_device *vdev) -{ - struct virtqueue *vq, *n; - - list_for_each_entry_safe(vq, n, &vdev->vqs, list) - lg_del_vq(vq); -} - -static int lg_find_vqs(struct virtio_device *vdev, unsigned nvqs, - struct virtqueue *vqs[], - vq_callback_t *callbacks[], - const char *names[]) -{ - struct lguest_device *ldev = to_lgdev(vdev); - int i; - - /* We must have this many virtqueues. */ - if (nvqs > ldev->desc->num_vq) - return -ENOENT; - - for (i = 0; i < nvqs; ++i) { - vqs[i] = lg_find_vq(vdev, i, callbacks[i], names[i]); - if (IS_ERR(vqs[i])) - goto error; - } - return 0; - -error: - lg_del_vqs(vdev); - return PTR_ERR(vqs[i]); -} - -static const char *lg_bus_name(struct virtio_device *vdev) -{ - return ""; -} - -/* The ops structure which hooks everything together. */ -static const struct virtio_config_ops lguest_config_ops = { - .get_features = lg_get_features, - .finalize_features = lg_finalize_features, - .get = lg_get, - .set = lg_set, - .get_status = lg_get_status, - .set_status = lg_set_status, - .reset = lg_reset, - .find_vqs = lg_find_vqs, - .del_vqs = lg_del_vqs, - .bus_name = lg_bus_name, -}; - -/* - * The root device for the lguest virtio devices. This makes them appear as - * /sys/devices/lguest/0,1,2 not /sys/devices/0,1,2. - */ -static struct device *lguest_root; - -/*D:120 - * This is the core of the lguest bus: actually adding a new device. - * It's a separate function because it's neater that way, and because an - * earlier version of the code supported hotplug and unplug. They were removed - * early on because they were never used. - * - * As Andrew Tridgell says, "Untested code is buggy code". - * - * It's worth reading this carefully: we start with a pointer to the new device - * descriptor in the "lguest_devices" page, and the offset into the device - * descriptor page so we can uniquely identify it if things go badly wrong. - */ -static void add_lguest_device(struct lguest_device_desc *d, - unsigned int offset) -{ - struct lguest_device *ldev; - - /* Start with zeroed memory; Linux's device layer counts on it. */ - ldev = kzalloc(sizeof(*ldev), GFP_KERNEL); - if (!ldev) { - printk(KERN_EMERG "Cannot allocate lguest dev %u type %u\n", - offset, d->type); - return; - } - - /* This devices' parent is the lguest/ dir. */ - ldev->vdev.dev.parent = lguest_root; - /* - * The device type comes straight from the descriptor. There's also a - * device vendor field in the virtio_device struct, which we leave as - * 0. - */ - ldev->vdev.id.device = d->type; - /* - * We have a simple set of routines for querying the device's - * configuration information and setting its status. - */ - ldev->vdev.config = &lguest_config_ops; - /* And we remember the device's descriptor for lguest_config_ops. */ - ldev->desc = d; - - /* - * register_virtio_device() sets up the generic fields for the struct - * virtio_device and calls device_register(). This makes the bus - * infrastructure look for a matching driver. - */ - if (register_virtio_device(&ldev->vdev) != 0) { - printk(KERN_ERR "Failed to register lguest dev %u type %u\n", - offset, d->type); - kfree(ldev); - } -} - -/*D:110 - * scan_devices() simply iterates through the device page. The type 0 is - * reserved to mean "end of devices". - */ -static void scan_devices(void) -{ - unsigned int i; - struct lguest_device_desc *d; - - /* We start at the page beginning, and skip over each entry. */ - for (i = 0; i < PAGE_SIZE; i += desc_size(d)) { - d = lguest_devices + i; - - /* Once we hit a zero, stop. */ - if (d->type == 0) - break; - - printk("Device at %i has size %u\n", i, desc_size(d)); - add_lguest_device(d, i); - } -} - -/*D:105 - * Fairly early in boot, lguest_devices_init() is called to set up the - * lguest device infrastructure. We check that we are a Guest by checking - * pv_info.name: there are other ways of checking, but this seems most - * obvious to me. - * - * So we can access the "struct lguest_device_desc"s easily, we map that memory - * and store the pointer in the global "lguest_devices". Then we register a - * root device from which all our devices will hang (this seems to be the - * correct sysfs incantation). - * - * Finally we call scan_devices() which adds all the devices found in the - * lguest_devices page. - */ -static int __init lguest_devices_init(void) -{ - if (strcmp(pv_info.name, "lguest") != 0) - return 0; - - lguest_root = root_device_register("lguest"); - if (IS_ERR(lguest_root)) - panic("Could not register lguest root"); - - /* Devices are in a single page above top of "normal" mem */ - lguest_devices = lguest_map(max_pfn< Date: Wed, 11 Feb 2015 15:28:01 +1030 Subject: lguest: remove NOTIFY call and eventfd facility. Disappointing, as this was kind of neat (especially getting to use RCU to manage the address -> eventfd mapping). But now the devices are PCI handled in userspace, we get rid of both the NOTIFY hypercall and the interface to connect an eventfd. Signed-off-by: Rusty Russell --- arch/x86/include/asm/lguest_hcall.h | 1 - drivers/lguest/core.c | 20 +--- drivers/lguest/hypercalls.c | 4 - drivers/lguest/lg.h | 12 --- drivers/lguest/lguest_user.c | 186 +----------------------------------- include/linux/lguest_launcher.h | 2 +- 6 files changed, 10 insertions(+), 215 deletions(-) (limited to 'drivers/lguest') diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index 879fd7d33877..ef01fef3eebc 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -16,7 +16,6 @@ #define LHCALL_SET_PTE 14 #define LHCALL_SET_PGD 15 #define LHCALL_LOAD_TLS 16 -#define LHCALL_NOTIFY 17 #define LHCALL_LOAD_GDT_ENTRY 18 #define LHCALL_SEND_INTERRUPTS 19 diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index 9159dbc583f6..7dc93aa004c8 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -225,22 +225,12 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) if (cpu->hcall) do_hypercalls(cpu); - /* - * It's possible the Guest did a NOTIFY hypercall to the - * Launcher. - */ + /* Do we have to tell the Launcher about a trap? */ if (cpu->pending.trap) { - /* - * Does it just needs to write to a registered - * eventfd (ie. the appropriate virtqueue thread)? - */ - if (!send_notify_to_eventfd(cpu)) { - /* OK, we tell the main Launcher. */ - if (copy_to_user(user, &cpu->pending, - sizeof(cpu->pending))) - return -EFAULT; - return sizeof(cpu->pending); - } + if (copy_to_user(user, &cpu->pending, + sizeof(cpu->pending))) + return -EFAULT; + return sizeof(cpu->pending); } /* diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index 5dd1fb8a6610..1219af493c0f 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c @@ -117,10 +117,6 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) /* Similarly, this sets the halted flag for run_guest(). */ cpu->halted = 1; break; - case LHCALL_NOTIFY: - cpu->pending.trap = LGUEST_TRAP_ENTRY; - cpu->pending.addr = args->arg1; - break; default: /* It should be an architecture-specific hypercall. */ if (lguest_arch_do_hcall(cpu, args)) diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index eb81abc05995..307e8b39e7d1 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -81,16 +81,6 @@ struct lg_cpu { struct lg_cpu_arch arch; }; -struct lg_eventfd { - unsigned long addr; - struct eventfd_ctx *event; -}; - -struct lg_eventfd_map { - unsigned int num; - struct lg_eventfd map[]; -}; - /* The private info the thread maintains about the guest. */ struct lguest { struct lguest_data __user *lguest_data; @@ -117,8 +107,6 @@ struct lguest { unsigned int stack_pages; u32 tsc_khz; - struct lg_eventfd_map *eventfds; - /* Dead? */ const char *dead; }; diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index c8b0e8575b44..c4c6113eb9a6 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -2,182 +2,20 @@ * launcher controls and communicates with the Guest. For example, * the first write will tell us the Guest's memory layout and entry * point. A read will run the Guest until something happens, such as - * a signal or the Guest doing a NOTIFY out to the Launcher. There is - * also a way for the Launcher to attach eventfds to particular NOTIFY - * values instead of returning from the read() call. + * a signal or the Guest accessing a device. :*/ #include #include #include #include -#include #include #include #include #include "lg.h" -/*L:056 - * Before we move on, let's jump ahead and look at what the kernel does when - * it needs to look up the eventfds. That will complete our picture of how we - * use RCU. - * - * The notification value is in cpu->pending_notify: we return true if it went - * to an eventfd. - */ -bool send_notify_to_eventfd(struct lg_cpu *cpu) -{ - unsigned int i; - struct lg_eventfd_map *map; - - /* We only connect LHCALL_NOTIFY to event fds, not other traps. */ - if (cpu->pending.trap != LGUEST_TRAP_ENTRY) - return false; - - /* - * This "rcu_read_lock()" helps track when someone is still looking at - * the (RCU-using) eventfds array. It's not actually a lock at all; - * indeed it's a noop in many configurations. (You didn't expect me to - * explain all the RCU secrets here, did you?) - */ - rcu_read_lock(); - /* - * rcu_dereference is the counter-side of rcu_assign_pointer(); it - * makes sure we don't access the memory pointed to by - * cpu->lg->eventfds before cpu->lg->eventfds is set. Sounds crazy, - * but Alpha allows this! Paul McKenney points out that a really - * aggressive compiler could have the same effect: - * http://lists.ozlabs.org/pipermail/lguest/2009-July/001560.html - * - * So play safe, use rcu_dereference to get the rcu-protected pointer: - */ - map = rcu_dereference(cpu->lg->eventfds); - /* - * Simple array search: even if they add an eventfd while we do this, - * we'll continue to use the old array and just won't see the new one. - */ - for (i = 0; i < map->num; i++) { - if (map->map[i].addr == cpu->pending.addr) { - eventfd_signal(map->map[i].event, 1); - cpu->pending.trap = 0; - break; - } - } - /* We're done with the rcu-protected variable cpu->lg->eventfds. */ - rcu_read_unlock(); - - /* If we cleared the notification, it's because we found a match. */ - return cpu->pending.trap == 0; -} - -/*L:055 - * One of the more tricksy tricks in the Linux Kernel is a technique called - * Read Copy Update. Since one point of lguest is to teach lguest journeyers - * about kernel coding, I use it here. (In case you're curious, other purposes - * include learning about virtualization and instilling a deep appreciation for - * simplicity and puppies). - * - * We keep a simple array which maps LHCALL_NOTIFY values to eventfds, but we - * add new eventfds without ever blocking readers from accessing the array. - * The current Launcher only does this during boot, so that never happens. But - * Read Copy Update is cool, and adding a lock risks damaging even more puppies - * than this code does. - * - * We allocate a brand new one-larger array, copy the old one and add our new - * element. Then we make the lg eventfd pointer point to the new array. - * That's the easy part: now we need to free the old one, but we need to make - * sure no slow CPU somewhere is still looking at it. That's what - * synchronize_rcu does for us: waits until every CPU has indicated that it has - * moved on to know it's no longer using the old one. - * - * If that's unclear, see http://en.wikipedia.org/wiki/Read-copy-update. - */ -static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) -{ - struct lg_eventfd_map *new, *old = lg->eventfds; - - /* - * We don't allow notifications on value 0 anyway (pending_notify of - * 0 means "nothing pending"). - */ - if (!addr) - return -EINVAL; - - /* - * Replace the old array with the new one, carefully: others can - * be accessing it at the same time. - */ - new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1), - GFP_KERNEL); - if (!new) - return -ENOMEM; - - /* First make identical copy. */ - memcpy(new->map, old->map, sizeof(old->map[0]) * old->num); - new->num = old->num; - - /* Now append new entry. */ - new->map[new->num].addr = addr; - new->map[new->num].event = eventfd_ctx_fdget(fd); - if (IS_ERR(new->map[new->num].event)) { - int err = PTR_ERR(new->map[new->num].event); - kfree(new); - return err; - } - new->num++; - - /* - * Now put new one in place: rcu_assign_pointer() is a fancy way of - * doing "lg->eventfds = new", but it uses memory barriers to make - * absolutely sure that the contents of "new" written above is nailed - * down before we actually do the assignment. - * - * We have to think about these kinds of things when we're operating on - * live data without locks. - */ - rcu_assign_pointer(lg->eventfds, new); - - /* - * We're not in a big hurry. Wait until no one's looking at old - * version, then free it. - */ - synchronize_rcu(); - kfree(old); - - return 0; -} - /*L:052 - * Receiving notifications from the Guest is usually done by attaching a - * particular LHCALL_NOTIFY value to an event filedescriptor. The eventfd will - * become readable when the Guest does an LHCALL_NOTIFY with that value. - * - * This is really convenient for processing each virtqueue in a separate - * thread. - */ -static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) -{ - unsigned long addr, fd; - int err; - - if (get_user(addr, input) != 0) - return -EFAULT; - input++; - if (get_user(fd, input) != 0) - return -EFAULT; - - /* - * Just make sure two callers don't add eventfds at once. We really - * only need to lock against callers adding to the same Guest, so using - * the Big Lguest Lock is overkill. But this is setup, not a fast path. - */ - mutex_lock(&lguest_lock); - err = add_eventfd(lg, addr, fd); - mutex_unlock(&lguest_lock); - - return err; -} - -/* The Launcher can get the registers, and also set some of them. */ + The Launcher can get the registers, and also set some of them. +*/ static int getreg_setup(struct lg_cpu *cpu, const unsigned long __user *input) { unsigned long which; @@ -409,13 +247,6 @@ static int initialize(struct file *file, const unsigned long __user *input) goto unlock; } - lg->eventfds = kmalloc(sizeof(*lg->eventfds), GFP_KERNEL); - if (!lg->eventfds) { - err = -ENOMEM; - goto free_lg; - } - lg->eventfds->num = 0; - /* Populate the easy fields of our "struct lguest" */ lg->mem_base = (void __user *)args[0]; lg->pfn_limit = args[1]; @@ -424,7 +255,7 @@ static int initialize(struct file *file, const unsigned long __user *input) /* This is the first cpu (cpu 0) and it will start booting at args[2] */ err = lg_cpu_start(&lg->cpus[0], 0, args[2]); if (err) - goto free_eventfds; + goto free_lg; /* * Initialize the Guest's shadow page tables. This allocates @@ -445,8 +276,6 @@ static int initialize(struct file *file, const unsigned long __user *input) free_regs: /* FIXME: This should be in free_vcpu */ free_page(lg->cpus[0].regs_page); -free_eventfds: - kfree(lg->eventfds); free_lg: kfree(lg); unlock: @@ -499,8 +328,6 @@ static ssize_t write(struct file *file, const char __user *in, return initialize(file, input); case LHREQ_IRQ: return user_send_irq(cpu, input); - case LHREQ_EVENTFD: - return attach_eventfd(lg, input); case LHREQ_GETREG: return getreg_setup(cpu, input); case LHREQ_SETREG: @@ -551,11 +378,6 @@ static int close(struct inode *inode, struct file *file) mmput(lg->cpus[i].mm); } - /* Release any eventfds they registered. */ - for (i = 0; i < lg->eventfds->num; i++) - eventfd_ctx_put(lg->eventfds->map[i].event); - kfree(lg->eventfds); - /* * If lg->dead doesn't contain an error code it will be NULL or a * kmalloc()ed string, either of which is ok to hand to kfree(). diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h index 677cde735d4b..acd5b12565cc 100644 --- a/include/linux/lguest_launcher.h +++ b/include/linux/lguest_launcher.h @@ -23,7 +23,7 @@ enum lguest_req LHREQ_GETDMA, /* No longer used */ LHREQ_IRQ, /* + irq */ LHREQ_BREAK, /* No longer used */ - LHREQ_EVENTFD, /* + address, fd. */ + LHREQ_EVENTFD, /* No longer used. */ LHREQ_GETREG, /* + offset within struct pt_regs (then read value). */ LHREQ_SETREG, /* + offset within struct pt_regs, value. */ LHREQ_TRAP, /* + trap number to deliver to guest. */ -- cgit v1.2.3-58-ga151