linux/drivers/lguest/lguest_user.c
<<
>>
Prefs
   1/*P:200 This contains all the /dev/lguest code, whereby the userspace launcher
   2 * controls and communicates with the Guest.  For example, the first write will
   3 * tell us the Guest's memory layout and entry point.  A read will run the
   4 * Guest until something happens, such as a signal or the Guest doing a NOTIFY
   5 * out to the Launcher.
   6:*/
   7#include <linux/uaccess.h>
   8#include <linux/miscdevice.h>
   9#include <linux/fs.h>
  10#include <linux/sched.h>
  11#include <linux/eventfd.h>
  12#include <linux/file.h>
  13#include "lg.h"
  14
  15/*L:056
  16 * Before we move on, let's jump ahead and look at what the kernel does when
  17 * it needs to look up the eventfds.  That will complete our picture of how we
  18 * use RCU.
  19 *
  20 * The notification value is in cpu->pending_notify: we return true if it went
  21 * to an eventfd.
  22 */
  23bool send_notify_to_eventfd(struct lg_cpu *cpu)
  24{
  25        unsigned int i;
  26        struct lg_eventfd_map *map;
  27
  28        /*
  29         * This "rcu_read_lock()" helps track when someone is still looking at
  30         * the (RCU-using) eventfds array.  It's not actually a lock at all;
  31         * indeed it's a noop in many configurations.  (You didn't expect me to
  32         * explain all the RCU secrets here, did you?)
  33         */
  34        rcu_read_lock();
  35        /*
  36         * rcu_dereference is the counter-side of rcu_assign_pointer(); it
  37         * makes sure we don't access the memory pointed to by
  38         * cpu->lg->eventfds before cpu->lg->eventfds is set.  Sounds crazy,
  39         * but Alpha allows this!  Paul McKenney points out that a really
  40         * aggressive compiler could have the same effect:
  41         *   http://lists.ozlabs.org/pipermail/lguest/2009-July/001560.html
  42         *
  43         * So play safe, use rcu_dereference to get the rcu-protected pointer:
  44         */
  45        map = rcu_dereference(cpu->lg->eventfds);
  46        /*
  47         * Simple array search: even if they add an eventfd while we do this,
  48         * we'll continue to use the old array and just won't see the new one.
  49         */
  50        for (i = 0; i < map->num; i++) {
  51                if (map->map[i].addr == cpu->pending_notify) {
  52                        eventfd_signal(map->map[i].event, 1);
  53                        cpu->pending_notify = 0;
  54                        break;
  55                }
  56        }
  57        /* We're done with the rcu-protected variable cpu->lg->eventfds. */
  58        rcu_read_unlock();
  59
  60        /* If we cleared the notification, it's because we found a match. */
  61        return cpu->pending_notify == 0;
  62}
  63
  64/*L:055
  65 * One of the more tricksy tricks in the Linux Kernel is a technique called
  66 * Read Copy Update.  Since one point of lguest is to teach lguest journeyers
  67 * about kernel coding, I use it here.  (In case you're curious, other purposes
  68 * include learning about virtualization and instilling a deep appreciation for
  69 * simplicity and puppies).
  70 *
  71 * We keep a simple array which maps LHCALL_NOTIFY values to eventfds, but we
  72 * add new eventfds without ever blocking readers from accessing the array.
  73 * The current Launcher only does this during boot, so that never happens.  But
  74 * Read Copy Update is cool, and adding a lock risks damaging even more puppies
  75 * than this code does.
  76 *
  77 * We allocate a brand new one-larger array, copy the old one and add our new
  78 * element.  Then we make the lg eventfd pointer point to the new array.
  79 * That's the easy part: now we need to free the old one, but we need to make
  80 * sure no slow CPU somewhere is still looking at it.  That's what
  81 * synchronize_rcu does for us: waits until every CPU has indicated that it has
  82 * moved on to know it's no longer using the old one.
  83 *
  84 * If that's unclear, see http://en.wikipedia.org/wiki/Read-copy-update.
  85 */
  86static int add_eventfd(struct lguest *lg, unsigned long addr, int fd)
  87{
  88        struct lg_eventfd_map *new, *old = lg->eventfds;
  89
  90        /*
  91         * We don't allow notifications on value 0 anyway (pending_notify of
  92         * 0 means "nothing pending").
  93         */
  94        if (!addr)
  95                return -EINVAL;
  96
  97        /*
  98         * Replace the old array with the new one, carefully: others can
  99         * be accessing it at the same time.
 100         */
 101        new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1),
 102                      GFP_KERNEL);
 103        if (!new)
 104                return -ENOMEM;
 105
 106        /* First make identical copy. */
 107        memcpy(new->map, old->map, sizeof(old->map[0]) * old->num);
 108        new->num = old->num;
 109
 110        /* Now append new entry. */
 111        new->map[new->num].addr = addr;
 112        new->map[new->num].event = eventfd_ctx_fdget(fd);
 113        if (IS_ERR(new->map[new->num].event)) {
 114                int err =  PTR_ERR(new->map[new->num].event);
 115                kfree(new);
 116                return err;
 117        }
 118        new->num++;
 119
 120        /*
 121         * Now put new one in place: rcu_assign_pointer() is a fancy way of
 122         * doing "lg->eventfds = new", but it uses memory barriers to make
 123         * absolutely sure that the contents of "new" written above is nailed
 124         * down before we actually do the assignment.
 125         *
 126         * We have to think about these kinds of things when we're operating on
 127         * live data without locks.
 128         */
 129        rcu_assign_pointer(lg->eventfds, new);
 130
 131        /*
 132         * We're not in a big hurry.  Wait until noone's looking at old
 133         * version, then free it.
 134         */
 135        synchronize_rcu();
 136        kfree(old);
 137
 138        return 0;
 139}
 140
 141/*L:052
 142 * Receiving notifications from the Guest is usually done by attaching a
 143 * particular LHCALL_NOTIFY value to an event filedescriptor.  The eventfd will
 144 * become readable when the Guest does an LHCALL_NOTIFY with that value.
 145 *
 146 * This is really convenient for processing each virtqueue in a separate
 147 * thread.
 148 */
 149static int attach_eventfd(struct lguest *lg, const unsigned long __user *input)
 150{
 151        unsigned long addr, fd;
 152        int err;
 153
 154        if (get_user(addr, input) != 0)
 155                return -EFAULT;
 156        input++;
 157        if (get_user(fd, input) != 0)
 158                return -EFAULT;
 159
 160        /*
 161         * Just make sure two callers don't add eventfds at once.  We really
 162         * only need to lock against callers adding to the same Guest, so using
 163         * the Big Lguest Lock is overkill.  But this is setup, not a fast path.
 164         */
 165        mutex_lock(&lguest_lock);
 166        err = add_eventfd(lg, addr, fd);
 167        mutex_unlock(&lguest_lock);
 168
 169        return err;
 170}
 171
 172/*L:050
 173 * Sending an interrupt is done by writing LHREQ_IRQ and an interrupt
 174 * number to /dev/lguest.
 175 */
 176static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
 177{
 178        unsigned long irq;
 179
 180        if (get_user(irq, input) != 0)
 181                return -EFAULT;
 182        if (irq >= LGUEST_IRQS)
 183                return -EINVAL;
 184
 185        /*
 186         * Next time the Guest runs, the core code will see if it can deliver
 187         * this interrupt.
 188         */
 189        set_interrupt(cpu, irq);
 190        return 0;
 191}
 192
 193/*L:040
 194 * Once our Guest is initialized, the Launcher makes it run by reading
 195 * from /dev/lguest.
 196 */
 197static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
 198{
 199        struct lguest *lg = file->private_data;
 200        struct lg_cpu *cpu;
 201        unsigned int cpu_id = *o;
 202
 203        /* You must write LHREQ_INITIALIZE first! */
 204        if (!lg)
 205                return -EINVAL;
 206
 207        /* Watch out for arbitrary vcpu indexes! */
 208        if (cpu_id >= lg->nr_cpus)
 209                return -EINVAL;
 210
 211        cpu = &lg->cpus[cpu_id];
 212
 213        /* If you're not the task which owns the Guest, go away. */
 214        if (current != cpu->tsk)
 215                return -EPERM;
 216
 217        /* If the Guest is already dead, we indicate why */
 218        if (lg->dead) {
 219                size_t len;
 220
 221                /* lg->dead either contains an error code, or a string. */
 222                if (IS_ERR(lg->dead))
 223                        return PTR_ERR(lg->dead);
 224
 225                /* We can only return as much as the buffer they read with. */
 226                len = min(size, strlen(lg->dead)+1);
 227                if (copy_to_user(user, lg->dead, len) != 0)
 228                        return -EFAULT;
 229                return len;
 230        }
 231
 232        /*
 233         * If we returned from read() last time because the Guest sent I/O,
 234         * clear the flag.
 235         */
 236        if (cpu->pending_notify)
 237                cpu->pending_notify = 0;
 238
 239        /* Run the Guest until something interesting happens. */
 240        return run_guest(cpu, (unsigned long __user *)user);
 241}
 242
 243/*L:025
 244 * This actually initializes a CPU.  For the moment, a Guest is only
 245 * uniprocessor, so "id" is always 0.
 246 */
 247static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
 248{
 249        /* We have a limited number the number of CPUs in the lguest struct. */
 250        if (id >= ARRAY_SIZE(cpu->lg->cpus))
 251                return -EINVAL;
 252
 253        /* Set up this CPU's id, and pointer back to the lguest struct. */
 254        cpu->id = id;
 255        cpu->lg = container_of((cpu - id), struct lguest, cpus[0]);
 256        cpu->lg->nr_cpus++;
 257
 258        /* Each CPU has a timer it can set. */
 259        init_clockdev(cpu);
 260
 261        /*
 262         * We need a complete page for the Guest registers: they are accessible
 263         * to the Guest and we can only grant it access to whole pages.
 264         */
 265        cpu->regs_page = get_zeroed_page(GFP_KERNEL);
 266        if (!cpu->regs_page)
 267                return -ENOMEM;
 268
 269        /* We actually put the registers at the bottom of the page. */
 270        cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs);
 271
 272        /*
 273         * Now we initialize the Guest's registers, handing it the start
 274         * address.
 275         */
 276        lguest_arch_setup_regs(cpu, start_ip);
 277
 278        /*
 279         * We keep a pointer to the Launcher task (ie. current task) for when
 280         * other Guests want to wake this one (eg. console input).
 281         */
 282        cpu->tsk = current;
 283
 284        /*
 285         * We need to keep a pointer to the Launcher's memory map, because if
 286         * the Launcher dies we need to clean it up.  If we don't keep a
 287         * reference, it is destroyed before close() is called.
 288         */
 289        cpu->mm = get_task_mm(cpu->tsk);
 290
 291        /*
 292         * We remember which CPU's pages this Guest used last, for optimization
 293         * when the same Guest runs on the same CPU twice.
 294         */
 295        cpu->last_pages = NULL;
 296
 297        /* No error == success. */
 298        return 0;
 299}
 300
 301/*L:020
 302 * The initialization write supplies 3 pointer sized (32 or 64 bit) values (in
 303 * addition to the LHREQ_INITIALIZE value).  These are:
 304 *
 305 * base: The start of the Guest-physical memory inside the Launcher memory.
 306 *
 307 * pfnlimit: The highest (Guest-physical) page number the Guest should be
 308 * allowed to access.  The Guest memory lives inside the Launcher, so it sets
 309 * this to ensure the Guest can only reach its own memory.
 310 *
 311 * start: The first instruction to execute ("eip" in x86-speak).
 312 */
 313static int initialize(struct file *file, const unsigned long __user *input)
 314{
 315        /* "struct lguest" contains all we (the Host) know about a Guest. */
 316        struct lguest *lg;
 317        int err;
 318        unsigned long args[3];
 319
 320        /*
 321         * We grab the Big Lguest lock, which protects against multiple
 322         * simultaneous initializations.
 323         */
 324        mutex_lock(&lguest_lock);
 325        /* You can't initialize twice!  Close the device and start again... */
 326        if (file->private_data) {
 327                err = -EBUSY;
 328                goto unlock;
 329        }
 330
 331        if (copy_from_user(args, input, sizeof(args)) != 0) {
 332                err = -EFAULT;
 333                goto unlock;
 334        }
 335
 336        lg = kzalloc(sizeof(*lg), GFP_KERNEL);
 337        if (!lg) {
 338                err = -ENOMEM;
 339                goto unlock;
 340        }
 341
 342        lg->eventfds = kmalloc(sizeof(*lg->eventfds), GFP_KERNEL);
 343        if (!lg->eventfds) {
 344                err = -ENOMEM;
 345                goto free_lg;
 346        }
 347        lg->eventfds->num = 0;
 348
 349        /* Populate the easy fields of our "struct lguest" */
 350        lg->mem_base = (void __user *)args[0];
 351        lg->pfn_limit = args[1];
 352
 353        /* This is the first cpu (cpu 0) and it will start booting at args[2] */
 354        err = lg_cpu_start(&lg->cpus[0], 0, args[2]);
 355        if (err)
 356                goto free_eventfds;
 357
 358        /*
 359         * Initialize the Guest's shadow page tables, using the toplevel
 360         * address the Launcher gave us.  This allocates memory, so can fail.
 361         */
 362        err = init_guest_pagetable(lg);
 363        if (err)
 364                goto free_regs;
 365
 366        /* We keep our "struct lguest" in the file's private_data. */
 367        file->private_data = lg;
 368
 369        mutex_unlock(&lguest_lock);
 370
 371        /* And because this is a write() call, we return the length used. */
 372        return sizeof(args);
 373
 374free_regs:
 375        /* FIXME: This should be in free_vcpu */
 376        free_page(lg->cpus[0].regs_page);
 377free_eventfds:
 378        kfree(lg->eventfds);
 379free_lg:
 380        kfree(lg);
 381unlock:
 382        mutex_unlock(&lguest_lock);
 383        return err;
 384}
 385
 386/*L:010
 387 * The first operation the Launcher does must be a write.  All writes
 388 * start with an unsigned long number: for the first write this must be
 389 * LHREQ_INITIALIZE to set up the Guest.  After that the Launcher can use
 390 * writes of other values to send interrupts or set up receipt of notifications.
 391 *
 392 * Note that we overload the "offset" in the /dev/lguest file to indicate what
 393 * CPU number we're dealing with.  Currently this is always 0 since we only
 394 * support uniprocessor Guests, but you can see the beginnings of SMP support
 395 * here.
 396 */
 397static ssize_t write(struct file *file, const char __user *in,
 398                     size_t size, loff_t *off)
 399{
 400        /*
 401         * Once the Guest is initialized, we hold the "struct lguest" in the
 402         * file private data.
 403         */
 404        struct lguest *lg = file->private_data;
 405        const unsigned long __user *input = (const unsigned long __user *)in;
 406        unsigned long req;
 407        struct lg_cpu *uninitialized_var(cpu);
 408        unsigned int cpu_id = *off;
 409
 410        /* The first value tells us what this request is. */
 411        if (get_user(req, input) != 0)
 412                return -EFAULT;
 413        input++;
 414
 415        /* If you haven't initialized, you must do that first. */
 416        if (req != LHREQ_INITIALIZE) {
 417                if (!lg || (cpu_id >= lg->nr_cpus))
 418                        return -EINVAL;
 419                cpu = &lg->cpus[cpu_id];
 420
 421                /* Once the Guest is dead, you can only read() why it died. */
 422                if (lg->dead)
 423                        return -ENOENT;
 424        }
 425
 426        switch (req) {
 427        case LHREQ_INITIALIZE:
 428                return initialize(file, input);
 429        case LHREQ_IRQ:
 430                return user_send_irq(cpu, input);
 431        case LHREQ_EVENTFD:
 432                return attach_eventfd(lg, input);
 433        default:
 434                return -EINVAL;
 435        }
 436}
 437
 438/*L:060
 439 * The final piece of interface code is the close() routine.  It reverses
 440 * everything done in initialize().  This is usually called because the
 441 * Launcher exited.
 442 *
 443 * Note that the close routine returns 0 or a negative error number: it can't
 444 * really fail, but it can whine.  I blame Sun for this wart, and K&R C for
 445 * letting them do it.
 446:*/
 447static int close(struct inode *inode, struct file *file)
 448{
 449        struct lguest *lg = file->private_data;
 450        unsigned int i;
 451
 452        /* If we never successfully initialized, there's nothing to clean up */
 453        if (!lg)
 454                return 0;
 455
 456        /*
 457         * We need the big lock, to protect from inter-guest I/O and other
 458         * Launchers initializing guests.
 459         */
 460        mutex_lock(&lguest_lock);
 461
 462        /* Free up the shadow page tables for the Guest. */
 463        free_guest_pagetable(lg);
 464
 465        for (i = 0; i < lg->nr_cpus; i++) {
 466                /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */
 467                hrtimer_cancel(&lg->cpus[i].hrt);
 468                /* We can free up the register page we allocated. */
 469                free_page(lg->cpus[i].regs_page);
 470                /*
 471                 * Now all the memory cleanups are done, it's safe to release
 472                 * the Launcher's memory management structure.
 473                 */
 474                mmput(lg->cpus[i].mm);
 475        }
 476
 477        /* Release any eventfds they registered. */
 478        for (i = 0; i < lg->eventfds->num; i++)
 479                eventfd_ctx_put(lg->eventfds->map[i].event);
 480        kfree(lg->eventfds);
 481
 482        /*
 483         * If lg->dead doesn't contain an error code it will be NULL or a
 484         * kmalloc()ed string, either of which is ok to hand to kfree().
 485         */
 486        if (!IS_ERR(lg->dead))
 487                kfree(lg->dead);
 488        /* Free the memory allocated to the lguest_struct */
 489        kfree(lg);
 490        /* Release lock and exit. */
 491        mutex_unlock(&lguest_lock);
 492
 493        return 0;
 494}
 495
 496/*L:000
 497 * Welcome to our journey through the Launcher!
 498 *
 499 * The Launcher is the Host userspace program which sets up, runs and services
 500 * the Guest.  In fact, many comments in the Drivers which refer to "the Host"
 501 * doing things are inaccurate: the Launcher does all the device handling for
 502 * the Guest, but the Guest can't know that.
 503 *
 504 * Just to confuse you: to the Host kernel, the Launcher *is* the Guest and we
 505 * shall see more of that later.
 506 *
 507 * We begin our understanding with the Host kernel interface which the Launcher
 508 * uses: reading and writing a character device called /dev/lguest.  All the
 509 * work happens in the read(), write() and close() routines:
 510 */
 511static const struct file_operations lguest_fops = {
 512        .owner   = THIS_MODULE,
 513        .release = close,
 514        .write   = write,
 515        .read    = read,
 516};
 517
 518/*
 519 * This is a textbook example of a "misc" character device.  Populate a "struct
 520 * miscdevice" and register it with misc_register().
 521 */
 522static struct miscdevice lguest_dev = {
 523        .minor  = MISC_DYNAMIC_MINOR,
 524        .name   = "lguest",
 525        .fops   = &lguest_fops,
 526};
 527
 528int __init lguest_device_init(void)
 529{
 530        return misc_register(&lguest_dev);
 531}
 532
 533void __exit lguest_device_remove(void)
 534{
 535        misc_deregister(&lguest_dev);
 536}
 537