linux/drivers/xen/privcmd.c
<<
>>
Prefs
   1/******************************************************************************
   2 * privcmd.c
   3 *
   4 * Interface to privileged domain-0 commands.
   5 *
   6 * Copyright (c) 2002-2004, K A Fraser, B Dragovic
   7 */
   8
   9#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
  10
  11#include <linux/kernel.h>
  12#include <linux/module.h>
  13#include <linux/sched.h>
  14#include <linux/slab.h>
  15#include <linux/string.h>
  16#include <linux/errno.h>
  17#include <linux/mm.h>
  18#include <linux/mman.h>
  19#include <linux/uaccess.h>
  20#include <linux/swap.h>
  21#include <linux/highmem.h>
  22#include <linux/pagemap.h>
  23#include <linux/seq_file.h>
  24#include <linux/miscdevice.h>
  25
  26#include <asm/pgalloc.h>
  27#include <asm/pgtable.h>
  28#include <asm/tlb.h>
  29#include <asm/xen/hypervisor.h>
  30#include <asm/xen/hypercall.h>
  31
  32#include <xen/xen.h>
  33#include <xen/privcmd.h>
  34#include <xen/interface/xen.h>
  35#include <xen/features.h>
  36#include <xen/page.h>
  37#include <xen/xen-ops.h>
  38#include <xen/balloon.h>
  39
  40#include "privcmd.h"
  41
  42MODULE_LICENSE("GPL");
  43
  44#define PRIV_VMA_LOCKED ((void *)1)
  45
  46static int privcmd_vma_range_is_mapped(
  47               struct vm_area_struct *vma,
  48               unsigned long addr,
  49               unsigned long nr_pages);
  50
  51static long privcmd_ioctl_hypercall(void __user *udata)
  52{
  53        struct privcmd_hypercall hypercall;
  54        long ret;
  55
  56        if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
  57                return -EFAULT;
  58
  59        xen_preemptible_hcall_begin();
  60        ret = privcmd_call(hypercall.op,
  61                           hypercall.arg[0], hypercall.arg[1],
  62                           hypercall.arg[2], hypercall.arg[3],
  63                           hypercall.arg[4]);
  64        xen_preemptible_hcall_end();
  65
  66        return ret;
  67}
  68
  69static void free_page_list(struct list_head *pages)
  70{
  71        struct page *p, *n;
  72
  73        list_for_each_entry_safe(p, n, pages, lru)
  74                __free_page(p);
  75
  76        INIT_LIST_HEAD(pages);
  77}
  78
  79/*
  80 * Given an array of items in userspace, return a list of pages
  81 * containing the data.  If copying fails, either because of memory
  82 * allocation failure or a problem reading user memory, return an
  83 * error code; its up to the caller to dispose of any partial list.
  84 */
  85static int gather_array(struct list_head *pagelist,
  86                        unsigned nelem, size_t size,
  87                        const void __user *data)
  88{
  89        unsigned pageidx;
  90        void *pagedata;
  91        int ret;
  92
  93        if (size > PAGE_SIZE)
  94                return 0;
  95
  96        pageidx = PAGE_SIZE;
  97        pagedata = NULL;        /* quiet, gcc */
  98        while (nelem--) {
  99                if (pageidx > PAGE_SIZE-size) {
 100                        struct page *page = alloc_page(GFP_KERNEL);
 101
 102                        ret = -ENOMEM;
 103                        if (page == NULL)
 104                                goto fail;
 105
 106                        pagedata = page_address(page);
 107
 108                        list_add_tail(&page->lru, pagelist);
 109                        pageidx = 0;
 110                }
 111
 112                ret = -EFAULT;
 113                if (copy_from_user(pagedata + pageidx, data, size))
 114                        goto fail;
 115
 116                data += size;
 117                pageidx += size;
 118        }
 119
 120        ret = 0;
 121
 122fail:
 123        return ret;
 124}
 125
 126/*
 127 * Call function "fn" on each element of the array fragmented
 128 * over a list of pages.
 129 */
 130static int traverse_pages(unsigned nelem, size_t size,
 131                          struct list_head *pos,
 132                          int (*fn)(void *data, void *state),
 133                          void *state)
 134{
 135        void *pagedata;
 136        unsigned pageidx;
 137        int ret = 0;
 138
 139        BUG_ON(size > PAGE_SIZE);
 140
 141        pageidx = PAGE_SIZE;
 142        pagedata = NULL;        /* hush, gcc */
 143
 144        while (nelem--) {
 145                if (pageidx > PAGE_SIZE-size) {
 146                        struct page *page;
 147                        pos = pos->next;
 148                        page = list_entry(pos, struct page, lru);
 149                        pagedata = page_address(page);
 150                        pageidx = 0;
 151                }
 152
 153                ret = (*fn)(pagedata + pageidx, state);
 154                if (ret)
 155                        break;
 156                pageidx += size;
 157        }
 158
 159        return ret;
 160}
 161
 162/*
 163 * Similar to traverse_pages, but use each page as a "block" of
 164 * data to be processed as one unit.
 165 */
 166static int traverse_pages_block(unsigned nelem, size_t size,
 167                                struct list_head *pos,
 168                                int (*fn)(void *data, int nr, void *state),
 169                                void *state)
 170{
 171        void *pagedata;
 172        unsigned pageidx;
 173        int ret = 0;
 174
 175        BUG_ON(size > PAGE_SIZE);
 176
 177        pageidx = PAGE_SIZE;
 178
 179        while (nelem) {
 180                int nr = (PAGE_SIZE/size);
 181                struct page *page;
 182                if (nr > nelem)
 183                        nr = nelem;
 184                pos = pos->next;
 185                page = list_entry(pos, struct page, lru);
 186                pagedata = page_address(page);
 187                ret = (*fn)(pagedata, nr, state);
 188                if (ret)
 189                        break;
 190                nelem -= nr;
 191        }
 192
 193        return ret;
 194}
 195
 196struct mmap_mfn_state {
 197        unsigned long va;
 198        struct vm_area_struct *vma;
 199        domid_t domain;
 200};
 201
 202static int mmap_mfn_range(void *data, void *state)
 203{
 204        struct privcmd_mmap_entry *msg = data;
 205        struct mmap_mfn_state *st = state;
 206        struct vm_area_struct *vma = st->vma;
 207        int rc;
 208
 209        /* Do not allow range to wrap the address space. */
 210        if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) ||
 211            ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va))
 212                return -EINVAL;
 213
 214        /* Range chunks must be contiguous in va space. */
 215        if ((msg->va != st->va) ||
 216            ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end))
 217                return -EINVAL;
 218
 219        rc = xen_remap_domain_mfn_range(vma,
 220                                        msg->va & PAGE_MASK,
 221                                        msg->mfn, msg->npages,
 222                                        vma->vm_page_prot,
 223                                        st->domain, NULL);
 224        if (rc < 0)
 225                return rc;
 226
 227        st->va += msg->npages << PAGE_SHIFT;
 228
 229        return 0;
 230}
 231
 232static long privcmd_ioctl_mmap(void __user *udata)
 233{
 234        struct privcmd_mmap mmapcmd;
 235        struct mm_struct *mm = current->mm;
 236        struct vm_area_struct *vma;
 237        int rc;
 238        LIST_HEAD(pagelist);
 239        struct mmap_mfn_state state;
 240
 241        /* We only support privcmd_ioctl_mmap_batch for auto translated. */
 242        if (xen_feature(XENFEAT_auto_translated_physmap))
 243                return -ENOSYS;
 244
 245        if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
 246                return -EFAULT;
 247
 248        rc = gather_array(&pagelist,
 249                          mmapcmd.num, sizeof(struct privcmd_mmap_entry),
 250                          mmapcmd.entry);
 251
 252        if (rc || list_empty(&pagelist))
 253                goto out;
 254
 255        down_write(&mm->mmap_sem);
 256
 257        {
 258                struct page *page = list_first_entry(&pagelist,
 259                                                     struct page, lru);
 260                struct privcmd_mmap_entry *msg = page_address(page);
 261
 262                vma = find_vma(mm, msg->va);
 263                rc = -EINVAL;
 264
 265                if (!vma || (msg->va != vma->vm_start) || vma->vm_private_data)
 266                        goto out_up;
 267                vma->vm_private_data = PRIV_VMA_LOCKED;
 268        }
 269
 270        state.va = vma->vm_start;
 271        state.vma = vma;
 272        state.domain = mmapcmd.dom;
 273
 274        rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry),
 275                            &pagelist,
 276                            mmap_mfn_range, &state);
 277
 278
 279out_up:
 280        up_write(&mm->mmap_sem);
 281
 282out:
 283        free_page_list(&pagelist);
 284
 285        return rc;
 286}
 287
 288struct mmap_batch_state {
 289        domid_t domain;
 290        unsigned long va;
 291        struct vm_area_struct *vma;
 292        int index;
 293        /* A tristate:
 294         *      0 for no errors
 295         *      1 if at least one error has happened (and no
 296         *          -ENOENT errors have happened)
 297         *      -ENOENT if at least 1 -ENOENT has happened.
 298         */
 299        int global_error;
 300        int version;
 301
 302        /* User-space mfn array to store errors in the second pass for V1. */
 303        xen_pfn_t __user *user_mfn;
 304        /* User-space int array to store errors in the second pass for V2. */
 305        int __user *user_err;
 306};
 307
 308/* auto translated dom0 note: if domU being created is PV, then mfn is
 309 * mfn(addr on bus). If it's auto xlated, then mfn is pfn (input to HAP).
 310 */
 311static int mmap_batch_fn(void *data, int nr, void *state)
 312{
 313        xen_pfn_t *mfnp = data;
 314        struct mmap_batch_state *st = state;
 315        struct vm_area_struct *vma = st->vma;
 316        struct page **pages = vma->vm_private_data;
 317        struct page **cur_pages = NULL;
 318        int ret;
 319
 320        if (xen_feature(XENFEAT_auto_translated_physmap))
 321                cur_pages = &pages[st->index];
 322
 323        BUG_ON(nr < 0);
 324        ret = xen_remap_domain_mfn_array(st->vma, st->va & PAGE_MASK, mfnp, nr,
 325                                         (int *)mfnp, st->vma->vm_page_prot,
 326                                         st->domain, cur_pages);
 327
 328        /* Adjust the global_error? */
 329        if (ret != nr) {
 330                if (ret == -ENOENT)
 331                        st->global_error = -ENOENT;
 332                else {
 333                        /* Record that at least one error has happened. */
 334                        if (st->global_error == 0)
 335                                st->global_error = 1;
 336                }
 337        }
 338        st->va += PAGE_SIZE * nr;
 339        st->index += nr;
 340
 341        return 0;
 342}
 343
 344static int mmap_return_error(int err, struct mmap_batch_state *st)
 345{
 346        int ret;
 347
 348        if (st->version == 1) {
 349                if (err) {
 350                        xen_pfn_t mfn;
 351
 352                        ret = get_user(mfn, st->user_mfn);
 353                        if (ret < 0)
 354                                return ret;
 355                        /*
 356                         * V1 encodes the error codes in the 32bit top
 357                         * nibble of the mfn (with its known
 358                         * limitations vis-a-vis 64 bit callers).
 359                         */
 360                        mfn |= (err == -ENOENT) ?
 361                                PRIVCMD_MMAPBATCH_PAGED_ERROR :
 362                                PRIVCMD_MMAPBATCH_MFN_ERROR;
 363                        return __put_user(mfn, st->user_mfn++);
 364                } else
 365                        st->user_mfn++;
 366        } else { /* st->version == 2 */
 367                if (err)
 368                        return __put_user(err, st->user_err++);
 369                else
 370                        st->user_err++;
 371        }
 372
 373        return 0;
 374}
 375
 376static int mmap_return_errors(void *data, int nr, void *state)
 377{
 378        struct mmap_batch_state *st = state;
 379        int *errs = data;
 380        int i;
 381        int ret;
 382
 383        for (i = 0; i < nr; i++) {
 384                ret = mmap_return_error(errs[i], st);
 385                if (ret < 0)
 386                        return ret;
 387        }
 388        return 0;
 389}
 390
 391/* Allocate pfns that are then mapped with gmfns from foreign domid. Update
 392 * the vma with the page info to use later.
 393 * Returns: 0 if success, otherwise -errno
 394 */
 395static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs)
 396{
 397        int rc;
 398        struct page **pages;
 399
 400        pages = kcalloc(numpgs, sizeof(pages[0]), GFP_KERNEL);
 401        if (pages == NULL)
 402                return -ENOMEM;
 403
 404        rc = alloc_xenballooned_pages(numpgs, pages, 0);
 405        if (rc != 0) {
 406                pr_warn("%s Could not alloc %d pfns rc:%d\n", __func__,
 407                        numpgs, rc);
 408                kfree(pages);
 409                return -ENOMEM;
 410        }
 411        BUG_ON(vma->vm_private_data != NULL);
 412        vma->vm_private_data = pages;
 413
 414        return 0;
 415}
 416
 417static struct vm_operations_struct privcmd_vm_ops;
 418
 419static long privcmd_ioctl_mmap_batch(void __user *udata, int version)
 420{
 421        int ret;
 422        struct privcmd_mmapbatch_v2 m;
 423        struct mm_struct *mm = current->mm;
 424        struct vm_area_struct *vma;
 425        unsigned long nr_pages;
 426        LIST_HEAD(pagelist);
 427        struct mmap_batch_state state;
 428
 429        switch (version) {
 430        case 1:
 431                if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch)))
 432                        return -EFAULT;
 433                /* Returns per-frame error in m.arr. */
 434                m.err = NULL;
 435                if (!access_ok(VERIFY_WRITE, m.arr, m.num * sizeof(*m.arr)))
 436                        return -EFAULT;
 437                break;
 438        case 2:
 439                if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2)))
 440                        return -EFAULT;
 441                /* Returns per-frame error code in m.err. */
 442                if (!access_ok(VERIFY_WRITE, m.err, m.num * (sizeof(*m.err))))
 443                        return -EFAULT;
 444                break;
 445        default:
 446                return -EINVAL;
 447        }
 448
 449        nr_pages = m.num;
 450        if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT)))
 451                return -EINVAL;
 452
 453        ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), m.arr);
 454
 455        if (ret)
 456                goto out;
 457        if (list_empty(&pagelist)) {
 458                ret = -EINVAL;
 459                goto out;
 460        }
 461
 462        if (version == 2) {
 463                /* Zero error array now to only copy back actual errors. */
 464                if (clear_user(m.err, sizeof(int) * m.num)) {
 465                        ret = -EFAULT;
 466                        goto out;
 467                }
 468        }
 469
 470        down_write(&mm->mmap_sem);
 471
 472        vma = find_vma(mm, m.addr);
 473        if (!vma ||
 474            vma->vm_ops != &privcmd_vm_ops) {
 475                ret = -EINVAL;
 476                goto out_unlock;
 477        }
 478
 479        /*
 480         * Caller must either:
 481         *
 482         * Map the whole VMA range, which will also allocate all the
 483         * pages required for the auto_translated_physmap case.
 484         *
 485         * Or
 486         *
 487         * Map unmapped holes left from a previous map attempt (e.g.,
 488         * because those foreign frames were previously paged out).
 489         */
 490        if (vma->vm_private_data == NULL) {
 491                if (m.addr != vma->vm_start ||
 492                    m.addr + (nr_pages << PAGE_SHIFT) != vma->vm_end) {
 493                        ret = -EINVAL;
 494                        goto out_unlock;
 495                }
 496                if (xen_feature(XENFEAT_auto_translated_physmap)) {
 497                        ret = alloc_empty_pages(vma, m.num);
 498                        if (ret < 0)
 499                                goto out_unlock;
 500                } else
 501                        vma->vm_private_data = PRIV_VMA_LOCKED;
 502        } else {
 503                if (m.addr < vma->vm_start ||
 504                    m.addr + (nr_pages << PAGE_SHIFT) > vma->vm_end) {
 505                        ret = -EINVAL;
 506                        goto out_unlock;
 507                }
 508                if (privcmd_vma_range_is_mapped(vma, m.addr, nr_pages)) {
 509                        ret = -EINVAL;
 510                        goto out_unlock;
 511                }
 512        }
 513
 514        state.domain        = m.dom;
 515        state.vma           = vma;
 516        state.va            = m.addr;
 517        state.index         = 0;
 518        state.global_error  = 0;
 519        state.version       = version;
 520
 521        /* mmap_batch_fn guarantees ret == 0 */
 522        BUG_ON(traverse_pages_block(m.num, sizeof(xen_pfn_t),
 523                                    &pagelist, mmap_batch_fn, &state));
 524
 525        up_write(&mm->mmap_sem);
 526
 527        if (state.global_error) {
 528                /* Write back errors in second pass. */
 529                state.user_mfn = (xen_pfn_t *)m.arr;
 530                state.user_err = m.err;
 531                ret = traverse_pages_block(m.num, sizeof(xen_pfn_t),
 532                                           &pagelist, mmap_return_errors, &state);
 533        } else
 534                ret = 0;
 535
 536        /* If we have not had any EFAULT-like global errors then set the global
 537         * error to -ENOENT if necessary. */
 538        if ((ret == 0) && (state.global_error == -ENOENT))
 539                ret = -ENOENT;
 540
 541out:
 542        free_page_list(&pagelist);
 543        return ret;
 544
 545out_unlock:
 546        up_write(&mm->mmap_sem);
 547        goto out;
 548}
 549
 550static long privcmd_ioctl(struct file *file,
 551                          unsigned int cmd, unsigned long data)
 552{
 553        int ret = -ENOSYS;
 554        void __user *udata = (void __user *) data;
 555
 556        switch (cmd) {
 557        case IOCTL_PRIVCMD_HYPERCALL:
 558                ret = privcmd_ioctl_hypercall(udata);
 559                break;
 560
 561        case IOCTL_PRIVCMD_MMAP:
 562                ret = privcmd_ioctl_mmap(udata);
 563                break;
 564
 565        case IOCTL_PRIVCMD_MMAPBATCH:
 566                ret = privcmd_ioctl_mmap_batch(udata, 1);
 567                break;
 568
 569        case IOCTL_PRIVCMD_MMAPBATCH_V2:
 570                ret = privcmd_ioctl_mmap_batch(udata, 2);
 571                break;
 572
 573        default:
 574                ret = -EINVAL;
 575                break;
 576        }
 577
 578        return ret;
 579}
 580
 581static void privcmd_close(struct vm_area_struct *vma)
 582{
 583        struct page **pages = vma->vm_private_data;
 584        int numpgs = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
 585        int rc;
 586
 587        if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages)
 588                return;
 589
 590        rc = xen_unmap_domain_mfn_range(vma, numpgs, pages);
 591        if (rc == 0)
 592                free_xenballooned_pages(numpgs, pages);
 593        else
 594                pr_crit("unable to unmap MFN range: leaking %d pages. rc=%d\n",
 595                        numpgs, rc);
 596        kfree(pages);
 597}
 598
 599static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 600{
 601        printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n",
 602               vma, vma->vm_start, vma->vm_end,
 603               vmf->pgoff, vmf->virtual_address);
 604
 605        return VM_FAULT_SIGBUS;
 606}
 607
 608static struct vm_operations_struct privcmd_vm_ops = {
 609        .close = privcmd_close,
 610        .fault = privcmd_fault
 611};
 612
 613static int privcmd_mmap(struct file *file, struct vm_area_struct *vma)
 614{
 615        /* DONTCOPY is essential for Xen because copy_page_range doesn't know
 616         * how to recreate these mappings */
 617        vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTCOPY |
 618                         VM_DONTEXPAND | VM_DONTDUMP;
 619        vma->vm_ops = &privcmd_vm_ops;
 620        vma->vm_private_data = NULL;
 621
 622        return 0;
 623}
 624
 625/*
 626 * For MMAPBATCH*. This allows asserting the singleshot mapping
 627 * on a per pfn/pte basis. Mapping calls that fail with ENOENT
 628 * can be then retried until success.
 629 */
 630static int is_mapped_fn(pte_t *pte, struct page *pmd_page,
 631                        unsigned long addr, void *data)
 632{
 633        return pte_none(*pte) ? 0 : -EBUSY;
 634}
 635
 636static int privcmd_vma_range_is_mapped(
 637                   struct vm_area_struct *vma,
 638                   unsigned long addr,
 639                   unsigned long nr_pages)
 640{
 641        return apply_to_page_range(vma->vm_mm, addr, nr_pages << PAGE_SHIFT,
 642                                   is_mapped_fn, NULL) != 0;
 643}
 644
 645const struct file_operations xen_privcmd_fops = {
 646        .owner = THIS_MODULE,
 647        .unlocked_ioctl = privcmd_ioctl,
 648        .mmap = privcmd_mmap,
 649};
 650EXPORT_SYMBOL_GPL(xen_privcmd_fops);
 651
 652static struct miscdevice privcmd_dev = {
 653        .minor = MISC_DYNAMIC_MINOR,
 654        .name = "xen/privcmd",
 655        .fops = &xen_privcmd_fops,
 656};
 657
 658static int __init privcmd_init(void)
 659{
 660        int err;
 661
 662        if (!xen_domain())
 663                return -ENODEV;
 664
 665        err = misc_register(&privcmd_dev);
 666        if (err != 0) {
 667                pr_err("Could not register Xen privcmd device\n");
 668                return err;
 669        }
 670        return 0;
 671}
 672
 673static void __exit privcmd_exit(void)
 674{
 675        misc_deregister(&privcmd_dev);
 676}
 677
 678module_init(privcmd_init);
 679module_exit(privcmd_exit);
 680