linux/drivers/gpu/drm/i915/gvt/kvmgt.c
<<
>>
Prefs
   1/*
   2 * KVMGT - the implementation of Intel mediated pass-through framework for KVM
   3 *
   4 * Copyright(c) 2014-2016 Intel Corporation. All rights reserved.
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a
   7 * copy of this software and associated documentation files (the "Software"),
   8 * to deal in the Software without restriction, including without limitation
   9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10 * and/or sell copies of the Software, and to permit persons to whom the
  11 * Software is furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice (including the next
  14 * paragraph) shall be included in all copies or substantial portions of the
  15 * Software.
  16 *
  17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23 * SOFTWARE.
  24 *
  25 * Authors:
  26 *    Kevin Tian <kevin.tian@intel.com>
  27 *    Jike Song <jike.song@intel.com>
  28 *    Xiaoguang Chen <xiaoguang.chen@intel.com>
  29 */
  30
  31#include <linux/init.h>
  32#include <linux/device.h>
  33#include <linux/mm.h>
  34#include <linux/mmu_context.h>
  35#include <linux/types.h>
  36#include <linux/list.h>
  37#include <linux/rbtree.h>
  38#include <linux/spinlock.h>
  39#include <linux/eventfd.h>
  40#include <linux/uuid.h>
  41#include <linux/kvm_host.h>
  42#include <linux/vfio.h>
  43#include <linux/mdev.h>
  44#include <linux/debugfs.h>
  45
  46#include "i915_drv.h"
  47#include "gvt.h"
  48
  49static const struct intel_gvt_ops *intel_gvt_ops;
  50
  51/* helper macros copied from vfio-pci */
  52#define VFIO_PCI_OFFSET_SHIFT   40
  53#define VFIO_PCI_OFFSET_TO_INDEX(off)   (off >> VFIO_PCI_OFFSET_SHIFT)
  54#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
  55#define VFIO_PCI_OFFSET_MASK    (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
  56
  57#define OPREGION_SIGNATURE "IntelGraphicsMem"
  58
  59struct vfio_region;
  60struct intel_vgpu_regops {
  61        size_t (*rw)(struct intel_vgpu *vgpu, char *buf,
  62                        size_t count, loff_t *ppos, bool iswrite);
  63        void (*release)(struct intel_vgpu *vgpu,
  64                        struct vfio_region *region);
  65};
  66
  67struct vfio_region {
  68        u32                             type;
  69        u32                             subtype;
  70        size_t                          size;
  71        u32                             flags;
  72        const struct intel_vgpu_regops  *ops;
  73        void                            *data;
  74};
  75
  76struct kvmgt_pgfn {
  77        gfn_t gfn;
  78        struct hlist_node hnode;
  79};
  80
  81struct kvmgt_guest_info {
  82        struct kvm *kvm;
  83        struct intel_vgpu *vgpu;
  84        struct kvm_page_track_notifier_node track_node;
  85#define NR_BKT (1 << 18)
  86        struct hlist_head ptable[NR_BKT];
  87#undef NR_BKT
  88        struct dentry *debugfs_cache_entries;
  89};
  90
  91struct gvt_dma {
  92        struct intel_vgpu *vgpu;
  93        struct rb_node gfn_node;
  94        struct rb_node dma_addr_node;
  95        gfn_t gfn;
  96        dma_addr_t dma_addr;
  97        struct kref ref;
  98};
  99
 100static inline bool handle_valid(unsigned long handle)
 101{
 102        return !!(handle & ~0xff);
 103}
 104
 105static int kvmgt_guest_init(struct mdev_device *mdev);
 106static void intel_vgpu_release_work(struct work_struct *work);
 107static bool kvmgt_guest_exit(struct kvmgt_guest_info *info);
 108
 109static int gvt_dma_map_page(struct intel_vgpu *vgpu, unsigned long gfn,
 110                dma_addr_t *dma_addr)
 111{
 112        struct device *dev = &vgpu->gvt->dev_priv->drm.pdev->dev;
 113        struct page *page;
 114        unsigned long pfn;
 115        int ret;
 116
 117        /* Pin the page first. */
 118        ret = vfio_pin_pages(mdev_dev(vgpu->vdev.mdev), &gfn, 1,
 119                             IOMMU_READ | IOMMU_WRITE, &pfn);
 120        if (ret != 1) {
 121                gvt_vgpu_err("vfio_pin_pages failed for gfn 0x%lx: %d\n",
 122                             gfn, ret);
 123                return -EINVAL;
 124        }
 125
 126        /* Setup DMA mapping. */
 127        page = pfn_to_page(pfn);
 128        *dma_addr = dma_map_page(dev, page, 0, PAGE_SIZE,
 129                                 PCI_DMA_BIDIRECTIONAL);
 130        if (dma_mapping_error(dev, *dma_addr)) {
 131                gvt_vgpu_err("DMA mapping failed for gfn 0x%lx\n", gfn);
 132                vfio_unpin_pages(mdev_dev(vgpu->vdev.mdev), &gfn, 1);
 133                return -ENOMEM;
 134        }
 135
 136        return 0;
 137}
 138
 139static void gvt_dma_unmap_page(struct intel_vgpu *vgpu, unsigned long gfn,
 140                dma_addr_t dma_addr)
 141{
 142        struct device *dev = &vgpu->gvt->dev_priv->drm.pdev->dev;
 143        int ret;
 144
 145        dma_unmap_page(dev, dma_addr, PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
 146        ret = vfio_unpin_pages(mdev_dev(vgpu->vdev.mdev), &gfn, 1);
 147        WARN_ON(ret != 1);
 148}
 149
 150static struct gvt_dma *__gvt_cache_find_dma_addr(struct intel_vgpu *vgpu,
 151                dma_addr_t dma_addr)
 152{
 153        struct rb_node *node = vgpu->vdev.dma_addr_cache.rb_node;
 154        struct gvt_dma *itr;
 155
 156        while (node) {
 157                itr = rb_entry(node, struct gvt_dma, dma_addr_node);
 158
 159                if (dma_addr < itr->dma_addr)
 160                        node = node->rb_left;
 161                else if (dma_addr > itr->dma_addr)
 162                        node = node->rb_right;
 163                else
 164                        return itr;
 165        }
 166        return NULL;
 167}
 168
 169static struct gvt_dma *__gvt_cache_find_gfn(struct intel_vgpu *vgpu, gfn_t gfn)
 170{
 171        struct rb_node *node = vgpu->vdev.gfn_cache.rb_node;
 172        struct gvt_dma *itr;
 173
 174        while (node) {
 175                itr = rb_entry(node, struct gvt_dma, gfn_node);
 176
 177                if (gfn < itr->gfn)
 178                        node = node->rb_left;
 179                else if (gfn > itr->gfn)
 180                        node = node->rb_right;
 181                else
 182                        return itr;
 183        }
 184        return NULL;
 185}
 186
 187static int __gvt_cache_add(struct intel_vgpu *vgpu, gfn_t gfn,
 188                dma_addr_t dma_addr)
 189{
 190        struct gvt_dma *new, *itr;
 191        struct rb_node **link, *parent = NULL;
 192
 193        new = kzalloc(sizeof(struct gvt_dma), GFP_KERNEL);
 194        if (!new)
 195                return -ENOMEM;
 196
 197        new->vgpu = vgpu;
 198        new->gfn = gfn;
 199        new->dma_addr = dma_addr;
 200        kref_init(&new->ref);
 201
 202        /* gfn_cache maps gfn to struct gvt_dma. */
 203        link = &vgpu->vdev.gfn_cache.rb_node;
 204        while (*link) {
 205                parent = *link;
 206                itr = rb_entry(parent, struct gvt_dma, gfn_node);
 207
 208                if (gfn < itr->gfn)
 209                        link = &parent->rb_left;
 210                else
 211                        link = &parent->rb_right;
 212        }
 213        rb_link_node(&new->gfn_node, parent, link);
 214        rb_insert_color(&new->gfn_node, &vgpu->vdev.gfn_cache);
 215
 216        /* dma_addr_cache maps dma addr to struct gvt_dma. */
 217        parent = NULL;
 218        link = &vgpu->vdev.dma_addr_cache.rb_node;
 219        while (*link) {
 220                parent = *link;
 221                itr = rb_entry(parent, struct gvt_dma, dma_addr_node);
 222
 223                if (dma_addr < itr->dma_addr)
 224                        link = &parent->rb_left;
 225                else
 226                        link = &parent->rb_right;
 227        }
 228        rb_link_node(&new->dma_addr_node, parent, link);
 229        rb_insert_color(&new->dma_addr_node, &vgpu->vdev.dma_addr_cache);
 230
 231        vgpu->vdev.nr_cache_entries++;
 232        return 0;
 233}
 234
 235static void __gvt_cache_remove_entry(struct intel_vgpu *vgpu,
 236                                struct gvt_dma *entry)
 237{
 238        rb_erase(&entry->gfn_node, &vgpu->vdev.gfn_cache);
 239        rb_erase(&entry->dma_addr_node, &vgpu->vdev.dma_addr_cache);
 240        kfree(entry);
 241        vgpu->vdev.nr_cache_entries--;
 242}
 243
 244static void gvt_cache_destroy(struct intel_vgpu *vgpu)
 245{
 246        struct gvt_dma *dma;
 247        struct rb_node *node = NULL;
 248
 249        for (;;) {
 250                mutex_lock(&vgpu->vdev.cache_lock);
 251                node = rb_first(&vgpu->vdev.gfn_cache);
 252                if (!node) {
 253                        mutex_unlock(&vgpu->vdev.cache_lock);
 254                        break;
 255                }
 256                dma = rb_entry(node, struct gvt_dma, gfn_node);
 257                gvt_dma_unmap_page(vgpu, dma->gfn, dma->dma_addr);
 258                __gvt_cache_remove_entry(vgpu, dma);
 259                mutex_unlock(&vgpu->vdev.cache_lock);
 260        }
 261}
 262
 263static void gvt_cache_init(struct intel_vgpu *vgpu)
 264{
 265        vgpu->vdev.gfn_cache = RB_ROOT;
 266        vgpu->vdev.dma_addr_cache = RB_ROOT;
 267        vgpu->vdev.nr_cache_entries = 0;
 268        mutex_init(&vgpu->vdev.cache_lock);
 269}
 270
 271static void kvmgt_protect_table_init(struct kvmgt_guest_info *info)
 272{
 273        hash_init(info->ptable);
 274}
 275
 276static void kvmgt_protect_table_destroy(struct kvmgt_guest_info *info)
 277{
 278        struct kvmgt_pgfn *p;
 279        struct hlist_node *tmp;
 280        int i;
 281
 282        hash_for_each_safe(info->ptable, i, tmp, p, hnode) {
 283                hash_del(&p->hnode);
 284                kfree(p);
 285        }
 286}
 287
 288static struct kvmgt_pgfn *
 289__kvmgt_protect_table_find(struct kvmgt_guest_info *info, gfn_t gfn)
 290{
 291        struct kvmgt_pgfn *p, *res = NULL;
 292
 293        hash_for_each_possible(info->ptable, p, hnode, gfn) {
 294                if (gfn == p->gfn) {
 295                        res = p;
 296                        break;
 297                }
 298        }
 299
 300        return res;
 301}
 302
 303static bool kvmgt_gfn_is_write_protected(struct kvmgt_guest_info *info,
 304                                gfn_t gfn)
 305{
 306        struct kvmgt_pgfn *p;
 307
 308        p = __kvmgt_protect_table_find(info, gfn);
 309        return !!p;
 310}
 311
 312static void kvmgt_protect_table_add(struct kvmgt_guest_info *info, gfn_t gfn)
 313{
 314        struct kvmgt_pgfn *p;
 315
 316        if (kvmgt_gfn_is_write_protected(info, gfn))
 317                return;
 318
 319        p = kzalloc(sizeof(struct kvmgt_pgfn), GFP_ATOMIC);
 320        if (WARN(!p, "gfn: 0x%llx\n", gfn))
 321                return;
 322
 323        p->gfn = gfn;
 324        hash_add(info->ptable, &p->hnode, gfn);
 325}
 326
 327static void kvmgt_protect_table_del(struct kvmgt_guest_info *info,
 328                                gfn_t gfn)
 329{
 330        struct kvmgt_pgfn *p;
 331
 332        p = __kvmgt_protect_table_find(info, gfn);
 333        if (p) {
 334                hash_del(&p->hnode);
 335                kfree(p);
 336        }
 337}
 338
 339static size_t intel_vgpu_reg_rw_opregion(struct intel_vgpu *vgpu, char *buf,
 340                size_t count, loff_t *ppos, bool iswrite)
 341{
 342        unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) -
 343                        VFIO_PCI_NUM_REGIONS;
 344        void *base = vgpu->vdev.region[i].data;
 345        loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
 346
 347        if (pos >= vgpu->vdev.region[i].size || iswrite) {
 348                gvt_vgpu_err("invalid op or offset for Intel vgpu OpRegion\n");
 349                return -EINVAL;
 350        }
 351        count = min(count, (size_t)(vgpu->vdev.region[i].size - pos));
 352        memcpy(buf, base + pos, count);
 353
 354        return count;
 355}
 356
 357static void intel_vgpu_reg_release_opregion(struct intel_vgpu *vgpu,
 358                struct vfio_region *region)
 359{
 360}
 361
 362static const struct intel_vgpu_regops intel_vgpu_regops_opregion = {
 363        .rw = intel_vgpu_reg_rw_opregion,
 364        .release = intel_vgpu_reg_release_opregion,
 365};
 366
 367static int intel_vgpu_register_reg(struct intel_vgpu *vgpu,
 368                unsigned int type, unsigned int subtype,
 369                const struct intel_vgpu_regops *ops,
 370                size_t size, u32 flags, void *data)
 371{
 372        struct vfio_region *region;
 373
 374        region = krealloc(vgpu->vdev.region,
 375                        (vgpu->vdev.num_regions + 1) * sizeof(*region),
 376                        GFP_KERNEL);
 377        if (!region)
 378                return -ENOMEM;
 379
 380        vgpu->vdev.region = region;
 381        vgpu->vdev.region[vgpu->vdev.num_regions].type = type;
 382        vgpu->vdev.region[vgpu->vdev.num_regions].subtype = subtype;
 383        vgpu->vdev.region[vgpu->vdev.num_regions].ops = ops;
 384        vgpu->vdev.region[vgpu->vdev.num_regions].size = size;
 385        vgpu->vdev.region[vgpu->vdev.num_regions].flags = flags;
 386        vgpu->vdev.region[vgpu->vdev.num_regions].data = data;
 387        vgpu->vdev.num_regions++;
 388        return 0;
 389}
 390
 391static int kvmgt_get_vfio_device(void *p_vgpu)
 392{
 393        struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu;
 394
 395        vgpu->vdev.vfio_device = vfio_device_get_from_dev(
 396                mdev_dev(vgpu->vdev.mdev));
 397        if (!vgpu->vdev.vfio_device) {
 398                gvt_vgpu_err("failed to get vfio device\n");
 399                return -ENODEV;
 400        }
 401        return 0;
 402}
 403
 404
 405static int kvmgt_set_opregion(void *p_vgpu)
 406{
 407        struct intel_vgpu *vgpu = (struct intel_vgpu *)p_vgpu;
 408        void *base;
 409        int ret;
 410
 411        /* Each vgpu has its own opregion, although VFIO would create another
 412         * one later. This one is used to expose opregion to VFIO. And the
 413         * other one created by VFIO later, is used by guest actually.
 414         */
 415        base = vgpu_opregion(vgpu)->va;
 416        if (!base)
 417                return -ENOMEM;
 418
 419        if (memcmp(base, OPREGION_SIGNATURE, 16)) {
 420                memunmap(base);
 421                return -EINVAL;
 422        }
 423
 424        ret = intel_vgpu_register_reg(vgpu,
 425                        PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
 426                        VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION,
 427                        &intel_vgpu_regops_opregion, OPREGION_SIZE,
 428                        VFIO_REGION_INFO_FLAG_READ, base);
 429
 430        return ret;
 431}
 432
 433static void kvmgt_put_vfio_device(void *vgpu)
 434{
 435        if (WARN_ON(!((struct intel_vgpu *)vgpu)->vdev.vfio_device))
 436                return;
 437
 438        vfio_device_put(((struct intel_vgpu *)vgpu)->vdev.vfio_device);
 439}
 440
 441static int intel_vgpu_create(struct kobject *kobj, struct mdev_device *mdev)
 442{
 443        struct intel_vgpu *vgpu = NULL;
 444        struct intel_vgpu_type *type;
 445        struct device *pdev;
 446        void *gvt;
 447        int ret;
 448
 449        pdev = mdev_parent_dev(mdev);
 450        gvt = kdev_to_i915(pdev)->gvt;
 451
 452        type = intel_gvt_ops->gvt_find_vgpu_type(gvt, kobject_name(kobj));
 453        if (!type) {
 454                gvt_vgpu_err("failed to find type %s to create\n",
 455                                                kobject_name(kobj));
 456                ret = -EINVAL;
 457                goto out;
 458        }
 459
 460        vgpu = intel_gvt_ops->vgpu_create(gvt, type);
 461        if (IS_ERR_OR_NULL(vgpu)) {
 462                ret = vgpu == NULL ? -EFAULT : PTR_ERR(vgpu);
 463                gvt_err("failed to create intel vgpu: %d\n", ret);
 464                goto out;
 465        }
 466
 467        INIT_WORK(&vgpu->vdev.release_work, intel_vgpu_release_work);
 468
 469        vgpu->vdev.mdev = mdev;
 470        mdev_set_drvdata(mdev, vgpu);
 471
 472        gvt_dbg_core("intel_vgpu_create succeeded for mdev: %s\n",
 473                     dev_name(mdev_dev(mdev)));
 474        ret = 0;
 475
 476out:
 477        return ret;
 478}
 479
 480static int intel_vgpu_remove(struct mdev_device *mdev)
 481{
 482        struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
 483
 484        if (handle_valid(vgpu->handle))
 485                return -EBUSY;
 486
 487        intel_gvt_ops->vgpu_destroy(vgpu);
 488        return 0;
 489}
 490
 491static int intel_vgpu_iommu_notifier(struct notifier_block *nb,
 492                                     unsigned long action, void *data)
 493{
 494        struct intel_vgpu *vgpu = container_of(nb,
 495                                        struct intel_vgpu,
 496                                        vdev.iommu_notifier);
 497
 498        if (action == VFIO_IOMMU_NOTIFY_DMA_UNMAP) {
 499                struct vfio_iommu_type1_dma_unmap *unmap = data;
 500                struct gvt_dma *entry;
 501                unsigned long iov_pfn, end_iov_pfn;
 502
 503                iov_pfn = unmap->iova >> PAGE_SHIFT;
 504                end_iov_pfn = iov_pfn + unmap->size / PAGE_SIZE;
 505
 506                mutex_lock(&vgpu->vdev.cache_lock);
 507                for (; iov_pfn < end_iov_pfn; iov_pfn++) {
 508                        entry = __gvt_cache_find_gfn(vgpu, iov_pfn);
 509                        if (!entry)
 510                                continue;
 511
 512                        gvt_dma_unmap_page(vgpu, entry->gfn, entry->dma_addr);
 513                        __gvt_cache_remove_entry(vgpu, entry);
 514                }
 515                mutex_unlock(&vgpu->vdev.cache_lock);
 516        }
 517
 518        return NOTIFY_OK;
 519}
 520
 521static int intel_vgpu_group_notifier(struct notifier_block *nb,
 522                                     unsigned long action, void *data)
 523{
 524        struct intel_vgpu *vgpu = container_of(nb,
 525                                        struct intel_vgpu,
 526                                        vdev.group_notifier);
 527
 528        /* the only action we care about */
 529        if (action == VFIO_GROUP_NOTIFY_SET_KVM) {
 530                vgpu->vdev.kvm = data;
 531
 532                if (!data)
 533                        schedule_work(&vgpu->vdev.release_work);
 534        }
 535
 536        return NOTIFY_OK;
 537}
 538
 539static int intel_vgpu_open(struct mdev_device *mdev)
 540{
 541        struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
 542        unsigned long events;
 543        int ret;
 544
 545        vgpu->vdev.iommu_notifier.notifier_call = intel_vgpu_iommu_notifier;
 546        vgpu->vdev.group_notifier.notifier_call = intel_vgpu_group_notifier;
 547
 548        events = VFIO_IOMMU_NOTIFY_DMA_UNMAP;
 549        ret = vfio_register_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, &events,
 550                                &vgpu->vdev.iommu_notifier);
 551        if (ret != 0) {
 552                gvt_vgpu_err("vfio_register_notifier for iommu failed: %d\n",
 553                        ret);
 554                goto out;
 555        }
 556
 557        events = VFIO_GROUP_NOTIFY_SET_KVM;
 558        ret = vfio_register_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY, &events,
 559                                &vgpu->vdev.group_notifier);
 560        if (ret != 0) {
 561                gvt_vgpu_err("vfio_register_notifier for group failed: %d\n",
 562                        ret);
 563                goto undo_iommu;
 564        }
 565
 566        ret = kvmgt_guest_init(mdev);
 567        if (ret)
 568                goto undo_group;
 569
 570        intel_gvt_ops->vgpu_activate(vgpu);
 571
 572        atomic_set(&vgpu->vdev.released, 0);
 573        return ret;
 574
 575undo_group:
 576        vfio_unregister_notifier(mdev_dev(mdev), VFIO_GROUP_NOTIFY,
 577                                        &vgpu->vdev.group_notifier);
 578
 579undo_iommu:
 580        vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY,
 581                                        &vgpu->vdev.iommu_notifier);
 582out:
 583        return ret;
 584}
 585
 586static void __intel_vgpu_release(struct intel_vgpu *vgpu)
 587{
 588        struct kvmgt_guest_info *info;
 589        int ret;
 590
 591        if (!handle_valid(vgpu->handle))
 592                return;
 593
 594        if (atomic_cmpxchg(&vgpu->vdev.released, 0, 1))
 595                return;
 596
 597        intel_gvt_ops->vgpu_deactivate(vgpu);
 598
 599        ret = vfio_unregister_notifier(mdev_dev(vgpu->vdev.mdev), VFIO_IOMMU_NOTIFY,
 600                                        &vgpu->vdev.iommu_notifier);
 601        WARN(ret, "vfio_unregister_notifier for iommu failed: %d\n", ret);
 602
 603        ret = vfio_unregister_notifier(mdev_dev(vgpu->vdev.mdev), VFIO_GROUP_NOTIFY,
 604                                        &vgpu->vdev.group_notifier);
 605        WARN(ret, "vfio_unregister_notifier for group failed: %d\n", ret);
 606
 607        info = (struct kvmgt_guest_info *)vgpu->handle;
 608        kvmgt_guest_exit(info);
 609
 610        vgpu->vdev.kvm = NULL;
 611        vgpu->handle = 0;
 612}
 613
 614static void intel_vgpu_release(struct mdev_device *mdev)
 615{
 616        struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
 617
 618        __intel_vgpu_release(vgpu);
 619}
 620
 621static void intel_vgpu_release_work(struct work_struct *work)
 622{
 623        struct intel_vgpu *vgpu = container_of(work, struct intel_vgpu,
 624                                        vdev.release_work);
 625
 626        __intel_vgpu_release(vgpu);
 627}
 628
 629static uint64_t intel_vgpu_get_bar_addr(struct intel_vgpu *vgpu, int bar)
 630{
 631        u32 start_lo, start_hi;
 632        u32 mem_type;
 633
 634        start_lo = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + bar)) &
 635                        PCI_BASE_ADDRESS_MEM_MASK;
 636        mem_type = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + bar)) &
 637                        PCI_BASE_ADDRESS_MEM_TYPE_MASK;
 638
 639        switch (mem_type) {
 640        case PCI_BASE_ADDRESS_MEM_TYPE_64:
 641                start_hi = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space
 642                                                + bar + 4));
 643                break;
 644        case PCI_BASE_ADDRESS_MEM_TYPE_32:
 645        case PCI_BASE_ADDRESS_MEM_TYPE_1M:
 646                /* 1M mem BAR treated as 32-bit BAR */
 647        default:
 648                /* mem unknown type treated as 32-bit BAR */
 649                start_hi = 0;
 650                break;
 651        }
 652
 653        return ((u64)start_hi << 32) | start_lo;
 654}
 655
 656static int intel_vgpu_bar_rw(struct intel_vgpu *vgpu, int bar, uint64_t off,
 657                             void *buf, unsigned int count, bool is_write)
 658{
 659        uint64_t bar_start = intel_vgpu_get_bar_addr(vgpu, bar);
 660        int ret;
 661
 662        if (is_write)
 663                ret = intel_gvt_ops->emulate_mmio_write(vgpu,
 664                                        bar_start + off, buf, count);
 665        else
 666                ret = intel_gvt_ops->emulate_mmio_read(vgpu,
 667                                        bar_start + off, buf, count);
 668        return ret;
 669}
 670
 671static inline bool intel_vgpu_in_aperture(struct intel_vgpu *vgpu, uint64_t off)
 672{
 673        return off >= vgpu_aperture_offset(vgpu) &&
 674               off < vgpu_aperture_offset(vgpu) + vgpu_aperture_sz(vgpu);
 675}
 676
 677static int intel_vgpu_aperture_rw(struct intel_vgpu *vgpu, uint64_t off,
 678                void *buf, unsigned long count, bool is_write)
 679{
 680        void *aperture_va;
 681
 682        if (!intel_vgpu_in_aperture(vgpu, off) ||
 683            !intel_vgpu_in_aperture(vgpu, off + count)) {
 684                gvt_vgpu_err("Invalid aperture offset %llu\n", off);
 685                return -EINVAL;
 686        }
 687
 688        aperture_va = io_mapping_map_wc(&vgpu->gvt->dev_priv->ggtt.iomap,
 689                                        ALIGN_DOWN(off, PAGE_SIZE),
 690                                        count + offset_in_page(off));
 691        if (!aperture_va)
 692                return -EIO;
 693
 694        if (is_write)
 695                memcpy(aperture_va + offset_in_page(off), buf, count);
 696        else
 697                memcpy(buf, aperture_va + offset_in_page(off), count);
 698
 699        io_mapping_unmap(aperture_va);
 700
 701        return 0;
 702}
 703
 704static ssize_t intel_vgpu_rw(struct mdev_device *mdev, char *buf,
 705                        size_t count, loff_t *ppos, bool is_write)
 706{
 707        struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
 708        unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
 709        uint64_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
 710        int ret = -EINVAL;
 711
 712
 713        if (index >= VFIO_PCI_NUM_REGIONS + vgpu->vdev.num_regions) {
 714                gvt_vgpu_err("invalid index: %u\n", index);
 715                return -EINVAL;
 716        }
 717
 718        switch (index) {
 719        case VFIO_PCI_CONFIG_REGION_INDEX:
 720                if (is_write)
 721                        ret = intel_gvt_ops->emulate_cfg_write(vgpu, pos,
 722                                                buf, count);
 723                else
 724                        ret = intel_gvt_ops->emulate_cfg_read(vgpu, pos,
 725                                                buf, count);
 726                break;
 727        case VFIO_PCI_BAR0_REGION_INDEX:
 728                ret = intel_vgpu_bar_rw(vgpu, PCI_BASE_ADDRESS_0, pos,
 729                                        buf, count, is_write);
 730                break;
 731        case VFIO_PCI_BAR2_REGION_INDEX:
 732                ret = intel_vgpu_aperture_rw(vgpu, pos, buf, count, is_write);
 733                break;
 734        case VFIO_PCI_BAR1_REGION_INDEX:
 735        case VFIO_PCI_BAR3_REGION_INDEX:
 736        case VFIO_PCI_BAR4_REGION_INDEX:
 737        case VFIO_PCI_BAR5_REGION_INDEX:
 738        case VFIO_PCI_VGA_REGION_INDEX:
 739        case VFIO_PCI_ROM_REGION_INDEX:
 740                break;
 741        default:
 742                if (index >= VFIO_PCI_NUM_REGIONS + vgpu->vdev.num_regions)
 743                        return -EINVAL;
 744
 745                index -= VFIO_PCI_NUM_REGIONS;
 746                return vgpu->vdev.region[index].ops->rw(vgpu, buf, count,
 747                                ppos, is_write);
 748        }
 749
 750        return ret == 0 ? count : ret;
 751}
 752
 753static bool gtt_entry(struct mdev_device *mdev, loff_t *ppos)
 754{
 755        struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
 756        unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
 757        struct intel_gvt *gvt = vgpu->gvt;
 758        int offset;
 759
 760        /* Only allow MMIO GGTT entry access */
 761        if (index != PCI_BASE_ADDRESS_0)
 762                return false;
 763
 764        offset = (u64)(*ppos & VFIO_PCI_OFFSET_MASK) -
 765                intel_vgpu_get_bar_gpa(vgpu, PCI_BASE_ADDRESS_0);
 766
 767        return (offset >= gvt->device_info.gtt_start_offset &&
 768                offset < gvt->device_info.gtt_start_offset + gvt_ggtt_sz(gvt)) ?
 769                        true : false;
 770}
 771
 772static ssize_t intel_vgpu_read(struct mdev_device *mdev, char __user *buf,
 773                        size_t count, loff_t *ppos)
 774{
 775        unsigned int done = 0;
 776        int ret;
 777
 778        while (count) {
 779                size_t filled;
 780
 781                /* Only support GGTT entry 8 bytes read */
 782                if (count >= 8 && !(*ppos % 8) &&
 783                        gtt_entry(mdev, ppos)) {
 784                        u64 val;
 785
 786                        ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
 787                                        ppos, false);
 788                        if (ret <= 0)
 789                                goto read_err;
 790
 791                        if (copy_to_user(buf, &val, sizeof(val)))
 792                                goto read_err;
 793
 794                        filled = 8;
 795                } else if (count >= 4 && !(*ppos % 4)) {
 796                        u32 val;
 797
 798                        ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
 799                                        ppos, false);
 800                        if (ret <= 0)
 801                                goto read_err;
 802
 803                        if (copy_to_user(buf, &val, sizeof(val)))
 804                                goto read_err;
 805
 806                        filled = 4;
 807                } else if (count >= 2 && !(*ppos % 2)) {
 808                        u16 val;
 809
 810                        ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
 811                                        ppos, false);
 812                        if (ret <= 0)
 813                                goto read_err;
 814
 815                        if (copy_to_user(buf, &val, sizeof(val)))
 816                                goto read_err;
 817
 818                        filled = 2;
 819                } else {
 820                        u8 val;
 821
 822                        ret = intel_vgpu_rw(mdev, &val, sizeof(val), ppos,
 823                                        false);
 824                        if (ret <= 0)
 825                                goto read_err;
 826
 827                        if (copy_to_user(buf, &val, sizeof(val)))
 828                                goto read_err;
 829
 830                        filled = 1;
 831                }
 832
 833                count -= filled;
 834                done += filled;
 835                *ppos += filled;
 836                buf += filled;
 837        }
 838
 839        return done;
 840
 841read_err:
 842        return -EFAULT;
 843}
 844
 845static ssize_t intel_vgpu_write(struct mdev_device *mdev,
 846                                const char __user *buf,
 847                                size_t count, loff_t *ppos)
 848{
 849        unsigned int done = 0;
 850        int ret;
 851
 852        while (count) {
 853                size_t filled;
 854
 855                /* Only support GGTT entry 8 bytes write */
 856                if (count >= 8 && !(*ppos % 8) &&
 857                        gtt_entry(mdev, ppos)) {
 858                        u64 val;
 859
 860                        if (copy_from_user(&val, buf, sizeof(val)))
 861                                goto write_err;
 862
 863                        ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
 864                                        ppos, true);
 865                        if (ret <= 0)
 866                                goto write_err;
 867
 868                        filled = 8;
 869                } else if (count >= 4 && !(*ppos % 4)) {
 870                        u32 val;
 871
 872                        if (copy_from_user(&val, buf, sizeof(val)))
 873                                goto write_err;
 874
 875                        ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
 876                                        ppos, true);
 877                        if (ret <= 0)
 878                                goto write_err;
 879
 880                        filled = 4;
 881                } else if (count >= 2 && !(*ppos % 2)) {
 882                        u16 val;
 883
 884                        if (copy_from_user(&val, buf, sizeof(val)))
 885                                goto write_err;
 886
 887                        ret = intel_vgpu_rw(mdev, (char *)&val,
 888                                        sizeof(val), ppos, true);
 889                        if (ret <= 0)
 890                                goto write_err;
 891
 892                        filled = 2;
 893                } else {
 894                        u8 val;
 895
 896                        if (copy_from_user(&val, buf, sizeof(val)))
 897                                goto write_err;
 898
 899                        ret = intel_vgpu_rw(mdev, &val, sizeof(val),
 900                                        ppos, true);
 901                        if (ret <= 0)
 902                                goto write_err;
 903
 904                        filled = 1;
 905                }
 906
 907                count -= filled;
 908                done += filled;
 909                *ppos += filled;
 910                buf += filled;
 911        }
 912
 913        return done;
 914write_err:
 915        return -EFAULT;
 916}
 917
 918static int intel_vgpu_mmap(struct mdev_device *mdev, struct vm_area_struct *vma)
 919{
 920        unsigned int index;
 921        u64 virtaddr;
 922        unsigned long req_size, pgoff = 0;
 923        pgprot_t pg_prot;
 924        struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
 925
 926        index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
 927        if (index >= VFIO_PCI_ROM_REGION_INDEX)
 928                return -EINVAL;
 929
 930        if (vma->vm_end < vma->vm_start)
 931                return -EINVAL;
 932        if ((vma->vm_flags & VM_SHARED) == 0)
 933                return -EINVAL;
 934        if (index != VFIO_PCI_BAR2_REGION_INDEX)
 935                return -EINVAL;
 936
 937        pg_prot = vma->vm_page_prot;
 938        virtaddr = vma->vm_start;
 939        req_size = vma->vm_end - vma->vm_start;
 940        pgoff = vgpu_aperture_pa_base(vgpu) >> PAGE_SHIFT;
 941
 942        return remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot);
 943}
 944
 945static int intel_vgpu_get_irq_count(struct intel_vgpu *vgpu, int type)
 946{
 947        if (type == VFIO_PCI_INTX_IRQ_INDEX || type == VFIO_PCI_MSI_IRQ_INDEX)
 948                return 1;
 949
 950        return 0;
 951}
 952
 953static int intel_vgpu_set_intx_mask(struct intel_vgpu *vgpu,
 954                        unsigned int index, unsigned int start,
 955                        unsigned int count, uint32_t flags,
 956                        void *data)
 957{
 958        return 0;
 959}
 960
 961static int intel_vgpu_set_intx_unmask(struct intel_vgpu *vgpu,
 962                        unsigned int index, unsigned int start,
 963                        unsigned int count, uint32_t flags, void *data)
 964{
 965        return 0;
 966}
 967
 968static int intel_vgpu_set_intx_trigger(struct intel_vgpu *vgpu,
 969                unsigned int index, unsigned int start, unsigned int count,
 970                uint32_t flags, void *data)
 971{
 972        return 0;
 973}
 974
 975static int intel_vgpu_set_msi_trigger(struct intel_vgpu *vgpu,
 976                unsigned int index, unsigned int start, unsigned int count,
 977                uint32_t flags, void *data)
 978{
 979        struct eventfd_ctx *trigger;
 980
 981        if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
 982                int fd = *(int *)data;
 983
 984                trigger = eventfd_ctx_fdget(fd);
 985                if (IS_ERR(trigger)) {
 986                        gvt_vgpu_err("eventfd_ctx_fdget failed\n");
 987                        return PTR_ERR(trigger);
 988                }
 989                vgpu->vdev.msi_trigger = trigger;
 990        }
 991
 992        return 0;
 993}
 994
 995static int intel_vgpu_set_irqs(struct intel_vgpu *vgpu, uint32_t flags,
 996                unsigned int index, unsigned int start, unsigned int count,
 997                void *data)
 998{
 999        int (*func)(struct intel_vgpu *vgpu, unsigned int index,
1000                        unsigned int start, unsigned int count, uint32_t flags,
1001                        void *data) = NULL;
1002
1003        switch (index) {
1004        case VFIO_PCI_INTX_IRQ_INDEX:
1005                switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
1006                case VFIO_IRQ_SET_ACTION_MASK:
1007                        func = intel_vgpu_set_intx_mask;
1008                        break;
1009                case VFIO_IRQ_SET_ACTION_UNMASK:
1010                        func = intel_vgpu_set_intx_unmask;
1011                        break;
1012                case VFIO_IRQ_SET_ACTION_TRIGGER:
1013                        func = intel_vgpu_set_intx_trigger;
1014                        break;
1015                }
1016                break;
1017        case VFIO_PCI_MSI_IRQ_INDEX:
1018                switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
1019                case VFIO_IRQ_SET_ACTION_MASK:
1020                case VFIO_IRQ_SET_ACTION_UNMASK:
1021                        /* XXX Need masking support exported */
1022                        break;
1023                case VFIO_IRQ_SET_ACTION_TRIGGER:
1024                        func = intel_vgpu_set_msi_trigger;
1025                        break;
1026                }
1027                break;
1028        }
1029
1030        if (!func)
1031                return -ENOTTY;
1032
1033        return func(vgpu, index, start, count, flags, data);
1034}
1035
1036static long intel_vgpu_ioctl(struct mdev_device *mdev, unsigned int cmd,
1037                             unsigned long arg)
1038{
1039        struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
1040        unsigned long minsz;
1041
1042        gvt_dbg_core("vgpu%d ioctl, cmd: %d\n", vgpu->id, cmd);
1043
1044        if (cmd == VFIO_DEVICE_GET_INFO) {
1045                struct vfio_device_info info;
1046
1047                minsz = offsetofend(struct vfio_device_info, num_irqs);
1048
1049                if (copy_from_user(&info, (void __user *)arg, minsz))
1050                        return -EFAULT;
1051
1052                if (info.argsz < minsz)
1053                        return -EINVAL;
1054
1055                info.flags = VFIO_DEVICE_FLAGS_PCI;
1056                info.flags |= VFIO_DEVICE_FLAGS_RESET;
1057                info.num_regions = VFIO_PCI_NUM_REGIONS +
1058                                vgpu->vdev.num_regions;
1059                info.num_irqs = VFIO_PCI_NUM_IRQS;
1060
1061                return copy_to_user((void __user *)arg, &info, minsz) ?
1062                        -EFAULT : 0;
1063
1064        } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
1065                struct vfio_region_info info;
1066                struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
1067                int i, ret;
1068                struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
1069                size_t size;
1070                int nr_areas = 1;
1071                int cap_type_id;
1072
1073                minsz = offsetofend(struct vfio_region_info, offset);
1074
1075                if (copy_from_user(&info, (void __user *)arg, minsz))
1076                        return -EFAULT;
1077
1078                if (info.argsz < minsz)
1079                        return -EINVAL;
1080
1081                switch (info.index) {
1082                case VFIO_PCI_CONFIG_REGION_INDEX:
1083                        info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1084                        info.size = vgpu->gvt->device_info.cfg_space_size;
1085                        info.flags = VFIO_REGION_INFO_FLAG_READ |
1086                                     VFIO_REGION_INFO_FLAG_WRITE;
1087                        break;
1088                case VFIO_PCI_BAR0_REGION_INDEX:
1089                        info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1090                        info.size = vgpu->cfg_space.bar[info.index].size;
1091                        if (!info.size) {
1092                                info.flags = 0;
1093                                break;
1094                        }
1095
1096                        info.flags = VFIO_REGION_INFO_FLAG_READ |
1097                                     VFIO_REGION_INFO_FLAG_WRITE;
1098                        break;
1099                case VFIO_PCI_BAR1_REGION_INDEX:
1100                        info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1101                        info.size = 0;
1102                        info.flags = 0;
1103                        break;
1104                case VFIO_PCI_BAR2_REGION_INDEX:
1105                        info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1106                        info.flags = VFIO_REGION_INFO_FLAG_CAPS |
1107                                        VFIO_REGION_INFO_FLAG_MMAP |
1108                                        VFIO_REGION_INFO_FLAG_READ |
1109                                        VFIO_REGION_INFO_FLAG_WRITE;
1110                        info.size = gvt_aperture_sz(vgpu->gvt);
1111
1112                        size = sizeof(*sparse) +
1113                                        (nr_areas * sizeof(*sparse->areas));
1114                        sparse = kzalloc(size, GFP_KERNEL);
1115                        if (!sparse)
1116                                return -ENOMEM;
1117
1118                        sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
1119                        sparse->header.version = 1;
1120                        sparse->nr_areas = nr_areas;
1121                        cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
1122                        sparse->areas[0].offset =
1123                                        PAGE_ALIGN(vgpu_aperture_offset(vgpu));
1124                        sparse->areas[0].size = vgpu_aperture_sz(vgpu);
1125                        break;
1126
1127                case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
1128                        info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1129                        info.size = 0;
1130                        info.flags = 0;
1131
1132                        gvt_dbg_core("get region info bar:%d\n", info.index);
1133                        break;
1134
1135                case VFIO_PCI_ROM_REGION_INDEX:
1136                case VFIO_PCI_VGA_REGION_INDEX:
1137                        info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
1138                        info.size = 0;
1139                        info.flags = 0;
1140
1141                        gvt_dbg_core("get region info index:%d\n", info.index);
1142                        break;
1143                default:
1144                        {
1145                                struct vfio_region_info_cap_type cap_type = {
1146                                        .header.id = VFIO_REGION_INFO_CAP_TYPE,
1147                                        .header.version = 1 };
1148
1149                                if (info.index >= VFIO_PCI_NUM_REGIONS +
1150                                                vgpu->vdev.num_regions)
1151                                        return -EINVAL;
1152
1153                                i = info.index - VFIO_PCI_NUM_REGIONS;
1154
1155                                info.offset =
1156                                        VFIO_PCI_INDEX_TO_OFFSET(info.index);
1157                                info.size = vgpu->vdev.region[i].size;
1158                                info.flags = vgpu->vdev.region[i].flags;
1159
1160                                cap_type.type = vgpu->vdev.region[i].type;
1161                                cap_type.subtype = vgpu->vdev.region[i].subtype;
1162
1163                                ret = vfio_info_add_capability(&caps,
1164                                                        &cap_type.header,
1165                                                        sizeof(cap_type));
1166                                if (ret)
1167                                        return ret;
1168                        }
1169                }
1170
1171                if ((info.flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) {
1172                        switch (cap_type_id) {
1173                        case VFIO_REGION_INFO_CAP_SPARSE_MMAP:
1174                                ret = vfio_info_add_capability(&caps,
1175                                        &sparse->header, sizeof(*sparse) +
1176                                        (sparse->nr_areas *
1177                                                sizeof(*sparse->areas)));
1178                                kfree(sparse);
1179                                if (ret)
1180                                        return ret;
1181                                break;
1182                        default:
1183                                return -EINVAL;
1184                        }
1185                }
1186
1187                if (caps.size) {
1188                        info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
1189                        if (info.argsz < sizeof(info) + caps.size) {
1190                                info.argsz = sizeof(info) + caps.size;
1191                                info.cap_offset = 0;
1192                        } else {
1193                                vfio_info_cap_shift(&caps, sizeof(info));
1194                                if (copy_to_user((void __user *)arg +
1195                                                  sizeof(info), caps.buf,
1196                                                  caps.size)) {
1197                                        kfree(caps.buf);
1198                                        return -EFAULT;
1199                                }
1200                                info.cap_offset = sizeof(info);
1201                        }
1202
1203                        kfree(caps.buf);
1204                }
1205
1206                return copy_to_user((void __user *)arg, &info, minsz) ?
1207                        -EFAULT : 0;
1208        } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
1209                struct vfio_irq_info info;
1210
1211                minsz = offsetofend(struct vfio_irq_info, count);
1212
1213                if (copy_from_user(&info, (void __user *)arg, minsz))
1214                        return -EFAULT;
1215
1216                if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
1217                        return -EINVAL;
1218
1219                switch (info.index) {
1220                case VFIO_PCI_INTX_IRQ_INDEX:
1221                case VFIO_PCI_MSI_IRQ_INDEX:
1222                        break;
1223                default:
1224                        return -EINVAL;
1225                }
1226
1227                info.flags = VFIO_IRQ_INFO_EVENTFD;
1228
1229                info.count = intel_vgpu_get_irq_count(vgpu, info.index);
1230
1231                if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
1232                        info.flags |= (VFIO_IRQ_INFO_MASKABLE |
1233                                       VFIO_IRQ_INFO_AUTOMASKED);
1234                else
1235                        info.flags |= VFIO_IRQ_INFO_NORESIZE;
1236
1237                return copy_to_user((void __user *)arg, &info, minsz) ?
1238                        -EFAULT : 0;
1239        } else if (cmd == VFIO_DEVICE_SET_IRQS) {
1240                struct vfio_irq_set hdr;
1241                u8 *data = NULL;
1242                int ret = 0;
1243                size_t data_size = 0;
1244
1245                minsz = offsetofend(struct vfio_irq_set, count);
1246
1247                if (copy_from_user(&hdr, (void __user *)arg, minsz))
1248                        return -EFAULT;
1249
1250                if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
1251                        int max = intel_vgpu_get_irq_count(vgpu, hdr.index);
1252
1253                        ret = vfio_set_irqs_validate_and_prepare(&hdr, max,
1254                                                VFIO_PCI_NUM_IRQS, &data_size);
1255                        if (ret) {
1256                                gvt_vgpu_err("intel:vfio_set_irqs_validate_and_prepare failed\n");
1257                                return -EINVAL;
1258                        }
1259                        if (data_size) {
1260                                data = memdup_user((void __user *)(arg + minsz),
1261                                                   data_size);
1262                                if (IS_ERR(data))
1263                                        return PTR_ERR(data);
1264                        }
1265                }
1266
1267                ret = intel_vgpu_set_irqs(vgpu, hdr.flags, hdr.index,
1268                                        hdr.start, hdr.count, data);
1269                kfree(data);
1270
1271                return ret;
1272        } else if (cmd == VFIO_DEVICE_RESET) {
1273                intel_gvt_ops->vgpu_reset(vgpu);
1274                return 0;
1275        } else if (cmd == VFIO_DEVICE_QUERY_GFX_PLANE) {
1276                struct vfio_device_gfx_plane_info dmabuf;
1277                int ret = 0;
1278
1279                minsz = offsetofend(struct vfio_device_gfx_plane_info,
1280                                    dmabuf_id);
1281                if (copy_from_user(&dmabuf, (void __user *)arg, minsz))
1282                        return -EFAULT;
1283                if (dmabuf.argsz < minsz)
1284                        return -EINVAL;
1285
1286                ret = intel_gvt_ops->vgpu_query_plane(vgpu, &dmabuf);
1287                if (ret != 0)
1288                        return ret;
1289
1290                return copy_to_user((void __user *)arg, &dmabuf, minsz) ?
1291                                                                -EFAULT : 0;
1292        } else if (cmd == VFIO_DEVICE_GET_GFX_DMABUF) {
1293                __u32 dmabuf_id;
1294                __s32 dmabuf_fd;
1295
1296                if (get_user(dmabuf_id, (__u32 __user *)arg))
1297                        return -EFAULT;
1298
1299                dmabuf_fd = intel_gvt_ops->vgpu_get_dmabuf(vgpu, dmabuf_id);
1300                return dmabuf_fd;
1301
1302        }
1303
1304        return -ENOTTY;
1305}
1306
1307static ssize_t
1308vgpu_id_show(struct device *dev, struct device_attribute *attr,
1309             char *buf)
1310{
1311        struct mdev_device *mdev = mdev_from_dev(dev);
1312
1313        if (mdev) {
1314                struct intel_vgpu *vgpu = (struct intel_vgpu *)
1315                        mdev_get_drvdata(mdev);
1316                return sprintf(buf, "%d\n", vgpu->id);
1317        }
1318        return sprintf(buf, "\n");
1319}
1320
1321static ssize_t
1322hw_id_show(struct device *dev, struct device_attribute *attr,
1323           char *buf)
1324{
1325        struct mdev_device *mdev = mdev_from_dev(dev);
1326
1327        if (mdev) {
1328                struct intel_vgpu *vgpu = (struct intel_vgpu *)
1329                        mdev_get_drvdata(mdev);
1330                return sprintf(buf, "%u\n",
1331                               vgpu->submission.shadow_ctx->hw_id);
1332        }
1333        return sprintf(buf, "\n");
1334}
1335
1336static DEVICE_ATTR_RO(vgpu_id);
1337static DEVICE_ATTR_RO(hw_id);
1338
1339static struct attribute *intel_vgpu_attrs[] = {
1340        &dev_attr_vgpu_id.attr,
1341        &dev_attr_hw_id.attr,
1342        NULL
1343};
1344
1345static const struct attribute_group intel_vgpu_group = {
1346        .name = "intel_vgpu",
1347        .attrs = intel_vgpu_attrs,
1348};
1349
1350static const struct attribute_group *intel_vgpu_groups[] = {
1351        &intel_vgpu_group,
1352        NULL,
1353};
1354
1355static struct mdev_parent_ops intel_vgpu_ops = {
1356        .mdev_attr_groups       = intel_vgpu_groups,
1357        .create                 = intel_vgpu_create,
1358        .remove                 = intel_vgpu_remove,
1359
1360        .open                   = intel_vgpu_open,
1361        .release                = intel_vgpu_release,
1362
1363        .read                   = intel_vgpu_read,
1364        .write                  = intel_vgpu_write,
1365        .mmap                   = intel_vgpu_mmap,
1366        .ioctl                  = intel_vgpu_ioctl,
1367};
1368
1369static int kvmgt_host_init(struct device *dev, void *gvt, const void *ops)
1370{
1371        struct attribute **kvm_type_attrs;
1372        struct attribute_group **kvm_vgpu_type_groups;
1373
1374        intel_gvt_ops = ops;
1375        if (!intel_gvt_ops->get_gvt_attrs(&kvm_type_attrs,
1376                        &kvm_vgpu_type_groups))
1377                return -EFAULT;
1378        intel_vgpu_ops.supported_type_groups = kvm_vgpu_type_groups;
1379
1380        return mdev_register_device(dev, &intel_vgpu_ops);
1381}
1382
1383static void kvmgt_host_exit(struct device *dev, void *gvt)
1384{
1385        mdev_unregister_device(dev);
1386}
1387
1388static int kvmgt_page_track_add(unsigned long handle, u64 gfn)
1389{
1390        struct kvmgt_guest_info *info;
1391        struct kvm *kvm;
1392        struct kvm_memory_slot *slot;
1393        int idx;
1394
1395        if (!handle_valid(handle))
1396                return -ESRCH;
1397
1398        info = (struct kvmgt_guest_info *)handle;
1399        kvm = info->kvm;
1400
1401        idx = srcu_read_lock(&kvm->srcu);
1402        slot = gfn_to_memslot(kvm, gfn);
1403        if (!slot) {
1404                srcu_read_unlock(&kvm->srcu, idx);
1405                return -EINVAL;
1406        }
1407
1408        spin_lock(&kvm->mmu_lock);
1409
1410        if (kvmgt_gfn_is_write_protected(info, gfn))
1411                goto out;
1412
1413        kvm_slot_page_track_add_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE);
1414        kvmgt_protect_table_add(info, gfn);
1415
1416out:
1417        spin_unlock(&kvm->mmu_lock);
1418        srcu_read_unlock(&kvm->srcu, idx);
1419        return 0;
1420}
1421
1422static int kvmgt_page_track_remove(unsigned long handle, u64 gfn)
1423{
1424        struct kvmgt_guest_info *info;
1425        struct kvm *kvm;
1426        struct kvm_memory_slot *slot;
1427        int idx;
1428
1429        if (!handle_valid(handle))
1430                return 0;
1431
1432        info = (struct kvmgt_guest_info *)handle;
1433        kvm = info->kvm;
1434
1435        idx = srcu_read_lock(&kvm->srcu);
1436        slot = gfn_to_memslot(kvm, gfn);
1437        if (!slot) {
1438                srcu_read_unlock(&kvm->srcu, idx);
1439                return -EINVAL;
1440        }
1441
1442        spin_lock(&kvm->mmu_lock);
1443
1444        if (!kvmgt_gfn_is_write_protected(info, gfn))
1445                goto out;
1446
1447        kvm_slot_page_track_remove_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE);
1448        kvmgt_protect_table_del(info, gfn);
1449
1450out:
1451        spin_unlock(&kvm->mmu_lock);
1452        srcu_read_unlock(&kvm->srcu, idx);
1453        return 0;
1454}
1455
1456static void kvmgt_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1457                const u8 *val, int len,
1458                struct kvm_page_track_notifier_node *node)
1459{
1460        struct kvmgt_guest_info *info = container_of(node,
1461                                        struct kvmgt_guest_info, track_node);
1462
1463        if (kvmgt_gfn_is_write_protected(info, gpa_to_gfn(gpa)))
1464                intel_gvt_ops->write_protect_handler(info->vgpu, gpa,
1465                                                     (void *)val, len);
1466}
1467
1468static void kvmgt_page_track_flush_slot(struct kvm *kvm,
1469                struct kvm_memory_slot *slot,
1470                struct kvm_page_track_notifier_node *node)
1471{
1472        int i;
1473        gfn_t gfn;
1474        struct kvmgt_guest_info *info = container_of(node,
1475                                        struct kvmgt_guest_info, track_node);
1476
1477        spin_lock(&kvm->mmu_lock);
1478        for (i = 0; i < slot->npages; i++) {
1479                gfn = slot->base_gfn + i;
1480                if (kvmgt_gfn_is_write_protected(info, gfn)) {
1481                        kvm_slot_page_track_remove_page(kvm, slot, gfn,
1482                                                KVM_PAGE_TRACK_WRITE);
1483                        kvmgt_protect_table_del(info, gfn);
1484                }
1485        }
1486        spin_unlock(&kvm->mmu_lock);
1487}
1488
1489static bool __kvmgt_vgpu_exist(struct intel_vgpu *vgpu, struct kvm *kvm)
1490{
1491        struct intel_vgpu *itr;
1492        struct kvmgt_guest_info *info;
1493        int id;
1494        bool ret = false;
1495
1496        mutex_lock(&vgpu->gvt->lock);
1497        for_each_active_vgpu(vgpu->gvt, itr, id) {
1498                if (!handle_valid(itr->handle))
1499                        continue;
1500
1501                info = (struct kvmgt_guest_info *)itr->handle;
1502                if (kvm && kvm == info->kvm) {
1503                        ret = true;
1504                        goto out;
1505                }
1506        }
1507out:
1508        mutex_unlock(&vgpu->gvt->lock);
1509        return ret;
1510}
1511
1512static int kvmgt_guest_init(struct mdev_device *mdev)
1513{
1514        struct kvmgt_guest_info *info;
1515        struct intel_vgpu *vgpu;
1516        struct kvm *kvm;
1517
1518        vgpu = mdev_get_drvdata(mdev);
1519        if (handle_valid(vgpu->handle))
1520                return -EEXIST;
1521
1522        kvm = vgpu->vdev.kvm;
1523        if (!kvm || kvm->mm != current->mm) {
1524                gvt_vgpu_err("KVM is required to use Intel vGPU\n");
1525                return -ESRCH;
1526        }
1527
1528        if (__kvmgt_vgpu_exist(vgpu, kvm))
1529                return -EEXIST;
1530
1531        info = vzalloc(sizeof(struct kvmgt_guest_info));
1532        if (!info)
1533                return -ENOMEM;
1534
1535        vgpu->handle = (unsigned long)info;
1536        info->vgpu = vgpu;
1537        info->kvm = kvm;
1538        kvm_get_kvm(info->kvm);
1539
1540        kvmgt_protect_table_init(info);
1541        gvt_cache_init(vgpu);
1542
1543        mutex_init(&vgpu->dmabuf_lock);
1544        init_completion(&vgpu->vblank_done);
1545
1546        info->track_node.track_write = kvmgt_page_track_write;
1547        info->track_node.track_flush_slot = kvmgt_page_track_flush_slot;
1548        kvm_page_track_register_notifier(kvm, &info->track_node);
1549
1550        info->debugfs_cache_entries = debugfs_create_ulong(
1551                                                "kvmgt_nr_cache_entries",
1552                                                0444, vgpu->debugfs,
1553                                                &vgpu->vdev.nr_cache_entries);
1554        if (!info->debugfs_cache_entries)
1555                gvt_vgpu_err("Cannot create kvmgt debugfs entry\n");
1556
1557        return 0;
1558}
1559
1560static bool kvmgt_guest_exit(struct kvmgt_guest_info *info)
1561{
1562        debugfs_remove(info->debugfs_cache_entries);
1563
1564        kvm_page_track_unregister_notifier(info->kvm, &info->track_node);
1565        kvm_put_kvm(info->kvm);
1566        kvmgt_protect_table_destroy(info);
1567        gvt_cache_destroy(info->vgpu);
1568        vfree(info);
1569
1570        return true;
1571}
1572
1573static int kvmgt_attach_vgpu(void *vgpu, unsigned long *handle)
1574{
1575        /* nothing to do here */
1576        return 0;
1577}
1578
1579static void kvmgt_detach_vgpu(unsigned long handle)
1580{
1581        /* nothing to do here */
1582}
1583
1584static int kvmgt_inject_msi(unsigned long handle, u32 addr, u16 data)
1585{
1586        struct kvmgt_guest_info *info;
1587        struct intel_vgpu *vgpu;
1588
1589        if (!handle_valid(handle))
1590                return -ESRCH;
1591
1592        info = (struct kvmgt_guest_info *)handle;
1593        vgpu = info->vgpu;
1594
1595        if (eventfd_signal(vgpu->vdev.msi_trigger, 1) == 1)
1596                return 0;
1597
1598        return -EFAULT;
1599}
1600
1601static unsigned long kvmgt_gfn_to_pfn(unsigned long handle, unsigned long gfn)
1602{
1603        struct kvmgt_guest_info *info;
1604        kvm_pfn_t pfn;
1605
1606        if (!handle_valid(handle))
1607                return INTEL_GVT_INVALID_ADDR;
1608
1609        info = (struct kvmgt_guest_info *)handle;
1610
1611        pfn = gfn_to_pfn(info->kvm, gfn);
1612        if (is_error_noslot_pfn(pfn))
1613                return INTEL_GVT_INVALID_ADDR;
1614
1615        return pfn;
1616}
1617
1618int kvmgt_dma_map_guest_page(unsigned long handle, unsigned long gfn,
1619                dma_addr_t *dma_addr)
1620{
1621        struct kvmgt_guest_info *info;
1622        struct intel_vgpu *vgpu;
1623        struct gvt_dma *entry;
1624        int ret;
1625
1626        if (!handle_valid(handle))
1627                return -EINVAL;
1628
1629        info = (struct kvmgt_guest_info *)handle;
1630        vgpu = info->vgpu;
1631
1632        mutex_lock(&info->vgpu->vdev.cache_lock);
1633
1634        entry = __gvt_cache_find_gfn(info->vgpu, gfn);
1635        if (!entry) {
1636                ret = gvt_dma_map_page(vgpu, gfn, dma_addr);
1637                if (ret)
1638                        goto err_unlock;
1639
1640                ret = __gvt_cache_add(info->vgpu, gfn, *dma_addr);
1641                if (ret)
1642                        goto err_unmap;
1643        } else {
1644                kref_get(&entry->ref);
1645                *dma_addr = entry->dma_addr;
1646        }
1647
1648        mutex_unlock(&info->vgpu->vdev.cache_lock);
1649        return 0;
1650
1651err_unmap:
1652        gvt_dma_unmap_page(vgpu, gfn, *dma_addr);
1653err_unlock:
1654        mutex_unlock(&info->vgpu->vdev.cache_lock);
1655        return ret;
1656}
1657
1658static void __gvt_dma_release(struct kref *ref)
1659{
1660        struct gvt_dma *entry = container_of(ref, typeof(*entry), ref);
1661
1662        gvt_dma_unmap_page(entry->vgpu, entry->gfn, entry->dma_addr);
1663        __gvt_cache_remove_entry(entry->vgpu, entry);
1664}
1665
1666void kvmgt_dma_unmap_guest_page(unsigned long handle, dma_addr_t dma_addr)
1667{
1668        struct kvmgt_guest_info *info;
1669        struct gvt_dma *entry;
1670
1671        if (!handle_valid(handle))
1672                return;
1673
1674        info = (struct kvmgt_guest_info *)handle;
1675
1676        mutex_lock(&info->vgpu->vdev.cache_lock);
1677        entry = __gvt_cache_find_dma_addr(info->vgpu, dma_addr);
1678        if (entry)
1679                kref_put(&entry->ref, __gvt_dma_release);
1680        mutex_unlock(&info->vgpu->vdev.cache_lock);
1681}
1682
1683static int kvmgt_rw_gpa(unsigned long handle, unsigned long gpa,
1684                        void *buf, unsigned long len, bool write)
1685{
1686        struct kvmgt_guest_info *info;
1687        struct kvm *kvm;
1688        int idx, ret;
1689        bool kthread = current->mm == NULL;
1690
1691        if (!handle_valid(handle))
1692                return -ESRCH;
1693
1694        info = (struct kvmgt_guest_info *)handle;
1695        kvm = info->kvm;
1696
1697        if (kthread)
1698                use_mm(kvm->mm);
1699
1700        idx = srcu_read_lock(&kvm->srcu);
1701        ret = write ? kvm_write_guest(kvm, gpa, buf, len) :
1702                      kvm_read_guest(kvm, gpa, buf, len);
1703        srcu_read_unlock(&kvm->srcu, idx);
1704
1705        if (kthread)
1706                unuse_mm(kvm->mm);
1707
1708        return ret;
1709}
1710
1711static int kvmgt_read_gpa(unsigned long handle, unsigned long gpa,
1712                        void *buf, unsigned long len)
1713{
1714        return kvmgt_rw_gpa(handle, gpa, buf, len, false);
1715}
1716
1717static int kvmgt_write_gpa(unsigned long handle, unsigned long gpa,
1718                        void *buf, unsigned long len)
1719{
1720        return kvmgt_rw_gpa(handle, gpa, buf, len, true);
1721}
1722
1723static unsigned long kvmgt_virt_to_pfn(void *addr)
1724{
1725        return PFN_DOWN(__pa(addr));
1726}
1727
1728static bool kvmgt_is_valid_gfn(unsigned long handle, unsigned long gfn)
1729{
1730        struct kvmgt_guest_info *info;
1731        struct kvm *kvm;
1732
1733        if (!handle_valid(handle))
1734                return false;
1735
1736        info = (struct kvmgt_guest_info *)handle;
1737        kvm = info->kvm;
1738
1739        return kvm_is_visible_gfn(kvm, gfn);
1740
1741}
1742
1743struct intel_gvt_mpt kvmgt_mpt = {
1744        .host_init = kvmgt_host_init,
1745        .host_exit = kvmgt_host_exit,
1746        .attach_vgpu = kvmgt_attach_vgpu,
1747        .detach_vgpu = kvmgt_detach_vgpu,
1748        .inject_msi = kvmgt_inject_msi,
1749        .from_virt_to_mfn = kvmgt_virt_to_pfn,
1750        .enable_page_track = kvmgt_page_track_add,
1751        .disable_page_track = kvmgt_page_track_remove,
1752        .read_gpa = kvmgt_read_gpa,
1753        .write_gpa = kvmgt_write_gpa,
1754        .gfn_to_mfn = kvmgt_gfn_to_pfn,
1755        .dma_map_guest_page = kvmgt_dma_map_guest_page,
1756        .dma_unmap_guest_page = kvmgt_dma_unmap_guest_page,
1757        .set_opregion = kvmgt_set_opregion,
1758        .get_vfio_device = kvmgt_get_vfio_device,
1759        .put_vfio_device = kvmgt_put_vfio_device,
1760        .is_valid_gfn = kvmgt_is_valid_gfn,
1761};
1762EXPORT_SYMBOL_GPL(kvmgt_mpt);
1763
1764static int __init kvmgt_init(void)
1765{
1766        return 0;
1767}
1768
1769static void __exit kvmgt_exit(void)
1770{
1771}
1772
1773module_init(kvmgt_init);
1774module_exit(kvmgt_exit);
1775
1776MODULE_LICENSE("GPL and additional rights");
1777MODULE_AUTHOR("Intel Corporation");
1778