LXR linux/drivers/iommu/intel/svm.c

   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright © 2015 Intel Corporation.
   4 *
   5 * Authors: David Woodhouse <dwmw2@infradead.org>
   6 */
   7
   8#include <linux/intel-iommu.h>
   9#include <linux/mmu_notifier.h>
  10#include <linux/sched.h>
  11#include <linux/sched/mm.h>
  12#include <linux/slab.h>
  13#include <linux/intel-svm.h>
  14#include <linux/rculist.h>
  15#include <linux/pci.h>
  16#include <linux/pci-ats.h>
  17#include <linux/dmar.h>
  18#include <linux/interrupt.h>
  19#include <linux/mm_types.h>
  20#include <linux/xarray.h>
  21#include <linux/ioasid.h>
  22#include <asm/page.h>
  23#include <asm/fpu/api.h>
  24#include <trace/events/intel_iommu.h>
  25
  26#include "pasid.h"
  27#include "perf.h"
  28#include "../iommu-sva-lib.h"
  29
  30static irqreturn_t prq_event_thread(int irq, void *d);
  31static void intel_svm_drain_prq(struct device *dev, u32 pasid);
  32#define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva)
  33
  34static DEFINE_XARRAY_ALLOC(pasid_private_array);
  35static int pasid_private_add(ioasid_t pasid, void *priv)
  36{
  37        return xa_alloc(&pasid_private_array, &pasid, priv,
  38                        XA_LIMIT(pasid, pasid), GFP_ATOMIC);
  39}
  40
  41static void pasid_private_remove(ioasid_t pasid)
  42{
  43        xa_erase(&pasid_private_array, pasid);
  44}
  45
  46static void *pasid_private_find(ioasid_t pasid)
  47{
  48        return xa_load(&pasid_private_array, pasid);
  49}
  50
  51static struct intel_svm_dev *
  52svm_lookup_device_by_sid(struct intel_svm *svm, u16 sid)
  53{
  54        struct intel_svm_dev *sdev = NULL, *t;
  55
  56        rcu_read_lock();
  57        list_for_each_entry_rcu(t, &svm->devs, list) {
  58                if (t->sid == sid) {
  59                        sdev = t;
  60                        break;
  61                }
  62        }
  63        rcu_read_unlock();
  64
  65        return sdev;
  66}
  67
  68static struct intel_svm_dev *
  69svm_lookup_device_by_dev(struct intel_svm *svm, struct device *dev)
  70{
  71        struct intel_svm_dev *sdev = NULL, *t;
  72
  73        rcu_read_lock();
  74        list_for_each_entry_rcu(t, &svm->devs, list) {
  75                if (t->dev == dev) {
  76                        sdev = t;
  77                        break;
  78                }
  79        }
  80        rcu_read_unlock();
  81
  82        return sdev;
  83}
  84
  85int intel_svm_enable_prq(struct intel_iommu *iommu)
  86{
  87        struct iopf_queue *iopfq;
  88        struct page *pages;
  89        int irq, ret;
  90
  91        pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, PRQ_ORDER);
  92        if (!pages) {
  93                pr_warn("IOMMU: %s: Failed to allocate page request queue\n",
  94                        iommu->name);
  95                return -ENOMEM;
  96        }
  97        iommu->prq = page_address(pages);
  98
  99        irq = dmar_alloc_hwirq(DMAR_UNITS_SUPPORTED + iommu->seq_id, iommu->node, iommu);
 100        if (irq <= 0) {
 101                pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n",
 102                       iommu->name);
 103                ret = -EINVAL;
 104                goto free_prq;
 105        }
 106        iommu->pr_irq = irq;
 107
 108        snprintf(iommu->iopfq_name, sizeof(iommu->iopfq_name),
 109                 "dmar%d-iopfq", iommu->seq_id);
 110        iopfq = iopf_queue_alloc(iommu->iopfq_name);
 111        if (!iopfq) {
 112                pr_err("IOMMU: %s: Failed to allocate iopf queue\n", iommu->name);
 113                ret = -ENOMEM;
 114                goto free_hwirq;
 115        }
 116        iommu->iopf_queue = iopfq;
 117
 118        snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id);
 119
 120        ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT,
 121                                   iommu->prq_name, iommu);
 122        if (ret) {
 123                pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n",
 124                       iommu->name);
 125                goto free_iopfq;
 126        }
 127        dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
 128        dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
 129        dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER);
 130
 131        init_completion(&iommu->prq_complete);
 132
 133        return 0;
 134
 135free_iopfq:
 136        iopf_queue_free(iommu->iopf_queue);
 137        iommu->iopf_queue = NULL;
 138free_hwirq:
 139        dmar_free_hwirq(irq);
 140        iommu->pr_irq = 0;
 141free_prq:
 142        free_pages((unsigned long)iommu->prq, PRQ_ORDER);
 143        iommu->prq = NULL;
 144
 145        return ret;
 146}
 147
 148int intel_svm_finish_prq(struct intel_iommu *iommu)
 149{
 150        dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
 151        dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
 152        dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL);
 153
 154        if (iommu->pr_irq) {
 155                free_irq(iommu->pr_irq, iommu);
 156                dmar_free_hwirq(iommu->pr_irq);
 157                iommu->pr_irq = 0;
 158        }
 159
 160        if (iommu->iopf_queue) {
 161                iopf_queue_free(iommu->iopf_queue);
 162                iommu->iopf_queue = NULL;
 163        }
 164
 165        free_pages((unsigned long)iommu->prq, PRQ_ORDER);
 166        iommu->prq = NULL;
 167
 168        return 0;
 169}
 170
 171static inline bool intel_svm_capable(struct intel_iommu *iommu)
 172{
 173        return iommu->flags & VTD_FLAG_SVM_CAPABLE;
 174}
 175
 176void intel_svm_check(struct intel_iommu *iommu)
 177{
 178        if (!pasid_supported(iommu))
 179                return;
 180
 181        if (cpu_feature_enabled(X86_FEATURE_GBPAGES) &&
 182            !cap_fl1gp_support(iommu->cap)) {
 183                pr_err("%s SVM disabled, incompatible 1GB page capability\n",
 184                       iommu->name);
 185                return;
 186        }
 187
 188        if (cpu_feature_enabled(X86_FEATURE_LA57) &&
 189            !cap_5lp_support(iommu->cap)) {
 190                pr_err("%s SVM disabled, incompatible paging mode\n",
 191                       iommu->name);
 192                return;
 193        }
 194
 195        iommu->flags |= VTD_FLAG_SVM_CAPABLE;
 196}
 197
 198static void __flush_svm_range_dev(struct intel_svm *svm,
 199                                  struct intel_svm_dev *sdev,
 200                                  unsigned long address,
 201                                  unsigned long pages, int ih)
 202{
 203        struct device_domain_info *info = get_domain_info(sdev->dev);
 204
 205        if (WARN_ON(!pages))
 206                return;
 207
 208        qi_flush_piotlb(sdev->iommu, sdev->did, svm->pasid, address, pages, ih);
 209        if (info->ats_enabled)
 210                qi_flush_dev_iotlb_pasid(sdev->iommu, sdev->sid, info->pfsid,
 211                                         svm->pasid, sdev->qdep, address,
 212                                         order_base_2(pages));
 213}
 214
 215static void intel_flush_svm_range_dev(struct intel_svm *svm,
 216                                      struct intel_svm_dev *sdev,
 217                                      unsigned long address,
 218                                      unsigned long pages, int ih)
 219{
 220        unsigned long shift = ilog2(__roundup_pow_of_two(pages));
 221        unsigned long align = (1ULL << (VTD_PAGE_SHIFT + shift));
 222        unsigned long start = ALIGN_DOWN(address, align);
 223        unsigned long end = ALIGN(address + (pages << VTD_PAGE_SHIFT), align);
 224
 225        while (start < end) {
 226                __flush_svm_range_dev(svm, sdev, start, align >> VTD_PAGE_SHIFT, ih);
 227                start += align;
 228        }
 229}
 230
 231static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address,
 232                                unsigned long pages, int ih)
 233{
 234        struct intel_svm_dev *sdev;
 235
 236        rcu_read_lock();
 237        list_for_each_entry_rcu(sdev, &svm->devs, list)
 238                intel_flush_svm_range_dev(svm, sdev, address, pages, ih);
 239        rcu_read_unlock();
 240}
 241
 242/* Pages have been freed at this point */
 243static void intel_invalidate_range(struct mmu_notifier *mn,
 244                                   struct mm_struct *mm,
 245                                   unsigned long start, unsigned long end)
 246{
 247        struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
 248
 249        intel_flush_svm_range(svm, start,
 250                              (end - start + PAGE_SIZE - 1) >> VTD_PAGE_SHIFT, 0);
 251}
 252
 253static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 254{
 255        struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
 256        struct intel_svm_dev *sdev;
 257
 258        /* This might end up being called from exit_mmap(), *before* the page
 259         * tables are cleared. And __mmu_notifier_release() will delete us from
 260         * the list of notifiers so that our invalidate_range() callback doesn't
 261         * get called when the page tables are cleared. So we need to protect
 262         * against hardware accessing those page tables.
 263         *
 264         * We do it by clearing the entry in the PASID table and then flushing
 265         * the IOTLB and the PASID table caches. This might upset hardware;
 266         * perhaps we'll want to point the PASID to a dummy PGD (like the zero
 267         * page) so that we end up taking a fault that the hardware really
 268         * *has* to handle gracefully without affecting other processes.
 269         */
 270        rcu_read_lock();
 271        list_for_each_entry_rcu(sdev, &svm->devs, list)
 272                intel_pasid_tear_down_entry(sdev->iommu, sdev->dev,
 273                                            svm->pasid, true);
 274        rcu_read_unlock();
 275
 276}
 277
 278static const struct mmu_notifier_ops intel_mmuops = {
 279        .release = intel_mm_release,
 280        .invalidate_range = intel_invalidate_range,
 281};
 282
 283static DEFINE_MUTEX(pasid_mutex);
 284
 285static int pasid_to_svm_sdev(struct device *dev, unsigned int pasid,
 286                             struct intel_svm **rsvm,
 287                             struct intel_svm_dev **rsdev)
 288{
 289        struct intel_svm_dev *sdev = NULL;
 290        struct intel_svm *svm;
 291
 292        /* The caller should hold the pasid_mutex lock */
 293        if (WARN_ON(!mutex_is_locked(&pasid_mutex)))
 294                return -EINVAL;
 295
 296        if (pasid == INVALID_IOASID || pasid >= PASID_MAX)
 297                return -EINVAL;
 298
 299        svm = pasid_private_find(pasid);
 300        if (IS_ERR(svm))
 301                return PTR_ERR(svm);
 302
 303        if (!svm)
 304                goto out;
 305
 306        /*
 307         * If we found svm for the PASID, there must be at least one device
 308         * bond.
 309         */
 310        if (WARN_ON(list_empty(&svm->devs)))
 311                return -EINVAL;
 312        sdev = svm_lookup_device_by_dev(svm, dev);
 313
 314out:
 315        *rsvm = svm;
 316        *rsdev = sdev;
 317
 318        return 0;
 319}
 320
 321int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
 322                          struct iommu_gpasid_bind_data *data)
 323{
 324        struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
 325        struct intel_svm_dev *sdev = NULL;
 326        struct dmar_domain *dmar_domain;
 327        struct device_domain_info *info;
 328        struct intel_svm *svm = NULL;
 329        unsigned long iflags;
 330        int ret = 0;
 331
 332        if (WARN_ON(!iommu) || !data)
 333                return -EINVAL;
 334
 335        if (data->format != IOMMU_PASID_FORMAT_INTEL_VTD)
 336                return -EINVAL;
 337
 338        /* IOMMU core ensures argsz is more than the start of the union */
 339        if (data->argsz < offsetofend(struct iommu_gpasid_bind_data, vendor.vtd))
 340                return -EINVAL;
 341
 342        /* Make sure no undefined flags are used in vendor data */
 343        if (data->vendor.vtd.flags & ~(IOMMU_SVA_VTD_GPASID_LAST - 1))
 344                return -EINVAL;
 345
 346        if (!dev_is_pci(dev))
 347                return -ENOTSUPP;
 348
 349        /* VT-d supports devices with full 20 bit PASIDs only */
 350        if (pci_max_pasids(to_pci_dev(dev)) != PASID_MAX)
 351                return -EINVAL;
 352
 353        /*
 354         * We only check host PASID range, we have no knowledge to check
 355         * guest PASID range.
 356         */
 357        if (data->hpasid <= 0 || data->hpasid >= PASID_MAX)
 358                return -EINVAL;
 359
 360        info = get_domain_info(dev);
 361        if (!info)
 362                return -EINVAL;
 363
 364        dmar_domain = to_dmar_domain(domain);
 365
 366        mutex_lock(&pasid_mutex);
 367        ret = pasid_to_svm_sdev(dev, data->hpasid, &svm, &sdev);
 368        if (ret)
 369                goto out;
 370
 371        if (sdev) {
 372                /*
 373                 * Do not allow multiple bindings of the same device-PASID since
 374                 * there is only one SL page tables per PASID. We may revisit
 375                 * once sharing PGD across domains are supported.
 376                 */
 377                dev_warn_ratelimited(dev, "Already bound with PASID %u\n",
 378                                     svm->pasid);
 379                ret = -EBUSY;
 380                goto out;
 381        }
 382
 383        if (!svm) {
 384                /* We come here when PASID has never been bond to a device. */
 385                svm = kzalloc(sizeof(*svm), GFP_KERNEL);
 386                if (!svm) {
 387                        ret = -ENOMEM;
 388                        goto out;
 389                }
 390                /* REVISIT: upper layer/VFIO can track host process that bind
 391                 * the PASID. ioasid_set = mm might be sufficient for vfio to
 392                 * check pasid VMM ownership. We can drop the following line
 393                 * once VFIO and IOASID set check is in place.
 394                 */
 395                svm->mm = get_task_mm(current);
 396                svm->pasid = data->hpasid;
 397                if (data->flags & IOMMU_SVA_GPASID_VAL) {
 398                        svm->gpasid = data->gpasid;
 399                        svm->flags |= SVM_FLAG_GUEST_PASID;
 400                }
 401                pasid_private_add(data->hpasid, svm);
 402                INIT_LIST_HEAD_RCU(&svm->devs);
 403                mmput(svm->mm);
 404        }
 405        sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
 406        if (!sdev) {
 407                ret = -ENOMEM;
 408                goto out;
 409        }
 410        sdev->dev = dev;
 411        sdev->sid = PCI_DEVID(info->bus, info->devfn);
 412        sdev->iommu = iommu;
 413
 414        /* Only count users if device has aux domains */
 415        if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX))
 416                sdev->users = 1;
 417
 418        /* Set up device context entry for PASID if not enabled already */
 419        ret = intel_iommu_enable_pasid(iommu, sdev->dev);
 420        if (ret) {
 421                dev_err_ratelimited(dev, "Failed to enable PASID capability\n");
 422                kfree(sdev);
 423                goto out;
 424        }
 425
 426        /*
 427         * PASID table is per device for better security. Therefore, for
 428         * each bind of a new device even with an existing PASID, we need to
 429         * call the nested mode setup function here.
 430         */
 431        spin_lock_irqsave(&iommu->lock, iflags);
 432        ret = intel_pasid_setup_nested(iommu, dev,
 433                                       (pgd_t *)(uintptr_t)data->gpgd,
 434                                       data->hpasid, &data->vendor.vtd, dmar_domain,
 435                                       data->addr_width);
 436        spin_unlock_irqrestore(&iommu->lock, iflags);
 437        if (ret) {
 438                dev_err_ratelimited(dev, "Failed to set up PASID %llu in nested mode, Err %d\n",
 439                                    data->hpasid, ret);
 440                /*
 441                 * PASID entry should be in cleared state if nested mode
 442                 * set up failed. So we only need to clear IOASID tracking
 443                 * data such that free call will succeed.
 444                 */
 445                kfree(sdev);
 446                goto out;
 447        }
 448
 449        svm->flags |= SVM_FLAG_GUEST_MODE;
 450
 451        init_rcu_head(&sdev->rcu);
 452        list_add_rcu(&sdev->list, &svm->devs);
 453 out:
 454        if (!IS_ERR_OR_NULL(svm) && list_empty(&svm->devs)) {
 455                pasid_private_remove(data->hpasid);
 456                kfree(svm);
 457        }
 458
 459        mutex_unlock(&pasid_mutex);
 460        return ret;
 461}
 462
 463int intel_svm_unbind_gpasid(struct device *dev, u32 pasid)
 464{
 465        struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
 466        struct intel_svm_dev *sdev;
 467        struct intel_svm *svm;
 468        int ret;
 469
 470        if (WARN_ON(!iommu))
 471                return -EINVAL;
 472
 473        mutex_lock(&pasid_mutex);
 474        ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev);
 475        if (ret)
 476                goto out;
 477
 478        if (sdev) {
 479                if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX))
 480                        sdev->users--;
 481                if (!sdev->users) {
 482                        list_del_rcu(&sdev->list);
 483                        intel_pasid_tear_down_entry(iommu, dev,
 484                                                    svm->pasid, false);
 485                        intel_svm_drain_prq(dev, svm->pasid);
 486                        kfree_rcu(sdev, rcu);
 487
 488                        if (list_empty(&svm->devs)) {
 489                                /*
 490                                 * We do not free the IOASID here in that
 491                                 * IOMMU driver did not allocate it.
 492                                 * Unlike native SVM, IOASID for guest use was
 493                                 * allocated prior to the bind call.
 494                                 * In any case, if the free call comes before
 495                                 * the unbind, IOMMU driver will get notified
 496                                 * and perform cleanup.
 497                                 */
 498                                pasid_private_remove(pasid);
 499                                kfree(svm);
 500                        }
 501                }
 502        }
 503out:
 504        mutex_unlock(&pasid_mutex);
 505        return ret;
 506}
 507
 508static void _load_pasid(void *unused)
 509{
 510        update_pasid();
 511}
 512
 513static void load_pasid(struct mm_struct *mm, u32 pasid)
 514{
 515        mutex_lock(&mm->context.lock);
 516
 517        /* Update PASID MSR on all CPUs running the mm's tasks. */
 518        on_each_cpu_mask(mm_cpumask(mm), _load_pasid, NULL, true);
 519
 520        mutex_unlock(&mm->context.lock);
 521}
 522
 523static int intel_svm_alloc_pasid(struct device *dev, struct mm_struct *mm,
 524                                 unsigned int flags)
 525{
 526        ioasid_t max_pasid = dev_is_pci(dev) ?
 527                        pci_max_pasids(to_pci_dev(dev)) : intel_pasid_max_id;
 528
 529        return iommu_sva_alloc_pasid(mm, PASID_MIN, max_pasid - 1);
 530}
 531
 532static void intel_svm_free_pasid(struct mm_struct *mm)
 533{
 534        iommu_sva_free_pasid(mm);
 535}
 536
 537static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu,
 538                                           struct device *dev,
 539                                           struct mm_struct *mm,
 540                                           unsigned int flags)
 541{
 542        struct device_domain_info *info = get_domain_info(dev);
 543        unsigned long iflags, sflags;
 544        struct intel_svm_dev *sdev;
 545        struct intel_svm *svm;
 546        int ret = 0;
 547
 548        svm = pasid_private_find(mm->pasid);
 549        if (!svm) {
 550                svm = kzalloc(sizeof(*svm), GFP_KERNEL);
 551                if (!svm)
 552                        return ERR_PTR(-ENOMEM);
 553
 554                svm->pasid = mm->pasid;
 555                svm->mm = mm;
 556                svm->flags = flags;
 557                INIT_LIST_HEAD_RCU(&svm->devs);
 558
 559                if (!(flags & SVM_FLAG_SUPERVISOR_MODE)) {
 560                        svm->notifier.ops = &intel_mmuops;
 561                        ret = mmu_notifier_register(&svm->notifier, mm);
 562                        if (ret) {
 563                                kfree(svm);
 564                                return ERR_PTR(ret);
 565                        }
 566                }
 567
 568                ret = pasid_private_add(svm->pasid, svm);
 569                if (ret) {
 570                        if (svm->notifier.ops)
 571                                mmu_notifier_unregister(&svm->notifier, mm);
 572                        kfree(svm);
 573                        return ERR_PTR(ret);
 574                }
 575        }
 576
 577        /* Find the matching device in svm list */
 578        sdev = svm_lookup_device_by_dev(svm, dev);
 579        if (sdev) {
 580                sdev->users++;
 581                goto success;
 582        }
 583
 584        sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
 585        if (!sdev) {
 586                ret = -ENOMEM;
 587                goto free_svm;
 588        }
 589
 590        sdev->dev = dev;
 591        sdev->iommu = iommu;
 592        sdev->did = FLPT_DEFAULT_DID;
 593        sdev->sid = PCI_DEVID(info->bus, info->devfn);
 594        sdev->users = 1;
 595        sdev->pasid = svm->pasid;
 596        sdev->sva.dev = dev;
 597        init_rcu_head(&sdev->rcu);
 598        if (info->ats_enabled) {
 599                sdev->dev_iotlb = 1;
 600                sdev->qdep = info->ats_qdep;
 601                if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
 602                        sdev->qdep = 0;
 603        }
 604
 605        /* Setup the pasid table: */
 606        sflags = (flags & SVM_FLAG_SUPERVISOR_MODE) ?
 607                        PASID_FLAG_SUPERVISOR_MODE : 0;
 608        sflags |= cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0;
 609        spin_lock_irqsave(&iommu->lock, iflags);
 610        ret = intel_pasid_setup_first_level(iommu, dev, mm->pgd, mm->pasid,
 611                                            FLPT_DEFAULT_DID, sflags);
 612        spin_unlock_irqrestore(&iommu->lock, iflags);
 613
 614        if (ret)
 615                goto free_sdev;
 616
 617        /* The newly allocated pasid is loaded to the mm. */
 618        if (!(flags & SVM_FLAG_SUPERVISOR_MODE) && list_empty(&svm->devs))
 619                load_pasid(mm, svm->pasid);
 620
 621        list_add_rcu(&sdev->list, &svm->devs);
 622success:
 623        return &sdev->sva;
 624
 625free_sdev:
 626        kfree(sdev);
 627free_svm:
 628        if (list_empty(&svm->devs)) {
 629                if (svm->notifier.ops)
 630                        mmu_notifier_unregister(&svm->notifier, mm);
 631                pasid_private_remove(mm->pasid);
 632                kfree(svm);
 633        }
 634
 635        return ERR_PTR(ret);
 636}
 637
 638/* Caller must hold pasid_mutex */
 639static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
 640{
 641        struct intel_svm_dev *sdev;
 642        struct intel_iommu *iommu;
 643        struct intel_svm *svm;
 644        struct mm_struct *mm;
 645        int ret = -EINVAL;
 646
 647        iommu = device_to_iommu(dev, NULL, NULL);
 648        if (!iommu)
 649                goto out;
 650
 651        ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev);
 652        if (ret)
 653                goto out;
 654        mm = svm->mm;
 655
 656        if (sdev) {
 657                sdev->users--;
 658                if (!sdev->users) {
 659                        list_del_rcu(&sdev->list);
 660                        /* Flush the PASID cache and IOTLB for this device.
 661                         * Note that we do depend on the hardware *not* using
 662                         * the PASID any more. Just as we depend on other
 663                         * devices never using PASIDs that they have no right
 664                         * to use. We have a *shared* PASID table, because it's
 665                         * large and has to be physically contiguous. So it's
 666                         * hard to be as defensive as we might like. */
 667                        intel_pasid_tear_down_entry(iommu, dev,
 668                                                    svm->pasid, false);
 669                        intel_svm_drain_prq(dev, svm->pasid);
 670                        kfree_rcu(sdev, rcu);
 671
 672                        if (list_empty(&svm->devs)) {
 673                                if (svm->notifier.ops) {
 674                                        mmu_notifier_unregister(&svm->notifier, mm);
 675                                        /* Clear mm's pasid. */
 676                                        load_pasid(mm, PASID_DISABLED);
 677                                }
 678                                pasid_private_remove(svm->pasid);
 679                                /* We mandate that no page faults may be outstanding
 680                                 * for the PASID when intel_svm_unbind_mm() is called.
 681                                 * If that is not obeyed, subtle errors will happen.
 682                                 * Let's make them less subtle... */
 683                                memset(svm, 0x6b, sizeof(*svm));
 684                                kfree(svm);
 685                        }
 686                }
 687                /* Drop a PASID reference and free it if no reference. */
 688                intel_svm_free_pasid(mm);
 689        }
 690out:
 691        return ret;
 692}
 693
 694/* Page request queue descriptor */
 695struct page_req_dsc {
 696        union {
 697                struct {
 698                        u64 type:8;
 699                        u64 pasid_present:1;
 700                        u64 priv_data_present:1;
 701                        u64 rsvd:6;
 702                        u64 rid:16;
 703                        u64 pasid:20;
 704                        u64 exe_req:1;
 705                        u64 pm_req:1;
 706                        u64 rsvd2:10;
 707                };
 708                u64 qw_0;
 709        };
 710        union {
 711                struct {
 712                        u64 rd_req:1;
 713                        u64 wr_req:1;
 714                        u64 lpig:1;
 715                        u64 prg_index:9;
 716                        u64 addr:52;
 717                };
 718                u64 qw_1;
 719        };
 720        u64 priv_data[2];
 721};
 722
 723static bool is_canonical_address(u64 addr)
 724{
 725        int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
 726        long saddr = (long) addr;
 727
 728        return (((saddr << shift) >> shift) == saddr);
 729}
 730
 731/**
 732 * intel_svm_drain_prq - Drain page requests and responses for a pasid
 733 * @dev: target device
 734 * @pasid: pasid for draining
 735 *
 736 * Drain all pending page requests and responses related to @pasid in both
 737 * software and hardware. This is supposed to be called after the device
 738 * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB
 739 * and DevTLB have been invalidated.
 740 *
 741 * It waits until all pending page requests for @pasid in the page fault
 742 * queue are completed by the prq handling thread. Then follow the steps
 743 * described in VT-d spec CH7.10 to drain all page requests and page
 744 * responses pending in the hardware.
 745 */
 746static void intel_svm_drain_prq(struct device *dev, u32 pasid)
 747{
 748        struct device_domain_info *info;
 749        struct dmar_domain *domain;
 750        struct intel_iommu *iommu;
 751        struct qi_desc desc[3];
 752        struct pci_dev *pdev;
 753        int head, tail;
 754        u16 sid, did;
 755        int qdep;
 756
 757        info = get_domain_info(dev);
 758        if (WARN_ON(!info || !dev_is_pci(dev)))
 759                return;
 760
 761        if (!info->pri_enabled)
 762                return;
 763
 764        iommu = info->iommu;
 765        domain = info->domain;
 766        pdev = to_pci_dev(dev);
 767        sid = PCI_DEVID(info->bus, info->devfn);
 768        did = domain->iommu_did[iommu->seq_id];
 769        qdep = pci_ats_queue_depth(pdev);
 770
 771        /*
 772         * Check and wait until all pending page requests in the queue are
 773         * handled by the prq handling thread.
 774         */
 775prq_retry:
 776        reinit_completion(&iommu->prq_complete);
 777        tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
 778        head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
 779        while (head != tail) {
 780                struct page_req_dsc *req;
 781
 782                req = &iommu->prq[head / sizeof(*req)];
 783                if (!req->pasid_present || req->pasid != pasid) {
 784                        head = (head + sizeof(*req)) & PRQ_RING_MASK;
 785                        continue;
 786                }
 787
 788                wait_for_completion(&iommu->prq_complete);
 789                goto prq_retry;
 790        }
 791
 792        /*
 793         * A work in IO page fault workqueue may try to lock pasid_mutex now.
 794         * Holding pasid_mutex while waiting in iopf_queue_flush_dev() for
 795         * all works in the workqueue to finish may cause deadlock.
 796         *
 797         * It's unnecessary to hold pasid_mutex in iopf_queue_flush_dev().
 798         * Unlock it to allow the works to be handled while waiting for
 799         * them to finish.
 800         */
 801        lockdep_assert_held(&pasid_mutex);
 802        mutex_unlock(&pasid_mutex);
 803        iopf_queue_flush_dev(dev);
 804        mutex_lock(&pasid_mutex);
 805
 806        /*
 807         * Perform steps described in VT-d spec CH7.10 to drain page
 808         * requests and responses in hardware.
 809         */
 810        memset(desc, 0, sizeof(desc));
 811        desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) |
 812                        QI_IWD_FENCE |
 813                        QI_IWD_TYPE;
 814        desc[1].qw0 = QI_EIOTLB_PASID(pasid) |
 815                        QI_EIOTLB_DID(did) |
 816                        QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) |
 817                        QI_EIOTLB_TYPE;
 818        desc[2].qw0 = QI_DEV_EIOTLB_PASID(pasid) |
 819                        QI_DEV_EIOTLB_SID(sid) |
 820                        QI_DEV_EIOTLB_QDEP(qdep) |
 821                        QI_DEIOTLB_TYPE |
 822                        QI_DEV_IOTLB_PFSID(info->pfsid);
 823qi_retry:
 824        reinit_completion(&iommu->prq_complete);
 825        qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN);
 826        if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) {
 827                wait_for_completion(&iommu->prq_complete);
 828                goto qi_retry;
 829        }
 830}
 831
 832static int prq_to_iommu_prot(struct page_req_dsc *req)
 833{
 834        int prot = 0;
 835
 836        if (req->rd_req)
 837                prot |= IOMMU_FAULT_PERM_READ;
 838        if (req->wr_req)
 839                prot |= IOMMU_FAULT_PERM_WRITE;
 840        if (req->exe_req)
 841                prot |= IOMMU_FAULT_PERM_EXEC;
 842        if (req->pm_req)
 843                prot |= IOMMU_FAULT_PERM_PRIV;
 844
 845        return prot;
 846}
 847
 848static int intel_svm_prq_report(struct intel_iommu *iommu, struct device *dev,
 849                                struct page_req_dsc *desc)
 850{
 851        struct iommu_fault_event event;
 852
 853        if (!dev || !dev_is_pci(dev))
 854                return -ENODEV;
 855
 856        /* Fill in event data for device specific processing */
 857        memset(&event, 0, sizeof(struct iommu_fault_event));
 858        event.fault.type = IOMMU_FAULT_PAGE_REQ;
 859        event.fault.prm.addr = (u64)desc->addr << VTD_PAGE_SHIFT;
 860        event.fault.prm.pasid = desc->pasid;
 861        event.fault.prm.grpid = desc->prg_index;
 862        event.fault.prm.perm = prq_to_iommu_prot(desc);
 863
 864        if (desc->lpig)
 865                event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
 866        if (desc->pasid_present) {
 867                event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
 868                event.fault.prm.flags |= IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID;
 869        }
 870        if (desc->priv_data_present) {
 871                /*
 872                 * Set last page in group bit if private data is present,
 873                 * page response is required as it does for LPIG.
 874                 * iommu_report_device_fault() doesn't understand this vendor
 875                 * specific requirement thus we set last_page as a workaround.
 876                 */
 877                event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
 878                event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA;
 879                event.fault.prm.private_data[0] = desc->priv_data[0];
 880                event.fault.prm.private_data[1] = desc->priv_data[1];
 881        } else if (dmar_latency_enabled(iommu, DMAR_LATENCY_PRQ)) {
 882                /*
 883                 * If the private data fields are not used by hardware, use it
 884                 * to monitor the prq handle latency.
 885                 */
 886                event.fault.prm.private_data[0] = ktime_to_ns(ktime_get());
 887        }
 888
 889        return iommu_report_device_fault(dev, &event);
 890}
 891
 892static void handle_bad_prq_event(struct intel_iommu *iommu,
 893                                 struct page_req_dsc *req, int result)
 894{
 895        struct qi_desc desc;
 896
 897        pr_err("%s: Invalid page request: %08llx %08llx\n",
 898               iommu->name, ((unsigned long long *)req)[0],
 899               ((unsigned long long *)req)[1]);
 900
 901        /*
 902         * Per VT-d spec. v3.0 ch7.7, system software must
 903         * respond with page group response if private data
 904         * is present (PDP) or last page in group (LPIG) bit
 905         * is set. This is an additional VT-d feature beyond
 906         * PCI ATS spec.
 907         */
 908        if (!req->lpig && !req->priv_data_present)
 909                return;
 910
 911        desc.qw0 = QI_PGRP_PASID(req->pasid) |
 912                        QI_PGRP_DID(req->rid) |
 913                        QI_PGRP_PASID_P(req->pasid_present) |
 914                        QI_PGRP_PDP(req->priv_data_present) |
 915                        QI_PGRP_RESP_CODE(result) |
 916                        QI_PGRP_RESP_TYPE;
 917        desc.qw1 = QI_PGRP_IDX(req->prg_index) |
 918                        QI_PGRP_LPIG(req->lpig);
 919
 920        if (req->priv_data_present) {
 921                desc.qw2 = req->priv_data[0];
 922                desc.qw3 = req->priv_data[1];
 923        } else {
 924                desc.qw2 = 0;
 925                desc.qw3 = 0;
 926        }
 927
 928        qi_submit_sync(iommu, &desc, 1, 0);
 929}
 930
 931static irqreturn_t prq_event_thread(int irq, void *d)
 932{
 933        struct intel_svm_dev *sdev = NULL;
 934        struct intel_iommu *iommu = d;
 935        struct intel_svm *svm = NULL;
 936        struct page_req_dsc *req;
 937        int head, tail, handled;
 938        u64 address;
 939
 940        /*
 941         * Clear PPR bit before reading head/tail registers, to ensure that
 942         * we get a new interrupt if needed.
 943         */
 944        writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG);
 945
 946        tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
 947        head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
 948        handled = (head != tail);
 949        while (head != tail) {
 950                req = &iommu->prq[head / sizeof(*req)];
 951                address = (u64)req->addr << VTD_PAGE_SHIFT;
 952
 953                if (unlikely(!req->pasid_present)) {
 954                        pr_err("IOMMU: %s: Page request without PASID\n",
 955                               iommu->name);
 956bad_req:
 957                        svm = NULL;
 958                        sdev = NULL;
 959                        handle_bad_prq_event(iommu, req, QI_RESP_INVALID);
 960                        goto prq_advance;
 961                }
 962
 963                if (unlikely(!is_canonical_address(address))) {
 964                        pr_err("IOMMU: %s: Address is not canonical\n",
 965                               iommu->name);
 966                        goto bad_req;
 967                }
 968
 969                if (unlikely(req->pm_req && (req->rd_req | req->wr_req))) {
 970                        pr_err("IOMMU: %s: Page request in Privilege Mode\n",
 971                               iommu->name);
 972                        goto bad_req;
 973                }
 974
 975                if (unlikely(req->exe_req && req->rd_req)) {
 976                        pr_err("IOMMU: %s: Execution request not supported\n",
 977                               iommu->name);
 978                        goto bad_req;
 979                }
 980
 981                if (!svm || svm->pasid != req->pasid) {
 982                        /*
 983                         * It can't go away, because the driver is not permitted
 984                         * to unbind the mm while any page faults are outstanding.
 985                         */
 986                        svm = pasid_private_find(req->pasid);
 987                        if (IS_ERR_OR_NULL(svm) || (svm->flags & SVM_FLAG_SUPERVISOR_MODE))
 988                                goto bad_req;
 989                }
 990
 991                if (!sdev || sdev->sid != req->rid) {
 992                        sdev = svm_lookup_device_by_sid(svm, req->rid);
 993                        if (!sdev)
 994                                goto bad_req;
 995                }
 996
 997                sdev->prq_seq_number++;
 998
 999                /*
1000                 * If prq is to be handled outside iommu driver via receiver of

1001                 * the fault notifiers, we skip the page response here.
1002                 */
1003                if (intel_svm_prq_report(iommu, sdev->dev, req))
1004                        handle_bad_prq_event(iommu, req, QI_RESP_INVALID);
1005
1006                trace_prq_report(iommu, sdev->dev, req->qw_0, req->qw_1,
1007                                 req->priv_data[0], req->priv_data[1],
1008                                 sdev->prq_seq_number);
1009prq_advance:
1010                head = (head + sizeof(*req)) & PRQ_RING_MASK;
1011        }
1012
1013        dmar_writeq(iommu->reg + DMAR_PQH_REG, tail);
1014
1015        /*
1016         * Clear the page request overflow bit and wake up all threads that
1017         * are waiting for the completion of this handling.
1018         */
1019        if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) {
1020                pr_info_ratelimited("IOMMU: %s: PRQ overflow detected\n",
1021                                    iommu->name);
1022                head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
1023                tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
1024                if (head == tail) {
1025                        iopf_queue_discard_partial(iommu->iopf_queue);
1026                        writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG);
1027                        pr_info_ratelimited("IOMMU: %s: PRQ overflow cleared",
1028                                            iommu->name);
1029                }
1030        }
1031
1032        if (!completion_done(&iommu->prq_complete))
1033                complete(&iommu->prq_complete);
1034
1035        return IRQ_RETVAL(handled);
1036}
1037
1038struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
1039{
1040        struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
1041        unsigned int flags = 0;
1042        struct iommu_sva *sva;
1043        int ret;
1044
1045        if (drvdata)
1046                flags = *(unsigned int *)drvdata;
1047
1048        if (flags & SVM_FLAG_SUPERVISOR_MODE) {
1049                if (!ecap_srs(iommu->ecap)) {
1050                        dev_err(dev, "%s: Supervisor PASID not supported\n",
1051                                iommu->name);
1052                        return ERR_PTR(-EOPNOTSUPP);
1053                }
1054
1055                if (mm) {
1056                        dev_err(dev, "%s: Supervisor PASID with user provided mm\n",
1057                                iommu->name);
1058                        return ERR_PTR(-EINVAL);
1059                }
1060
1061                mm = &init_mm;
1062        }
1063
1064        mutex_lock(&pasid_mutex);
1065        ret = intel_svm_alloc_pasid(dev, mm, flags);
1066        if (ret) {
1067                mutex_unlock(&pasid_mutex);
1068                return ERR_PTR(ret);
1069        }
1070
1071        sva = intel_svm_bind_mm(iommu, dev, mm, flags);
1072        if (IS_ERR_OR_NULL(sva))
1073                intel_svm_free_pasid(mm);
1074        mutex_unlock(&pasid_mutex);
1075
1076        return sva;
1077}
1078
1079void intel_svm_unbind(struct iommu_sva *sva)
1080{
1081        struct intel_svm_dev *sdev = to_intel_svm_dev(sva);
1082
1083        mutex_lock(&pasid_mutex);
1084        intel_svm_unbind_mm(sdev->dev, sdev->pasid);
1085        mutex_unlock(&pasid_mutex);
1086}
1087
1088u32 intel_svm_get_pasid(struct iommu_sva *sva)
1089{
1090        struct intel_svm_dev *sdev;
1091        u32 pasid;
1092
1093        mutex_lock(&pasid_mutex);
1094        sdev = to_intel_svm_dev(sva);
1095        pasid = sdev->pasid;
1096        mutex_unlock(&pasid_mutex);
1097
1098        return pasid;
1099}
1100
1101int intel_svm_page_response(struct device *dev,
1102                            struct iommu_fault_event *evt,
1103                            struct iommu_page_response *msg)
1104{
1105        struct iommu_fault_page_request *prm;
1106        struct intel_svm_dev *sdev = NULL;
1107        struct intel_svm *svm = NULL;
1108        struct intel_iommu *iommu;
1109        bool private_present;
1110        bool pasid_present;
1111        bool last_page;
1112        u8 bus, devfn;
1113        int ret = 0;
1114        u16 sid;
1115
1116        if (!dev || !dev_is_pci(dev))
1117                return -ENODEV;
1118
1119        iommu = device_to_iommu(dev, &bus, &devfn);
1120        if (!iommu)
1121                return -ENODEV;
1122
1123        if (!msg || !evt)
1124                return -EINVAL;
1125
1126        mutex_lock(&pasid_mutex);
1127
1128        prm = &evt->fault.prm;
1129        sid = PCI_DEVID(bus, devfn);
1130        pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
1131        private_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA;
1132        last_page = prm->flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
1133
1134        if (!pasid_present) {
1135                ret = -EINVAL;
1136                goto out;
1137        }
1138
1139        if (prm->pasid == 0 || prm->pasid >= PASID_MAX) {
1140                ret = -EINVAL;
1141                goto out;
1142        }
1143
1144        ret = pasid_to_svm_sdev(dev, prm->pasid, &svm, &sdev);
1145        if (ret || !sdev) {
1146                ret = -ENODEV;
1147                goto out;
1148        }
1149
1150        /*
1151         * For responses from userspace, need to make sure that the
1152         * pasid has been bound to its mm.
1153         */
1154        if (svm->flags & SVM_FLAG_GUEST_MODE) {
1155                struct mm_struct *mm;
1156
1157                mm = get_task_mm(current);
1158                if (!mm) {
1159                        ret = -EINVAL;
1160                        goto out;
1161                }
1162
1163                if (mm != svm->mm) {
1164                        ret = -ENODEV;
1165                        mmput(mm);
1166                        goto out;
1167                }
1168
1169                mmput(mm);
1170        }
1171
1172        /*
1173         * Per VT-d spec. v3.0 ch7.7, system software must respond
1174         * with page group response if private data is present (PDP)
1175         * or last page in group (LPIG) bit is set. This is an
1176         * additional VT-d requirement beyond PCI ATS spec.
1177         */
1178        if (last_page || private_present) {
1179                struct qi_desc desc;
1180
1181                desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) |
1182                                QI_PGRP_PASID_P(pasid_present) |
1183                                QI_PGRP_PDP(private_present) |
1184                                QI_PGRP_RESP_CODE(msg->code) |
1185                                QI_PGRP_RESP_TYPE;
1186                desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page);
1187                desc.qw2 = 0;
1188                desc.qw3 = 0;
1189
1190                if (private_present) {
1191                        desc.qw2 = prm->private_data[0];
1192                        desc.qw3 = prm->private_data[1];
1193                } else if (prm->private_data[0]) {
1194                        dmar_latency_update(iommu, DMAR_LATENCY_PRQ,
1195                                ktime_to_ns(ktime_get()) - prm->private_data[0]);
1196                }
1197
1198                qi_submit_sync(iommu, &desc, 1, 0);
1199        }
1200out:
1201        mutex_unlock(&pasid_mutex);
1202        return ret;
1203}
1204