linux/drivers/iommu/intel/svm.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright © 2015 Intel Corporation.
   4 *
   5 * Authors: David Woodhouse <dwmw2@infradead.org>
   6 */
   7
   8#include <linux/intel-iommu.h>
   9#include <linux/mmu_notifier.h>
  10#include <linux/sched.h>
  11#include <linux/sched/mm.h>
  12#include <linux/slab.h>
  13#include <linux/intel-svm.h>
  14#include <linux/rculist.h>
  15#include <linux/pci.h>
  16#include <linux/pci-ats.h>
  17#include <linux/dmar.h>
  18#include <linux/interrupt.h>
  19#include <linux/mm_types.h>
  20#include <linux/ioasid.h>
  21#include <asm/page.h>
  22
  23#include "pasid.h"
  24
  25static irqreturn_t prq_event_thread(int irq, void *d);
  26static void intel_svm_drain_prq(struct device *dev, int pasid);
  27
  28#define PRQ_ORDER 0
  29
  30int intel_svm_enable_prq(struct intel_iommu *iommu)
  31{
  32        struct page *pages;
  33        int irq, ret;
  34
  35        pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, PRQ_ORDER);
  36        if (!pages) {
  37                pr_warn("IOMMU: %s: Failed to allocate page request queue\n",
  38                        iommu->name);
  39                return -ENOMEM;
  40        }
  41        iommu->prq = page_address(pages);
  42
  43        irq = dmar_alloc_hwirq(DMAR_UNITS_SUPPORTED + iommu->seq_id, iommu->node, iommu);
  44        if (irq <= 0) {
  45                pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n",
  46                       iommu->name);
  47                ret = -EINVAL;
  48        err:
  49                free_pages((unsigned long)iommu->prq, PRQ_ORDER);
  50                iommu->prq = NULL;
  51                return ret;
  52        }
  53        iommu->pr_irq = irq;
  54
  55        snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id);
  56
  57        ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT,
  58                                   iommu->prq_name, iommu);
  59        if (ret) {
  60                pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n",
  61                       iommu->name);
  62                dmar_free_hwirq(irq);
  63                iommu->pr_irq = 0;
  64                goto err;
  65        }
  66        dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
  67        dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
  68        dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER);
  69
  70        init_completion(&iommu->prq_complete);
  71
  72        return 0;
  73}
  74
  75int intel_svm_finish_prq(struct intel_iommu *iommu)
  76{
  77        dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
  78        dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
  79        dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL);
  80
  81        if (iommu->pr_irq) {
  82                free_irq(iommu->pr_irq, iommu);
  83                dmar_free_hwirq(iommu->pr_irq);
  84                iommu->pr_irq = 0;
  85        }
  86
  87        free_pages((unsigned long)iommu->prq, PRQ_ORDER);
  88        iommu->prq = NULL;
  89
  90        return 0;
  91}
  92
  93static inline bool intel_svm_capable(struct intel_iommu *iommu)
  94{
  95        return iommu->flags & VTD_FLAG_SVM_CAPABLE;
  96}
  97
  98void intel_svm_check(struct intel_iommu *iommu)
  99{
 100        if (!pasid_supported(iommu))
 101                return;
 102
 103        if (cpu_feature_enabled(X86_FEATURE_GBPAGES) &&
 104            !cap_fl1gp_support(iommu->cap)) {
 105                pr_err("%s SVM disabled, incompatible 1GB page capability\n",
 106                       iommu->name);
 107                return;
 108        }
 109
 110        if (cpu_feature_enabled(X86_FEATURE_LA57) &&
 111            !cap_5lp_support(iommu->cap)) {
 112                pr_err("%s SVM disabled, incompatible paging mode\n",
 113                       iommu->name);
 114                return;
 115        }
 116
 117        iommu->flags |= VTD_FLAG_SVM_CAPABLE;
 118}
 119
 120static void intel_flush_svm_range_dev (struct intel_svm *svm, struct intel_svm_dev *sdev,
 121                                unsigned long address, unsigned long pages, int ih)
 122{
 123        struct qi_desc desc;
 124
 125        if (pages == -1) {
 126                desc.qw0 = QI_EIOTLB_PASID(svm->pasid) |
 127                        QI_EIOTLB_DID(sdev->did) |
 128                        QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) |
 129                        QI_EIOTLB_TYPE;
 130                desc.qw1 = 0;
 131        } else {
 132                int mask = ilog2(__roundup_pow_of_two(pages));
 133
 134                desc.qw0 = QI_EIOTLB_PASID(svm->pasid) |
 135                                QI_EIOTLB_DID(sdev->did) |
 136                                QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) |
 137                                QI_EIOTLB_TYPE;
 138                desc.qw1 = QI_EIOTLB_ADDR(address) |
 139                                QI_EIOTLB_IH(ih) |
 140                                QI_EIOTLB_AM(mask);
 141        }
 142        desc.qw2 = 0;
 143        desc.qw3 = 0;
 144        qi_submit_sync(svm->iommu, &desc, 1, 0);
 145
 146        if (sdev->dev_iotlb) {
 147                desc.qw0 = QI_DEV_EIOTLB_PASID(svm->pasid) |
 148                                QI_DEV_EIOTLB_SID(sdev->sid) |
 149                                QI_DEV_EIOTLB_QDEP(sdev->qdep) |
 150                                QI_DEIOTLB_TYPE;
 151                if (pages == -1) {
 152                        desc.qw1 = QI_DEV_EIOTLB_ADDR(-1ULL >> 1) |
 153                                        QI_DEV_EIOTLB_SIZE;
 154                } else if (pages > 1) {
 155                        /* The least significant zero bit indicates the size. So,
 156                         * for example, an "address" value of 0x12345f000 will
 157                         * flush from 0x123440000 to 0x12347ffff (256KiB). */
 158                        unsigned long last = address + ((unsigned long)(pages - 1) << VTD_PAGE_SHIFT);
 159                        unsigned long mask = __rounddown_pow_of_two(address ^ last);
 160
 161                        desc.qw1 = QI_DEV_EIOTLB_ADDR((address & ~mask) |
 162                                        (mask - 1)) | QI_DEV_EIOTLB_SIZE;
 163                } else {
 164                        desc.qw1 = QI_DEV_EIOTLB_ADDR(address);
 165                }
 166                desc.qw2 = 0;
 167                desc.qw3 = 0;
 168                qi_submit_sync(svm->iommu, &desc, 1, 0);
 169        }
 170}
 171
 172static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address,
 173                                unsigned long pages, int ih)
 174{
 175        struct intel_svm_dev *sdev;
 176
 177        rcu_read_lock();
 178        list_for_each_entry_rcu(sdev, &svm->devs, list)
 179                intel_flush_svm_range_dev(svm, sdev, address, pages, ih);
 180        rcu_read_unlock();
 181}
 182
 183/* Pages have been freed at this point */
 184static void intel_invalidate_range(struct mmu_notifier *mn,
 185                                   struct mm_struct *mm,
 186                                   unsigned long start, unsigned long end)
 187{
 188        struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
 189
 190        intel_flush_svm_range(svm, start,
 191                              (end - start + PAGE_SIZE - 1) >> VTD_PAGE_SHIFT, 0);
 192}
 193
 194static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 195{
 196        struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
 197        struct intel_svm_dev *sdev;
 198
 199        /* This might end up being called from exit_mmap(), *before* the page
 200         * tables are cleared. And __mmu_notifier_release() will delete us from
 201         * the list of notifiers so that our invalidate_range() callback doesn't
 202         * get called when the page tables are cleared. So we need to protect
 203         * against hardware accessing those page tables.
 204         *
 205         * We do it by clearing the entry in the PASID table and then flushing
 206         * the IOTLB and the PASID table caches. This might upset hardware;
 207         * perhaps we'll want to point the PASID to a dummy PGD (like the zero
 208         * page) so that we end up taking a fault that the hardware really
 209         * *has* to handle gracefully without affecting other processes.
 210         */
 211        rcu_read_lock();
 212        list_for_each_entry_rcu(sdev, &svm->devs, list)
 213                intel_pasid_tear_down_entry(svm->iommu, sdev->dev,
 214                                            svm->pasid, true);
 215        rcu_read_unlock();
 216
 217}
 218
 219static const struct mmu_notifier_ops intel_mmuops = {
 220        .release = intel_mm_release,
 221        .invalidate_range = intel_invalidate_range,
 222};
 223
 224static DEFINE_MUTEX(pasid_mutex);
 225static LIST_HEAD(global_svm_list);
 226
 227#define for_each_svm_dev(sdev, svm, d)                  \
 228        list_for_each_entry((sdev), &(svm)->devs, list) \
 229                if ((d) != (sdev)->dev) {} else
 230
 231static int pasid_to_svm_sdev(struct device *dev, unsigned int pasid,
 232                             struct intel_svm **rsvm,
 233                             struct intel_svm_dev **rsdev)
 234{
 235        struct intel_svm_dev *d, *sdev = NULL;
 236        struct intel_svm *svm;
 237
 238        /* The caller should hold the pasid_mutex lock */
 239        if (WARN_ON(!mutex_is_locked(&pasid_mutex)))
 240                return -EINVAL;
 241
 242        if (pasid == INVALID_IOASID || pasid >= PASID_MAX)
 243                return -EINVAL;
 244
 245        svm = ioasid_find(NULL, pasid, NULL);
 246        if (IS_ERR(svm))
 247                return PTR_ERR(svm);
 248
 249        if (!svm)
 250                goto out;
 251
 252        /*
 253         * If we found svm for the PASID, there must be at least one device
 254         * bond.
 255         */
 256        if (WARN_ON(list_empty(&svm->devs)))
 257                return -EINVAL;
 258
 259        rcu_read_lock();
 260        list_for_each_entry_rcu(d, &svm->devs, list) {
 261                if (d->dev == dev) {
 262                        sdev = d;
 263                        break;
 264                }
 265        }
 266        rcu_read_unlock();
 267
 268out:
 269        *rsvm = svm;
 270        *rsdev = sdev;
 271
 272        return 0;
 273}
 274
 275int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
 276                          struct iommu_gpasid_bind_data *data)
 277{
 278        struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
 279        struct intel_svm_dev *sdev = NULL;
 280        struct dmar_domain *dmar_domain;
 281        struct intel_svm *svm = NULL;
 282        int ret = 0;
 283
 284        if (WARN_ON(!iommu) || !data)
 285                return -EINVAL;
 286
 287        if (data->version != IOMMU_GPASID_BIND_VERSION_1 ||
 288            data->format != IOMMU_PASID_FORMAT_INTEL_VTD)
 289                return -EINVAL;
 290
 291        if (!dev_is_pci(dev))
 292                return -ENOTSUPP;
 293
 294        /* VT-d supports devices with full 20 bit PASIDs only */
 295        if (pci_max_pasids(to_pci_dev(dev)) != PASID_MAX)
 296                return -EINVAL;
 297
 298        /*
 299         * We only check host PASID range, we have no knowledge to check
 300         * guest PASID range.
 301         */
 302        if (data->hpasid <= 0 || data->hpasid >= PASID_MAX)
 303                return -EINVAL;
 304
 305        dmar_domain = to_dmar_domain(domain);
 306
 307        mutex_lock(&pasid_mutex);
 308        ret = pasid_to_svm_sdev(dev, data->hpasid, &svm, &sdev);
 309        if (ret)
 310                goto out;
 311
 312        if (sdev) {
 313                /*
 314                 * Do not allow multiple bindings of the same device-PASID since
 315                 * there is only one SL page tables per PASID. We may revisit
 316                 * once sharing PGD across domains are supported.
 317                 */
 318                dev_warn_ratelimited(dev, "Already bound with PASID %u\n",
 319                                     svm->pasid);
 320                ret = -EBUSY;
 321                goto out;
 322        }
 323
 324        if (!svm) {
 325                /* We come here when PASID has never been bond to a device. */
 326                svm = kzalloc(sizeof(*svm), GFP_KERNEL);
 327                if (!svm) {
 328                        ret = -ENOMEM;
 329                        goto out;
 330                }
 331                /* REVISIT: upper layer/VFIO can track host process that bind
 332                 * the PASID. ioasid_set = mm might be sufficient for vfio to
 333                 * check pasid VMM ownership. We can drop the following line
 334                 * once VFIO and IOASID set check is in place.
 335                 */
 336                svm->mm = get_task_mm(current);
 337                svm->pasid = data->hpasid;
 338                if (data->flags & IOMMU_SVA_GPASID_VAL) {
 339                        svm->gpasid = data->gpasid;
 340                        svm->flags |= SVM_FLAG_GUEST_PASID;
 341                }
 342                ioasid_set_data(data->hpasid, svm);
 343                INIT_LIST_HEAD_RCU(&svm->devs);
 344                mmput(svm->mm);
 345        }
 346        sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
 347        if (!sdev) {
 348                ret = -ENOMEM;
 349                goto out;
 350        }
 351        sdev->dev = dev;
 352
 353        /* Only count users if device has aux domains */
 354        if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX))
 355                sdev->users = 1;
 356
 357        /* Set up device context entry for PASID if not enabled already */
 358        ret = intel_iommu_enable_pasid(iommu, sdev->dev);
 359        if (ret) {
 360                dev_err_ratelimited(dev, "Failed to enable PASID capability\n");
 361                kfree(sdev);
 362                goto out;
 363        }
 364
 365        /*
 366         * PASID table is per device for better security. Therefore, for
 367         * each bind of a new device even with an existing PASID, we need to
 368         * call the nested mode setup function here.
 369         */
 370        spin_lock(&iommu->lock);
 371        ret = intel_pasid_setup_nested(iommu, dev,
 372                                       (pgd_t *)(uintptr_t)data->gpgd,
 373                                       data->hpasid, &data->vtd, dmar_domain,
 374                                       data->addr_width);
 375        spin_unlock(&iommu->lock);
 376        if (ret) {
 377                dev_err_ratelimited(dev, "Failed to set up PASID %llu in nested mode, Err %d\n",
 378                                    data->hpasid, ret);
 379                /*
 380                 * PASID entry should be in cleared state if nested mode
 381                 * set up failed. So we only need to clear IOASID tracking
 382                 * data such that free call will succeed.
 383                 */
 384                kfree(sdev);
 385                goto out;
 386        }
 387
 388        svm->flags |= SVM_FLAG_GUEST_MODE;
 389
 390        init_rcu_head(&sdev->rcu);
 391        list_add_rcu(&sdev->list, &svm->devs);
 392 out:
 393        if (!IS_ERR_OR_NULL(svm) && list_empty(&svm->devs)) {
 394                ioasid_set_data(data->hpasid, NULL);
 395                kfree(svm);
 396        }
 397
 398        mutex_unlock(&pasid_mutex);
 399        return ret;
 400}
 401
 402int intel_svm_unbind_gpasid(struct device *dev, int pasid)
 403{
 404        struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
 405        struct intel_svm_dev *sdev;
 406        struct intel_svm *svm;
 407        int ret;
 408
 409        if (WARN_ON(!iommu))
 410                return -EINVAL;
 411
 412        mutex_lock(&pasid_mutex);
 413        ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev);
 414        if (ret)
 415                goto out;
 416
 417        if (sdev) {
 418                if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX))
 419                        sdev->users--;
 420                if (!sdev->users) {
 421                        list_del_rcu(&sdev->list);
 422                        intel_pasid_tear_down_entry(iommu, dev,
 423                                                    svm->pasid, false);
 424                        intel_svm_drain_prq(dev, svm->pasid);
 425                        kfree_rcu(sdev, rcu);
 426
 427                        if (list_empty(&svm->devs)) {
 428                                /*
 429                                 * We do not free the IOASID here in that
 430                                 * IOMMU driver did not allocate it.
 431                                 * Unlike native SVM, IOASID for guest use was
 432                                 * allocated prior to the bind call.
 433                                 * In any case, if the free call comes before
 434                                 * the unbind, IOMMU driver will get notified
 435                                 * and perform cleanup.
 436                                 */
 437                                ioasid_set_data(pasid, NULL);
 438                                kfree(svm);
 439                        }
 440                }
 441        }
 442out:
 443        mutex_unlock(&pasid_mutex);
 444        return ret;
 445}
 446
 447/* Caller must hold pasid_mutex, mm reference */
 448static int
 449intel_svm_bind_mm(struct device *dev, int flags, struct svm_dev_ops *ops,
 450                  struct mm_struct *mm, struct intel_svm_dev **sd)
 451{
 452        struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
 453        struct device_domain_info *info;
 454        struct intel_svm_dev *sdev;
 455        struct intel_svm *svm = NULL;
 456        int pasid_max;
 457        int ret;
 458
 459        if (!iommu || dmar_disabled)
 460                return -EINVAL;
 461
 462        if (!intel_svm_capable(iommu))
 463                return -ENOTSUPP;
 464
 465        if (dev_is_pci(dev)) {
 466                pasid_max = pci_max_pasids(to_pci_dev(dev));
 467                if (pasid_max < 0)
 468                        return -EINVAL;
 469        } else
 470                pasid_max = 1 << 20;
 471
 472        /* Bind supervisor PASID shuld have mm = NULL */
 473        if (flags & SVM_FLAG_SUPERVISOR_MODE) {
 474                if (!ecap_srs(iommu->ecap) || mm) {
 475                        pr_err("Supervisor PASID with user provided mm.\n");
 476                        return -EINVAL;
 477                }
 478        }
 479
 480        if (!(flags & SVM_FLAG_PRIVATE_PASID)) {
 481                struct intel_svm *t;
 482
 483                list_for_each_entry(t, &global_svm_list, list) {
 484                        if (t->mm != mm || (t->flags & SVM_FLAG_PRIVATE_PASID))
 485                                continue;
 486
 487                        svm = t;
 488                        if (svm->pasid >= pasid_max) {
 489                                dev_warn(dev,
 490                                         "Limited PASID width. Cannot use existing PASID %d\n",
 491                                         svm->pasid);
 492                                ret = -ENOSPC;
 493                                goto out;
 494                        }
 495
 496                        /* Find the matching device in svm list */
 497                        for_each_svm_dev(sdev, svm, dev) {
 498                                if (sdev->ops != ops) {
 499                                        ret = -EBUSY;
 500                                        goto out;
 501                                }
 502                                sdev->users++;
 503                                goto success;
 504                        }
 505
 506                        break;
 507                }
 508        }
 509
 510        sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
 511        if (!sdev) {
 512                ret = -ENOMEM;
 513                goto out;
 514        }
 515        sdev->dev = dev;
 516
 517        ret = intel_iommu_enable_pasid(iommu, dev);
 518        if (ret) {
 519                kfree(sdev);
 520                goto out;
 521        }
 522
 523        info = get_domain_info(dev);
 524        sdev->did = FLPT_DEFAULT_DID;
 525        sdev->sid = PCI_DEVID(info->bus, info->devfn);
 526        if (info->ats_enabled) {
 527                sdev->dev_iotlb = 1;
 528                sdev->qdep = info->ats_qdep;
 529                if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
 530                        sdev->qdep = 0;
 531        }
 532
 533        /* Finish the setup now we know we're keeping it */
 534        sdev->users = 1;
 535        sdev->ops = ops;
 536        init_rcu_head(&sdev->rcu);
 537
 538        if (!svm) {
 539                svm = kzalloc(sizeof(*svm), GFP_KERNEL);
 540                if (!svm) {
 541                        ret = -ENOMEM;
 542                        kfree(sdev);
 543                        goto out;
 544                }
 545                svm->iommu = iommu;
 546
 547                if (pasid_max > intel_pasid_max_id)
 548                        pasid_max = intel_pasid_max_id;
 549
 550                /* Do not use PASID 0, reserved for RID to PASID */
 551                svm->pasid = ioasid_alloc(NULL, PASID_MIN,
 552                                          pasid_max - 1, svm);
 553                if (svm->pasid == INVALID_IOASID) {
 554                        kfree(svm);
 555                        kfree(sdev);
 556                        ret = -ENOSPC;
 557                        goto out;
 558                }
 559                svm->notifier.ops = &intel_mmuops;
 560                svm->mm = mm;
 561                svm->flags = flags;
 562                INIT_LIST_HEAD_RCU(&svm->devs);
 563                INIT_LIST_HEAD(&svm->list);
 564                ret = -ENOMEM;
 565                if (mm) {
 566                        ret = mmu_notifier_register(&svm->notifier, mm);
 567                        if (ret) {
 568                                ioasid_free(svm->pasid);
 569                                kfree(svm);
 570                                kfree(sdev);
 571                                goto out;
 572                        }
 573                }
 574
 575                spin_lock(&iommu->lock);
 576                ret = intel_pasid_setup_first_level(iommu, dev,
 577                                mm ? mm->pgd : init_mm.pgd,
 578                                svm->pasid, FLPT_DEFAULT_DID,
 579                                (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) |
 580                                (cpu_feature_enabled(X86_FEATURE_LA57) ?
 581                                 PASID_FLAG_FL5LP : 0));
 582                spin_unlock(&iommu->lock);
 583                if (ret) {
 584                        if (mm)
 585                                mmu_notifier_unregister(&svm->notifier, mm);
 586                        ioasid_free(svm->pasid);
 587                        kfree(svm);
 588                        kfree(sdev);
 589                        goto out;
 590                }
 591
 592                list_add_tail(&svm->list, &global_svm_list);
 593        } else {
 594                /*
 595                 * Binding a new device with existing PASID, need to setup
 596                 * the PASID entry.
 597                 */
 598                spin_lock(&iommu->lock);
 599                ret = intel_pasid_setup_first_level(iommu, dev,
 600                                                mm ? mm->pgd : init_mm.pgd,
 601                                                svm->pasid, FLPT_DEFAULT_DID,
 602                                                (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) |
 603                                                (cpu_feature_enabled(X86_FEATURE_LA57) ?
 604                                                PASID_FLAG_FL5LP : 0));
 605                spin_unlock(&iommu->lock);
 606                if (ret) {
 607                        kfree(sdev);
 608                        goto out;
 609                }
 610        }
 611        list_add_rcu(&sdev->list, &svm->devs);
 612success:
 613        sdev->pasid = svm->pasid;
 614        sdev->sva.dev = dev;
 615        if (sd)
 616                *sd = sdev;
 617        ret = 0;
 618out:
 619        return ret;
 620}
 621
 622/* Caller must hold pasid_mutex */
 623static int intel_svm_unbind_mm(struct device *dev, int pasid)
 624{
 625        struct intel_svm_dev *sdev;
 626        struct intel_iommu *iommu;
 627        struct intel_svm *svm;
 628        int ret = -EINVAL;
 629
 630        iommu = device_to_iommu(dev, NULL, NULL);
 631        if (!iommu)
 632                goto out;
 633
 634        ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev);
 635        if (ret)
 636                goto out;
 637
 638        if (sdev) {
 639                sdev->users--;
 640                if (!sdev->users) {
 641                        list_del_rcu(&sdev->list);
 642                        /* Flush the PASID cache and IOTLB for this device.
 643                         * Note that we do depend on the hardware *not* using
 644                         * the PASID any more. Just as we depend on other
 645                         * devices never using PASIDs that they have no right
 646                         * to use. We have a *shared* PASID table, because it's
 647                         * large and has to be physically contiguous. So it's
 648                         * hard to be as defensive as we might like. */
 649                        intel_pasid_tear_down_entry(iommu, dev,
 650                                                    svm->pasid, false);
 651                        intel_svm_drain_prq(dev, svm->pasid);
 652                        kfree_rcu(sdev, rcu);
 653
 654                        if (list_empty(&svm->devs)) {
 655                                ioasid_free(svm->pasid);
 656                                if (svm->mm)
 657                                        mmu_notifier_unregister(&svm->notifier, svm->mm);
 658                                list_del(&svm->list);
 659                                /* We mandate that no page faults may be outstanding
 660                                 * for the PASID when intel_svm_unbind_mm() is called.
 661                                 * If that is not obeyed, subtle errors will happen.
 662                                 * Let's make them less subtle... */
 663                                memset(svm, 0x6b, sizeof(*svm));
 664                                kfree(svm);
 665                        }
 666                }
 667        }
 668out:
 669        return ret;
 670}
 671
 672/* Page request queue descriptor */
 673struct page_req_dsc {
 674        union {
 675                struct {
 676                        u64 type:8;
 677                        u64 pasid_present:1;
 678                        u64 priv_data_present:1;
 679                        u64 rsvd:6;
 680                        u64 rid:16;
 681                        u64 pasid:20;
 682                        u64 exe_req:1;
 683                        u64 pm_req:1;
 684                        u64 rsvd2:10;
 685                };
 686                u64 qw_0;
 687        };
 688        union {
 689                struct {
 690                        u64 rd_req:1;
 691                        u64 wr_req:1;
 692                        u64 lpig:1;
 693                        u64 prg_index:9;
 694                        u64 addr:52;
 695                };
 696                u64 qw_1;
 697        };
 698        u64 priv_data[2];
 699};
 700
 701#define PRQ_RING_MASK   ((0x1000 << PRQ_ORDER) - 0x20)
 702
 703static bool access_error(struct vm_area_struct *vma, struct page_req_dsc *req)
 704{
 705        unsigned long requested = 0;
 706
 707        if (req->exe_req)
 708                requested |= VM_EXEC;
 709
 710        if (req->rd_req)
 711                requested |= VM_READ;
 712
 713        if (req->wr_req)
 714                requested |= VM_WRITE;
 715
 716        return (requested & ~vma->vm_flags) != 0;
 717}
 718
 719static bool is_canonical_address(u64 addr)
 720{
 721        int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
 722        long saddr = (long) addr;
 723
 724        return (((saddr << shift) >> shift) == saddr);
 725}
 726
 727/**
 728 * intel_svm_drain_prq - Drain page requests and responses for a pasid
 729 * @dev: target device
 730 * @pasid: pasid for draining
 731 *
 732 * Drain all pending page requests and responses related to @pasid in both
 733 * software and hardware. This is supposed to be called after the device
 734 * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB
 735 * and DevTLB have been invalidated.
 736 *
 737 * It waits until all pending page requests for @pasid in the page fault
 738 * queue are completed by the prq handling thread. Then follow the steps
 739 * described in VT-d spec CH7.10 to drain all page requests and page
 740 * responses pending in the hardware.
 741 */
 742static void intel_svm_drain_prq(struct device *dev, int pasid)
 743{
 744        struct device_domain_info *info;
 745        struct dmar_domain *domain;
 746        struct intel_iommu *iommu;
 747        struct qi_desc desc[3];
 748        struct pci_dev *pdev;
 749        int head, tail;
 750        u16 sid, did;
 751        int qdep;
 752
 753        info = get_domain_info(dev);
 754        if (WARN_ON(!info || !dev_is_pci(dev)))
 755                return;
 756
 757        if (!info->pri_enabled)
 758                return;
 759
 760        iommu = info->iommu;
 761        domain = info->domain;
 762        pdev = to_pci_dev(dev);
 763        sid = PCI_DEVID(info->bus, info->devfn);
 764        did = domain->iommu_did[iommu->seq_id];
 765        qdep = pci_ats_queue_depth(pdev);
 766
 767        /*
 768         * Check and wait until all pending page requests in the queue are
 769         * handled by the prq handling thread.
 770         */
 771prq_retry:
 772        reinit_completion(&iommu->prq_complete);
 773        tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
 774        head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
 775        while (head != tail) {
 776                struct page_req_dsc *req;
 777
 778                req = &iommu->prq[head / sizeof(*req)];
 779                if (!req->pasid_present || req->pasid != pasid) {
 780                        head = (head + sizeof(*req)) & PRQ_RING_MASK;
 781                        continue;
 782                }
 783
 784                wait_for_completion(&iommu->prq_complete);
 785                goto prq_retry;
 786        }
 787
 788        /*
 789         * Perform steps described in VT-d spec CH7.10 to drain page
 790         * requests and responses in hardware.
 791         */
 792        memset(desc, 0, sizeof(desc));
 793        desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) |
 794                        QI_IWD_FENCE |
 795                        QI_IWD_TYPE;
 796        desc[1].qw0 = QI_EIOTLB_PASID(pasid) |
 797                        QI_EIOTLB_DID(did) |
 798                        QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) |
 799                        QI_EIOTLB_TYPE;
 800        desc[2].qw0 = QI_DEV_EIOTLB_PASID(pasid) |
 801                        QI_DEV_EIOTLB_SID(sid) |
 802                        QI_DEV_EIOTLB_QDEP(qdep) |
 803                        QI_DEIOTLB_TYPE |
 804                        QI_DEV_IOTLB_PFSID(info->pfsid);
 805qi_retry:
 806        reinit_completion(&iommu->prq_complete);
 807        qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN);
 808        if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) {
 809                wait_for_completion(&iommu->prq_complete);
 810                goto qi_retry;
 811        }
 812}
 813
 814static int prq_to_iommu_prot(struct page_req_dsc *req)
 815{
 816        int prot = 0;
 817
 818        if (req->rd_req)
 819                prot |= IOMMU_FAULT_PERM_READ;
 820        if (req->wr_req)
 821                prot |= IOMMU_FAULT_PERM_WRITE;
 822        if (req->exe_req)
 823                prot |= IOMMU_FAULT_PERM_EXEC;
 824        if (req->pm_req)
 825                prot |= IOMMU_FAULT_PERM_PRIV;
 826
 827        return prot;
 828}
 829
 830static int
 831intel_svm_prq_report(struct device *dev, struct page_req_dsc *desc)
 832{
 833        struct iommu_fault_event event;
 834
 835        if (!dev || !dev_is_pci(dev))
 836                return -ENODEV;
 837
 838        /* Fill in event data for device specific processing */
 839        memset(&event, 0, sizeof(struct iommu_fault_event));
 840        event.fault.type = IOMMU_FAULT_PAGE_REQ;
 841        event.fault.prm.addr = desc->addr;
 842        event.fault.prm.pasid = desc->pasid;
 843        event.fault.prm.grpid = desc->prg_index;
 844        event.fault.prm.perm = prq_to_iommu_prot(desc);
 845
 846        if (desc->lpig)
 847                event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
 848        if (desc->pasid_present) {
 849                event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
 850                event.fault.prm.flags |= IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID;
 851        }
 852        if (desc->priv_data_present) {
 853                /*
 854                 * Set last page in group bit if private data is present,
 855                 * page response is required as it does for LPIG.
 856                 * iommu_report_device_fault() doesn't understand this vendor
 857                 * specific requirement thus we set last_page as a workaround.
 858                 */
 859                event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
 860                event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA;
 861                memcpy(event.fault.prm.private_data, desc->priv_data,
 862                       sizeof(desc->priv_data));
 863        }
 864
 865        return iommu_report_device_fault(dev, &event);
 866}
 867
 868static irqreturn_t prq_event_thread(int irq, void *d)
 869{
 870        struct intel_svm_dev *sdev = NULL;
 871        struct intel_iommu *iommu = d;
 872        struct intel_svm *svm = NULL;
 873        int head, tail, handled = 0;
 874
 875        /* Clear PPR bit before reading head/tail registers, to
 876         * ensure that we get a new interrupt if needed. */
 877        writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG);
 878
 879        tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
 880        head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
 881        while (head != tail) {
 882                struct vm_area_struct *vma;
 883                struct page_req_dsc *req;
 884                struct qi_desc resp;
 885                int result;
 886                vm_fault_t ret;
 887                u64 address;
 888
 889                handled = 1;
 890
 891                req = &iommu->prq[head / sizeof(*req)];
 892
 893                result = QI_RESP_FAILURE;
 894                address = (u64)req->addr << VTD_PAGE_SHIFT;
 895                if (!req->pasid_present) {
 896                        pr_err("%s: Page request without PASID: %08llx %08llx\n",
 897                               iommu->name, ((unsigned long long *)req)[0],
 898                               ((unsigned long long *)req)[1]);
 899                        goto no_pasid;
 900                }
 901
 902                if (!svm || svm->pasid != req->pasid) {
 903                        rcu_read_lock();
 904                        svm = ioasid_find(NULL, req->pasid, NULL);
 905                        /* It *can't* go away, because the driver is not permitted
 906                         * to unbind the mm while any page faults are outstanding.
 907                         * So we only need RCU to protect the internal idr code. */
 908                        rcu_read_unlock();
 909                        if (IS_ERR_OR_NULL(svm)) {
 910                                pr_err("%s: Page request for invalid PASID %d: %08llx %08llx\n",
 911                                       iommu->name, req->pasid, ((unsigned long long *)req)[0],
 912                                       ((unsigned long long *)req)[1]);
 913                                goto no_pasid;
 914                        }
 915                }
 916
 917                if (!sdev || sdev->sid != req->rid) {
 918                        struct intel_svm_dev *t;
 919
 920                        sdev = NULL;
 921                        rcu_read_lock();
 922                        list_for_each_entry_rcu(t, &svm->devs, list) {
 923                                if (t->sid == req->rid) {
 924                                        sdev = t;
 925                                        break;
 926                                }
 927                        }
 928                        rcu_read_unlock();
 929                }
 930
 931                result = QI_RESP_INVALID;
 932                /* Since we're using init_mm.pgd directly, we should never take
 933                 * any faults on kernel addresses. */
 934                if (!svm->mm)
 935                        goto bad_req;
 936
 937                /* If address is not canonical, return invalid response */
 938                if (!is_canonical_address(address))
 939                        goto bad_req;
 940
 941                /*
 942                 * If prq is to be handled outside iommu driver via receiver of
 943                 * the fault notifiers, we skip the page response here.
 944                 */
 945                if (svm->flags & SVM_FLAG_GUEST_MODE) {
 946                        if (sdev && !intel_svm_prq_report(sdev->dev, req))
 947                                goto prq_advance;
 948                        else
 949                                goto bad_req;
 950                }
 951
 952                /* If the mm is already defunct, don't handle faults. */
 953                if (!mmget_not_zero(svm->mm))
 954                        goto bad_req;
 955
 956                mmap_read_lock(svm->mm);
 957                vma = find_extend_vma(svm->mm, address);
 958                if (!vma || address < vma->vm_start)
 959                        goto invalid;
 960
 961                if (access_error(vma, req))
 962                        goto invalid;
 963
 964                ret = handle_mm_fault(vma, address,
 965                                      req->wr_req ? FAULT_FLAG_WRITE : 0,
 966                                      NULL);
 967                if (ret & VM_FAULT_ERROR)
 968                        goto invalid;
 969
 970                result = QI_RESP_SUCCESS;
 971invalid:
 972                mmap_read_unlock(svm->mm);
 973                mmput(svm->mm);
 974bad_req:
 975                WARN_ON(!sdev);
 976                if (sdev && sdev->ops && sdev->ops->fault_cb) {
 977                        int rwxp = (req->rd_req << 3) | (req->wr_req << 2) |
 978                                (req->exe_req << 1) | (req->pm_req);
 979                        sdev->ops->fault_cb(sdev->dev, req->pasid, req->addr,
 980                                            req->priv_data, rwxp, result);
 981                }
 982                /* We get here in the error case where the PASID lookup failed,
 983                   and these can be NULL. Do not use them below this point! */
 984                sdev = NULL;
 985                svm = NULL;
 986no_pasid:
 987                if (req->lpig || req->priv_data_present) {
 988                        /*
 989                         * Per VT-d spec. v3.0 ch7.7, system software must
 990                         * respond with page group response if private data
 991                         * is present (PDP) or last page in group (LPIG) bit
 992                         * is set. This is an additional VT-d feature beyond
 993                         * PCI ATS spec.
 994                         */
 995                        resp.qw0 = QI_PGRP_PASID(req->pasid) |
 996                                QI_PGRP_DID(req->rid) |
 997                                QI_PGRP_PASID_P(req->pasid_present) |
 998                                QI_PGRP_PDP(req->pasid_present) |
 999                                QI_PGRP_RESP_CODE(result) |
1000                                QI_PGRP_RESP_TYPE;
1001                        resp.qw1 = QI_PGRP_IDX(req->prg_index) |
1002                                QI_PGRP_LPIG(req->lpig);
1003
1004                        if (req->priv_data_present)
1005                                memcpy(&resp.qw2, req->priv_data,
1006                                       sizeof(req->priv_data));
1007                        resp.qw2 = 0;
1008                        resp.qw3 = 0;
1009                        qi_submit_sync(iommu, &resp, 1, 0);
1010                }
1011prq_advance:
1012                head = (head + sizeof(*req)) & PRQ_RING_MASK;
1013        }
1014
1015        dmar_writeq(iommu->reg + DMAR_PQH_REG, tail);
1016
1017        /*
1018         * Clear the page request overflow bit and wake up all threads that
1019         * are waiting for the completion of this handling.
1020         */
1021        if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO)
1022                writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG);
1023
1024        if (!completion_done(&iommu->prq_complete))
1025                complete(&iommu->prq_complete);
1026
1027        return IRQ_RETVAL(handled);
1028}
1029
1030#define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva)
1031struct iommu_sva *
1032intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
1033{
1034        struct iommu_sva *sva = ERR_PTR(-EINVAL);
1035        struct intel_svm_dev *sdev = NULL;
1036        int flags = 0;
1037        int ret;
1038
1039        /*
1040         * TODO: Consolidate with generic iommu-sva bind after it is merged.
1041         * It will require shared SVM data structures, i.e. combine io_mm
1042         * and intel_svm etc.
1043         */
1044        if (drvdata)
1045                flags = *(int *)drvdata;
1046        mutex_lock(&pasid_mutex);
1047        ret = intel_svm_bind_mm(dev, flags, NULL, mm, &sdev);
1048        if (ret)
1049                sva = ERR_PTR(ret);
1050        else if (sdev)
1051                sva = &sdev->sva;
1052        else
1053                WARN(!sdev, "SVM bind succeeded with no sdev!\n");
1054
1055        mutex_unlock(&pasid_mutex);
1056
1057        return sva;
1058}
1059
1060void intel_svm_unbind(struct iommu_sva *sva)
1061{
1062        struct intel_svm_dev *sdev;
1063
1064        mutex_lock(&pasid_mutex);
1065        sdev = to_intel_svm_dev(sva);
1066        intel_svm_unbind_mm(sdev->dev, sdev->pasid);
1067        mutex_unlock(&pasid_mutex);
1068}
1069
1070int intel_svm_get_pasid(struct iommu_sva *sva)
1071{
1072        struct intel_svm_dev *sdev;
1073        int pasid;
1074
1075        mutex_lock(&pasid_mutex);
1076        sdev = to_intel_svm_dev(sva);
1077        pasid = sdev->pasid;
1078        mutex_unlock(&pasid_mutex);
1079
1080        return pasid;
1081}
1082
1083int intel_svm_page_response(struct device *dev,
1084                            struct iommu_fault_event *evt,
1085                            struct iommu_page_response *msg)
1086{
1087        struct iommu_fault_page_request *prm;
1088        struct intel_svm_dev *sdev = NULL;
1089        struct intel_svm *svm = NULL;
1090        struct intel_iommu *iommu;
1091        bool private_present;
1092        bool pasid_present;
1093        bool last_page;
1094        u8 bus, devfn;
1095        int ret = 0;
1096        u16 sid;
1097
1098        if (!dev || !dev_is_pci(dev))
1099                return -ENODEV;
1100
1101        iommu = device_to_iommu(dev, &bus, &devfn);
1102        if (!iommu)
1103                return -ENODEV;
1104
1105        if (!msg || !evt)
1106                return -EINVAL;
1107
1108        mutex_lock(&pasid_mutex);
1109
1110        prm = &evt->fault.prm;
1111        sid = PCI_DEVID(bus, devfn);
1112        pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
1113        private_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA;
1114        last_page = prm->flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
1115
1116        if (!pasid_present) {
1117                ret = -EINVAL;
1118                goto out;
1119        }
1120
1121        if (prm->pasid == 0 || prm->pasid >= PASID_MAX) {
1122                ret = -EINVAL;
1123                goto out;
1124        }
1125
1126        ret = pasid_to_svm_sdev(dev, prm->pasid, &svm, &sdev);
1127        if (ret || !sdev) {
1128                ret = -ENODEV;
1129                goto out;
1130        }
1131
1132        /*
1133         * For responses from userspace, need to make sure that the
1134         * pasid has been bound to its mm.
1135         */
1136        if (svm->flags & SVM_FLAG_GUEST_MODE) {
1137                struct mm_struct *mm;
1138
1139                mm = get_task_mm(current);
1140                if (!mm) {
1141                        ret = -EINVAL;
1142                        goto out;
1143                }
1144
1145                if (mm != svm->mm) {
1146                        ret = -ENODEV;
1147                        mmput(mm);
1148                        goto out;
1149                }
1150
1151                mmput(mm);
1152        }
1153
1154        /*
1155         * Per VT-d spec. v3.0 ch7.7, system software must respond
1156         * with page group response if private data is present (PDP)
1157         * or last page in group (LPIG) bit is set. This is an
1158         * additional VT-d requirement beyond PCI ATS spec.
1159         */
1160        if (last_page || private_present) {
1161                struct qi_desc desc;
1162
1163                desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) |
1164                                QI_PGRP_PASID_P(pasid_present) |
1165                                QI_PGRP_PDP(private_present) |
1166                                QI_PGRP_RESP_CODE(msg->code) |
1167                                QI_PGRP_RESP_TYPE;
1168                desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page);
1169                desc.qw2 = 0;
1170                desc.qw3 = 0;
1171                if (private_present)
1172                        memcpy(&desc.qw2, prm->private_data,
1173                               sizeof(prm->private_data));
1174
1175                qi_submit_sync(iommu, &desc, 1, 0);
1176        }
1177out:
1178        mutex_unlock(&pasid_mutex);
1179        return ret;
1180}
1181