linux/drivers/iommu/intel/iommu.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright © 2006-2014 Intel Corporation.
   4 *
   5 * Authors: David Woodhouse <dwmw2@infradead.org>,
   6 *          Ashok Raj <ashok.raj@intel.com>,
   7 *          Shaohua Li <shaohua.li@intel.com>,
   8 *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
   9 *          Fenghua Yu <fenghua.yu@intel.com>
  10 *          Joerg Roedel <jroedel@suse.de>
  11 */
  12
  13#define pr_fmt(fmt)     "DMAR: " fmt
  14#define dev_fmt(fmt)    pr_fmt(fmt)
  15
  16#include <linux/init.h>
  17#include <linux/bitmap.h>
  18#include <linux/debugfs.h>
  19#include <linux/export.h>
  20#include <linux/slab.h>
  21#include <linux/irq.h>
  22#include <linux/interrupt.h>
  23#include <linux/spinlock.h>
  24#include <linux/pci.h>
  25#include <linux/dmar.h>
  26#include <linux/dma-map-ops.h>
  27#include <linux/mempool.h>
  28#include <linux/memory.h>
  29#include <linux/cpu.h>
  30#include <linux/timer.h>
  31#include <linux/io.h>
  32#include <linux/iova.h>
  33#include <linux/iommu.h>
  34#include <linux/dma-iommu.h>
  35#include <linux/intel-iommu.h>
  36#include <linux/intel-svm.h>
  37#include <linux/syscore_ops.h>
  38#include <linux/tboot.h>
  39#include <linux/dmi.h>
  40#include <linux/pci-ats.h>
  41#include <linux/memblock.h>
  42#include <linux/dma-direct.h>
  43#include <linux/crash_dump.h>
  44#include <linux/numa.h>
  45#include <asm/irq_remapping.h>
  46#include <asm/cacheflush.h>
  47#include <asm/iommu.h>
  48
  49#include "../irq_remapping.h"
  50#include "../iommu-sva-lib.h"
  51#include "pasid.h"
  52#include "cap_audit.h"
  53
  54#define ROOT_SIZE               VTD_PAGE_SIZE
  55#define CONTEXT_SIZE            VTD_PAGE_SIZE
  56
  57#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  58#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  59#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  60#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  61
  62#define IOAPIC_RANGE_START      (0xfee00000)
  63#define IOAPIC_RANGE_END        (0xfeefffff)
  64#define IOVA_START_ADDR         (0x1000)
  65
  66#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
  67
  68#define MAX_AGAW_WIDTH 64
  69#define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
  70
  71#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
  72#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
  73
  74/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  75   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  76#define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  77                                __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  78#define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  79
  80/* IO virtual address start page frame number */
  81#define IOVA_START_PFN          (1)
  82
  83#define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  84
  85/* page table handling */
  86#define LEVEL_STRIDE            (9)
  87#define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  88
  89static inline int agaw_to_level(int agaw)
  90{
  91        return agaw + 2;
  92}
  93
  94static inline int agaw_to_width(int agaw)
  95{
  96        return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
  97}
  98
  99static inline int width_to_agaw(int width)
 100{
 101        return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
 102}
 103
 104static inline unsigned int level_to_offset_bits(int level)
 105{
 106        return (level - 1) * LEVEL_STRIDE;
 107}
 108
 109static inline int pfn_level_offset(u64 pfn, int level)
 110{
 111        return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 112}
 113
 114static inline u64 level_mask(int level)
 115{
 116        return -1ULL << level_to_offset_bits(level);
 117}
 118
 119static inline u64 level_size(int level)
 120{
 121        return 1ULL << level_to_offset_bits(level);
 122}
 123
 124static inline u64 align_to_level(u64 pfn, int level)
 125{
 126        return (pfn + level_size(level) - 1) & level_mask(level);
 127}
 128
 129static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 130{
 131        return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
 132}
 133
 134/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 135   are never going to work. */
 136static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 137{
 138        return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 139}
 140
 141static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 142{
 143        return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 144}
 145static inline unsigned long page_to_dma_pfn(struct page *pg)
 146{
 147        return mm_to_dma_pfn(page_to_pfn(pg));
 148}
 149static inline unsigned long virt_to_dma_pfn(void *p)
 150{
 151        return page_to_dma_pfn(virt_to_page(p));
 152}
 153
 154/* global iommu list, set NULL for ignored DMAR units */
 155static struct intel_iommu **g_iommus;
 156
 157static void __init check_tylersburg_isoch(void);
 158static int rwbf_quirk;
 159
 160/*
 161 * set to 1 to panic kernel if can't successfully enable VT-d
 162 * (used when kernel is launched w/ TXT)
 163 */
 164static int force_on = 0;
 165static int intel_iommu_tboot_noforce;
 166static int no_platform_optin;
 167
 168#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 169
 170/*
 171 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 172 * if marked present.
 173 */
 174static phys_addr_t root_entry_lctp(struct root_entry *re)
 175{
 176        if (!(re->lo & 1))
 177                return 0;
 178
 179        return re->lo & VTD_PAGE_MASK;
 180}
 181
 182/*
 183 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 184 * if marked present.
 185 */
 186static phys_addr_t root_entry_uctp(struct root_entry *re)
 187{
 188        if (!(re->hi & 1))
 189                return 0;
 190
 191        return re->hi & VTD_PAGE_MASK;
 192}
 193
 194static inline void context_clear_pasid_enable(struct context_entry *context)
 195{
 196        context->lo &= ~(1ULL << 11);
 197}
 198
 199static inline bool context_pasid_enabled(struct context_entry *context)
 200{
 201        return !!(context->lo & (1ULL << 11));
 202}
 203
 204static inline void context_set_copied(struct context_entry *context)
 205{
 206        context->hi |= (1ull << 3);
 207}
 208
 209static inline bool context_copied(struct context_entry *context)
 210{
 211        return !!(context->hi & (1ULL << 3));
 212}
 213
 214static inline bool __context_present(struct context_entry *context)
 215{
 216        return (context->lo & 1);
 217}
 218
 219bool context_present(struct context_entry *context)
 220{
 221        return context_pasid_enabled(context) ?
 222             __context_present(context) :
 223             __context_present(context) && !context_copied(context);
 224}
 225
 226static inline void context_set_present(struct context_entry *context)
 227{
 228        context->lo |= 1;
 229}
 230
 231static inline void context_set_fault_enable(struct context_entry *context)
 232{
 233        context->lo &= (((u64)-1) << 2) | 1;
 234}
 235
 236static inline void context_set_translation_type(struct context_entry *context,
 237                                                unsigned long value)
 238{
 239        context->lo &= (((u64)-1) << 4) | 3;
 240        context->lo |= (value & 3) << 2;
 241}
 242
 243static inline void context_set_address_root(struct context_entry *context,
 244                                            unsigned long value)
 245{
 246        context->lo &= ~VTD_PAGE_MASK;
 247        context->lo |= value & VTD_PAGE_MASK;
 248}
 249
 250static inline void context_set_address_width(struct context_entry *context,
 251                                             unsigned long value)
 252{
 253        context->hi |= value & 7;
 254}
 255
 256static inline void context_set_domain_id(struct context_entry *context,
 257                                         unsigned long value)
 258{
 259        context->hi |= (value & ((1 << 16) - 1)) << 8;
 260}
 261
 262static inline int context_domain_id(struct context_entry *c)
 263{
 264        return((c->hi >> 8) & 0xffff);
 265}
 266
 267static inline void context_clear_entry(struct context_entry *context)
 268{
 269        context->lo = 0;
 270        context->hi = 0;
 271}
 272
 273/*
 274 * This domain is a statically identity mapping domain.
 275 *      1. This domain creats a static 1:1 mapping to all usable memory.
 276 *      2. It maps to each iommu if successful.
 277 *      3. Each iommu mapps to this domain if successful.
 278 */
 279static struct dmar_domain *si_domain;
 280static int hw_pass_through = 1;
 281
 282#define for_each_domain_iommu(idx, domain)                      \
 283        for (idx = 0; idx < g_num_of_iommus; idx++)             \
 284                if (domain->iommu_refcnt[idx])
 285
 286struct dmar_rmrr_unit {
 287        struct list_head list;          /* list of rmrr units   */
 288        struct acpi_dmar_header *hdr;   /* ACPI header          */
 289        u64     base_address;           /* reserved base address*/
 290        u64     end_address;            /* reserved end address */
 291        struct dmar_dev_scope *devices; /* target devices */
 292        int     devices_cnt;            /* target device count */
 293};
 294
 295struct dmar_atsr_unit {
 296        struct list_head list;          /* list of ATSR units */
 297        struct acpi_dmar_header *hdr;   /* ACPI header */
 298        struct dmar_dev_scope *devices; /* target devices */
 299        int devices_cnt;                /* target device count */
 300        u8 include_all:1;               /* include all ports */
 301};
 302
 303struct dmar_satc_unit {
 304        struct list_head list;          /* list of SATC units */
 305        struct acpi_dmar_header *hdr;   /* ACPI header */
 306        struct dmar_dev_scope *devices; /* target devices */
 307        struct intel_iommu *iommu;      /* the corresponding iommu */
 308        int devices_cnt;                /* target device count */
 309        u8 atc_required:1;              /* ATS is required */
 310};
 311
 312static LIST_HEAD(dmar_atsr_units);
 313static LIST_HEAD(dmar_rmrr_units);
 314static LIST_HEAD(dmar_satc_units);
 315
 316#define for_each_rmrr_units(rmrr) \
 317        list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 318
 319/* bitmap for indexing intel_iommus */
 320static int g_num_of_iommus;
 321
 322static void domain_exit(struct dmar_domain *domain);
 323static void domain_remove_dev_info(struct dmar_domain *domain);
 324static void dmar_remove_one_dev_info(struct device *dev);
 325static void __dmar_remove_one_dev_info(struct device_domain_info *info);
 326static int intel_iommu_attach_device(struct iommu_domain *domain,
 327                                     struct device *dev);
 328static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
 329                                            dma_addr_t iova);
 330
 331int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
 332int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
 333
 334int intel_iommu_enabled = 0;
 335EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 336
 337static int dmar_map_gfx = 1;
 338static int intel_iommu_superpage = 1;
 339static int iommu_identity_mapping;
 340static int iommu_skip_te_disable;
 341
 342#define IDENTMAP_GFX            2
 343#define IDENTMAP_AZALIA         4
 344
 345int intel_iommu_gfx_mapped;
 346EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 347
 348#define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
 349struct device_domain_info *get_domain_info(struct device *dev)
 350{
 351        struct device_domain_info *info;
 352
 353        if (!dev)
 354                return NULL;
 355
 356        info = dev_iommu_priv_get(dev);
 357        if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
 358                return NULL;
 359
 360        return info;
 361}
 362
 363DEFINE_SPINLOCK(device_domain_lock);
 364static LIST_HEAD(device_domain_list);
 365
 366/*
 367 * Iterate over elements in device_domain_list and call the specified
 368 * callback @fn against each element.
 369 */
 370int for_each_device_domain(int (*fn)(struct device_domain_info *info,
 371                                     void *data), void *data)
 372{
 373        int ret = 0;
 374        unsigned long flags;
 375        struct device_domain_info *info;
 376
 377        spin_lock_irqsave(&device_domain_lock, flags);
 378        list_for_each_entry(info, &device_domain_list, global) {
 379                ret = fn(info, data);
 380                if (ret) {
 381                        spin_unlock_irqrestore(&device_domain_lock, flags);
 382                        return ret;
 383                }
 384        }
 385        spin_unlock_irqrestore(&device_domain_lock, flags);
 386
 387        return 0;
 388}
 389
 390const struct iommu_ops intel_iommu_ops;
 391
 392static bool translation_pre_enabled(struct intel_iommu *iommu)
 393{
 394        return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 395}
 396
 397static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 398{
 399        iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 400}
 401
 402static void init_translation_status(struct intel_iommu *iommu)
 403{
 404        u32 gsts;
 405
 406        gsts = readl(iommu->reg + DMAR_GSTS_REG);
 407        if (gsts & DMA_GSTS_TES)
 408                iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 409}
 410
 411static int __init intel_iommu_setup(char *str)
 412{
 413        if (!str)
 414                return -EINVAL;
 415        while (*str) {
 416                if (!strncmp(str, "on", 2)) {
 417                        dmar_disabled = 0;
 418                        pr_info("IOMMU enabled\n");
 419                } else if (!strncmp(str, "off", 3)) {
 420                        dmar_disabled = 1;
 421                        no_platform_optin = 1;
 422                        pr_info("IOMMU disabled\n");
 423                } else if (!strncmp(str, "igfx_off", 8)) {
 424                        dmar_map_gfx = 0;
 425                        pr_info("Disable GFX device mapping\n");
 426                } else if (!strncmp(str, "forcedac", 8)) {
 427                        pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
 428                        iommu_dma_forcedac = true;
 429                } else if (!strncmp(str, "strict", 6)) {
 430                        pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
 431                        iommu_set_dma_strict();
 432                } else if (!strncmp(str, "sp_off", 6)) {
 433                        pr_info("Disable supported super page\n");
 434                        intel_iommu_superpage = 0;
 435                } else if (!strncmp(str, "sm_on", 5)) {
 436                        pr_info("Enable scalable mode if hardware supports\n");
 437                        intel_iommu_sm = 1;
 438                } else if (!strncmp(str, "sm_off", 6)) {
 439                        pr_info("Scalable mode is disallowed\n");
 440                        intel_iommu_sm = 0;
 441                } else if (!strncmp(str, "tboot_noforce", 13)) {
 442                        pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 443                        intel_iommu_tboot_noforce = 1;
 444                }
 445
 446                str += strcspn(str, ",");
 447                while (*str == ',')
 448                        str++;
 449        }
 450        return 0;
 451}
 452__setup("intel_iommu=", intel_iommu_setup);
 453
 454static struct kmem_cache *iommu_domain_cache;
 455static struct kmem_cache *iommu_devinfo_cache;
 456
 457static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
 458{
 459        struct dmar_domain **domains;
 460        int idx = did >> 8;
 461
 462        domains = iommu->domains[idx];
 463        if (!domains)
 464                return NULL;
 465
 466        return domains[did & 0xff];
 467}
 468
 469static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
 470                             struct dmar_domain *domain)
 471{
 472        struct dmar_domain **domains;
 473        int idx = did >> 8;
 474
 475        if (!iommu->domains[idx]) {
 476                size_t size = 256 * sizeof(struct dmar_domain *);
 477                iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
 478        }
 479
 480        domains = iommu->domains[idx];
 481        if (WARN_ON(!domains))
 482                return;
 483        else
 484                domains[did & 0xff] = domain;
 485}
 486
 487void *alloc_pgtable_page(int node)
 488{
 489        struct page *page;
 490        void *vaddr = NULL;
 491
 492        page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 493        if (page)
 494                vaddr = page_address(page);
 495        return vaddr;
 496}
 497
 498void free_pgtable_page(void *vaddr)
 499{
 500        free_page((unsigned long)vaddr);
 501}
 502
 503static inline void *alloc_domain_mem(void)
 504{
 505        return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 506}
 507
 508static void free_domain_mem(void *vaddr)
 509{
 510        kmem_cache_free(iommu_domain_cache, vaddr);
 511}
 512
 513static inline void * alloc_devinfo_mem(void)
 514{
 515        return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 516}
 517
 518static inline void free_devinfo_mem(void *vaddr)
 519{
 520        kmem_cache_free(iommu_devinfo_cache, vaddr);
 521}
 522
 523static inline int domain_type_is_si(struct dmar_domain *domain)
 524{
 525        return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
 526}
 527
 528static inline bool domain_use_first_level(struct dmar_domain *domain)
 529{
 530        return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
 531}
 532
 533static inline int domain_pfn_supported(struct dmar_domain *domain,
 534                                       unsigned long pfn)
 535{
 536        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 537
 538        return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 539}
 540
 541static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 542{
 543        unsigned long sagaw;
 544        int agaw;
 545
 546        sagaw = cap_sagaw(iommu->cap);
 547        for (agaw = width_to_agaw(max_gaw);
 548             agaw >= 0; agaw--) {
 549                if (test_bit(agaw, &sagaw))
 550                        break;
 551        }
 552
 553        return agaw;
 554}
 555
 556/*
 557 * Calculate max SAGAW for each iommu.
 558 */
 559int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 560{
 561        return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 562}
 563
 564/*
 565 * calculate agaw for each iommu.
 566 * "SAGAW" may be different across iommus, use a default agaw, and
 567 * get a supported less agaw for iommus that don't support the default agaw.
 568 */
 569int iommu_calculate_agaw(struct intel_iommu *iommu)
 570{
 571        return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 572}
 573
 574/* This functionin only returns single iommu in a domain */
 575struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 576{
 577        int iommu_id;
 578
 579        /* si_domain and vm domain should not get here. */
 580        if (WARN_ON(!iommu_is_dma_domain(&domain->domain)))
 581                return NULL;
 582
 583        for_each_domain_iommu(iommu_id, domain)
 584                break;
 585
 586        if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 587                return NULL;
 588
 589        return g_iommus[iommu_id];
 590}
 591
 592static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
 593{
 594        return sm_supported(iommu) ?
 595                        ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
 596}
 597
 598static void domain_update_iommu_coherency(struct dmar_domain *domain)
 599{
 600        struct dmar_drhd_unit *drhd;
 601        struct intel_iommu *iommu;
 602        bool found = false;
 603        int i;
 604
 605        domain->iommu_coherency = true;
 606
 607        for_each_domain_iommu(i, domain) {
 608                found = true;
 609                if (!iommu_paging_structure_coherency(g_iommus[i])) {
 610                        domain->iommu_coherency = false;
 611                        break;
 612                }
 613        }
 614        if (found)
 615                return;
 616
 617        /* No hardware attached; use lowest common denominator */
 618        rcu_read_lock();
 619        for_each_active_iommu(iommu, drhd) {
 620                if (!iommu_paging_structure_coherency(iommu)) {
 621                        domain->iommu_coherency = false;
 622                        break;
 623                }
 624        }
 625        rcu_read_unlock();
 626}
 627
 628static bool domain_update_iommu_snooping(struct intel_iommu *skip)
 629{
 630        struct dmar_drhd_unit *drhd;
 631        struct intel_iommu *iommu;
 632        bool ret = true;
 633
 634        rcu_read_lock();
 635        for_each_active_iommu(iommu, drhd) {
 636                if (iommu != skip) {
 637                        /*
 638                         * If the hardware is operating in the scalable mode,
 639                         * the snooping control is always supported since we
 640                         * always set PASID-table-entry.PGSNP bit if the domain
 641                         * is managed outside (UNMANAGED).
 642                         */
 643                        if (!sm_supported(iommu) &&
 644                            !ecap_sc_support(iommu->ecap)) {
 645                                ret = false;
 646                                break;
 647                        }
 648                }
 649        }
 650        rcu_read_unlock();
 651
 652        return ret;
 653}
 654
 655static int domain_update_iommu_superpage(struct dmar_domain *domain,
 656                                         struct intel_iommu *skip)
 657{
 658        struct dmar_drhd_unit *drhd;
 659        struct intel_iommu *iommu;
 660        int mask = 0x3;
 661
 662        if (!intel_iommu_superpage)
 663                return 0;
 664
 665        /* set iommu_superpage to the smallest common denominator */
 666        rcu_read_lock();
 667        for_each_active_iommu(iommu, drhd) {
 668                if (iommu != skip) {
 669                        if (domain && domain_use_first_level(domain)) {
 670                                if (!cap_fl1gp_support(iommu->cap))
 671                                        mask = 0x1;
 672                        } else {
 673                                mask &= cap_super_page_val(iommu->cap);
 674                        }
 675
 676                        if (!mask)
 677                                break;
 678                }
 679        }
 680        rcu_read_unlock();
 681
 682        return fls(mask);
 683}
 684
 685static int domain_update_device_node(struct dmar_domain *domain)
 686{
 687        struct device_domain_info *info;
 688        int nid = NUMA_NO_NODE;
 689
 690        assert_spin_locked(&device_domain_lock);
 691
 692        if (list_empty(&domain->devices))
 693                return NUMA_NO_NODE;
 694
 695        list_for_each_entry(info, &domain->devices, link) {
 696                if (!info->dev)
 697                        continue;
 698
 699                /*
 700                 * There could possibly be multiple device numa nodes as devices
 701                 * within the same domain may sit behind different IOMMUs. There
 702                 * isn't perfect answer in such situation, so we select first
 703                 * come first served policy.
 704                 */
 705                nid = dev_to_node(info->dev);
 706                if (nid != NUMA_NO_NODE)
 707                        break;
 708        }
 709
 710        return nid;
 711}
 712
 713static void domain_update_iotlb(struct dmar_domain *domain);
 714
 715/* Return the super pagesize bitmap if supported. */
 716static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
 717{
 718        unsigned long bitmap = 0;
 719
 720        /*
 721         * 1-level super page supports page size of 2MiB, 2-level super page
 722         * supports page size of both 2MiB and 1GiB.
 723         */
 724        if (domain->iommu_superpage == 1)
 725                bitmap |= SZ_2M;
 726        else if (domain->iommu_superpage == 2)
 727                bitmap |= SZ_2M | SZ_1G;
 728
 729        return bitmap;
 730}
 731
 732/* Some capabilities may be different across iommus */
 733static void domain_update_iommu_cap(struct dmar_domain *domain)
 734{
 735        domain_update_iommu_coherency(domain);
 736        domain->iommu_snooping = domain_update_iommu_snooping(NULL);
 737        domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
 738
 739        /*
 740         * If RHSA is missing, we should default to the device numa domain
 741         * as fall back.
 742         */
 743        if (domain->nid == NUMA_NO_NODE)
 744                domain->nid = domain_update_device_node(domain);
 745
 746        /*
 747         * First-level translation restricts the input-address to a
 748         * canonical address (i.e., address bits 63:N have the same
 749         * value as address bit [N-1], where N is 48-bits with 4-level
 750         * paging and 57-bits with 5-level paging). Hence, skip bit
 751         * [N-1].
 752         */
 753        if (domain_use_first_level(domain))
 754                domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
 755        else
 756                domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
 757
 758        domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
 759        domain_update_iotlb(domain);
 760}
 761
 762struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
 763                                         u8 devfn, int alloc)
 764{
 765        struct root_entry *root = &iommu->root_entry[bus];
 766        struct context_entry *context;
 767        u64 *entry;
 768
 769        entry = &root->lo;
 770        if (sm_supported(iommu)) {
 771                if (devfn >= 0x80) {
 772                        devfn -= 0x80;
 773                        entry = &root->hi;
 774                }
 775                devfn *= 2;
 776        }
 777        if (*entry & 1)
 778                context = phys_to_virt(*entry & VTD_PAGE_MASK);
 779        else {
 780                unsigned long phy_addr;
 781                if (!alloc)
 782                        return NULL;
 783
 784                context = alloc_pgtable_page(iommu->node);
 785                if (!context)
 786                        return NULL;
 787
 788                __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 789                phy_addr = virt_to_phys((void *)context);
 790                *entry = phy_addr | 1;
 791                __iommu_flush_cache(iommu, entry, sizeof(*entry));
 792        }
 793        return &context[devfn];
 794}
 795
 796static bool attach_deferred(struct device *dev)
 797{
 798        return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
 799}
 800
 801/**
 802 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
 803 *                               sub-hierarchy of a candidate PCI-PCI bridge
 804 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
 805 * @bridge: the candidate PCI-PCI bridge
 806 *
 807 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
 808 */
 809static bool
 810is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
 811{
 812        struct pci_dev *pdev, *pbridge;
 813
 814        if (!dev_is_pci(dev) || !dev_is_pci(bridge))
 815                return false;
 816
 817        pdev = to_pci_dev(dev);
 818        pbridge = to_pci_dev(bridge);
 819
 820        if (pbridge->subordinate &&
 821            pbridge->subordinate->number <= pdev->bus->number &&
 822            pbridge->subordinate->busn_res.end >= pdev->bus->number)
 823                return true;
 824
 825        return false;
 826}
 827
 828static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
 829{
 830        struct dmar_drhd_unit *drhd;
 831        u32 vtbar;
 832        int rc;
 833
 834        /* We know that this device on this chipset has its own IOMMU.
 835         * If we find it under a different IOMMU, then the BIOS is lying
 836         * to us. Hope that the IOMMU for this device is actually
 837         * disabled, and it needs no translation...
 838         */
 839        rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
 840        if (rc) {
 841                /* "can't" happen */
 842                dev_info(&pdev->dev, "failed to run vt-d quirk\n");
 843                return false;
 844        }
 845        vtbar &= 0xffff0000;
 846
 847        /* we know that the this iommu should be at offset 0xa000 from vtbar */
 848        drhd = dmar_find_matched_drhd_unit(pdev);
 849        if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
 850                pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
 851                add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
 852                return true;
 853        }
 854
 855        return false;
 856}
 857
 858static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
 859{
 860        if (!iommu || iommu->drhd->ignored)
 861                return true;
 862
 863        if (dev_is_pci(dev)) {
 864                struct pci_dev *pdev = to_pci_dev(dev);
 865
 866                if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
 867                    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
 868                    quirk_ioat_snb_local_iommu(pdev))
 869                        return true;
 870        }
 871
 872        return false;
 873}
 874
 875struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 876{
 877        struct dmar_drhd_unit *drhd = NULL;
 878        struct pci_dev *pdev = NULL;
 879        struct intel_iommu *iommu;
 880        struct device *tmp;
 881        u16 segment = 0;
 882        int i;
 883
 884        if (!dev)
 885                return NULL;
 886
 887        if (dev_is_pci(dev)) {
 888                struct pci_dev *pf_pdev;
 889
 890                pdev = pci_real_dma_dev(to_pci_dev(dev));
 891
 892                /* VFs aren't listed in scope tables; we need to look up
 893                 * the PF instead to find the IOMMU. */
 894                pf_pdev = pci_physfn(pdev);
 895                dev = &pf_pdev->dev;
 896                segment = pci_domain_nr(pdev->bus);
 897        } else if (has_acpi_companion(dev))
 898                dev = &ACPI_COMPANION(dev)->dev;
 899
 900        rcu_read_lock();
 901        for_each_iommu(iommu, drhd) {
 902                if (pdev && segment != drhd->segment)
 903                        continue;
 904
 905                for_each_active_dev_scope(drhd->devices,
 906                                          drhd->devices_cnt, i, tmp) {
 907                        if (tmp == dev) {
 908                                /* For a VF use its original BDF# not that of the PF
 909                                 * which we used for the IOMMU lookup. Strictly speaking
 910                                 * we could do this for all PCI devices; we only need to
 911                                 * get the BDF# from the scope table for ACPI matches. */
 912                                if (pdev && pdev->is_virtfn)
 913                                        goto got_pdev;
 914
 915                                if (bus && devfn) {
 916                                        *bus = drhd->devices[i].bus;
 917                                        *devfn = drhd->devices[i].devfn;
 918                                }
 919                                goto out;
 920                        }
 921
 922                        if (is_downstream_to_pci_bridge(dev, tmp))
 923                                goto got_pdev;
 924                }
 925
 926                if (pdev && drhd->include_all) {
 927                got_pdev:
 928                        if (bus && devfn) {
 929                                *bus = pdev->bus->number;
 930                                *devfn = pdev->devfn;
 931                        }
 932                        goto out;
 933                }
 934        }
 935        iommu = NULL;
 936 out:
 937        if (iommu_is_dummy(iommu, dev))
 938                iommu = NULL;
 939
 940        rcu_read_unlock();
 941
 942        return iommu;
 943}
 944
 945static void domain_flush_cache(struct dmar_domain *domain,
 946                               void *addr, int size)
 947{
 948        if (!domain->iommu_coherency)
 949                clflush_cache_range(addr, size);
 950}
 951
 952static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 953{
 954        struct context_entry *context;
 955        int ret = 0;
 956        unsigned long flags;
 957
 958        spin_lock_irqsave(&iommu->lock, flags);
 959        context = iommu_context_addr(iommu, bus, devfn, 0);
 960        if (context)
 961                ret = context_present(context);
 962        spin_unlock_irqrestore(&iommu->lock, flags);
 963        return ret;
 964}
 965
 966static void free_context_table(struct intel_iommu *iommu)
 967{
 968        int i;
 969        unsigned long flags;
 970        struct context_entry *context;
 971
 972        spin_lock_irqsave(&iommu->lock, flags);
 973        if (!iommu->root_entry) {
 974                goto out;
 975        }
 976        for (i = 0; i < ROOT_ENTRY_NR; i++) {
 977                context = iommu_context_addr(iommu, i, 0, 0);
 978                if (context)
 979                        free_pgtable_page(context);
 980
 981                if (!sm_supported(iommu))
 982                        continue;
 983
 984                context = iommu_context_addr(iommu, i, 0x80, 0);
 985                if (context)
 986                        free_pgtable_page(context);
 987
 988        }
 989        free_pgtable_page(iommu->root_entry);
 990        iommu->root_entry = NULL;
 991out:
 992        spin_unlock_irqrestore(&iommu->lock, flags);
 993}
 994
 995static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 996                                      unsigned long pfn, int *target_level)
 997{
 998        struct dma_pte *parent, *pte;
 999        int level = agaw_to_level(domain->agaw);
1000        int offset;
1001
1002        BUG_ON(!domain->pgd);
1003
1004        if (!domain_pfn_supported(domain, pfn))
1005                /* Address beyond IOMMU's addressing capabilities. */
1006                return NULL;
1007
1008        parent = domain->pgd;
1009
1010        while (1) {
1011                void *tmp_page;
1012
1013                offset = pfn_level_offset(pfn, level);
1014                pte = &parent[offset];
1015                if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1016                        break;
1017                if (level == *target_level)
1018                        break;
1019
1020                if (!dma_pte_present(pte)) {
1021                        uint64_t pteval;
1022
1023                        tmp_page = alloc_pgtable_page(domain->nid);
1024
1025                        if (!tmp_page)
1026                                return NULL;
1027
1028                        domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1029                        pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1030                        if (domain_use_first_level(domain)) {
1031                                pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1032                                if (iommu_is_dma_domain(&domain->domain))
1033                                        pteval |= DMA_FL_PTE_ACCESS;
1034                        }
1035                        if (cmpxchg64(&pte->val, 0ULL, pteval))
1036                                /* Someone else set it while we were thinking; use theirs. */
1037                                free_pgtable_page(tmp_page);
1038                        else
1039                                domain_flush_cache(domain, pte, sizeof(*pte));
1040                }
1041                if (level == 1)
1042                        break;
1043
1044                parent = phys_to_virt(dma_pte_addr(pte));
1045                level--;
1046        }
1047
1048        if (!*target_level)
1049                *target_level = level;
1050
1051        return pte;
1052}
1053
1054/* return address's pte at specific level */
1055static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1056                                         unsigned long pfn,
1057                                         int level, int *large_page)
1058{
1059        struct dma_pte *parent, *pte;
1060        int total = agaw_to_level(domain->agaw);
1061        int offset;
1062
1063        parent = domain->pgd;
1064        while (level <= total) {
1065                offset = pfn_level_offset(pfn, total);
1066                pte = &parent[offset];
1067                if (level == total)
1068                        return pte;
1069
1070                if (!dma_pte_present(pte)) {
1071                        *large_page = total;
1072                        break;
1073                }
1074
1075                if (dma_pte_superpage(pte)) {
1076                        *large_page = total;
1077                        return pte;
1078                }
1079
1080                parent = phys_to_virt(dma_pte_addr(pte));
1081                total--;
1082        }
1083        return NULL;
1084}
1085
1086/* clear last level pte, a tlb flush should be followed */
1087static void dma_pte_clear_range(struct dmar_domain *domain,
1088                                unsigned long start_pfn,
1089                                unsigned long last_pfn)
1090{
1091        unsigned int large_page;
1092        struct dma_pte *first_pte, *pte;
1093
1094        BUG_ON(!domain_pfn_supported(domain, start_pfn));
1095        BUG_ON(!domain_pfn_supported(domain, last_pfn));
1096        BUG_ON(start_pfn > last_pfn);
1097
1098        /* we don't need lock here; nobody else touches the iova range */
1099        do {
1100                large_page = 1;
1101                first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1102                if (!pte) {
1103                        start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1104                        continue;
1105                }
1106                do {
1107                        dma_clear_pte(pte);
1108                        start_pfn += lvl_to_nr_pages(large_page);
1109                        pte++;
1110                } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1111
1112                domain_flush_cache(domain, first_pte,
1113                                   (void *)pte - (void *)first_pte);
1114
1115        } while (start_pfn && start_pfn <= last_pfn);
1116}
1117
1118static void dma_pte_free_level(struct dmar_domain *domain, int level,
1119                               int retain_level, struct dma_pte *pte,
1120                               unsigned long pfn, unsigned long start_pfn,
1121                               unsigned long last_pfn)
1122{
1123        pfn = max(start_pfn, pfn);
1124        pte = &pte[pfn_level_offset(pfn, level)];
1125
1126        do {
1127                unsigned long level_pfn;
1128                struct dma_pte *level_pte;
1129
1130                if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1131                        goto next;
1132
1133                level_pfn = pfn & level_mask(level);
1134                level_pte = phys_to_virt(dma_pte_addr(pte));
1135
1136                if (level > 2) {
1137                        dma_pte_free_level(domain, level - 1, retain_level,
1138                                           level_pte, level_pfn, start_pfn,
1139                                           last_pfn);
1140                }
1141
1142                /*
1143                 * Free the page table if we're below the level we want to
1144                 * retain and the range covers the entire table.
1145                 */
1146                if (level < retain_level && !(start_pfn > level_pfn ||
1147                      last_pfn < level_pfn + level_size(level) - 1)) {
1148                        dma_clear_pte(pte);
1149                        domain_flush_cache(domain, pte, sizeof(*pte));
1150                        free_pgtable_page(level_pte);
1151                }
1152next:
1153                pfn += level_size(level);
1154        } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1155}
1156
1157/*
1158 * clear last level (leaf) ptes and free page table pages below the
1159 * level we wish to keep intact.
1160 */
1161static void dma_pte_free_pagetable(struct dmar_domain *domain,
1162                                   unsigned long start_pfn,
1163                                   unsigned long last_pfn,
1164                                   int retain_level)
1165{
1166        BUG_ON(!domain_pfn_supported(domain, start_pfn));
1167        BUG_ON(!domain_pfn_supported(domain, last_pfn));
1168        BUG_ON(start_pfn > last_pfn);
1169
1170        dma_pte_clear_range(domain, start_pfn, last_pfn);
1171
1172        /* We don't need lock here; nobody else touches the iova range */
1173        dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1174                           domain->pgd, 0, start_pfn, last_pfn);
1175
1176        /* free pgd */
1177        if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1178                free_pgtable_page(domain->pgd);
1179                domain->pgd = NULL;
1180        }
1181}
1182
1183/* When a page at a given level is being unlinked from its parent, we don't
1184   need to *modify* it at all. All we need to do is make a list of all the
1185   pages which can be freed just as soon as we've flushed the IOTLB and we
1186   know the hardware page-walk will no longer touch them.
1187   The 'pte' argument is the *parent* PTE, pointing to the page that is to
1188   be freed. */
1189static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1190                                            int level, struct dma_pte *pte,
1191                                            struct page *freelist)
1192{
1193        struct page *pg;
1194
1195        pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1196        pg->freelist = freelist;
1197        freelist = pg;
1198
1199        if (level == 1)
1200                return freelist;
1201
1202        pte = page_address(pg);
1203        do {
1204                if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1205                        freelist = dma_pte_list_pagetables(domain, level - 1,
1206                                                           pte, freelist);
1207                pte++;
1208        } while (!first_pte_in_page(pte));
1209
1210        return freelist;
1211}
1212
1213static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1214                                        struct dma_pte *pte, unsigned long pfn,
1215                                        unsigned long start_pfn,
1216                                        unsigned long last_pfn,
1217                                        struct page *freelist)
1218{
1219        struct dma_pte *first_pte = NULL, *last_pte = NULL;
1220
1221        pfn = max(start_pfn, pfn);
1222        pte = &pte[pfn_level_offset(pfn, level)];
1223
1224        do {
1225                unsigned long level_pfn;
1226
1227                if (!dma_pte_present(pte))
1228                        goto next;
1229
1230                level_pfn = pfn & level_mask(level);
1231
1232                /* If range covers entire pagetable, free it */
1233                if (start_pfn <= level_pfn &&
1234                    last_pfn >= level_pfn + level_size(level) - 1) {
1235                        /* These suborbinate page tables are going away entirely. Don't
1236                           bother to clear them; we're just going to *free* them. */
1237                        if (level > 1 && !dma_pte_superpage(pte))
1238                                freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1239
1240                        dma_clear_pte(pte);
1241                        if (!first_pte)
1242                                first_pte = pte;
1243                        last_pte = pte;
1244                } else if (level > 1) {
1245                        /* Recurse down into a level that isn't *entirely* obsolete */
1246                        freelist = dma_pte_clear_level(domain, level - 1,
1247                                                       phys_to_virt(dma_pte_addr(pte)),
1248                                                       level_pfn, start_pfn, last_pfn,
1249                                                       freelist);
1250                }
1251next:
1252                pfn += level_size(level);
1253        } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1254
1255        if (first_pte)
1256                domain_flush_cache(domain, first_pte,
1257                                   (void *)++last_pte - (void *)first_pte);
1258
1259        return freelist;
1260}
1261
1262/* We can't just free the pages because the IOMMU may still be walking
1263   the page tables, and may have cached the intermediate levels. The
1264   pages can only be freed after the IOTLB flush has been done. */
1265static struct page *domain_unmap(struct dmar_domain *domain,
1266                                 unsigned long start_pfn,
1267                                 unsigned long last_pfn,
1268                                 struct page *freelist)
1269{
1270        BUG_ON(!domain_pfn_supported(domain, start_pfn));
1271        BUG_ON(!domain_pfn_supported(domain, last_pfn));
1272        BUG_ON(start_pfn > last_pfn);
1273
1274        /* we don't need lock here; nobody else touches the iova range */
1275        freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1276                                       domain->pgd, 0, start_pfn, last_pfn,
1277                                       freelist);
1278
1279        /* free pgd */
1280        if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1281                struct page *pgd_page = virt_to_page(domain->pgd);
1282                pgd_page->freelist = freelist;
1283                freelist = pgd_page;
1284
1285                domain->pgd = NULL;
1286        }
1287
1288        return freelist;
1289}
1290
1291static void dma_free_pagelist(struct page *freelist)
1292{
1293        struct page *pg;
1294
1295        while ((pg = freelist)) {
1296                freelist = pg->freelist;
1297                free_pgtable_page(page_address(pg));
1298        }
1299}
1300
1301/* iommu handling */
1302static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1303{
1304        struct root_entry *root;
1305        unsigned long flags;
1306
1307        root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1308        if (!root) {
1309                pr_err("Allocating root entry for %s failed\n",
1310                        iommu->name);
1311                return -ENOMEM;
1312        }
1313
1314        __iommu_flush_cache(iommu, root, ROOT_SIZE);
1315
1316        spin_lock_irqsave(&iommu->lock, flags);
1317        iommu->root_entry = root;
1318        spin_unlock_irqrestore(&iommu->lock, flags);
1319
1320        return 0;
1321}
1322
1323static void iommu_set_root_entry(struct intel_iommu *iommu)
1324{
1325        u64 addr;
1326        u32 sts;
1327        unsigned long flag;
1328
1329        addr = virt_to_phys(iommu->root_entry);
1330        if (sm_supported(iommu))
1331                addr |= DMA_RTADDR_SMT;
1332
1333        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1334        dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1335
1336        writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1337
1338        /* Make sure hardware complete it */
1339        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1340                      readl, (sts & DMA_GSTS_RTPS), sts);
1341
1342        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1343
1344        iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1345        if (sm_supported(iommu))
1346                qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1347        iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1348}
1349
1350void iommu_flush_write_buffer(struct intel_iommu *iommu)
1351{
1352        u32 val;
1353        unsigned long flag;
1354
1355        if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1356                return;
1357
1358        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1359        writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1360
1361        /* Make sure hardware complete it */
1362        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1363                      readl, (!(val & DMA_GSTS_WBFS)), val);
1364
1365        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1366}
1367
1368/* return value determine if we need a write buffer flush */
1369static void __iommu_flush_context(struct intel_iommu *iommu,
1370                                  u16 did, u16 source_id, u8 function_mask,
1371                                  u64 type)
1372{
1373        u64 val = 0;
1374        unsigned long flag;
1375
1376        switch (type) {
1377        case DMA_CCMD_GLOBAL_INVL:
1378                val = DMA_CCMD_GLOBAL_INVL;
1379                break;
1380        case DMA_CCMD_DOMAIN_INVL:
1381                val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1382                break;
1383        case DMA_CCMD_DEVICE_INVL:
1384                val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1385                        | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1386                break;
1387        default:
1388                BUG();
1389        }
1390        val |= DMA_CCMD_ICC;
1391
1392        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1393        dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1394
1395        /* Make sure hardware complete it */
1396        IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1397                dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1398
1399        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1400}
1401
1402/* return value determine if we need a write buffer flush */
1403static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1404                                u64 addr, unsigned int size_order, u64 type)
1405{
1406        int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1407        u64 val = 0, val_iva = 0;
1408        unsigned long flag;
1409
1410        switch (type) {
1411        case DMA_TLB_GLOBAL_FLUSH:
1412                /* global flush doesn't need set IVA_REG */
1413                val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1414                break;
1415        case DMA_TLB_DSI_FLUSH:
1416                val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1417                break;
1418        case DMA_TLB_PSI_FLUSH:
1419                val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1420                /* IH bit is passed in as part of address */
1421                val_iva = size_order | addr;
1422                break;
1423        default:
1424                BUG();
1425        }
1426        /* Note: set drain read/write */
1427#if 0
1428        /*
1429         * This is probably to be super secure.. Looks like we can
1430         * ignore it without any impact.
1431         */
1432        if (cap_read_drain(iommu->cap))
1433                val |= DMA_TLB_READ_DRAIN;
1434#endif
1435        if (cap_write_drain(iommu->cap))
1436                val |= DMA_TLB_WRITE_DRAIN;
1437
1438        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1439        /* Note: Only uses first TLB reg currently */
1440        if (val_iva)
1441                dmar_writeq(iommu->reg + tlb_offset, val_iva);
1442        dmar_writeq(iommu->reg + tlb_offset + 8, val);
1443
1444        /* Make sure hardware complete it */
1445        IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1446                dmar_readq, (!(val & DMA_TLB_IVT)), val);
1447
1448        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1449
1450        /* check IOTLB invalidation granularity */
1451        if (DMA_TLB_IAIG(val) == 0)
1452                pr_err("Flush IOTLB failed\n");
1453        if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1454                pr_debug("TLB flush request %Lx, actual %Lx\n",
1455                        (unsigned long long)DMA_TLB_IIRG(type),
1456                        (unsigned long long)DMA_TLB_IAIG(val));
1457}
1458
1459static struct device_domain_info *
1460iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1461                         u8 bus, u8 devfn)
1462{
1463        struct device_domain_info *info;
1464
1465        assert_spin_locked(&device_domain_lock);
1466
1467        if (!iommu->qi)
1468                return NULL;
1469
1470        list_for_each_entry(info, &domain->devices, link)
1471                if (info->iommu == iommu && info->bus == bus &&
1472                    info->devfn == devfn) {
1473                        if (info->ats_supported && info->dev)
1474                                return info;
1475                        break;
1476                }
1477
1478        return NULL;
1479}
1480
1481static void domain_update_iotlb(struct dmar_domain *domain)
1482{
1483        struct device_domain_info *info;
1484        bool has_iotlb_device = false;
1485
1486        assert_spin_locked(&device_domain_lock);
1487
1488        list_for_each_entry(info, &domain->devices, link)
1489                if (info->ats_enabled) {
1490                        has_iotlb_device = true;
1491                        break;
1492                }
1493
1494        if (!has_iotlb_device) {
1495                struct subdev_domain_info *sinfo;
1496
1497                list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1498                        info = get_domain_info(sinfo->pdev);
1499                        if (info && info->ats_enabled) {
1500                                has_iotlb_device = true;
1501                                break;
1502                        }
1503                }
1504        }
1505
1506        domain->has_iotlb_device = has_iotlb_device;
1507}
1508
1509static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1510{
1511        struct pci_dev *pdev;
1512
1513        assert_spin_locked(&device_domain_lock);
1514
1515        if (!info || !dev_is_pci(info->dev))
1516                return;
1517
1518        pdev = to_pci_dev(info->dev);
1519        /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1520         * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1521         * queue depth at PF level. If DIT is not set, PFSID will be treated as
1522         * reserved, which should be set to 0.
1523         */
1524        if (!ecap_dit(info->iommu->ecap))
1525                info->pfsid = 0;
1526        else {
1527                struct pci_dev *pf_pdev;
1528
1529                /* pdev will be returned if device is not a vf */
1530                pf_pdev = pci_physfn(pdev);
1531                info->pfsid = pci_dev_id(pf_pdev);
1532        }
1533
1534#ifdef CONFIG_INTEL_IOMMU_SVM
1535        /* The PCIe spec, in its wisdom, declares that the behaviour of
1536           the device if you enable PASID support after ATS support is
1537           undefined. So always enable PASID support on devices which
1538           have it, even if we can't yet know if we're ever going to
1539           use it. */
1540        if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1541                info->pasid_enabled = 1;
1542
1543        if (info->pri_supported &&
1544            (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1545            !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
1546                info->pri_enabled = 1;
1547#endif
1548        if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1549            !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1550                info->ats_enabled = 1;
1551                domain_update_iotlb(info->domain);
1552                info->ats_qdep = pci_ats_queue_depth(pdev);
1553        }
1554}
1555
1556static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1557{
1558        struct pci_dev *pdev;
1559
1560        assert_spin_locked(&device_domain_lock);
1561
1562        if (!dev_is_pci(info->dev))
1563                return;
1564
1565        pdev = to_pci_dev(info->dev);
1566
1567        if (info->ats_enabled) {
1568                pci_disable_ats(pdev);
1569                info->ats_enabled = 0;
1570                domain_update_iotlb(info->domain);
1571        }
1572#ifdef CONFIG_INTEL_IOMMU_SVM
1573        if (info->pri_enabled) {
1574                pci_disable_pri(pdev);
1575                info->pri_enabled = 0;
1576        }
1577        if (info->pasid_enabled) {
1578                pci_disable_pasid(pdev);
1579                info->pasid_enabled = 0;
1580        }
1581#endif
1582}
1583
1584static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1585                                    u64 addr, unsigned int mask)
1586{
1587        u16 sid, qdep;
1588
1589        if (!info || !info->ats_enabled)
1590                return;
1591
1592        sid = info->bus << 8 | info->devfn;
1593        qdep = info->ats_qdep;
1594        qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1595                           qdep, addr, mask);
1596}
1597
1598static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1599                                  u64 addr, unsigned mask)
1600{
1601        unsigned long flags;
1602        struct device_domain_info *info;
1603        struct subdev_domain_info *sinfo;
1604
1605        if (!domain->has_iotlb_device)
1606                return;
1607
1608        spin_lock_irqsave(&device_domain_lock, flags);
1609        list_for_each_entry(info, &domain->devices, link)
1610                __iommu_flush_dev_iotlb(info, addr, mask);
1611
1612        list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1613                info = get_domain_info(sinfo->pdev);
1614                __iommu_flush_dev_iotlb(info, addr, mask);
1615        }
1616        spin_unlock_irqrestore(&device_domain_lock, flags);
1617}
1618
1619static void domain_flush_piotlb(struct intel_iommu *iommu,
1620                                struct dmar_domain *domain,
1621                                u64 addr, unsigned long npages, bool ih)
1622{
1623        u16 did = domain->iommu_did[iommu->seq_id];
1624
1625        if (domain->default_pasid)
1626                qi_flush_piotlb(iommu, did, domain->default_pasid,
1627                                addr, npages, ih);
1628
1629        if (!list_empty(&domain->devices))
1630                qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1631}
1632
1633static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1634                                  struct dmar_domain *domain,
1635                                  unsigned long pfn, unsigned int pages,
1636                                  int ih, int map)
1637{
1638        unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1639        uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1640        u16 did = domain->iommu_did[iommu->seq_id];
1641
1642        BUG_ON(pages == 0);
1643
1644        if (ih)
1645                ih = 1 << 6;
1646
1647        if (domain_use_first_level(domain)) {
1648                domain_flush_piotlb(iommu, domain, addr, pages, ih);
1649        } else {
1650                /*
1651                 * Fallback to domain selective flush if no PSI support or
1652                 * the size is too big. PSI requires page size to be 2 ^ x,
1653                 * and the base address is naturally aligned to the size.
1654                 */
1655                if (!cap_pgsel_inv(iommu->cap) ||
1656                    mask > cap_max_amask_val(iommu->cap))
1657                        iommu->flush.flush_iotlb(iommu, did, 0, 0,
1658                                                        DMA_TLB_DSI_FLUSH);
1659                else
1660                        iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1661                                                        DMA_TLB_PSI_FLUSH);
1662        }
1663
1664        /*
1665         * In caching mode, changes of pages from non-present to present require
1666         * flush. However, device IOTLB doesn't need to be flushed in this case.
1667         */
1668        if (!cap_caching_mode(iommu->cap) || !map)
1669                iommu_flush_dev_iotlb(domain, addr, mask);
1670}
1671
1672/* Notification for newly created mappings */
1673static inline void __mapping_notify_one(struct intel_iommu *iommu,
1674                                        struct dmar_domain *domain,
1675                                        unsigned long pfn, unsigned int pages)
1676{
1677        /*
1678         * It's a non-present to present mapping. Only flush if caching mode
1679         * and second level.
1680         */
1681        if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1682                iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1683        else
1684                iommu_flush_write_buffer(iommu);
1685}
1686
1687static void intel_flush_iotlb_all(struct iommu_domain *domain)
1688{
1689        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1690        int idx;
1691
1692        for_each_domain_iommu(idx, dmar_domain) {
1693                struct intel_iommu *iommu = g_iommus[idx];
1694                u16 did = dmar_domain->iommu_did[iommu->seq_id];
1695
1696                if (domain_use_first_level(dmar_domain))
1697                        domain_flush_piotlb(iommu, dmar_domain, 0, -1, 0);
1698                else
1699                        iommu->flush.flush_iotlb(iommu, did, 0, 0,
1700                                                 DMA_TLB_DSI_FLUSH);
1701
1702                if (!cap_caching_mode(iommu->cap))
1703                        iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1704                                              0, MAX_AGAW_PFN_WIDTH);
1705        }
1706}
1707
1708static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1709{
1710        u32 pmen;
1711        unsigned long flags;
1712
1713        if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1714                return;
1715
1716        raw_spin_lock_irqsave(&iommu->register_lock, flags);
1717        pmen = readl(iommu->reg + DMAR_PMEN_REG);
1718        pmen &= ~DMA_PMEN_EPM;
1719        writel(pmen, iommu->reg + DMAR_PMEN_REG);
1720
1721        /* wait for the protected region status bit to clear */
1722        IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1723                readl, !(pmen & DMA_PMEN_PRS), pmen);
1724
1725        raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1726}
1727
1728static void iommu_enable_translation(struct intel_iommu *iommu)
1729{
1730        u32 sts;
1731        unsigned long flags;
1732
1733        raw_spin_lock_irqsave(&iommu->register_lock, flags);
1734        iommu->gcmd |= DMA_GCMD_TE;
1735        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1736
1737        /* Make sure hardware complete it */
1738        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1739                      readl, (sts & DMA_GSTS_TES), sts);
1740
1741        raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1742}
1743
1744static void iommu_disable_translation(struct intel_iommu *iommu)
1745{
1746        u32 sts;
1747        unsigned long flag;
1748
1749        if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1750            (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1751                return;
1752
1753        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1754        iommu->gcmd &= ~DMA_GCMD_TE;
1755        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1756
1757        /* Make sure hardware complete it */
1758        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1759                      readl, (!(sts & DMA_GSTS_TES)), sts);
1760
1761        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1762}
1763
1764static int iommu_init_domains(struct intel_iommu *iommu)
1765{
1766        u32 ndomains, nlongs;
1767        size_t size;
1768
1769        ndomains = cap_ndoms(iommu->cap);
1770        pr_debug("%s: Number of Domains supported <%d>\n",
1771                 iommu->name, ndomains);
1772        nlongs = BITS_TO_LONGS(ndomains);
1773
1774        spin_lock_init(&iommu->lock);
1775
1776        iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1777        if (!iommu->domain_ids)
1778                return -ENOMEM;
1779
1780        size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1781        iommu->domains = kzalloc(size, GFP_KERNEL);
1782
1783        if (iommu->domains) {
1784                size = 256 * sizeof(struct dmar_domain *);
1785                iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1786        }
1787
1788        if (!iommu->domains || !iommu->domains[0]) {
1789                pr_err("%s: Allocating domain array failed\n",
1790                       iommu->name);
1791                kfree(iommu->domain_ids);
1792                kfree(iommu->domains);
1793                iommu->domain_ids = NULL;
1794                iommu->domains    = NULL;
1795                return -ENOMEM;
1796        }
1797
1798        /*
1799         * If Caching mode is set, then invalid translations are tagged
1800         * with domain-id 0, hence we need to pre-allocate it. We also
1801         * use domain-id 0 as a marker for non-allocated domain-id, so
1802         * make sure it is not used for a real domain.
1803         */
1804        set_bit(0, iommu->domain_ids);
1805
1806        /*
1807         * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1808         * entry for first-level or pass-through translation modes should
1809         * be programmed with a domain id different from those used for
1810         * second-level or nested translation. We reserve a domain id for
1811         * this purpose.
1812         */
1813        if (sm_supported(iommu))
1814                set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1815
1816        return 0;
1817}
1818
1819static void disable_dmar_iommu(struct intel_iommu *iommu)
1820{
1821        struct device_domain_info *info, *tmp;
1822        unsigned long flags;
1823
1824        if (!iommu->domains || !iommu->domain_ids)
1825                return;
1826
1827        spin_lock_irqsave(&device_domain_lock, flags);
1828        list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1829                if (info->iommu != iommu)
1830                        continue;
1831
1832                if (!info->dev || !info->domain)
1833                        continue;
1834
1835                __dmar_remove_one_dev_info(info);
1836        }
1837        spin_unlock_irqrestore(&device_domain_lock, flags);
1838
1839        if (iommu->gcmd & DMA_GCMD_TE)
1840                iommu_disable_translation(iommu);
1841}
1842
1843static void free_dmar_iommu(struct intel_iommu *iommu)
1844{
1845        if ((iommu->domains) && (iommu->domain_ids)) {
1846                int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1847                int i;
1848
1849                for (i = 0; i < elems; i++)
1850                        kfree(iommu->domains[i]);
1851                kfree(iommu->domains);
1852                kfree(iommu->domain_ids);
1853                iommu->domains = NULL;
1854                iommu->domain_ids = NULL;
1855        }
1856
1857        g_iommus[iommu->seq_id] = NULL;
1858
1859        /* free context mapping */
1860        free_context_table(iommu);
1861
1862#ifdef CONFIG_INTEL_IOMMU_SVM
1863        if (pasid_supported(iommu)) {
1864                if (ecap_prs(iommu->ecap))
1865                        intel_svm_finish_prq(iommu);
1866        }
1867        if (vccap_pasid(iommu->vccap))
1868                ioasid_unregister_allocator(&iommu->pasid_allocator);
1869
1870#endif
1871}
1872
1873/*
1874 * Check and return whether first level is used by default for
1875 * DMA translation.
1876 */
1877static bool first_level_by_default(void)
1878{
1879        return scalable_mode_support() && intel_cap_flts_sanity();
1880}
1881
1882static struct dmar_domain *alloc_domain(int flags)
1883{
1884        struct dmar_domain *domain;
1885
1886        domain = alloc_domain_mem();
1887        if (!domain)
1888                return NULL;
1889
1890        memset(domain, 0, sizeof(*domain));
1891        domain->nid = NUMA_NO_NODE;
1892        domain->flags = flags;
1893        if (first_level_by_default())
1894                domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1895        domain->has_iotlb_device = false;
1896        INIT_LIST_HEAD(&domain->devices);
1897        INIT_LIST_HEAD(&domain->subdevices);
1898
1899        return domain;
1900}
1901
1902/* Must be called with iommu->lock */
1903static int domain_attach_iommu(struct dmar_domain *domain,
1904                               struct intel_iommu *iommu)
1905{
1906        unsigned long ndomains;
1907        int num;
1908
1909        assert_spin_locked(&device_domain_lock);
1910        assert_spin_locked(&iommu->lock);
1911
1912        domain->iommu_refcnt[iommu->seq_id] += 1;
1913        if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1914                ndomains = cap_ndoms(iommu->cap);
1915                num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1916
1917                if (num >= ndomains) {
1918                        pr_err("%s: No free domain ids\n", iommu->name);
1919                        domain->iommu_refcnt[iommu->seq_id] -= 1;
1920                        return -ENOSPC;
1921                }
1922
1923                set_bit(num, iommu->domain_ids);
1924                set_iommu_domain(iommu, num, domain);
1925
1926                domain->iommu_did[iommu->seq_id] = num;
1927                domain->nid                      = iommu->node;
1928
1929                domain_update_iommu_cap(domain);
1930        }
1931
1932        return 0;
1933}
1934
1935static void domain_detach_iommu(struct dmar_domain *domain,
1936                                struct intel_iommu *iommu)
1937{
1938        int num;
1939
1940        assert_spin_locked(&device_domain_lock);
1941        assert_spin_locked(&iommu->lock);
1942
1943        domain->iommu_refcnt[iommu->seq_id] -= 1;
1944        if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1945                num = domain->iommu_did[iommu->seq_id];
1946                clear_bit(num, iommu->domain_ids);
1947                set_iommu_domain(iommu, num, NULL);
1948
1949                domain_update_iommu_cap(domain);
1950                domain->iommu_did[iommu->seq_id] = 0;
1951        }
1952}
1953
1954static inline int guestwidth_to_adjustwidth(int gaw)
1955{
1956        int agaw;
1957        int r = (gaw - 12) % 9;
1958
1959        if (r == 0)
1960                agaw = gaw;
1961        else
1962                agaw = gaw + 9 - r;
1963        if (agaw > 64)
1964                agaw = 64;
1965        return agaw;
1966}
1967
1968static void domain_exit(struct dmar_domain *domain)
1969{
1970
1971        /* Remove associated devices and clear attached or cached domains */
1972        domain_remove_dev_info(domain);
1973
1974        if (domain->pgd) {
1975                struct page *freelist;
1976
1977                freelist = domain_unmap(domain, 0,
1978                                        DOMAIN_MAX_PFN(domain->gaw), NULL);
1979                dma_free_pagelist(freelist);
1980        }
1981
1982        free_domain_mem(domain);
1983}
1984
1985/*
1986 * Get the PASID directory size for scalable mode context entry.
1987 * Value of X in the PDTS field of a scalable mode context entry
1988 * indicates PASID directory with 2^(X + 7) entries.
1989 */
1990static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1991{
1992        int pds, max_pde;
1993
1994        max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1995        pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1996        if (pds < 7)
1997                return 0;
1998
1999        return pds - 7;
2000}
2001
2002/*
2003 * Set the RID_PASID field of a scalable mode context entry. The
2004 * IOMMU hardware will use the PASID value set in this field for
2005 * DMA translations of DMA requests without PASID.
2006 */
2007static inline void
2008context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2009{
2010        context->hi |= pasid & ((1 << 20) - 1);
2011}
2012
2013/*
2014 * Set the DTE(Device-TLB Enable) field of a scalable mode context
2015 * entry.
2016 */
2017static inline void context_set_sm_dte(struct context_entry *context)
2018{
2019        context->lo |= (1 << 2);
2020}
2021
2022/*
2023 * Set the PRE(Page Request Enable) field of a scalable mode context
2024 * entry.
2025 */
2026static inline void context_set_sm_pre(struct context_entry *context)
2027{
2028        context->lo |= (1 << 4);
2029}
2030
2031/* Convert value to context PASID directory size field coding. */
2032#define context_pdts(pds)       (((pds) & 0x7) << 9)
2033
2034static int domain_context_mapping_one(struct dmar_domain *domain,
2035                                      struct intel_iommu *iommu,
2036                                      struct pasid_table *table,
2037                                      u8 bus, u8 devfn)
2038{
2039        u16 did = domain->iommu_did[iommu->seq_id];
2040        int translation = CONTEXT_TT_MULTI_LEVEL;
2041        struct device_domain_info *info = NULL;
2042        struct context_entry *context;
2043        unsigned long flags;
2044        int ret;
2045
2046        WARN_ON(did == 0);
2047
2048        if (hw_pass_through && domain_type_is_si(domain))
2049                translation = CONTEXT_TT_PASS_THROUGH;
2050
2051        pr_debug("Set context mapping for %02x:%02x.%d\n",
2052                bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2053
2054        BUG_ON(!domain->pgd);
2055
2056        spin_lock_irqsave(&device_domain_lock, flags);
2057        spin_lock(&iommu->lock);
2058
2059        ret = -ENOMEM;
2060        context = iommu_context_addr(iommu, bus, devfn, 1);
2061        if (!context)
2062                goto out_unlock;
2063
2064        ret = 0;
2065        if (context_present(context))
2066                goto out_unlock;
2067
2068        /*
2069         * For kdump cases, old valid entries may be cached due to the
2070         * in-flight DMA and copied pgtable, but there is no unmapping
2071         * behaviour for them, thus we need an explicit cache flush for
2072         * the newly-mapped device. For kdump, at this point, the device
2073         * is supposed to finish reset at its driver probe stage, so no
2074         * in-flight DMA will exist, and we don't need to worry anymore
2075         * hereafter.
2076         */
2077        if (context_copied(context)) {
2078                u16 did_old = context_domain_id(context);
2079
2080                if (did_old < cap_ndoms(iommu->cap)) {
2081                        iommu->flush.flush_context(iommu, did_old,
2082                                                   (((u16)bus) << 8) | devfn,
2083                                                   DMA_CCMD_MASK_NOBIT,
2084                                                   DMA_CCMD_DEVICE_INVL);
2085                        iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2086                                                 DMA_TLB_DSI_FLUSH);
2087                }
2088        }
2089
2090        context_clear_entry(context);
2091
2092        if (sm_supported(iommu)) {
2093                unsigned long pds;
2094
2095                WARN_ON(!table);
2096
2097                /* Setup the PASID DIR pointer: */
2098                pds = context_get_sm_pds(table);
2099                context->lo = (u64)virt_to_phys(table->table) |
2100                                context_pdts(pds);
2101
2102                /* Setup the RID_PASID field: */
2103                context_set_sm_rid2pasid(context, PASID_RID2PASID);
2104
2105                /*
2106                 * Setup the Device-TLB enable bit and Page request
2107                 * Enable bit:
2108                 */
2109                info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2110                if (info && info->ats_supported)
2111                        context_set_sm_dte(context);
2112                if (info && info->pri_supported)
2113                        context_set_sm_pre(context);
2114        } else {
2115                struct dma_pte *pgd = domain->pgd;
2116                int agaw;
2117
2118                context_set_domain_id(context, did);
2119
2120                if (translation != CONTEXT_TT_PASS_THROUGH) {
2121                        /*
2122                         * Skip top levels of page tables for iommu which has
2123                         * less agaw than default. Unnecessary for PT mode.
2124                         */
2125                        for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2126                                ret = -ENOMEM;
2127                                pgd = phys_to_virt(dma_pte_addr(pgd));
2128                                if (!dma_pte_present(pgd))
2129                                        goto out_unlock;
2130                        }
2131
2132                        info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2133                        if (info && info->ats_supported)
2134                                translation = CONTEXT_TT_DEV_IOTLB;
2135                        else
2136                                translation = CONTEXT_TT_MULTI_LEVEL;
2137
2138                        context_set_address_root(context, virt_to_phys(pgd));
2139                        context_set_address_width(context, agaw);
2140                } else {
2141                        /*
2142                         * In pass through mode, AW must be programmed to
2143                         * indicate the largest AGAW value supported by
2144                         * hardware. And ASR is ignored by hardware.
2145                         */
2146                        context_set_address_width(context, iommu->msagaw);
2147                }
2148
2149                context_set_translation_type(context, translation);
2150        }
2151
2152        context_set_fault_enable(context);
2153        context_set_present(context);
2154        if (!ecap_coherent(iommu->ecap))
2155                clflush_cache_range(context, sizeof(*context));
2156
2157        /*
2158         * It's a non-present to present mapping. If hardware doesn't cache
2159         * non-present entry we only need to flush the write-buffer. If the
2160         * _does_ cache non-present entries, then it does so in the special
2161         * domain #0, which we have to flush:
2162         */
2163        if (cap_caching_mode(iommu->cap)) {
2164                iommu->flush.flush_context(iommu, 0,
2165                                           (((u16)bus) << 8) | devfn,
2166                                           DMA_CCMD_MASK_NOBIT,
2167                                           DMA_CCMD_DEVICE_INVL);
2168                iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2169        } else {
2170                iommu_flush_write_buffer(iommu);
2171        }
2172        iommu_enable_dev_iotlb(info);
2173
2174        ret = 0;
2175
2176out_unlock:
2177        spin_unlock(&iommu->lock);
2178        spin_unlock_irqrestore(&device_domain_lock, flags);
2179
2180        return ret;
2181}
2182
2183struct domain_context_mapping_data {
2184        struct dmar_domain *domain;
2185        struct intel_iommu *iommu;
2186        struct pasid_table *table;
2187};
2188
2189static int domain_context_mapping_cb(struct pci_dev *pdev,
2190                                     u16 alias, void *opaque)
2191{
2192        struct domain_context_mapping_data *data = opaque;
2193
2194        return domain_context_mapping_one(data->domain, data->iommu,
2195                                          data->table, PCI_BUS_NUM(alias),
2196                                          alias & 0xff);
2197}
2198
2199static int
2200domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2201{
2202        struct domain_context_mapping_data data;
2203        struct pasid_table *table;
2204        struct intel_iommu *iommu;
2205        u8 bus, devfn;
2206
2207        iommu = device_to_iommu(dev, &bus, &devfn);
2208        if (!iommu)
2209                return -ENODEV;
2210
2211        table = intel_pasid_get_table(dev);
2212
2213        if (!dev_is_pci(dev))
2214                return domain_context_mapping_one(domain, iommu, table,
2215                                                  bus, devfn);
2216
2217        data.domain = domain;
2218        data.iommu = iommu;
2219        data.table = table;
2220
2221        return pci_for_each_dma_alias(to_pci_dev(dev),
2222                                      &domain_context_mapping_cb, &data);
2223}
2224
2225static int domain_context_mapped_cb(struct pci_dev *pdev,
2226                                    u16 alias, void *opaque)
2227{
2228        struct intel_iommu *iommu = opaque;
2229
2230        return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2231}
2232
2233static int domain_context_mapped(struct device *dev)
2234{
2235        struct intel_iommu *iommu;
2236        u8 bus, devfn;
2237
2238        iommu = device_to_iommu(dev, &bus, &devfn);
2239        if (!iommu)
2240                return -ENODEV;
2241
2242        if (!dev_is_pci(dev))
2243                return device_context_mapped(iommu, bus, devfn);
2244
2245        return !pci_for_each_dma_alias(to_pci_dev(dev),
2246                                       domain_context_mapped_cb, iommu);
2247}
2248
2249/* Returns a number of VTD pages, but aligned to MM page size */
2250static inline unsigned long aligned_nrpages(unsigned long host_addr,
2251                                            size_t size)
2252{
2253        host_addr &= ~PAGE_MASK;
2254        return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2255}
2256
2257/* Return largest possible superpage level for a given mapping */
2258static inline int hardware_largepage_caps(struct dmar_domain *domain,
2259                                          unsigned long iov_pfn,
2260                                          unsigned long phy_pfn,
2261                                          unsigned long pages)
2262{
2263        int support, level = 1;
2264        unsigned long pfnmerge;
2265
2266        support = domain->iommu_superpage;
2267
2268        /* To use a large page, the virtual *and* physical addresses
2269           must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2270           of them will mean we have to use smaller pages. So just
2271           merge them and check both at once. */
2272        pfnmerge = iov_pfn | phy_pfn;
2273
2274        while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2275                pages >>= VTD_STRIDE_SHIFT;
2276                if (!pages)
2277                        break;
2278                pfnmerge >>= VTD_STRIDE_SHIFT;
2279                level++;
2280                support--;
2281        }
2282        return level;
2283}
2284
2285/*
2286 * Ensure that old small page tables are removed to make room for superpage(s).
2287 * We're going to add new large pages, so make sure we don't remove their parent
2288 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2289 */
2290static void switch_to_super_page(struct dmar_domain *domain,
2291                                 unsigned long start_pfn,
2292                                 unsigned long end_pfn, int level)
2293{
2294        unsigned long lvl_pages = lvl_to_nr_pages(level);
2295        struct dma_pte *pte = NULL;
2296        int i;
2297
2298        while (start_pfn <= end_pfn) {
2299                if (!pte)
2300                        pte = pfn_to_dma_pte(domain, start_pfn, &level);
2301
2302                if (dma_pte_present(pte)) {
2303                        dma_pte_free_pagetable(domain, start_pfn,
2304                                               start_pfn + lvl_pages - 1,
2305                                               level + 1);
2306
2307                        for_each_domain_iommu(i, domain)
2308                                iommu_flush_iotlb_psi(g_iommus[i], domain,
2309                                                      start_pfn, lvl_pages,
2310                                                      0, 0);
2311                }
2312
2313                pte++;
2314                start_pfn += lvl_pages;
2315                if (first_pte_in_page(pte))
2316                        pte = NULL;
2317        }
2318}
2319
2320static int
2321__domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2322                 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2323{
2324        struct dma_pte *first_pte = NULL, *pte = NULL;
2325        unsigned int largepage_lvl = 0;
2326        unsigned long lvl_pages = 0;
2327        phys_addr_t pteval;
2328        u64 attr;
2329
2330        BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2331
2332        if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2333                return -EINVAL;
2334
2335        attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2336        attr |= DMA_FL_PTE_PRESENT;
2337        if (domain_use_first_level(domain)) {
2338                attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2339                if (prot & DMA_PTE_WRITE)
2340                        attr |= DMA_FL_PTE_DIRTY;
2341        }
2342
2343        pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2344
2345        while (nr_pages > 0) {
2346                uint64_t tmp;
2347
2348                if (!pte) {
2349                        largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2350                                        phys_pfn, nr_pages);
2351
2352                        pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2353                        if (!pte)
2354                                return -ENOMEM;
2355                        first_pte = pte;
2356
2357                        /* It is large page*/
2358                        if (largepage_lvl > 1) {
2359                                unsigned long end_pfn;
2360
2361                                pteval |= DMA_PTE_LARGE_PAGE;
2362                                end_pfn = ((iov_pfn + nr_pages) & level_mask(largepage_lvl)) - 1;
2363                                switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2364                        } else {
2365                                pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2366                        }
2367
2368                }
2369                /* We don't need lock here, nobody else
2370                 * touches the iova range
2371                 */
2372                tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2373                if (tmp) {
2374                        static int dumps = 5;
2375                        pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2376                                iov_pfn, tmp, (unsigned long long)pteval);
2377                        if (dumps) {
2378                                dumps--;
2379                                debug_dma_dump_mappings(NULL);
2380                        }
2381                        WARN_ON(1);
2382                }
2383
2384                lvl_pages = lvl_to_nr_pages(largepage_lvl);
2385
2386                BUG_ON(nr_pages < lvl_pages);
2387
2388                nr_pages -= lvl_pages;
2389                iov_pfn += lvl_pages;
2390                phys_pfn += lvl_pages;
2391                pteval += lvl_pages * VTD_PAGE_SIZE;
2392
2393                /* If the next PTE would be the first in a new page, then we
2394                 * need to flush the cache on the entries we've just written.
2395                 * And then we'll need to recalculate 'pte', so clear it and
2396                 * let it get set again in the if (!pte) block above.
2397                 *
2398                 * If we're done (!nr_pages) we need to flush the cache too.
2399                 *
2400                 * Also if we've been setting superpages, we may need to
2401                 * recalculate 'pte' and switch back to smaller pages for the
2402                 * end of the mapping, if the trailing size is not enough to
2403                 * use another superpage (i.e. nr_pages < lvl_pages).
2404                 */
2405                pte++;
2406                if (!nr_pages || first_pte_in_page(pte) ||
2407                    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2408                        domain_flush_cache(domain, first_pte,
2409                                           (void *)pte - (void *)first_pte);
2410                        pte = NULL;
2411                }
2412        }
2413
2414        return 0;
2415}
2416
2417static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2418{
2419        struct intel_iommu *iommu = info->iommu;
2420        struct context_entry *context;
2421        unsigned long flags;
2422        u16 did_old;
2423
2424        if (!iommu)
2425                return;
2426
2427        spin_lock_irqsave(&iommu->lock, flags);
2428        context = iommu_context_addr(iommu, bus, devfn, 0);
2429        if (!context) {
2430                spin_unlock_irqrestore(&iommu->lock, flags);
2431                return;
2432        }
2433
2434        if (sm_supported(iommu)) {
2435                if (hw_pass_through && domain_type_is_si(info->domain))
2436                        did_old = FLPT_DEFAULT_DID;
2437                else
2438                        did_old = info->domain->iommu_did[iommu->seq_id];
2439        } else {
2440                did_old = context_domain_id(context);
2441        }
2442
2443        context_clear_entry(context);
2444        __iommu_flush_cache(iommu, context, sizeof(*context));
2445        spin_unlock_irqrestore(&iommu->lock, flags);
2446        iommu->flush.flush_context(iommu,
2447                                   did_old,
2448                                   (((u16)bus) << 8) | devfn,
2449                                   DMA_CCMD_MASK_NOBIT,
2450                                   DMA_CCMD_DEVICE_INVL);
2451
2452        if (sm_supported(iommu))
2453                qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2454
2455        iommu->flush.flush_iotlb(iommu,
2456                                 did_old,
2457                                 0,
2458                                 0,
2459                                 DMA_TLB_DSI_FLUSH);
2460
2461        __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2462}
2463
2464static inline void unlink_domain_info(struct device_domain_info *info)
2465{
2466        assert_spin_locked(&device_domain_lock);
2467        list_del(&info->link);
2468        list_del(&info->global);
2469        if (info->dev)
2470                dev_iommu_priv_set(info->dev, NULL);
2471}
2472
2473static void domain_remove_dev_info(struct dmar_domain *domain)
2474{
2475        struct device_domain_info *info, *tmp;
2476        unsigned long flags;
2477
2478        spin_lock_irqsave(&device_domain_lock, flags);
2479        list_for_each_entry_safe(info, tmp, &domain->devices, link)
2480                __dmar_remove_one_dev_info(info);
2481        spin_unlock_irqrestore(&device_domain_lock, flags);
2482}
2483
2484struct dmar_domain *find_domain(struct device *dev)
2485{
2486        struct device_domain_info *info;
2487
2488        if (unlikely(!dev || !dev->iommu))
2489                return NULL;
2490
2491        if (unlikely(attach_deferred(dev)))
2492                return NULL;
2493
2494        /* No lock here, assumes no domain exit in normal case */
2495        info = get_domain_info(dev);
2496        if (likely(info))
2497                return info->domain;
2498
2499        return NULL;
2500}
2501
2502static inline struct device_domain_info *
2503dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2504{
2505        struct device_domain_info *info;
2506
2507        list_for_each_entry(info, &device_domain_list, global)
2508                if (info->segment == segment && info->bus == bus &&
2509                    info->devfn == devfn)
2510                        return info;
2511
2512        return NULL;
2513}
2514
2515static int domain_setup_first_level(struct intel_iommu *iommu,
2516                                    struct dmar_domain *domain,
2517                                    struct device *dev,
2518                                    u32 pasid)
2519{
2520        struct dma_pte *pgd = domain->pgd;
2521        int agaw, level;
2522        int flags = 0;
2523
2524        /*
2525         * Skip top levels of page tables for iommu which has
2526         * less agaw than default. Unnecessary for PT mode.
2527         */
2528        for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2529                pgd = phys_to_virt(dma_pte_addr(pgd));
2530                if (!dma_pte_present(pgd))
2531                        return -ENOMEM;
2532        }
2533
2534        level = agaw_to_level(agaw);
2535        if (level != 4 && level != 5)
2536                return -EINVAL;
2537
2538        if (pasid != PASID_RID2PASID)
2539                flags |= PASID_FLAG_SUPERVISOR_MODE;
2540        if (level == 5)
2541                flags |= PASID_FLAG_FL5LP;
2542
2543        if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED)
2544                flags |= PASID_FLAG_PAGE_SNOOP;
2545
2546        return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2547                                             domain->iommu_did[iommu->seq_id],
2548                                             flags);
2549}
2550
2551static bool dev_is_real_dma_subdevice(struct device *dev)
2552{
2553        return dev && dev_is_pci(dev) &&
2554               pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2555}
2556
2557static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2558                                                    int bus, int devfn,
2559                                                    struct device *dev,
2560                                                    struct dmar_domain *domain)
2561{
2562        struct dmar_domain *found = NULL;
2563        struct device_domain_info *info;
2564        unsigned long flags;
2565        int ret;
2566
2567        info = alloc_devinfo_mem();
2568        if (!info)
2569                return NULL;
2570
2571        if (!dev_is_real_dma_subdevice(dev)) {
2572                info->bus = bus;
2573                info->devfn = devfn;
2574                info->segment = iommu->segment;
2575        } else {
2576                struct pci_dev *pdev = to_pci_dev(dev);
2577
2578                info->bus = pdev->bus->number;
2579                info->devfn = pdev->devfn;
2580                info->segment = pci_domain_nr(pdev->bus);
2581        }
2582
2583        info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2584        info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2585        info->ats_qdep = 0;
2586        info->dev = dev;
2587        info->domain = domain;
2588        info->iommu = iommu;
2589        info->pasid_table = NULL;
2590        info->auxd_enabled = 0;
2591        INIT_LIST_HEAD(&info->subdevices);
2592
2593        if (dev && dev_is_pci(dev)) {
2594                struct pci_dev *pdev = to_pci_dev(info->dev);
2595
2596                if (ecap_dev_iotlb_support(iommu->ecap) &&
2597                    pci_ats_supported(pdev) &&
2598                    dmar_find_matched_atsr_unit(pdev))
2599                        info->ats_supported = 1;
2600
2601                if (sm_supported(iommu)) {
2602                        if (pasid_supported(iommu)) {
2603                                int features = pci_pasid_features(pdev);
2604                                if (features >= 0)
2605                                        info->pasid_supported = features | 1;
2606                        }
2607
2608                        if (info->ats_supported && ecap_prs(iommu->ecap) &&
2609                            pci_pri_supported(pdev))
2610                                info->pri_supported = 1;
2611                }
2612        }
2613
2614        spin_lock_irqsave(&device_domain_lock, flags);
2615        if (dev)
2616                found = find_domain(dev);
2617
2618        if (!found) {
2619                struct device_domain_info *info2;
2620                info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2621                                                       info->devfn);
2622                if (info2) {
2623                        found      = info2->domain;
2624                        info2->dev = dev;
2625                }
2626        }
2627
2628        if (found) {
2629                spin_unlock_irqrestore(&device_domain_lock, flags);
2630                free_devinfo_mem(info);
2631                /* Caller must free the original domain */
2632                return found;
2633        }
2634
2635        spin_lock(&iommu->lock);
2636        ret = domain_attach_iommu(domain, iommu);
2637        spin_unlock(&iommu->lock);
2638
2639        if (ret) {
2640                spin_unlock_irqrestore(&device_domain_lock, flags);
2641                free_devinfo_mem(info);
2642                return NULL;
2643        }
2644
2645        list_add(&info->link, &domain->devices);
2646        list_add(&info->global, &device_domain_list);
2647        if (dev)
2648                dev_iommu_priv_set(dev, info);
2649        spin_unlock_irqrestore(&device_domain_lock, flags);
2650
2651        /* PASID table is mandatory for a PCI device in scalable mode. */
2652        if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2653                ret = intel_pasid_alloc_table(dev);
2654                if (ret) {
2655                        dev_err(dev, "PASID table allocation failed\n");
2656                        dmar_remove_one_dev_info(dev);
2657                        return NULL;
2658                }
2659
2660                /* Setup the PASID entry for requests without PASID: */
2661                spin_lock_irqsave(&iommu->lock, flags);
2662                if (hw_pass_through && domain_type_is_si(domain))
2663                        ret = intel_pasid_setup_pass_through(iommu, domain,
2664                                        dev, PASID_RID2PASID);
2665                else if (domain_use_first_level(domain))
2666                        ret = domain_setup_first_level(iommu, domain, dev,
2667                                        PASID_RID2PASID);
2668                else
2669                        ret = intel_pasid_setup_second_level(iommu, domain,
2670                                        dev, PASID_RID2PASID);
2671                spin_unlock_irqrestore(&iommu->lock, flags);
2672                if (ret) {
2673                        dev_err(dev, "Setup RID2PASID failed\n");
2674                        dmar_remove_one_dev_info(dev);
2675                        return NULL;
2676                }
2677        }
2678
2679        if (dev && domain_context_mapping(domain, dev)) {
2680                dev_err(dev, "Domain context map failed\n");
2681                dmar_remove_one_dev_info(dev);
2682                return NULL;
2683        }
2684
2685        return domain;
2686}
2687
2688static int iommu_domain_identity_map(struct dmar_domain *domain,
2689                                     unsigned long first_vpfn,
2690                                     unsigned long last_vpfn)
2691{
2692        /*
2693         * RMRR range might have overlap with physical memory range,
2694         * clear it first
2695         */
2696        dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2697
2698        return __domain_mapping(domain, first_vpfn,
2699                                first_vpfn, last_vpfn - first_vpfn + 1,
2700                                DMA_PTE_READ|DMA_PTE_WRITE);
2701}
2702
2703static int md_domain_init(struct dmar_domain *domain, int guest_width);
2704
2705static int __init si_domain_init(int hw)
2706{
2707        struct dmar_rmrr_unit *rmrr;
2708        struct device *dev;
2709        int i, nid, ret;
2710
2711        si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2712        if (!si_domain)
2713                return -EFAULT;
2714
2715        if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2716                domain_exit(si_domain);
2717                return -EFAULT;
2718        }
2719
2720        if (hw)
2721                return 0;
2722
2723        for_each_online_node(nid) {
2724                unsigned long start_pfn, end_pfn;
2725                int i;
2726
2727                for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2728                        ret = iommu_domain_identity_map(si_domain,
2729                                        mm_to_dma_pfn(start_pfn),
2730                                        mm_to_dma_pfn(end_pfn));
2731                        if (ret)
2732                                return ret;
2733                }
2734        }
2735
2736        /*
2737         * Identity map the RMRRs so that devices with RMRRs could also use
2738         * the si_domain.
2739         */
2740        for_each_rmrr_units(rmrr) {
2741                for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2742                                          i, dev) {
2743                        unsigned long long start = rmrr->base_address;
2744                        unsigned long long end = rmrr->end_address;
2745
2746                        if (WARN_ON(end < start ||
2747                                    end >> agaw_to_width(si_domain->agaw)))
2748                                continue;
2749
2750                        ret = iommu_domain_identity_map(si_domain,
2751                                        mm_to_dma_pfn(start >> PAGE_SHIFT),
2752                                        mm_to_dma_pfn(end >> PAGE_SHIFT));
2753                        if (ret)
2754                                return ret;
2755                }
2756        }
2757
2758        return 0;
2759}
2760
2761static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2762{
2763        struct dmar_domain *ndomain;
2764        struct intel_iommu *iommu;
2765        u8 bus, devfn;
2766
2767        iommu = device_to_iommu(dev, &bus, &devfn);
2768        if (!iommu)
2769                return -ENODEV;
2770
2771        ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2772        if (ndomain != domain)
2773                return -EBUSY;
2774
2775        return 0;
2776}
2777
2778static bool device_has_rmrr(struct device *dev)
2779{
2780        struct dmar_rmrr_unit *rmrr;
2781        struct device *tmp;
2782        int i;
2783
2784        rcu_read_lock();
2785        for_each_rmrr_units(rmrr) {
2786                /*
2787                 * Return TRUE if this RMRR contains the device that
2788                 * is passed in.
2789                 */
2790                for_each_active_dev_scope(rmrr->devices,
2791                                          rmrr->devices_cnt, i, tmp)
2792                        if (tmp == dev ||
2793                            is_downstream_to_pci_bridge(dev, tmp)) {
2794                                rcu_read_unlock();
2795                                return true;
2796                        }
2797        }
2798        rcu_read_unlock();
2799        return false;
2800}
2801
2802/**
2803 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2804 * is relaxable (ie. is allowed to be not enforced under some conditions)
2805 * @dev: device handle
2806 *
2807 * We assume that PCI USB devices with RMRRs have them largely
2808 * for historical reasons and that the RMRR space is not actively used post
2809 * boot.  This exclusion may change if vendors begin to abuse it.
2810 *
2811 * The same exception is made for graphics devices, with the requirement that
2812 * any use of the RMRR regions will be torn down before assigning the device
2813 * to a guest.
2814 *
2815 * Return: true if the RMRR is relaxable, false otherwise
2816 */
2817static bool device_rmrr_is_relaxable(struct device *dev)
2818{
2819        struct pci_dev *pdev;
2820
2821        if (!dev_is_pci(dev))
2822                return false;
2823
2824        pdev = to_pci_dev(dev);
2825        if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2826                return true;
2827        else
2828                return false;
2829}
2830
2831/*
2832 * There are a couple cases where we need to restrict the functionality of
2833 * devices associated with RMRRs.  The first is when evaluating a device for
2834 * identity mapping because problems exist when devices are moved in and out
2835 * of domains and their respective RMRR information is lost.  This means that
2836 * a device with associated RMRRs will never be in a "passthrough" domain.
2837 * The second is use of the device through the IOMMU API.  This interface
2838 * expects to have full control of the IOVA space for the device.  We cannot
2839 * satisfy both the requirement that RMRR access is maintained and have an
2840 * unencumbered IOVA space.  We also have no ability to quiesce the device's
2841 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2842 * We therefore prevent devices associated with an RMRR from participating in
2843 * the IOMMU API, which eliminates them from device assignment.
2844 *
2845 * In both cases, devices which have relaxable RMRRs are not concerned by this
2846 * restriction. See device_rmrr_is_relaxable comment.
2847 */
2848static bool device_is_rmrr_locked(struct device *dev)
2849{
2850        if (!device_has_rmrr(dev))
2851                return false;
2852
2853        if (device_rmrr_is_relaxable(dev))
2854                return false;
2855
2856        return true;
2857}
2858
2859/*
2860 * Return the required default domain type for a specific device.
2861 *
2862 * @dev: the device in query
2863 * @startup: true if this is during early boot
2864 *
2865 * Returns:
2866 *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2867 *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2868 *  - 0: both identity and dynamic domains work for this device
2869 */
2870static int device_def_domain_type(struct device *dev)
2871{
2872        if (dev_is_pci(dev)) {
2873                struct pci_dev *pdev = to_pci_dev(dev);
2874
2875                if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2876                        return IOMMU_DOMAIN_IDENTITY;
2877
2878                if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2879                        return IOMMU_DOMAIN_IDENTITY;
2880        }
2881
2882        return 0;
2883}
2884
2885static void intel_iommu_init_qi(struct intel_iommu *iommu)
2886{
2887        /*
2888         * Start from the sane iommu hardware state.
2889         * If the queued invalidation is already initialized by us
2890         * (for example, while enabling interrupt-remapping) then
2891         * we got the things already rolling from a sane state.
2892         */
2893        if (!iommu->qi) {
2894                /*
2895                 * Clear any previous faults.
2896                 */
2897                dmar_fault(-1, iommu);
2898                /*
2899                 * Disable queued invalidation if supported and already enabled
2900                 * before OS handover.
2901                 */
2902                dmar_disable_qi(iommu);
2903        }
2904
2905        if (dmar_enable_qi(iommu)) {
2906                /*
2907                 * Queued Invalidate not enabled, use Register Based Invalidate
2908                 */
2909                iommu->flush.flush_context = __iommu_flush_context;
2910                iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2911                pr_info("%s: Using Register based invalidation\n",
2912                        iommu->name);
2913        } else {
2914                iommu->flush.flush_context = qi_flush_context;
2915                iommu->flush.flush_iotlb = qi_flush_iotlb;
2916                pr_info("%s: Using Queued invalidation\n", iommu->name);
2917        }
2918}
2919
2920static int copy_context_table(struct intel_iommu *iommu,
2921                              struct root_entry *old_re,
2922                              struct context_entry **tbl,
2923                              int bus, bool ext)
2924{
2925        int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2926        struct context_entry *new_ce = NULL, ce;
2927        struct context_entry *old_ce = NULL;
2928        struct root_entry re;
2929        phys_addr_t old_ce_phys;
2930
2931        tbl_idx = ext ? bus * 2 : bus;
2932        memcpy(&re, old_re, sizeof(re));
2933
2934        for (devfn = 0; devfn < 256; devfn++) {
2935                /* First calculate the correct index */
2936                idx = (ext ? devfn * 2 : devfn) % 256;
2937
2938                if (idx == 0) {
2939                        /* First save what we may have and clean up */
2940                        if (new_ce) {
2941                                tbl[tbl_idx] = new_ce;
2942                                __iommu_flush_cache(iommu, new_ce,
2943                                                    VTD_PAGE_SIZE);
2944                                pos = 1;
2945                        }
2946
2947                        if (old_ce)
2948                                memunmap(old_ce);
2949
2950                        ret = 0;
2951                        if (devfn < 0x80)
2952                                old_ce_phys = root_entry_lctp(&re);
2953                        else
2954                                old_ce_phys = root_entry_uctp(&re);
2955
2956                        if (!old_ce_phys) {
2957                                if (ext && devfn == 0) {
2958                                        /* No LCTP, try UCTP */
2959                                        devfn = 0x7f;
2960                                        continue;
2961                                } else {
2962                                        goto out;
2963                                }
2964                        }
2965
2966                        ret = -ENOMEM;
2967                        old_ce = memremap(old_ce_phys, PAGE_SIZE,
2968                                        MEMREMAP_WB);
2969                        if (!old_ce)
2970                                goto out;
2971
2972                        new_ce = alloc_pgtable_page(iommu->node);
2973                        if (!new_ce)
2974                                goto out_unmap;
2975
2976                        ret = 0;
2977                }
2978
2979                /* Now copy the context entry */
2980                memcpy(&ce, old_ce + idx, sizeof(ce));
2981
2982                if (!__context_present(&ce))
2983                        continue;
2984
2985                did = context_domain_id(&ce);
2986                if (did >= 0 && did < cap_ndoms(iommu->cap))
2987                        set_bit(did, iommu->domain_ids);
2988
2989                /*
2990                 * We need a marker for copied context entries. This
2991                 * marker needs to work for the old format as well as
2992                 * for extended context entries.
2993                 *
2994                 * Bit 67 of the context entry is used. In the old
2995                 * format this bit is available to software, in the
2996                 * extended format it is the PGE bit, but PGE is ignored
2997                 * by HW if PASIDs are disabled (and thus still
2998                 * available).
2999                 *
3000                 * So disable PASIDs first and then mark the entry
3001                 * copied. This means that we don't copy PASID
3002                 * translations from the old kernel, but this is fine as
3003                 * faults there are not fatal.
3004                 */
3005                context_clear_pasid_enable(&ce);
3006                context_set_copied(&ce);
3007
3008                new_ce[idx] = ce;
3009        }
3010
3011        tbl[tbl_idx + pos] = new_ce;
3012
3013        __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3014
3015out_unmap:
3016        memunmap(old_ce);
3017
3018out:
3019        return ret;
3020}
3021
3022static int copy_translation_tables(struct intel_iommu *iommu)
3023{
3024        struct context_entry **ctxt_tbls;
3025        struct root_entry *old_rt;
3026        phys_addr_t old_rt_phys;
3027        int ctxt_table_entries;
3028        unsigned long flags;
3029        u64 rtaddr_reg;
3030        int bus, ret;
3031        bool new_ext, ext;
3032
3033        rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3034        ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3035        new_ext    = !!ecap_ecs(iommu->ecap);
3036
3037        /*
3038         * The RTT bit can only be changed when translation is disabled,
3039         * but disabling translation means to open a window for data
3040         * corruption. So bail out and don't copy anything if we would
3041         * have to change the bit.
3042         */
3043        if (new_ext != ext)
3044                return -EINVAL;
3045
3046        old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3047        if (!old_rt_phys)
3048                return -EINVAL;
3049
3050        old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3051        if (!old_rt)
3052                return -ENOMEM;
3053
3054        /* This is too big for the stack - allocate it from slab */
3055        ctxt_table_entries = ext ? 512 : 256;
3056        ret = -ENOMEM;
3057        ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3058        if (!ctxt_tbls)
3059                goto out_unmap;
3060
3061        for (bus = 0; bus < 256; bus++) {
3062                ret = copy_context_table(iommu, &old_rt[bus],
3063                                         ctxt_tbls, bus, ext);
3064                if (ret) {
3065                        pr_err("%s: Failed to copy context table for bus %d\n",
3066                                iommu->name, bus);
3067                        continue;
3068                }
3069        }
3070
3071        spin_lock_irqsave(&iommu->lock, flags);
3072
3073        /* Context tables are copied, now write them to the root_entry table */
3074        for (bus = 0; bus < 256; bus++) {
3075                int idx = ext ? bus * 2 : bus;
3076                u64 val;
3077
3078                if (ctxt_tbls[idx]) {
3079                        val = virt_to_phys(ctxt_tbls[idx]) | 1;
3080                        iommu->root_entry[bus].lo = val;
3081                }
3082
3083                if (!ext || !ctxt_tbls[idx + 1])
3084                        continue;
3085
3086                val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3087                iommu->root_entry[bus].hi = val;
3088        }
3089
3090        spin_unlock_irqrestore(&iommu->lock, flags);
3091
3092        kfree(ctxt_tbls);
3093
3094        __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3095
3096        ret = 0;
3097
3098out_unmap:
3099        memunmap(old_rt);
3100
3101        return ret;
3102}
3103
3104#ifdef CONFIG_INTEL_IOMMU_SVM
3105static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3106{
3107        struct intel_iommu *iommu = data;
3108        ioasid_t ioasid;
3109
3110        if (!iommu)
3111                return INVALID_IOASID;
3112        /*
3113         * VT-d virtual command interface always uses the full 20 bit
3114         * PASID range. Host can partition guest PASID range based on
3115         * policies but it is out of guest's control.
3116         */
3117        if (min < PASID_MIN || max > intel_pasid_max_id)
3118                return INVALID_IOASID;
3119
3120        if (vcmd_alloc_pasid(iommu, &ioasid))
3121                return INVALID_IOASID;
3122
3123        return ioasid;
3124}
3125
3126static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3127{
3128        struct intel_iommu *iommu = data;
3129
3130        if (!iommu)
3131                return;
3132        /*
3133         * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3134         * We can only free the PASID when all the devices are unbound.
3135         */
3136        if (ioasid_find(NULL, ioasid, NULL)) {
3137                pr_alert("Cannot free active IOASID %d\n", ioasid);
3138                return;
3139        }
3140        vcmd_free_pasid(iommu, ioasid);
3141}
3142
3143static void register_pasid_allocator(struct intel_iommu *iommu)
3144{
3145        /*
3146         * If we are running in the host, no need for custom allocator
3147         * in that PASIDs are allocated from the host system-wide.
3148         */
3149        if (!cap_caching_mode(iommu->cap))
3150                return;
3151
3152        if (!sm_supported(iommu)) {
3153                pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3154                return;
3155        }
3156
3157        /*
3158         * Register a custom PASID allocator if we are running in a guest,
3159         * guest PASID must be obtained via virtual command interface.
3160         * There can be multiple vIOMMUs in each guest but only one allocator
3161         * is active. All vIOMMU allocators will eventually be calling the same
3162         * host allocator.
3163         */
3164        if (!vccap_pasid(iommu->vccap))
3165                return;
3166
3167        pr_info("Register custom PASID allocator\n");
3168        iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3169        iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3170        iommu->pasid_allocator.pdata = (void *)iommu;
3171        if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3172                pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3173                /*
3174                 * Disable scalable mode on this IOMMU if there
3175                 * is no custom allocator. Mixing SM capable vIOMMU
3176                 * and non-SM vIOMMU are not supported.
3177                 */
3178                intel_iommu_sm = 0;
3179        }
3180}
3181#endif
3182
3183static int __init init_dmars(void)
3184{
3185        struct dmar_drhd_unit *drhd;
3186        struct intel_iommu *iommu;
3187        int ret;
3188
3189        /*
3190         * for each drhd
3191         *    allocate root
3192         *    initialize and program root entry to not present
3193         * endfor
3194         */
3195        for_each_drhd_unit(drhd) {
3196                /*
3197                 * lock not needed as this is only incremented in the single
3198                 * threaded kernel __init code path all other access are read
3199                 * only
3200                 */
3201                if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3202                        g_num_of_iommus++;
3203                        continue;
3204                }
3205                pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3206        }
3207
3208        /* Preallocate enough resources for IOMMU hot-addition */
3209        if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3210                g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3211
3212        g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3213                        GFP_KERNEL);
3214        if (!g_iommus) {
3215                ret = -ENOMEM;
3216                goto error;
3217        }
3218
3219        ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
3220        if (ret)
3221                goto free_iommu;
3222
3223        for_each_iommu(iommu, drhd) {
3224                if (drhd->ignored) {
3225                        iommu_disable_translation(iommu);
3226                        continue;
3227                }
3228
3229                /*
3230                 * Find the max pasid size of all IOMMU's in the system.
3231                 * We need to ensure the system pasid table is no bigger
3232                 * than the smallest supported.
3233                 */
3234                if (pasid_supported(iommu)) {
3235                        u32 temp = 2 << ecap_pss(iommu->ecap);
3236
3237                        intel_pasid_max_id = min_t(u32, temp,
3238                                                   intel_pasid_max_id);
3239                }
3240
3241                g_iommus[iommu->seq_id] = iommu;
3242
3243                intel_iommu_init_qi(iommu);
3244
3245                ret = iommu_init_domains(iommu);
3246                if (ret)
3247                        goto free_iommu;
3248
3249                init_translation_status(iommu);
3250
3251                if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3252                        iommu_disable_translation(iommu);
3253                        clear_translation_pre_enabled(iommu);
3254                        pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3255                                iommu->name);
3256                }
3257
3258                /*
3259                 * TBD:
3260                 * we could share the same root & context tables
3261                 * among all IOMMU's. Need to Split it later.
3262                 */
3263                ret = iommu_alloc_root_entry(iommu);
3264                if (ret)
3265                        goto free_iommu;
3266
3267                if (translation_pre_enabled(iommu)) {
3268                        pr_info("Translation already enabled - trying to copy translation structures\n");
3269
3270                        ret = copy_translation_tables(iommu);
3271                        if (ret) {
3272                                /*
3273                                 * We found the IOMMU with translation
3274                                 * enabled - but failed to copy over the
3275                                 * old root-entry table. Try to proceed
3276                                 * by disabling translation now and
3277                                 * allocating a clean root-entry table.
3278                                 * This might cause DMAR faults, but
3279                                 * probably the dump will still succeed.
3280                                 */
3281                                pr_err("Failed to copy translation tables from previous kernel for %s\n",
3282                                       iommu->name);
3283                                iommu_disable_translation(iommu);
3284                                clear_translation_pre_enabled(iommu);
3285                        } else {
3286                                pr_info("Copied translation tables from previous kernel for %s\n",
3287                                        iommu->name);
3288                        }
3289                }
3290
3291                if (!ecap_pass_through(iommu->ecap))
3292                        hw_pass_through = 0;
3293                intel_svm_check(iommu);
3294        }
3295
3296        /*
3297         * Now that qi is enabled on all iommus, set the root entry and flush
3298         * caches. This is required on some Intel X58 chipsets, otherwise the
3299         * flush_context function will loop forever and the boot hangs.
3300         */
3301        for_each_active_iommu(iommu, drhd) {
3302                iommu_flush_write_buffer(iommu);
3303#ifdef CONFIG_INTEL_IOMMU_SVM
3304                register_pasid_allocator(iommu);
3305#endif
3306                iommu_set_root_entry(iommu);
3307        }
3308
3309#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3310        dmar_map_gfx = 0;
3311#endif
3312
3313        if (!dmar_map_gfx)
3314                iommu_identity_mapping |= IDENTMAP_GFX;
3315
3316        check_tylersburg_isoch();
3317
3318        ret = si_domain_init(hw_pass_through);
3319        if (ret)
3320                goto free_iommu;
3321
3322        /*
3323         * for each drhd
3324         *   enable fault log
3325         *   global invalidate context cache
3326         *   global invalidate iotlb
3327         *   enable translation
3328         */
3329        for_each_iommu(iommu, drhd) {
3330                if (drhd->ignored) {
3331                        /*
3332                         * we always have to disable PMRs or DMA may fail on
3333                         * this device
3334                         */
3335                        if (force_on)
3336                                iommu_disable_protect_mem_regions(iommu);
3337                        continue;
3338                }
3339
3340                iommu_flush_write_buffer(iommu);
3341
3342#ifdef CONFIG_INTEL_IOMMU_SVM
3343                if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3344                        /*
3345                         * Call dmar_alloc_hwirq() with dmar_global_lock held,
3346                         * could cause possible lock race condition.
3347                         */
3348                        up_write(&dmar_global_lock);
3349                        ret = intel_svm_enable_prq(iommu);
3350                        down_write(&dmar_global_lock);
3351                        if (ret)
3352                                goto free_iommu;
3353                }
3354#endif
3355                ret = dmar_set_interrupt(iommu);
3356                if (ret)
3357                        goto free_iommu;
3358        }
3359
3360        return 0;
3361
3362free_iommu:
3363        for_each_active_iommu(iommu, drhd) {
3364                disable_dmar_iommu(iommu);
3365                free_dmar_iommu(iommu);
3366        }
3367
3368        kfree(g_iommus);
3369
3370error:
3371        return ret;
3372}
3373
3374static inline int iommu_domain_cache_init(void)
3375{
3376        int ret = 0;
3377
3378        iommu_domain_cache = kmem_cache_create("iommu_domain",
3379                                         sizeof(struct dmar_domain),
3380                                         0,
3381                                         SLAB_HWCACHE_ALIGN,
3382
3383                                         NULL);
3384        if (!iommu_domain_cache) {
3385                pr_err("Couldn't create iommu_domain cache\n");
3386                ret = -ENOMEM;
3387        }
3388
3389        return ret;
3390}
3391
3392static inline int iommu_devinfo_cache_init(void)
3393{
3394        int ret = 0;
3395
3396        iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3397                                         sizeof(struct device_domain_info),
3398                                         0,
3399                                         SLAB_HWCACHE_ALIGN,
3400                                         NULL);
3401        if (!iommu_devinfo_cache) {
3402                pr_err("Couldn't create devinfo cache\n");
3403                ret = -ENOMEM;
3404        }
3405
3406        return ret;
3407}
3408
3409static int __init iommu_init_mempool(void)
3410{
3411        int ret;
3412        ret = iova_cache_get();
3413        if (ret)
3414                return ret;
3415
3416        ret = iommu_domain_cache_init();
3417        if (ret)
3418                goto domain_error;
3419
3420        ret = iommu_devinfo_cache_init();
3421        if (!ret)
3422                return ret;
3423
3424        kmem_cache_destroy(iommu_domain_cache);
3425domain_error:
3426        iova_cache_put();
3427
3428        return -ENOMEM;
3429}
3430
3431static void __init iommu_exit_mempool(void)
3432{
3433        kmem_cache_destroy(iommu_devinfo_cache);
3434        kmem_cache_destroy(iommu_domain_cache);
3435        iova_cache_put();
3436}
3437
3438static void __init init_no_remapping_devices(void)
3439{
3440        struct dmar_drhd_unit *drhd;
3441        struct device *dev;
3442        int i;
3443
3444        for_each_drhd_unit(drhd) {
3445                if (!drhd->include_all) {
3446                        for_each_active_dev_scope(drhd->devices,
3447                                                  drhd->devices_cnt, i, dev)
3448                                break;
3449                        /* ignore DMAR unit if no devices exist */
3450                        if (i == drhd->devices_cnt)
3451                                drhd->ignored = 1;
3452                }
3453        }
3454
3455        for_each_active_drhd_unit(drhd) {
3456                if (drhd->include_all)
3457                        continue;
3458
3459                for_each_active_dev_scope(drhd->devices,
3460                                          drhd->devices_cnt, i, dev)
3461                        if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3462                                break;
3463                if (i < drhd->devices_cnt)
3464                        continue;
3465
3466                /* This IOMMU has *only* gfx devices. Either bypass it or
3467                   set the gfx_mapped flag, as appropriate */
3468                drhd->gfx_dedicated = 1;
3469                if (!dmar_map_gfx)
3470                        drhd->ignored = 1;
3471        }
3472}
3473
3474#ifdef CONFIG_SUSPEND
3475static int init_iommu_hw(void)
3476{
3477        struct dmar_drhd_unit *drhd;
3478        struct intel_iommu *iommu = NULL;
3479
3480        for_each_active_iommu(iommu, drhd)
3481                if (iommu->qi)
3482                        dmar_reenable_qi(iommu);
3483
3484        for_each_iommu(iommu, drhd) {
3485                if (drhd->ignored) {
3486                        /*
3487                         * we always have to disable PMRs or DMA may fail on
3488                         * this device
3489                         */
3490                        if (force_on)
3491                                iommu_disable_protect_mem_regions(iommu);
3492                        continue;
3493                }
3494
3495                iommu_flush_write_buffer(iommu);
3496                iommu_set_root_entry(iommu);
3497                iommu_enable_translation(iommu);
3498                iommu_disable_protect_mem_regions(iommu);
3499        }
3500
3501        return 0;
3502}
3503
3504static void iommu_flush_all(void)
3505{
3506        struct dmar_drhd_unit *drhd;
3507        struct intel_iommu *iommu;
3508
3509        for_each_active_iommu(iommu, drhd) {
3510                iommu->flush.flush_context(iommu, 0, 0, 0,
3511                                           DMA_CCMD_GLOBAL_INVL);
3512                iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3513                                         DMA_TLB_GLOBAL_FLUSH);
3514        }
3515}
3516
3517static int iommu_suspend(void)
3518{
3519        struct dmar_drhd_unit *drhd;
3520        struct intel_iommu *iommu = NULL;
3521        unsigned long flag;
3522
3523        for_each_active_iommu(iommu, drhd) {
3524                iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3525                                             GFP_KERNEL);
3526                if (!iommu->iommu_state)
3527                        goto nomem;
3528        }
3529
3530        iommu_flush_all();
3531
3532        for_each_active_iommu(iommu, drhd) {
3533                iommu_disable_translation(iommu);
3534
3535                raw_spin_lock_irqsave(&iommu->register_lock, flag);
3536
3537                iommu->iommu_state[SR_DMAR_FECTL_REG] =
3538                        readl(iommu->reg + DMAR_FECTL_REG);
3539                iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3540                        readl(iommu->reg + DMAR_FEDATA_REG);
3541                iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3542                        readl(iommu->reg + DMAR_FEADDR_REG);
3543                iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3544                        readl(iommu->reg + DMAR_FEUADDR_REG);
3545
3546                raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3547        }
3548        return 0;
3549
3550nomem:
3551        for_each_active_iommu(iommu, drhd)
3552                kfree(iommu->iommu_state);
3553
3554        return -ENOMEM;
3555}
3556
3557static void iommu_resume(void)
3558{
3559        struct dmar_drhd_unit *drhd;
3560        struct intel_iommu *iommu = NULL;
3561        unsigned long flag;
3562
3563        if (init_iommu_hw()) {
3564                if (force_on)
3565                        panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3566                else
3567                        WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3568                return;
3569        }
3570
3571        for_each_active_iommu(iommu, drhd) {
3572
3573                raw_spin_lock_irqsave(&iommu->register_lock, flag);
3574
3575                writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3576                        iommu->reg + DMAR_FECTL_REG);
3577                writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3578                        iommu->reg + DMAR_FEDATA_REG);
3579                writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3580                        iommu->reg + DMAR_FEADDR_REG);
3581                writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3582                        iommu->reg + DMAR_FEUADDR_REG);
3583
3584                raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3585        }
3586
3587        for_each_active_iommu(iommu, drhd)
3588                kfree(iommu->iommu_state);
3589}
3590
3591static struct syscore_ops iommu_syscore_ops = {
3592        .resume         = iommu_resume,
3593        .suspend        = iommu_suspend,
3594};
3595
3596static void __init init_iommu_pm_ops(void)
3597{
3598        register_syscore_ops(&iommu_syscore_ops);
3599}
3600
3601#else
3602static inline void init_iommu_pm_ops(void) {}
3603#endif  /* CONFIG_PM */
3604
3605static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3606{
3607        if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3608            !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3609            rmrr->end_address <= rmrr->base_address ||
3610            arch_rmrr_sanity_check(rmrr))
3611                return -EINVAL;
3612
3613        return 0;
3614}
3615
3616int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3617{
3618        struct acpi_dmar_reserved_memory *rmrr;
3619        struct dmar_rmrr_unit *rmrru;
3620
3621        rmrr = (struct acpi_dmar_reserved_memory *)header;
3622        if (rmrr_sanity_check(rmrr)) {
3623                pr_warn(FW_BUG
3624                           "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3625                           "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3626                           rmrr->base_address, rmrr->end_address,
3627                           dmi_get_system_info(DMI_BIOS_VENDOR),
3628                           dmi_get_system_info(DMI_BIOS_VERSION),
3629                           dmi_get_system_info(DMI_PRODUCT_VERSION));
3630                add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3631        }
3632
3633        rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3634        if (!rmrru)
3635                goto out;
3636
3637        rmrru->hdr = header;
3638
3639        rmrru->base_address = rmrr->base_address;
3640        rmrru->end_address = rmrr->end_address;
3641
3642        rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3643                                ((void *)rmrr) + rmrr->header.length,
3644                                &rmrru->devices_cnt);
3645        if (rmrru->devices_cnt && rmrru->devices == NULL)
3646                goto free_rmrru;
3647
3648        list_add(&rmrru->list, &dmar_rmrr_units);
3649
3650        return 0;
3651free_rmrru:
3652        kfree(rmrru);
3653out:
3654        return -ENOMEM;
3655}
3656
3657static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3658{
3659        struct dmar_atsr_unit *atsru;
3660        struct acpi_dmar_atsr *tmp;
3661
3662        list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3663                                dmar_rcu_check()) {
3664                tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3665                if (atsr->segment != tmp->segment)
3666                        continue;
3667                if (atsr->header.length != tmp->header.length)
3668                        continue;
3669                if (memcmp(atsr, tmp, atsr->header.length) == 0)
3670                        return atsru;
3671        }
3672
3673        return NULL;
3674}
3675
3676int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3677{
3678        struct acpi_dmar_atsr *atsr;
3679        struct dmar_atsr_unit *atsru;
3680
3681        if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3682                return 0;
3683
3684        atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3685        atsru = dmar_find_atsr(atsr);
3686        if (atsru)
3687                return 0;
3688
3689        atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3690        if (!atsru)
3691                return -ENOMEM;
3692
3693        /*
3694         * If memory is allocated from slab by ACPI _DSM method, we need to
3695         * copy the memory content because the memory buffer will be freed
3696         * on return.
3697         */
3698        atsru->hdr = (void *)(atsru + 1);
3699        memcpy(atsru->hdr, hdr, hdr->length);
3700        atsru->include_all = atsr->flags & 0x1;
3701        if (!atsru->include_all) {
3702                atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3703                                (void *)atsr + atsr->header.length,
3704                                &atsru->devices_cnt);
3705                if (atsru->devices_cnt && atsru->devices == NULL) {
3706                        kfree(atsru);
3707                        return -ENOMEM;
3708                }
3709        }
3710
3711        list_add_rcu(&atsru->list, &dmar_atsr_units);
3712
3713        return 0;
3714}
3715
3716static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3717{
3718        dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3719        kfree(atsru);
3720}
3721
3722int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3723{
3724        struct acpi_dmar_atsr *atsr;
3725        struct dmar_atsr_unit *atsru;
3726
3727        atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3728        atsru = dmar_find_atsr(atsr);
3729        if (atsru) {
3730                list_del_rcu(&atsru->list);
3731                synchronize_rcu();
3732                intel_iommu_free_atsr(atsru);
3733        }
3734
3735        return 0;
3736}
3737
3738int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3739{
3740        int i;
3741        struct device *dev;
3742        struct acpi_dmar_atsr *atsr;
3743        struct dmar_atsr_unit *atsru;
3744
3745        atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3746        atsru = dmar_find_atsr(atsr);
3747        if (!atsru)
3748                return 0;
3749
3750        if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3751                for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3752                                          i, dev)
3753                        return -EBUSY;
3754        }
3755
3756        return 0;
3757}
3758
3759static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3760{
3761        struct dmar_satc_unit *satcu;
3762        struct acpi_dmar_satc *tmp;
3763
3764        list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3765                                dmar_rcu_check()) {
3766                tmp = (struct acpi_dmar_satc *)satcu->hdr;
3767                if (satc->segment != tmp->segment)
3768                        continue;
3769                if (satc->header.length != tmp->header.length)
3770                        continue;
3771                if (memcmp(satc, tmp, satc->header.length) == 0)
3772                        return satcu;
3773        }
3774
3775        return NULL;
3776}
3777
3778int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3779{
3780        struct acpi_dmar_satc *satc;
3781        struct dmar_satc_unit *satcu;
3782
3783        if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3784                return 0;
3785
3786        satc = container_of(hdr, struct acpi_dmar_satc, header);
3787        satcu = dmar_find_satc(satc);
3788        if (satcu)
3789                return 0;
3790
3791        satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3792        if (!satcu)
3793                return -ENOMEM;
3794
3795        satcu->hdr = (void *)(satcu + 1);
3796        memcpy(satcu->hdr, hdr, hdr->length);
3797        satcu->atc_required = satc->flags & 0x1;
3798        satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3799                                              (void *)satc + satc->header.length,
3800                                              &satcu->devices_cnt);
3801        if (satcu->devices_cnt && !satcu->devices) {
3802                kfree(satcu);
3803                return -ENOMEM;
3804        }
3805        list_add_rcu(&satcu->list, &dmar_satc_units);
3806
3807        return 0;
3808}
3809
3810static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3811{
3812        int sp, ret;
3813        struct intel_iommu *iommu = dmaru->iommu;
3814
3815        if (g_iommus[iommu->seq_id])
3816                return 0;
3817
3818        ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3819        if (ret)
3820                goto out;
3821
3822        if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3823                pr_warn("%s: Doesn't support hardware pass through.\n",
3824                        iommu->name);
3825                return -ENXIO;
3826        }
3827        if (!ecap_sc_support(iommu->ecap) &&
3828            domain_update_iommu_snooping(iommu)) {
3829                pr_warn("%s: Doesn't support snooping.\n",
3830                        iommu->name);
3831                return -ENXIO;
3832        }
3833        sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3834        if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3835                pr_warn("%s: Doesn't support large page.\n",
3836                        iommu->name);
3837                return -ENXIO;
3838        }
3839
3840        /*
3841         * Disable translation if already enabled prior to OS handover.
3842         */
3843        if (iommu->gcmd & DMA_GCMD_TE)
3844                iommu_disable_translation(iommu);
3845
3846        g_iommus[iommu->seq_id] = iommu;
3847        ret = iommu_init_domains(iommu);
3848        if (ret == 0)
3849                ret = iommu_alloc_root_entry(iommu);
3850        if (ret)
3851                goto out;
3852
3853        intel_svm_check(iommu);
3854
3855        if (dmaru->ignored) {
3856                /*
3857                 * we always have to disable PMRs or DMA may fail on this device
3858                 */
3859                if (force_on)
3860                        iommu_disable_protect_mem_regions(iommu);
3861                return 0;
3862        }
3863
3864        intel_iommu_init_qi(iommu);
3865        iommu_flush_write_buffer(iommu);
3866
3867#ifdef CONFIG_INTEL_IOMMU_SVM
3868        if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3869                ret = intel_svm_enable_prq(iommu);
3870                if (ret)
3871                        goto disable_iommu;
3872        }
3873#endif
3874        ret = dmar_set_interrupt(iommu);
3875        if (ret)
3876                goto disable_iommu;
3877
3878        iommu_set_root_entry(iommu);
3879        iommu_enable_translation(iommu);
3880
3881        iommu_disable_protect_mem_regions(iommu);
3882        return 0;
3883
3884disable_iommu:
3885        disable_dmar_iommu(iommu);
3886out:
3887        free_dmar_iommu(iommu);
3888        return ret;
3889}
3890
3891int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3892{
3893        int ret = 0;
3894        struct intel_iommu *iommu = dmaru->iommu;
3895
3896        if (!intel_iommu_enabled)
3897                return 0;
3898        if (iommu == NULL)
3899                return -EINVAL;
3900
3901        if (insert) {
3902                ret = intel_iommu_add(dmaru);
3903        } else {
3904                disable_dmar_iommu(iommu);
3905                free_dmar_iommu(iommu);
3906        }
3907
3908        return ret;
3909}
3910
3911static void intel_iommu_free_dmars(void)
3912{
3913        struct dmar_rmrr_unit *rmrru, *rmrr_n;
3914        struct dmar_atsr_unit *atsru, *atsr_n;
3915        struct dmar_satc_unit *satcu, *satc_n;
3916
3917        list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3918                list_del(&rmrru->list);
3919                dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3920                kfree(rmrru);
3921        }
3922
3923        list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3924                list_del(&atsru->list);
3925                intel_iommu_free_atsr(atsru);
3926        }
3927        list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3928                list_del(&satcu->list);
3929                dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3930                kfree(satcu);
3931        }
3932}
3933
3934int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3935{
3936        int i, ret = 1;
3937        struct pci_bus *bus;
3938        struct pci_dev *bridge = NULL;
3939        struct device *tmp;
3940        struct acpi_dmar_atsr *atsr;
3941        struct dmar_atsr_unit *atsru;
3942
3943        dev = pci_physfn(dev);
3944        for (bus = dev->bus; bus; bus = bus->parent) {
3945                bridge = bus->self;
3946                /* If it's an integrated device, allow ATS */
3947                if (!bridge)
3948                        return 1;
3949                /* Connected via non-PCIe: no ATS */
3950                if (!pci_is_pcie(bridge) ||
3951                    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3952                        return 0;
3953                /* If we found the root port, look it up in the ATSR */
3954                if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3955                        break;
3956        }
3957
3958        rcu_read_lock();
3959        list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3960                atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3961                if (atsr->segment != pci_domain_nr(dev->bus))
3962                        continue;
3963
3964                for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3965                        if (tmp == &bridge->dev)
3966                                goto out;
3967
3968                if (atsru->include_all)
3969                        goto out;
3970        }
3971        ret = 0;
3972out:
3973        rcu_read_unlock();
3974
3975        return ret;
3976}
3977
3978int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3979{
3980        int ret;
3981        struct dmar_rmrr_unit *rmrru;
3982        struct dmar_atsr_unit *atsru;
3983        struct dmar_satc_unit *satcu;
3984        struct acpi_dmar_atsr *atsr;
3985        struct acpi_dmar_reserved_memory *rmrr;
3986        struct acpi_dmar_satc *satc;
3987
3988        if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3989                return 0;
3990
3991        list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3992                rmrr = container_of(rmrru->hdr,
3993                                    struct acpi_dmar_reserved_memory, header);
3994                if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3995                        ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3996                                ((void *)rmrr) + rmrr->header.length,
3997                                rmrr->segment, rmrru->devices,
3998                                rmrru->devices_cnt);
3999                        if (ret < 0)
4000                                return ret;
4001                } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4002                        dmar_remove_dev_scope(info, rmrr->segment,
4003                                rmrru->devices, rmrru->devices_cnt);
4004                }
4005        }
4006
4007        list_for_each_entry(atsru, &dmar_atsr_units, list) {
4008                if (atsru->include_all)
4009                        continue;
4010
4011                atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4012                if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4013                        ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4014                                        (void *)atsr + atsr->header.length,
4015                                        atsr->segment, atsru->devices,
4016                                        atsru->devices_cnt);
4017                        if (ret > 0)
4018                                break;
4019                        else if (ret < 0)
4020                                return ret;
4021                } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4022                        if (dmar_remove_dev_scope(info, atsr->segment,
4023                                        atsru->devices, atsru->devices_cnt))
4024                                break;
4025                }
4026        }
4027        list_for_each_entry(satcu, &dmar_satc_units, list) {
4028                satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
4029                if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4030                        ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
4031                                        (void *)satc + satc->header.length,
4032                                        satc->segment, satcu->devices,
4033                                        satcu->devices_cnt);
4034                        if (ret > 0)
4035                                break;
4036                        else if (ret < 0)
4037                                return ret;
4038                } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4039                        if (dmar_remove_dev_scope(info, satc->segment,
4040                                        satcu->devices, satcu->devices_cnt))
4041                                break;
4042                }
4043        }
4044
4045        return 0;
4046}
4047
4048static int intel_iommu_memory_notifier(struct notifier_block *nb,
4049                                       unsigned long val, void *v)
4050{
4051        struct memory_notify *mhp = v;
4052        unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4053        unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4054                        mhp->nr_pages - 1);
4055
4056        switch (val) {
4057        case MEM_GOING_ONLINE:
4058                if (iommu_domain_identity_map(si_domain,
4059                                              start_vpfn, last_vpfn)) {
4060                        pr_warn("Failed to build identity map for [%lx-%lx]\n",
4061                                start_vpfn, last_vpfn);
4062                        return NOTIFY_BAD;
4063                }
4064                break;
4065
4066        case MEM_OFFLINE:
4067        case MEM_CANCEL_ONLINE:
4068                {
4069                        struct dmar_drhd_unit *drhd;
4070                        struct intel_iommu *iommu;
4071                        struct page *freelist;
4072
4073                        freelist = domain_unmap(si_domain,
4074                                                start_vpfn, last_vpfn,
4075                                                NULL);
4076
4077                        rcu_read_lock();
4078                        for_each_active_iommu(iommu, drhd)
4079                                iommu_flush_iotlb_psi(iommu, si_domain,
4080                                        start_vpfn, mhp->nr_pages,
4081                                        !freelist, 0);
4082                        rcu_read_unlock();
4083                        dma_free_pagelist(freelist);
4084                }
4085                break;
4086        }
4087
4088        return NOTIFY_OK;
4089}
4090
4091static struct notifier_block intel_iommu_memory_nb = {
4092        .notifier_call = intel_iommu_memory_notifier,
4093        .priority = 0
4094};
4095
4096static void intel_disable_iommus(void)
4097{
4098        struct intel_iommu *iommu = NULL;
4099        struct dmar_drhd_unit *drhd;
4100
4101        for_each_iommu(iommu, drhd)
4102                iommu_disable_translation(iommu);
4103}
4104
4105void intel_iommu_shutdown(void)
4106{
4107        struct dmar_drhd_unit *drhd;
4108        struct intel_iommu *iommu = NULL;
4109
4110        if (no_iommu || dmar_disabled)
4111                return;
4112
4113        down_write(&dmar_global_lock);
4114
4115        /* Disable PMRs explicitly here. */
4116        for_each_iommu(iommu, drhd)
4117                iommu_disable_protect_mem_regions(iommu);
4118
4119        /* Make sure the IOMMUs are switched off */
4120        intel_disable_iommus();
4121
4122        up_write(&dmar_global_lock);
4123}
4124
4125static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4126{
4127        struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4128
4129        return container_of(iommu_dev, struct intel_iommu, iommu);
4130}
4131
4132static ssize_t version_show(struct device *dev,
4133                            struct device_attribute *attr, char *buf)
4134{
4135        struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4136        u32 ver = readl(iommu->reg + DMAR_VER_REG);
4137        return sprintf(buf, "%d:%d\n",
4138                       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4139}
4140static DEVICE_ATTR_RO(version);
4141
4142static ssize_t address_show(struct device *dev,
4143                            struct device_attribute *attr, char *buf)
4144{
4145        struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4146        return sprintf(buf, "%llx\n", iommu->reg_phys);
4147}
4148static DEVICE_ATTR_RO(address);
4149
4150static ssize_t cap_show(struct device *dev,
4151                        struct device_attribute *attr, char *buf)
4152{
4153        struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4154        return sprintf(buf, "%llx\n", iommu->cap);
4155}
4156static DEVICE_ATTR_RO(cap);
4157
4158static ssize_t ecap_show(struct device *dev,
4159                         struct device_attribute *attr, char *buf)
4160{
4161        struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4162        return sprintf(buf, "%llx\n", iommu->ecap);
4163}
4164static DEVICE_ATTR_RO(ecap);
4165
4166static ssize_t domains_supported_show(struct device *dev,
4167                                      struct device_attribute *attr, char *buf)
4168{
4169        struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4170        return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4171}
4172static DEVICE_ATTR_RO(domains_supported);
4173
4174static ssize_t domains_used_show(struct device *dev,
4175                                 struct device_attribute *attr, char *buf)
4176{
4177        struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4178        return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4179                                                  cap_ndoms(iommu->cap)));
4180}
4181static DEVICE_ATTR_RO(domains_used);
4182
4183static struct attribute *intel_iommu_attrs[] = {
4184        &dev_attr_version.attr,
4185        &dev_attr_address.attr,
4186        &dev_attr_cap.attr,
4187        &dev_attr_ecap.attr,
4188        &dev_attr_domains_supported.attr,
4189        &dev_attr_domains_used.attr,
4190        NULL,
4191};
4192
4193static struct attribute_group intel_iommu_group = {
4194        .name = "intel-iommu",
4195        .attrs = intel_iommu_attrs,
4196};
4197
4198const struct attribute_group *intel_iommu_groups[] = {
4199        &intel_iommu_group,
4200        NULL,
4201};
4202
4203static inline bool has_external_pci(void)
4204{
4205        struct pci_dev *pdev = NULL;
4206
4207        for_each_pci_dev(pdev)
4208                if (pdev->external_facing)
4209                        return true;
4210
4211        return false;
4212}
4213
4214static int __init platform_optin_force_iommu(void)
4215{
4216        if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4217                return 0;
4218
4219        if (no_iommu || dmar_disabled)
4220                pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4221
4222        /*
4223         * If Intel-IOMMU is disabled by default, we will apply identity
4224         * map for all devices except those marked as being untrusted.
4225         */
4226        if (dmar_disabled)
4227                iommu_set_default_passthrough(false);
4228
4229        dmar_disabled = 0;
4230        no_iommu = 0;
4231
4232        return 1;
4233}
4234
4235static int __init probe_acpi_namespace_devices(void)
4236{
4237        struct dmar_drhd_unit *drhd;
4238        /* To avoid a -Wunused-but-set-variable warning. */
4239        struct intel_iommu *iommu __maybe_unused;
4240        struct device *dev;
4241        int i, ret = 0;
4242
4243        for_each_active_iommu(iommu, drhd) {
4244                for_each_active_dev_scope(drhd->devices,
4245                                          drhd->devices_cnt, i, dev) {
4246                        struct acpi_device_physical_node *pn;
4247                        struct iommu_group *group;
4248                        struct acpi_device *adev;
4249
4250                        if (dev->bus != &acpi_bus_type)
4251                                continue;
4252
4253                        adev = to_acpi_device(dev);
4254                        mutex_lock(&adev->physical_node_lock);
4255                        list_for_each_entry(pn,
4256                                            &adev->physical_node_list, node) {
4257                                group = iommu_group_get(pn->dev);
4258                                if (group) {
4259                                        iommu_group_put(group);
4260                                        continue;
4261                                }
4262
4263                                pn->dev->bus->iommu_ops = &intel_iommu_ops;
4264                                ret = iommu_probe_device(pn->dev);
4265                                if (ret)
4266                                        break;
4267                        }
4268                        mutex_unlock(&adev->physical_node_lock);
4269
4270                        if (ret)
4271                                return ret;
4272                }
4273        }
4274
4275        return 0;
4276}
4277
4278int __init intel_iommu_init(void)
4279{
4280        int ret = -ENODEV;
4281        struct dmar_drhd_unit *drhd;
4282        struct intel_iommu *iommu;
4283
4284        /*
4285         * Intel IOMMU is required for a TXT/tboot launch or platform
4286         * opt in, so enforce that.
4287         */
4288        force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4289                    platform_optin_force_iommu();
4290
4291        if (iommu_init_mempool()) {
4292                if (force_on)
4293                        panic("tboot: Failed to initialize iommu memory\n");
4294                return -ENOMEM;
4295        }
4296
4297        down_write(&dmar_global_lock);
4298        if (dmar_table_init()) {
4299                if (force_on)
4300                        panic("tboot: Failed to initialize DMAR table\n");
4301                goto out_free_dmar;
4302        }
4303
4304        if (dmar_dev_scope_init() < 0) {
4305                if (force_on)
4306                        panic("tboot: Failed to initialize DMAR device scope\n");
4307                goto out_free_dmar;
4308        }
4309
4310        up_write(&dmar_global_lock);
4311
4312        /*
4313         * The bus notifier takes the dmar_global_lock, so lockdep will
4314         * complain later when we register it under the lock.
4315         */
4316        dmar_register_bus_notifier();
4317
4318        down_write(&dmar_global_lock);
4319
4320        if (!no_iommu)
4321                intel_iommu_debugfs_init();
4322
4323        if (no_iommu || dmar_disabled) {
4324                /*
4325                 * We exit the function here to ensure IOMMU's remapping and
4326                 * mempool aren't setup, which means that the IOMMU's PMRs
4327                 * won't be disabled via the call to init_dmars(). So disable
4328                 * it explicitly here. The PMRs were setup by tboot prior to
4329                 * calling SENTER, but the kernel is expected to reset/tear
4330                 * down the PMRs.
4331                 */
4332                if (intel_iommu_tboot_noforce) {
4333                        for_each_iommu(iommu, drhd)
4334                                iommu_disable_protect_mem_regions(iommu);
4335                }
4336
4337                /*
4338                 * Make sure the IOMMUs are switched off, even when we
4339                 * boot into a kexec kernel and the previous kernel left
4340                 * them enabled
4341                 */
4342                intel_disable_iommus();
4343                goto out_free_dmar;
4344        }
4345
4346        if (list_empty(&dmar_rmrr_units))
4347                pr_info("No RMRR found\n");
4348
4349        if (list_empty(&dmar_atsr_units))
4350                pr_info("No ATSR found\n");
4351
4352        if (list_empty(&dmar_satc_units))
4353                pr_info("No SATC found\n");
4354
4355        if (dmar_map_gfx)
4356                intel_iommu_gfx_mapped = 1;
4357
4358        init_no_remapping_devices();
4359
4360        ret = init_dmars();
4361        if (ret) {
4362                if (force_on)
4363                        panic("tboot: Failed to initialize DMARs\n");
4364                pr_err("Initialization failed\n");
4365                goto out_free_dmar;
4366        }
4367        up_write(&dmar_global_lock);
4368
4369        init_iommu_pm_ops();
4370
4371        down_read(&dmar_global_lock);
4372        for_each_active_iommu(iommu, drhd) {
4373                /*
4374                 * The flush queue implementation does not perform
4375                 * page-selective invalidations that are required for efficient
4376                 * TLB flushes in virtual environments.  The benefit of batching
4377                 * is likely to be much lower than the overhead of synchronizing
4378                 * the virtual and physical IOMMU page-tables.
4379                 */
4380                if (cap_caching_mode(iommu->cap)) {
4381                        pr_info_once("IOMMU batching disallowed due to virtualization\n");
4382                        iommu_set_dma_strict();
4383                }
4384                iommu_device_sysfs_add(&iommu->iommu, NULL,
4385                                       intel_iommu_groups,
4386                                       "%s", iommu->name);
4387                iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4388        }
4389        up_read(&dmar_global_lock);
4390
4391        bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4392        if (si_domain && !hw_pass_through)
4393                register_memory_notifier(&intel_iommu_memory_nb);
4394
4395        down_read(&dmar_global_lock);
4396        if (probe_acpi_namespace_devices())
4397                pr_warn("ACPI name space devices didn't probe correctly\n");
4398
4399        /* Finally, we enable the DMA remapping hardware. */
4400        for_each_iommu(iommu, drhd) {
4401                if (!drhd->ignored && !translation_pre_enabled(iommu))
4402                        iommu_enable_translation(iommu);
4403
4404                iommu_disable_protect_mem_regions(iommu);
4405        }
4406        up_read(&dmar_global_lock);
4407
4408        pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4409
4410        intel_iommu_enabled = 1;
4411
4412        return 0;
4413
4414out_free_dmar:
4415        intel_iommu_free_dmars();
4416        up_write(&dmar_global_lock);
4417        iommu_exit_mempool();
4418        return ret;
4419}
4420
4421static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4422{
4423        struct device_domain_info *info = opaque;
4424
4425        domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4426        return 0;
4427}
4428
4429/*
4430 * NB - intel-iommu lacks any sort of reference counting for the users of
4431 * dependent devices.  If multiple endpoints have intersecting dependent
4432 * devices, unbinding the driver from any one of them will possibly leave
4433 * the others unable to operate.
4434 */
4435static void domain_context_clear(struct device_domain_info *info)
4436{
4437        if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4438                return;
4439
4440        pci_for_each_dma_alias(to_pci_dev(info->dev),
4441                               &domain_context_clear_one_cb, info);
4442}
4443
4444static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4445{
4446        struct dmar_domain *domain;
4447        struct intel_iommu *iommu;
4448        unsigned long flags;
4449
4450        assert_spin_locked(&device_domain_lock);
4451
4452        if (WARN_ON(!info))
4453                return;
4454
4455        iommu = info->iommu;
4456        domain = info->domain;
4457
4458        if (info->dev && !dev_is_real_dma_subdevice(info->dev)) {
4459                if (dev_is_pci(info->dev) && sm_supported(iommu))
4460                        intel_pasid_tear_down_entry(iommu, info->dev,
4461                                        PASID_RID2PASID, false);
4462
4463                iommu_disable_dev_iotlb(info);
4464                domain_context_clear(info);
4465                intel_pasid_free_table(info->dev);
4466        }
4467
4468        unlink_domain_info(info);
4469
4470        spin_lock_irqsave(&iommu->lock, flags);
4471        domain_detach_iommu(domain, iommu);
4472        spin_unlock_irqrestore(&iommu->lock, flags);
4473
4474        free_devinfo_mem(info);
4475}
4476
4477static void dmar_remove_one_dev_info(struct device *dev)
4478{
4479        struct device_domain_info *info;
4480        unsigned long flags;
4481
4482        spin_lock_irqsave(&device_domain_lock, flags);
4483        info = get_domain_info(dev);
4484        if (info)
4485                __dmar_remove_one_dev_info(info);
4486        spin_unlock_irqrestore(&device_domain_lock, flags);
4487}
4488
4489static int md_domain_init(struct dmar_domain *domain, int guest_width)
4490{
4491        int adjust_width;
4492
4493        /* calculate AGAW */
4494        domain->gaw = guest_width;
4495        adjust_width = guestwidth_to_adjustwidth(guest_width);
4496        domain->agaw = width_to_agaw(adjust_width);
4497
4498        domain->iommu_coherency = false;
4499        domain->iommu_snooping = false;
4500        domain->iommu_superpage = 0;
4501        domain->max_addr = 0;
4502
4503        /* always allocate the top pgd */
4504        domain->pgd = alloc_pgtable_page(domain->nid);
4505        if (!domain->pgd)
4506                return -ENOMEM;
4507        domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4508        return 0;
4509}
4510
4511static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4512{
4513        struct dmar_domain *dmar_domain;
4514        struct iommu_domain *domain;
4515
4516        switch (type) {
4517        case IOMMU_DOMAIN_DMA:
4518        case IOMMU_DOMAIN_DMA_FQ:
4519        case IOMMU_DOMAIN_UNMANAGED:
4520                dmar_domain = alloc_domain(0);
4521                if (!dmar_domain) {
4522                        pr_err("Can't allocate dmar_domain\n");
4523                        return NULL;
4524                }
4525                if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4526                        pr_err("Domain initialization failed\n");
4527                        domain_exit(dmar_domain);
4528                        return NULL;
4529                }
4530
4531                domain = &dmar_domain->domain;
4532                domain->geometry.aperture_start = 0;
4533                domain->geometry.aperture_end   =
4534                                __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4535                domain->geometry.force_aperture = true;
4536
4537                return domain;
4538        case IOMMU_DOMAIN_IDENTITY:
4539                return &si_domain->domain;
4540        default:
4541                return NULL;
4542        }
4543
4544        return NULL;
4545}
4546
4547static void intel_iommu_domain_free(struct iommu_domain *domain)
4548{
4549        if (domain != &si_domain->domain)
4550                domain_exit(to_dmar_domain(domain));
4551}
4552
4553/*
4554 * Check whether a @domain could be attached to the @dev through the
4555 * aux-domain attach/detach APIs.
4556 */
4557static inline bool
4558is_aux_domain(struct device *dev, struct iommu_domain *domain)
4559{
4560        struct device_domain_info *info = get_domain_info(dev);
4561
4562        return info && info->auxd_enabled &&
4563                        domain->type == IOMMU_DOMAIN_UNMANAGED;
4564}
4565
4566static inline struct subdev_domain_info *
4567lookup_subdev_info(struct dmar_domain *domain, struct device *dev)
4568{
4569        struct subdev_domain_info *sinfo;
4570
4571        if (!list_empty(&domain->subdevices)) {
4572                list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
4573                        if (sinfo->pdev == dev)
4574                                return sinfo;
4575                }
4576        }
4577
4578        return NULL;
4579}
4580
4581static int auxiliary_link_device(struct dmar_domain *domain,
4582                                 struct device *dev)
4583{
4584        struct device_domain_info *info = get_domain_info(dev);
4585        struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4586
4587        assert_spin_locked(&device_domain_lock);
4588        if (WARN_ON(!info))
4589                return -EINVAL;
4590
4591        if (!sinfo) {
4592                sinfo = kzalloc(sizeof(*sinfo), GFP_ATOMIC);
4593                if (!sinfo)
4594                        return -ENOMEM;
4595                sinfo->domain = domain;
4596                sinfo->pdev = dev;
4597                list_add(&sinfo->link_phys, &info->subdevices);
4598                list_add(&sinfo->link_domain, &domain->subdevices);
4599        }
4600
4601        return ++sinfo->users;
4602}
4603
4604static int auxiliary_unlink_device(struct dmar_domain *domain,
4605                                   struct device *dev)
4606{
4607        struct device_domain_info *info = get_domain_info(dev);
4608        struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4609        int ret;
4610
4611        assert_spin_locked(&device_domain_lock);
4612        if (WARN_ON(!info || !sinfo || sinfo->users <= 0))
4613                return -EINVAL;
4614
4615        ret = --sinfo->users;
4616        if (!ret) {
4617                list_del(&sinfo->link_phys);
4618                list_del(&sinfo->link_domain);
4619                kfree(sinfo);
4620        }
4621
4622        return ret;
4623}
4624
4625static int aux_domain_add_dev(struct dmar_domain *domain,
4626                              struct device *dev)
4627{
4628        int ret;
4629        unsigned long flags;
4630        struct intel_iommu *iommu;
4631
4632        iommu = device_to_iommu(dev, NULL, NULL);
4633        if (!iommu)
4634                return -ENODEV;
4635
4636        if (domain->default_pasid <= 0) {
4637                u32 pasid;
4638
4639                /* No private data needed for the default pasid */
4640                pasid = ioasid_alloc(NULL, PASID_MIN,
4641                                     pci_max_pasids(to_pci_dev(dev)) - 1,
4642                                     NULL);
4643                if (pasid == INVALID_IOASID) {
4644                        pr_err("Can't allocate default pasid\n");
4645                        return -ENODEV;
4646                }
4647                domain->default_pasid = pasid;
4648        }
4649
4650        spin_lock_irqsave(&device_domain_lock, flags);
4651        ret = auxiliary_link_device(domain, dev);
4652        if (ret <= 0)
4653                goto link_failed;
4654
4655        /*
4656         * Subdevices from the same physical device can be attached to the
4657         * same domain. For such cases, only the first subdevice attachment
4658         * needs to go through the full steps in this function. So if ret >
4659         * 1, just goto out.
4660         */
4661        if (ret > 1)
4662                goto out;
4663
4664        /*
4665         * iommu->lock must be held to attach domain to iommu and setup the
4666         * pasid entry for second level translation.
4667         */
4668        spin_lock(&iommu->lock);
4669        ret = domain_attach_iommu(domain, iommu);
4670        if (ret)
4671                goto attach_failed;
4672
4673        /* Setup the PASID entry for mediated devices: */
4674        if (domain_use_first_level(domain))
4675                ret = domain_setup_first_level(iommu, domain, dev,
4676                                               domain->default_pasid);
4677        else
4678                ret = intel_pasid_setup_second_level(iommu, domain, dev,
4679                                                     domain->default_pasid);
4680        if (ret)
4681                goto table_failed;
4682
4683        spin_unlock(&iommu->lock);
4684out:
4685        spin_unlock_irqrestore(&device_domain_lock, flags);
4686
4687        return 0;
4688
4689table_failed:
4690        domain_detach_iommu(domain, iommu);
4691attach_failed:
4692        spin_unlock(&iommu->lock);
4693        auxiliary_unlink_device(domain, dev);
4694link_failed:
4695        spin_unlock_irqrestore(&device_domain_lock, flags);
4696        if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4697                ioasid_put(domain->default_pasid);
4698
4699        return ret;
4700}
4701
4702static void aux_domain_remove_dev(struct dmar_domain *domain,
4703                                  struct device *dev)
4704{
4705        struct device_domain_info *info;
4706        struct intel_iommu *iommu;
4707        unsigned long flags;
4708
4709        if (!is_aux_domain(dev, &domain->domain))
4710                return;
4711
4712        spin_lock_irqsave(&device_domain_lock, flags);
4713        info = get_domain_info(dev);
4714        iommu = info->iommu;
4715
4716        if (!auxiliary_unlink_device(domain, dev)) {
4717                spin_lock(&iommu->lock);
4718                intel_pasid_tear_down_entry(iommu, dev,
4719                                            domain->default_pasid, false);
4720                domain_detach_iommu(domain, iommu);
4721                spin_unlock(&iommu->lock);
4722        }
4723
4724        spin_unlock_irqrestore(&device_domain_lock, flags);
4725
4726        if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4727                ioasid_put(domain->default_pasid);
4728}
4729
4730static int prepare_domain_attach_device(struct iommu_domain *domain,
4731                                        struct device *dev)
4732{
4733        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4734        struct intel_iommu *iommu;
4735        int addr_width;
4736
4737        iommu = device_to_iommu(dev, NULL, NULL);
4738        if (!iommu)
4739                return -ENODEV;
4740
4741        if ((dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE) &&
4742            !ecap_nest(iommu->ecap)) {
4743                dev_err(dev, "%s: iommu not support nested translation\n",
4744                        iommu->name);
4745                return -EINVAL;
4746        }
4747
4748        /* check if this iommu agaw is sufficient for max mapped address */
4749        addr_width = agaw_to_width(iommu->agaw);
4750        if (addr_width > cap_mgaw(iommu->cap))
4751                addr_width = cap_mgaw(iommu->cap);
4752
4753        if (dmar_domain->max_addr > (1LL << addr_width)) {
4754                dev_err(dev, "%s: iommu width (%d) is not "
4755                        "sufficient for the mapped address (%llx)\n",
4756                        __func__, addr_width, dmar_domain->max_addr);
4757                return -EFAULT;
4758        }
4759        dmar_domain->gaw = addr_width;
4760
4761        /*
4762         * Knock out extra levels of page tables if necessary
4763         */
4764        while (iommu->agaw < dmar_domain->agaw) {
4765                struct dma_pte *pte;
4766
4767                pte = dmar_domain->pgd;
4768                if (dma_pte_present(pte)) {
4769                        dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4770                        free_pgtable_page(pte);
4771                }
4772                dmar_domain->agaw--;
4773        }
4774
4775        return 0;
4776}
4777
4778static int intel_iommu_attach_device(struct iommu_domain *domain,
4779                                     struct device *dev)
4780{
4781        int ret;
4782
4783        if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4784            device_is_rmrr_locked(dev)) {
4785                dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4786                return -EPERM;
4787        }
4788
4789        if (is_aux_domain(dev, domain))
4790                return -EPERM;
4791
4792        /* normally dev is not mapped */
4793        if (unlikely(domain_context_mapped(dev))) {
4794                struct dmar_domain *old_domain;
4795
4796                old_domain = find_domain(dev);
4797                if (old_domain)
4798                        dmar_remove_one_dev_info(dev);
4799        }
4800
4801        ret = prepare_domain_attach_device(domain, dev);
4802        if (ret)
4803                return ret;
4804
4805        return domain_add_dev_info(to_dmar_domain(domain), dev);
4806}
4807
4808static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
4809                                         struct device *dev)
4810{
4811        int ret;
4812
4813        if (!is_aux_domain(dev, domain))
4814                return -EPERM;
4815
4816        ret = prepare_domain_attach_device(domain, dev);
4817        if (ret)
4818                return ret;
4819
4820        return aux_domain_add_dev(to_dmar_domain(domain), dev);
4821}
4822
4823static void intel_iommu_detach_device(struct iommu_domain *domain,
4824                                      struct device *dev)
4825{
4826        dmar_remove_one_dev_info(dev);
4827}
4828
4829static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
4830                                          struct device *dev)
4831{
4832        aux_domain_remove_dev(to_dmar_domain(domain), dev);
4833}
4834
4835#ifdef CONFIG_INTEL_IOMMU_SVM
4836/*
4837 * 2D array for converting and sanitizing IOMMU generic TLB granularity to
4838 * VT-d granularity. Invalidation is typically included in the unmap operation
4839 * as a result of DMA or VFIO unmap. However, for assigned devices guest
4840 * owns the first level page tables. Invalidations of translation caches in the
4841 * guest are trapped and passed down to the host.
4842 *
4843 * vIOMMU in the guest will only expose first level page tables, therefore
4844 * we do not support IOTLB granularity for request without PASID (second level).
4845 *
4846 * For example, to find the VT-d granularity encoding for IOTLB
4847 * type and page selective granularity within PASID:
4848 * X: indexed by iommu cache type
4849 * Y: indexed by enum iommu_inv_granularity
4850 * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
4851 */
4852
4853static const int
4854inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
4855        /*
4856         * PASID based IOTLB invalidation: PASID selective (per PASID),
4857         * page selective (address granularity)
4858         */
4859        {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
4860        /* PASID based dev TLBs */
4861        {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
4862        /* PASID cache */
4863        {-EINVAL, -EINVAL, -EINVAL}
4864};
4865
4866static inline int to_vtd_granularity(int type, int granu)
4867{
4868        return inv_type_granu_table[type][granu];
4869}
4870
4871static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
4872{
4873        u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
4874
4875        /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
4876         * IOMMU cache invalidate API passes granu_size in bytes, and number of
4877         * granu size in contiguous memory.
4878         */
4879        return order_base_2(nr_pages);
4880}
4881
4882static int
4883intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
4884                           struct iommu_cache_invalidate_info *inv_info)
4885{
4886        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4887        struct device_domain_info *info;
4888        struct intel_iommu *iommu;
4889        unsigned long flags;
4890        int cache_type;
4891        u8 bus, devfn;
4892        u16 did, sid;
4893        int ret = 0;
4894        u64 size = 0;
4895
4896        if (!inv_info || !dmar_domain)
4897                return -EINVAL;
4898
4899        if (!dev || !dev_is_pci(dev))
4900                return -ENODEV;
4901
4902        iommu = device_to_iommu(dev, &bus, &devfn);
4903        if (!iommu)
4904                return -ENODEV;
4905
4906        if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
4907                return -EINVAL;
4908
4909        spin_lock_irqsave(&device_domain_lock, flags);
4910        spin_lock(&iommu->lock);
4911        info = get_domain_info(dev);
4912        if (!info) {
4913                ret = -EINVAL;
4914                goto out_unlock;
4915        }
4916        did = dmar_domain->iommu_did[iommu->seq_id];
4917        sid = PCI_DEVID(bus, devfn);
4918
4919        /* Size is only valid in address selective invalidation */
4920        if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
4921                size = to_vtd_size(inv_info->granu.addr_info.granule_size,
4922                                   inv_info->granu.addr_info.nb_granules);
4923
4924        for_each_set_bit(cache_type,
4925                         (unsigned long *)&inv_info->cache,
4926                         IOMMU_CACHE_INV_TYPE_NR) {
4927                int granu = 0;
4928                u64 pasid = 0;
4929                u64 addr = 0;
4930
4931                granu = to_vtd_granularity(cache_type, inv_info->granularity);
4932                if (granu == -EINVAL) {
4933                        pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
4934                                           cache_type, inv_info->granularity);
4935                        break;
4936                }
4937
4938                /*
4939                 * PASID is stored in different locations based on the
4940                 * granularity.
4941                 */
4942                if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
4943                    (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
4944                        pasid = inv_info->granu.pasid_info.pasid;
4945                else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
4946                         (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
4947                        pasid = inv_info->granu.addr_info.pasid;
4948
4949                switch (BIT(cache_type)) {
4950                case IOMMU_CACHE_INV_TYPE_IOTLB:
4951                        /* HW will ignore LSB bits based on address mask */
4952                        if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
4953                            size &&
4954                            (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
4955                                pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
4956                                                   inv_info->granu.addr_info.addr, size);
4957                        }
4958
4959                        /*
4960                         * If granu is PASID-selective, address is ignored.
4961                         * We use npages = -1 to indicate that.
4962                         */
4963                        qi_flush_piotlb(iommu, did, pasid,
4964                                        mm_to_dma_pfn(inv_info->granu.addr_info.addr),
4965                                        (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
4966                                        inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
4967
4968                        if (!info->ats_enabled)
4969                                break;
4970                        /*
4971                         * Always flush device IOTLB if ATS is enabled. vIOMMU
4972                         * in the guest may assume IOTLB flush is inclusive,
4973                         * which is more efficient.
4974                         */
4975                        fallthrough;
4976                case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
4977                        /*
4978                         * PASID based device TLB invalidation does not support
4979                         * IOMMU_INV_GRANU_PASID granularity but only supports
4980                         * IOMMU_INV_GRANU_ADDR.
4981                         * The equivalent of that is we set the size to be the
4982                         * entire range of 64 bit. User only provides PASID info
4983                         * without address info. So we set addr to 0.
4984                         */
4985                        if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
4986                                size = 64 - VTD_PAGE_SHIFT;
4987                                addr = 0;
4988                        } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
4989                                addr = inv_info->granu.addr_info.addr;
4990                        }
4991
4992                        if (info->ats_enabled)
4993                                qi_flush_dev_iotlb_pasid(iommu, sid,
4994                                                info->pfsid, pasid,
4995                                                info->ats_qdep, addr,
4996                                                size);
4997                        else
4998                                pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
4999                        break;
5000                default:
5001                        dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5002                                            cache_type);
5003                        ret = -EINVAL;
5004                }
5005        }
5006out_unlock:
5007        spin_unlock(&iommu->lock);
5008        spin_unlock_irqrestore(&device_domain_lock, flags);
5009
5010        return ret;
5011}
5012#endif
5013
5014static int intel_iommu_map(struct iommu_domain *domain,
5015                           unsigned long iova, phys_addr_t hpa,
5016                           size_t size, int iommu_prot, gfp_t gfp)
5017{
5018        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5019        u64 max_addr;
5020        int prot = 0;
5021
5022        if (iommu_prot & IOMMU_READ)
5023                prot |= DMA_PTE_READ;
5024        if (iommu_prot & IOMMU_WRITE)
5025                prot |= DMA_PTE_WRITE;
5026        if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5027                prot |= DMA_PTE_SNP;
5028
5029        max_addr = iova + size;
5030        if (dmar_domain->max_addr < max_addr) {
5031                u64 end;
5032
5033                /* check if minimum agaw is sufficient for mapped address */
5034                end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5035                if (end < max_addr) {
5036                        pr_err("%s: iommu width (%d) is not "
5037                               "sufficient for the mapped address (%llx)\n",
5038                               __func__, dmar_domain->gaw, max_addr);
5039                        return -EFAULT;
5040                }
5041                dmar_domain->max_addr = max_addr;
5042        }
5043        /* Round up size to next multiple of PAGE_SIZE, if it and
5044           the low bits of hpa would take us onto the next page */
5045        size = aligned_nrpages(hpa, size);
5046        return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5047                                hpa >> VTD_PAGE_SHIFT, size, prot);
5048}
5049
5050static int intel_iommu_map_pages(struct iommu_domain *domain,
5051                                 unsigned long iova, phys_addr_t paddr,
5052                                 size_t pgsize, size_t pgcount,
5053                                 int prot, gfp_t gfp, size_t *mapped)
5054{
5055        unsigned long pgshift = __ffs(pgsize);
5056        size_t size = pgcount << pgshift;
5057        int ret;
5058
5059        if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
5060                return -EINVAL;
5061
5062        if (!IS_ALIGNED(iova | paddr, pgsize))
5063                return -EINVAL;
5064
5065        ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
5066        if (!ret && mapped)
5067                *mapped = size;
5068
5069        return ret;
5070}
5071
5072static size_t intel_iommu_unmap(struct iommu_domain *domain,
5073                                unsigned long iova, size_t size,
5074                                struct iommu_iotlb_gather *gather)
5075{
5076        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5077        unsigned long start_pfn, last_pfn;
5078        int level = 0;
5079
5080        /* Cope with horrid API which requires us to unmap more than the
5081           size argument if it happens to be a large-page mapping. */
5082        BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5083
5084        if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5085                size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5086
5087        start_pfn = iova >> VTD_PAGE_SHIFT;
5088        last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5089
5090        gather->freelist = domain_unmap(dmar_domain, start_pfn,
5091                                        last_pfn, gather->freelist);
5092
5093        if (dmar_domain->max_addr == iova + size)
5094                dmar_domain->max_addr = iova;
5095
5096        iommu_iotlb_gather_add_page(domain, gather, iova, size);
5097
5098        return size;
5099}
5100
5101static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
5102                                      unsigned long iova,
5103                                      size_t pgsize, size_t pgcount,
5104                                      struct iommu_iotlb_gather *gather)
5105{
5106        unsigned long pgshift = __ffs(pgsize);
5107        size_t size = pgcount << pgshift;
5108
5109        return intel_iommu_unmap(domain, iova, size, gather);
5110}
5111
5112static void intel_iommu_tlb_sync(struct iommu_domain *domain,
5113                                 struct iommu_iotlb_gather *gather)
5114{
5115        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5116        unsigned long iova_pfn = IOVA_PFN(gather->start);
5117        size_t size = gather->end - gather->start;
5118        unsigned long start_pfn;
5119        unsigned long nrpages;
5120        int iommu_id;
5121
5122        nrpages = aligned_nrpages(gather->start, size);
5123        start_pfn = mm_to_dma_pfn(iova_pfn);
5124
5125        for_each_domain_iommu(iommu_id, dmar_domain)
5126                iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5127                                      start_pfn, nrpages, !gather->freelist, 0);
5128
5129        dma_free_pagelist(gather->freelist);
5130}
5131
5132static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5133                                            dma_addr_t iova)
5134{
5135        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5136        struct dma_pte *pte;
5137        int level = 0;
5138        u64 phys = 0;
5139
5140        pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5141        if (pte && dma_pte_present(pte))
5142                phys = dma_pte_addr(pte) +
5143                        (iova & (BIT_MASK(level_to_offset_bits(level) +
5144                                                VTD_PAGE_SHIFT) - 1));
5145
5146        return phys;
5147}
5148
5149static bool intel_iommu_capable(enum iommu_cap cap)
5150{
5151        if (cap == IOMMU_CAP_CACHE_COHERENCY)
5152                return domain_update_iommu_snooping(NULL);
5153        if (cap == IOMMU_CAP_INTR_REMAP)
5154                return irq_remapping_enabled == 1;
5155
5156        return false;
5157}
5158
5159static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5160{
5161        struct intel_iommu *iommu;
5162
5163        iommu = device_to_iommu(dev, NULL, NULL);
5164        if (!iommu)
5165                return ERR_PTR(-ENODEV);
5166
5167        if (translation_pre_enabled(iommu))
5168                dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5169
5170        return &iommu->iommu;
5171}
5172
5173static void intel_iommu_release_device(struct device *dev)
5174{
5175        struct intel_iommu *iommu;
5176
5177        iommu = device_to_iommu(dev, NULL, NULL);
5178        if (!iommu)
5179                return;
5180
5181        dmar_remove_one_dev_info(dev);
5182
5183        set_dma_ops(dev, NULL);
5184}
5185
5186static void intel_iommu_probe_finalize(struct device *dev)
5187{
5188        set_dma_ops(dev, NULL);
5189        iommu_setup_dma_ops(dev, 0, U64_MAX);
5190}
5191
5192static void intel_iommu_get_resv_regions(struct device *device,
5193                                         struct list_head *head)
5194{
5195        int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5196        struct iommu_resv_region *reg;
5197        struct dmar_rmrr_unit *rmrr;
5198        struct device *i_dev;
5199        int i;
5200
5201        down_read(&dmar_global_lock);
5202        for_each_rmrr_units(rmrr) {
5203                for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5204                                          i, i_dev) {
5205                        struct iommu_resv_region *resv;
5206                        enum iommu_resv_type type;
5207                        size_t length;
5208
5209                        if (i_dev != device &&
5210                            !is_downstream_to_pci_bridge(device, i_dev))
5211                                continue;
5212
5213                        length = rmrr->end_address - rmrr->base_address + 1;
5214
5215                        type = device_rmrr_is_relaxable(device) ?
5216                                IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5217
5218                        resv = iommu_alloc_resv_region(rmrr->base_address,
5219                                                       length, prot, type);
5220                        if (!resv)
5221                                break;
5222
5223                        list_add_tail(&resv->list, head);
5224                }
5225        }
5226        up_read(&dmar_global_lock);
5227
5228#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5229        if (dev_is_pci(device)) {
5230                struct pci_dev *pdev = to_pci_dev(device);
5231
5232                if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5233                        reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5234                                                   IOMMU_RESV_DIRECT_RELAXABLE);
5235                        if (reg)
5236                                list_add_tail(&reg->list, head);
5237                }
5238        }
5239#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5240
5241        reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5242                                      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5243                                      0, IOMMU_RESV_MSI);
5244        if (!reg)
5245                return;
5246        list_add_tail(&reg->list, head);
5247}
5248
5249int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5250{
5251        struct device_domain_info *info;
5252        struct context_entry *context;
5253        struct dmar_domain *domain;
5254        unsigned long flags;
5255        u64 ctx_lo;
5256        int ret;
5257
5258        domain = find_domain(dev);
5259        if (!domain)
5260                return -EINVAL;
5261
5262        spin_lock_irqsave(&device_domain_lock, flags);
5263        spin_lock(&iommu->lock);
5264
5265        ret = -EINVAL;
5266        info = get_domain_info(dev);
5267        if (!info || !info->pasid_supported)
5268                goto out;
5269
5270        context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5271        if (WARN_ON(!context))
5272                goto out;
5273
5274        ctx_lo = context[0].lo;
5275
5276        if (!(ctx_lo & CONTEXT_PASIDE)) {
5277                ctx_lo |= CONTEXT_PASIDE;
5278                context[0].lo = ctx_lo;
5279                wmb();
5280                iommu->flush.flush_context(iommu,
5281                                           domain->iommu_did[iommu->seq_id],
5282                                           PCI_DEVID(info->bus, info->devfn),
5283                                           DMA_CCMD_MASK_NOBIT,
5284                                           DMA_CCMD_DEVICE_INVL);
5285        }
5286
5287        /* Enable PASID support in the device, if it wasn't already */
5288        if (!info->pasid_enabled)
5289                iommu_enable_dev_iotlb(info);
5290
5291        ret = 0;
5292
5293 out:
5294        spin_unlock(&iommu->lock);
5295        spin_unlock_irqrestore(&device_domain_lock, flags);
5296
5297        return ret;
5298}
5299
5300static struct iommu_group *intel_iommu_device_group(struct device *dev)
5301{
5302        if (dev_is_pci(dev))
5303                return pci_device_group(dev);
5304        return generic_device_group(dev);
5305}
5306
5307static int intel_iommu_enable_auxd(struct device *dev)
5308{
5309        struct device_domain_info *info;
5310        struct intel_iommu *iommu;
5311        unsigned long flags;
5312        int ret;
5313
5314        iommu = device_to_iommu(dev, NULL, NULL);
5315        if (!iommu || dmar_disabled)
5316                return -EINVAL;
5317
5318        if (!sm_supported(iommu) || !pasid_supported(iommu))
5319                return -EINVAL;
5320
5321        ret = intel_iommu_enable_pasid(iommu, dev);
5322        if (ret)
5323                return -ENODEV;
5324
5325        spin_lock_irqsave(&device_domain_lock, flags);
5326        info = get_domain_info(dev);
5327        info->auxd_enabled = 1;
5328        spin_unlock_irqrestore(&device_domain_lock, flags);
5329
5330        return 0;
5331}
5332
5333static int intel_iommu_disable_auxd(struct device *dev)
5334{
5335        struct device_domain_info *info;
5336        unsigned long flags;
5337
5338        spin_lock_irqsave(&device_domain_lock, flags);
5339        info = get_domain_info(dev);
5340        if (!WARN_ON(!info))
5341                info->auxd_enabled = 0;
5342        spin_unlock_irqrestore(&device_domain_lock, flags);
5343
5344        return 0;
5345}
5346
5347static int intel_iommu_enable_sva(struct device *dev)
5348{
5349        struct device_domain_info *info = get_domain_info(dev);
5350        struct intel_iommu *iommu;
5351        int ret;
5352
5353        if (!info || dmar_disabled)
5354                return -EINVAL;
5355
5356        iommu = info->iommu;
5357        if (!iommu)
5358                return -EINVAL;
5359
5360        if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
5361                return -ENODEV;
5362
5363        if (intel_iommu_enable_pasid(iommu, dev))
5364                return -ENODEV;
5365
5366        if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
5367                return -EINVAL;
5368
5369        ret = iopf_queue_add_device(iommu->iopf_queue, dev);
5370        if (!ret)
5371                ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
5372
5373        return ret;
5374}
5375
5376static int intel_iommu_disable_sva(struct device *dev)
5377{
5378        struct device_domain_info *info = get_domain_info(dev);
5379        struct intel_iommu *iommu = info->iommu;
5380        int ret;
5381
5382        ret = iommu_unregister_device_fault_handler(dev);
5383        if (!ret)
5384                ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
5385
5386        return ret;
5387}
5388
5389/*
5390 * A PCI express designated vendor specific extended capability is defined
5391 * in the section 3.7 of Intel scalable I/O virtualization technical spec
5392 * for system software and tools to detect endpoint devices supporting the
5393 * Intel scalable IO virtualization without host driver dependency.
5394 *
5395 * Returns the address of the matching extended capability structure within
5396 * the device's PCI configuration space or 0 if the device does not support
5397 * it.
5398 */
5399static int siov_find_pci_dvsec(struct pci_dev *pdev)
5400{
5401        int pos;
5402        u16 vendor, id;
5403
5404        pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5405        while (pos) {
5406                pci_read_config_word(pdev, pos + 4, &vendor);
5407                pci_read_config_word(pdev, pos + 8, &id);
5408                if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5409                        return pos;
5410
5411                pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5412        }
5413
5414        return 0;
5415}
5416
5417static bool
5418intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5419{
5420        struct device_domain_info *info = get_domain_info(dev);
5421
5422        if (feat == IOMMU_DEV_FEAT_AUX) {
5423                int ret;
5424
5425                if (!dev_is_pci(dev) || dmar_disabled ||
5426                    !scalable_mode_support() || !pasid_mode_support())
5427                        return false;
5428
5429                ret = pci_pasid_features(to_pci_dev(dev));
5430                if (ret < 0)
5431                        return false;
5432
5433                return !!siov_find_pci_dvsec(to_pci_dev(dev));
5434        }
5435
5436        if (feat == IOMMU_DEV_FEAT_IOPF)
5437                return info && info->pri_supported;
5438
5439        if (feat == IOMMU_DEV_FEAT_SVA)
5440                return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5441                        info->pasid_supported && info->pri_supported &&
5442                        info->ats_supported;
5443
5444        return false;
5445}
5446
5447static int
5448intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5449{
5450        switch (feat) {
5451        case IOMMU_DEV_FEAT_AUX:
5452                return intel_iommu_enable_auxd(dev);
5453
5454        case IOMMU_DEV_FEAT_IOPF:
5455                return intel_iommu_dev_has_feat(dev, feat) ? 0 : -ENODEV;
5456
5457        case IOMMU_DEV_FEAT_SVA:
5458                return intel_iommu_enable_sva(dev);
5459
5460        default:
5461                return -ENODEV;
5462        }
5463}
5464
5465static int
5466intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5467{
5468        switch (feat) {
5469        case IOMMU_DEV_FEAT_AUX:
5470                return intel_iommu_disable_auxd(dev);
5471
5472        case IOMMU_DEV_FEAT_IOPF:
5473                return 0;
5474
5475        case IOMMU_DEV_FEAT_SVA:
5476                return intel_iommu_disable_sva(dev);
5477
5478        default:
5479                return -ENODEV;
5480        }
5481}
5482
5483static bool
5484intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5485{
5486        struct device_domain_info *info = get_domain_info(dev);
5487
5488        if (feat == IOMMU_DEV_FEAT_AUX)
5489                return scalable_mode_support() && info && info->auxd_enabled;
5490
5491        return false;
5492}
5493
5494static int
5495intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5496{
5497        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5498
5499        return dmar_domain->default_pasid > 0 ?
5500                        dmar_domain->default_pasid : -EINVAL;
5501}
5502
5503static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5504                                           struct device *dev)
5505{
5506        return attach_deferred(dev);
5507}
5508
5509static int
5510intel_iommu_enable_nesting(struct iommu_domain *domain)
5511{
5512        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5513        unsigned long flags;
5514        int ret = -ENODEV;
5515
5516        spin_lock_irqsave(&device_domain_lock, flags);
5517        if (list_empty(&dmar_domain->devices)) {
5518                dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
5519                dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
5520                ret = 0;
5521        }
5522        spin_unlock_irqrestore(&device_domain_lock, flags);
5523
5524        return ret;
5525}
5526
5527/*
5528 * Check that the device does not live on an external facing PCI port that is
5529 * marked as untrusted. Such devices should not be able to apply quirks and
5530 * thus not be able to bypass the IOMMU restrictions.
5531 */
5532static bool risky_device(struct pci_dev *pdev)
5533{
5534        if (pdev->untrusted) {
5535                pci_info(pdev,
5536                         "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
5537                         pdev->vendor, pdev->device);
5538                pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
5539                return true;
5540        }
5541        return false;
5542}
5543
5544static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
5545                                       unsigned long iova, size_t size)
5546{
5547        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5548        unsigned long pages = aligned_nrpages(iova, size);
5549        unsigned long pfn = iova >> VTD_PAGE_SHIFT;
5550        struct intel_iommu *iommu;
5551        int iommu_id;
5552
5553        for_each_domain_iommu(iommu_id, dmar_domain) {
5554                iommu = g_iommus[iommu_id];
5555                __mapping_notify_one(iommu, dmar_domain, pfn, pages);
5556        }
5557}
5558
5559const struct iommu_ops intel_iommu_ops = {
5560        .capable                = intel_iommu_capable,
5561        .domain_alloc           = intel_iommu_domain_alloc,
5562        .domain_free            = intel_iommu_domain_free,
5563        .enable_nesting         = intel_iommu_enable_nesting,
5564        .attach_dev             = intel_iommu_attach_device,
5565        .detach_dev             = intel_iommu_detach_device,
5566        .aux_attach_dev         = intel_iommu_aux_attach_device,
5567        .aux_detach_dev         = intel_iommu_aux_detach_device,
5568        .aux_get_pasid          = intel_iommu_aux_get_pasid,
5569        .map_pages              = intel_iommu_map_pages,
5570        .unmap_pages            = intel_iommu_unmap_pages,
5571        .iotlb_sync_map         = intel_iommu_iotlb_sync_map,
5572        .flush_iotlb_all        = intel_flush_iotlb_all,
5573        .iotlb_sync             = intel_iommu_tlb_sync,
5574        .iova_to_phys           = intel_iommu_iova_to_phys,
5575        .probe_device           = intel_iommu_probe_device,
5576        .probe_finalize         = intel_iommu_probe_finalize,
5577        .release_device         = intel_iommu_release_device,
5578        .get_resv_regions       = intel_iommu_get_resv_regions,
5579        .put_resv_regions       = generic_iommu_put_resv_regions,
5580        .device_group           = intel_iommu_device_group,
5581        .dev_has_feat           = intel_iommu_dev_has_feat,
5582        .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
5583        .dev_enable_feat        = intel_iommu_dev_enable_feat,
5584        .dev_disable_feat       = intel_iommu_dev_disable_feat,
5585        .is_attach_deferred     = intel_iommu_is_attach_deferred,
5586        .def_domain_type        = device_def_domain_type,
5587        .pgsize_bitmap          = SZ_4K,
5588#ifdef CONFIG_INTEL_IOMMU_SVM
5589        .cache_invalidate       = intel_iommu_sva_invalidate,
5590        .sva_bind_gpasid        = intel_svm_bind_gpasid,
5591        .sva_unbind_gpasid      = intel_svm_unbind_gpasid,
5592        .sva_bind               = intel_svm_bind,
5593        .sva_unbind             = intel_svm_unbind,
5594        .sva_get_pasid          = intel_svm_get_pasid,
5595        .page_response          = intel_svm_page_response,
5596#endif
5597};
5598
5599static void quirk_iommu_igfx(struct pci_dev *dev)
5600{
5601        if (risky_device(dev))
5602                return;
5603
5604        pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5605        dmar_map_gfx = 0;
5606}
5607
5608/* G4x/GM45 integrated gfx dmar support is totally busted. */
5609DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
5610DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
5611DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
5612DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
5613DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
5614DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
5615DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
5616
5617/* Broadwell igfx malfunctions with dmar */
5618DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
5619DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
5620DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
5621DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
5622DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
5623DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
5624DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
5625DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
5626DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
5627DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
5628DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
5629DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
5630DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
5631DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
5632DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
5633DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
5634DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
5635DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
5636DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
5637DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
5638DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
5639DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
5640DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
5641DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
5642
5643static void quirk_iommu_rwbf(struct pci_dev *dev)
5644{
5645        if (risky_device(dev))
5646                return;
5647
5648        /*
5649         * Mobile 4 Series Chipset neglects to set RWBF capability,
5650         * but needs it. Same seems to hold for the desktop versions.
5651         */
5652        pci_info(dev, "Forcing write-buffer flush capability\n");
5653        rwbf_quirk = 1;
5654}
5655
5656DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5657DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5658DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5659DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5660DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5661DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5662DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5663
5664#define GGC 0x52
5665#define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5666#define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5667#define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5668#define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5669#define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5670#define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5671#define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5672#define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5673
5674static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5675{
5676        unsigned short ggc;
5677
5678        if (risky_device(dev))
5679                return;
5680
5681        if (pci_read_config_word(dev, GGC, &ggc))
5682                return;
5683
5684        if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5685                pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5686                dmar_map_gfx = 0;
5687        } else if (dmar_map_gfx) {
5688                /* we have to ensure the gfx device is idle before we flush */
5689                pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5690                iommu_set_dma_strict();
5691        }
5692}
5693DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5694DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5695DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5696DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5697
5698static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5699{
5700        unsigned short ver;
5701
5702        if (!IS_GFX_DEVICE(dev))
5703                return;
5704
5705        ver = (dev->device >> 8) & 0xff;
5706        if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5707            ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5708            ver != 0x9a)
5709                return;
5710
5711        if (risky_device(dev))
5712                return;
5713
5714        pci_info(dev, "Skip IOMMU disabling for graphics\n");
5715        iommu_skip_te_disable = 1;
5716}
5717DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5718
5719/* On Tylersburg chipsets, some BIOSes have been known to enable the
5720   ISOCH DMAR unit for the Azalia sound device, but not give it any
5721   TLB entries, which causes it to deadlock. Check for that.  We do
5722   this in a function called from init_dmars(), instead of in a PCI
5723   quirk, because we don't want to print the obnoxious "BIOS broken"
5724   message if VT-d is actually disabled.
5725*/
5726static void __init check_tylersburg_isoch(void)
5727{
5728        struct pci_dev *pdev;
5729        uint32_t vtisochctrl;
5730
5731        /* If there's no Azalia in the system anyway, forget it. */
5732        pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5733        if (!pdev)
5734                return;
5735
5736        if (risky_device(pdev)) {
5737                pci_dev_put(pdev);
5738                return;
5739        }
5740
5741        pci_dev_put(pdev);
5742
5743        /* System Management Registers. Might be hidden, in which case
5744           we can't do the sanity check. But that's OK, because the
5745           known-broken BIOSes _don't_ actually hide it, so far. */
5746        pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5747        if (!pdev)
5748                return;
5749
5750        if (risky_device(pdev)) {
5751                pci_dev_put(pdev);
5752                return;
5753        }
5754
5755        if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5756                pci_dev_put(pdev);
5757                return;
5758        }
5759
5760        pci_dev_put(pdev);
5761
5762        /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5763        if (vtisochctrl & 1)
5764                return;
5765
5766        /* Drop all bits other than the number of TLB entries */
5767        vtisochctrl &= 0x1c;
5768
5769        /* If we have the recommended number of TLB entries (16), fine. */
5770        if (vtisochctrl == 0x10)
5771                return;
5772
5773        /* Zero TLB entries? You get to ride the short bus to school. */
5774        if (!vtisochctrl) {
5775                WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5776                     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5777                     dmi_get_system_info(DMI_BIOS_VENDOR),
5778                     dmi_get_system_info(DMI_BIOS_VERSION),
5779                     dmi_get_system_info(DMI_PRODUCT_VERSION));
5780                iommu_identity_mapping |= IDENTMAP_AZALIA;
5781                return;
5782        }
5783
5784        pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5785               vtisochctrl);
5786}
5787