linux/drivers/iommu/intel/iommu.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright © 2006-2014 Intel Corporation.
   4 *
   5 * Authors: David Woodhouse <dwmw2@infradead.org>,
   6 *          Ashok Raj <ashok.raj@intel.com>,
   7 *          Shaohua Li <shaohua.li@intel.com>,
   8 *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
   9 *          Fenghua Yu <fenghua.yu@intel.com>
  10 *          Joerg Roedel <jroedel@suse.de>
  11 */
  12
  13#define pr_fmt(fmt)     "DMAR: " fmt
  14#define dev_fmt(fmt)    pr_fmt(fmt)
  15
  16#include <linux/init.h>
  17#include <linux/bitmap.h>
  18#include <linux/debugfs.h>
  19#include <linux/export.h>
  20#include <linux/slab.h>
  21#include <linux/irq.h>
  22#include <linux/interrupt.h>
  23#include <linux/spinlock.h>
  24#include <linux/pci.h>
  25#include <linux/dmar.h>
  26#include <linux/dma-map-ops.h>
  27#include <linux/mempool.h>
  28#include <linux/memory.h>
  29#include <linux/cpu.h>
  30#include <linux/timer.h>
  31#include <linux/io.h>
  32#include <linux/iova.h>
  33#include <linux/iommu.h>
  34#include <linux/dma-iommu.h>
  35#include <linux/intel-iommu.h>
  36#include <linux/intel-svm.h>
  37#include <linux/syscore_ops.h>
  38#include <linux/tboot.h>
  39#include <linux/dmi.h>
  40#include <linux/pci-ats.h>
  41#include <linux/memblock.h>
  42#include <linux/dma-direct.h>
  43#include <linux/crash_dump.h>
  44#include <linux/numa.h>
  45#include <asm/irq_remapping.h>
  46#include <asm/cacheflush.h>
  47#include <asm/iommu.h>
  48
  49#include "../irq_remapping.h"
  50#include "../iommu-sva-lib.h"
  51#include "pasid.h"
  52#include "cap_audit.h"
  53
  54#define ROOT_SIZE               VTD_PAGE_SIZE
  55#define CONTEXT_SIZE            VTD_PAGE_SIZE
  56
  57#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  58#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  59#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  60#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  61
  62#define IOAPIC_RANGE_START      (0xfee00000)
  63#define IOAPIC_RANGE_END        (0xfeefffff)
  64#define IOVA_START_ADDR         (0x1000)
  65
  66#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
  67
  68#define MAX_AGAW_WIDTH 64
  69#define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
  70
  71#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
  72#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
  73
  74/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  75   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  76#define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  77                                __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  78#define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  79
  80/* IO virtual address start page frame number */
  81#define IOVA_START_PFN          (1)
  82
  83#define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  84
  85/* page table handling */
  86#define LEVEL_STRIDE            (9)
  87#define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  88
  89static inline int agaw_to_level(int agaw)
  90{
  91        return agaw + 2;
  92}
  93
  94static inline int agaw_to_width(int agaw)
  95{
  96        return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
  97}
  98
  99static inline int width_to_agaw(int width)
 100{
 101        return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
 102}
 103
 104static inline unsigned int level_to_offset_bits(int level)
 105{
 106        return (level - 1) * LEVEL_STRIDE;
 107}
 108
 109static inline int pfn_level_offset(u64 pfn, int level)
 110{
 111        return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 112}
 113
 114static inline u64 level_mask(int level)
 115{
 116        return -1ULL << level_to_offset_bits(level);
 117}
 118
 119static inline u64 level_size(int level)
 120{
 121        return 1ULL << level_to_offset_bits(level);
 122}
 123
 124static inline u64 align_to_level(u64 pfn, int level)
 125{
 126        return (pfn + level_size(level) - 1) & level_mask(level);
 127}
 128
 129static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 130{
 131        return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
 132}
 133
 134/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 135   are never going to work. */
 136static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 137{
 138        return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 139}
 140
 141static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 142{
 143        return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 144}
 145static inline unsigned long page_to_dma_pfn(struct page *pg)
 146{
 147        return mm_to_dma_pfn(page_to_pfn(pg));
 148}
 149static inline unsigned long virt_to_dma_pfn(void *p)
 150{
 151        return page_to_dma_pfn(virt_to_page(p));
 152}
 153
 154/* global iommu list, set NULL for ignored DMAR units */
 155static struct intel_iommu **g_iommus;
 156
 157static void __init check_tylersburg_isoch(void);
 158static int rwbf_quirk;
 159static inline struct device_domain_info *
 160dmar_search_domain_by_dev_info(int segment, int bus, int devfn);
 161
 162/*
 163 * set to 1 to panic kernel if can't successfully enable VT-d
 164 * (used when kernel is launched w/ TXT)
 165 */
 166static int force_on = 0;
 167static int intel_iommu_tboot_noforce;
 168static int no_platform_optin;
 169
 170#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 171
 172/*
 173 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 174 * if marked present.
 175 */
 176static phys_addr_t root_entry_lctp(struct root_entry *re)
 177{
 178        if (!(re->lo & 1))
 179                return 0;
 180
 181        return re->lo & VTD_PAGE_MASK;
 182}
 183
 184/*
 185 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 186 * if marked present.
 187 */
 188static phys_addr_t root_entry_uctp(struct root_entry *re)
 189{
 190        if (!(re->hi & 1))
 191                return 0;
 192
 193        return re->hi & VTD_PAGE_MASK;
 194}
 195
 196static inline void context_clear_pasid_enable(struct context_entry *context)
 197{
 198        context->lo &= ~(1ULL << 11);
 199}
 200
 201static inline bool context_pasid_enabled(struct context_entry *context)
 202{
 203        return !!(context->lo & (1ULL << 11));
 204}
 205
 206static inline void context_set_copied(struct context_entry *context)
 207{
 208        context->hi |= (1ull << 3);
 209}
 210
 211static inline bool context_copied(struct context_entry *context)
 212{
 213        return !!(context->hi & (1ULL << 3));
 214}
 215
 216static inline bool __context_present(struct context_entry *context)
 217{
 218        return (context->lo & 1);
 219}
 220
 221bool context_present(struct context_entry *context)
 222{
 223        return context_pasid_enabled(context) ?
 224             __context_present(context) :
 225             __context_present(context) && !context_copied(context);
 226}
 227
 228static inline void context_set_present(struct context_entry *context)
 229{
 230        context->lo |= 1;
 231}
 232
 233static inline void context_set_fault_enable(struct context_entry *context)
 234{
 235        context->lo &= (((u64)-1) << 2) | 1;
 236}
 237
 238static inline void context_set_translation_type(struct context_entry *context,
 239                                                unsigned long value)
 240{
 241        context->lo &= (((u64)-1) << 4) | 3;
 242        context->lo |= (value & 3) << 2;
 243}
 244
 245static inline void context_set_address_root(struct context_entry *context,
 246                                            unsigned long value)
 247{
 248        context->lo &= ~VTD_PAGE_MASK;
 249        context->lo |= value & VTD_PAGE_MASK;
 250}
 251
 252static inline void context_set_address_width(struct context_entry *context,
 253                                             unsigned long value)
 254{
 255        context->hi |= value & 7;
 256}
 257
 258static inline void context_set_domain_id(struct context_entry *context,
 259                                         unsigned long value)
 260{
 261        context->hi |= (value & ((1 << 16) - 1)) << 8;
 262}
 263
 264static inline int context_domain_id(struct context_entry *c)
 265{
 266        return((c->hi >> 8) & 0xffff);
 267}
 268
 269static inline void context_clear_entry(struct context_entry *context)
 270{
 271        context->lo = 0;
 272        context->hi = 0;
 273}
 274
 275/*
 276 * This domain is a statically identity mapping domain.
 277 *      1. This domain creats a static 1:1 mapping to all usable memory.
 278 *      2. It maps to each iommu if successful.
 279 *      3. Each iommu mapps to this domain if successful.
 280 */
 281static struct dmar_domain *si_domain;
 282static int hw_pass_through = 1;
 283
 284#define for_each_domain_iommu(idx, domain)                      \
 285        for (idx = 0; idx < g_num_of_iommus; idx++)             \
 286                if (domain->iommu_refcnt[idx])
 287
 288struct dmar_rmrr_unit {
 289        struct list_head list;          /* list of rmrr units   */
 290        struct acpi_dmar_header *hdr;   /* ACPI header          */
 291        u64     base_address;           /* reserved base address*/
 292        u64     end_address;            /* reserved end address */
 293        struct dmar_dev_scope *devices; /* target devices */
 294        int     devices_cnt;            /* target device count */
 295};
 296
 297struct dmar_atsr_unit {
 298        struct list_head list;          /* list of ATSR units */
 299        struct acpi_dmar_header *hdr;   /* ACPI header */
 300        struct dmar_dev_scope *devices; /* target devices */
 301        int devices_cnt;                /* target device count */
 302        u8 include_all:1;               /* include all ports */
 303};
 304
 305struct dmar_satc_unit {
 306        struct list_head list;          /* list of SATC units */
 307        struct acpi_dmar_header *hdr;   /* ACPI header */
 308        struct dmar_dev_scope *devices; /* target devices */
 309        struct intel_iommu *iommu;      /* the corresponding iommu */
 310        int devices_cnt;                /* target device count */
 311        u8 atc_required:1;              /* ATS is required */
 312};
 313
 314static LIST_HEAD(dmar_atsr_units);
 315static LIST_HEAD(dmar_rmrr_units);
 316static LIST_HEAD(dmar_satc_units);
 317
 318#define for_each_rmrr_units(rmrr) \
 319        list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 320
 321/* bitmap for indexing intel_iommus */
 322static int g_num_of_iommus;
 323
 324static void domain_exit(struct dmar_domain *domain);
 325static void domain_remove_dev_info(struct dmar_domain *domain);
 326static void dmar_remove_one_dev_info(struct device *dev);
 327static void __dmar_remove_one_dev_info(struct device_domain_info *info);
 328static int intel_iommu_attach_device(struct iommu_domain *domain,
 329                                     struct device *dev);
 330static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
 331                                            dma_addr_t iova);
 332
 333int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
 334int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
 335
 336int intel_iommu_enabled = 0;
 337EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 338
 339static int dmar_map_gfx = 1;
 340static int intel_iommu_superpage = 1;
 341static int iommu_identity_mapping;
 342static int iommu_skip_te_disable;
 343
 344#define IDENTMAP_GFX            2
 345#define IDENTMAP_AZALIA         4
 346
 347int intel_iommu_gfx_mapped;
 348EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 349
 350#define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
 351struct device_domain_info *get_domain_info(struct device *dev)
 352{
 353        struct device_domain_info *info;
 354
 355        if (!dev)
 356                return NULL;
 357
 358        info = dev_iommu_priv_get(dev);
 359        if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
 360                return NULL;
 361
 362        return info;
 363}
 364
 365DEFINE_SPINLOCK(device_domain_lock);
 366static LIST_HEAD(device_domain_list);
 367
 368/*
 369 * Iterate over elements in device_domain_list and call the specified
 370 * callback @fn against each element.
 371 */
 372int for_each_device_domain(int (*fn)(struct device_domain_info *info,
 373                                     void *data), void *data)
 374{
 375        int ret = 0;
 376        unsigned long flags;
 377        struct device_domain_info *info;
 378
 379        spin_lock_irqsave(&device_domain_lock, flags);
 380        list_for_each_entry(info, &device_domain_list, global) {
 381                ret = fn(info, data);
 382                if (ret) {
 383                        spin_unlock_irqrestore(&device_domain_lock, flags);
 384                        return ret;
 385                }
 386        }
 387        spin_unlock_irqrestore(&device_domain_lock, flags);
 388
 389        return 0;
 390}
 391
 392const struct iommu_ops intel_iommu_ops;
 393
 394static bool translation_pre_enabled(struct intel_iommu *iommu)
 395{
 396        return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 397}
 398
 399static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 400{
 401        iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 402}
 403
 404static void init_translation_status(struct intel_iommu *iommu)
 405{
 406        u32 gsts;
 407
 408        gsts = readl(iommu->reg + DMAR_GSTS_REG);
 409        if (gsts & DMA_GSTS_TES)
 410                iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 411}
 412
 413static int __init intel_iommu_setup(char *str)
 414{
 415        if (!str)
 416                return -EINVAL;
 417
 418        while (*str) {
 419                if (!strncmp(str, "on", 2)) {
 420                        dmar_disabled = 0;
 421                        pr_info("IOMMU enabled\n");
 422                } else if (!strncmp(str, "off", 3)) {
 423                        dmar_disabled = 1;
 424                        no_platform_optin = 1;
 425                        pr_info("IOMMU disabled\n");
 426                } else if (!strncmp(str, "igfx_off", 8)) {
 427                        dmar_map_gfx = 0;
 428                        pr_info("Disable GFX device mapping\n");
 429                } else if (!strncmp(str, "forcedac", 8)) {
 430                        pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
 431                        iommu_dma_forcedac = true;
 432                } else if (!strncmp(str, "strict", 6)) {
 433                        pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
 434                        iommu_set_dma_strict();
 435                } else if (!strncmp(str, "sp_off", 6)) {
 436                        pr_info("Disable supported super page\n");
 437                        intel_iommu_superpage = 0;
 438                } else if (!strncmp(str, "sm_on", 5)) {
 439                        pr_info("Enable scalable mode if hardware supports\n");
 440                        intel_iommu_sm = 1;
 441                } else if (!strncmp(str, "sm_off", 6)) {
 442                        pr_info("Scalable mode is disallowed\n");
 443                        intel_iommu_sm = 0;
 444                } else if (!strncmp(str, "tboot_noforce", 13)) {
 445                        pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 446                        intel_iommu_tboot_noforce = 1;
 447                } else {
 448                        pr_notice("Unknown option - '%s'\n", str);
 449                }
 450
 451                str += strcspn(str, ",");
 452                while (*str == ',')
 453                        str++;
 454        }
 455
 456        return 1;
 457}
 458__setup("intel_iommu=", intel_iommu_setup);
 459
 460static struct kmem_cache *iommu_domain_cache;
 461static struct kmem_cache *iommu_devinfo_cache;
 462
 463static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
 464{
 465        struct dmar_domain **domains;
 466        int idx = did >> 8;
 467
 468        domains = iommu->domains[idx];
 469        if (!domains)
 470                return NULL;
 471
 472        return domains[did & 0xff];
 473}
 474
 475static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
 476                             struct dmar_domain *domain)
 477{
 478        struct dmar_domain **domains;
 479        int idx = did >> 8;
 480
 481        if (!iommu->domains[idx]) {
 482                size_t size = 256 * sizeof(struct dmar_domain *);
 483                iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
 484        }
 485
 486        domains = iommu->domains[idx];
 487        if (WARN_ON(!domains))
 488                return;
 489        else
 490                domains[did & 0xff] = domain;
 491}
 492
 493void *alloc_pgtable_page(int node)
 494{
 495        struct page *page;
 496        void *vaddr = NULL;
 497
 498        page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 499        if (page)
 500                vaddr = page_address(page);
 501        return vaddr;
 502}
 503
 504void free_pgtable_page(void *vaddr)
 505{
 506        free_page((unsigned long)vaddr);
 507}
 508
 509static inline void *alloc_domain_mem(void)
 510{
 511        return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 512}
 513
 514static void free_domain_mem(void *vaddr)
 515{
 516        kmem_cache_free(iommu_domain_cache, vaddr);
 517}
 518
 519static inline void * alloc_devinfo_mem(void)
 520{
 521        return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 522}
 523
 524static inline void free_devinfo_mem(void *vaddr)
 525{
 526        kmem_cache_free(iommu_devinfo_cache, vaddr);
 527}
 528
 529static inline int domain_type_is_si(struct dmar_domain *domain)
 530{
 531        return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
 532}
 533
 534static inline bool domain_use_first_level(struct dmar_domain *domain)
 535{
 536        return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
 537}
 538
 539static inline int domain_pfn_supported(struct dmar_domain *domain,
 540                                       unsigned long pfn)
 541{
 542        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 543
 544        return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 545}
 546
 547static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 548{
 549        unsigned long sagaw;
 550        int agaw;
 551
 552        sagaw = cap_sagaw(iommu->cap);
 553        for (agaw = width_to_agaw(max_gaw);
 554             agaw >= 0; agaw--) {
 555                if (test_bit(agaw, &sagaw))
 556                        break;
 557        }
 558
 559        return agaw;
 560}
 561
 562/*
 563 * Calculate max SAGAW for each iommu.
 564 */
 565int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 566{
 567        return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 568}
 569
 570/*
 571 * calculate agaw for each iommu.
 572 * "SAGAW" may be different across iommus, use a default agaw, and
 573 * get a supported less agaw for iommus that don't support the default agaw.
 574 */
 575int iommu_calculate_agaw(struct intel_iommu *iommu)
 576{
 577        return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 578}
 579
 580/* This functionin only returns single iommu in a domain */
 581struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 582{
 583        int iommu_id;
 584
 585        /* si_domain and vm domain should not get here. */
 586        if (WARN_ON(!iommu_is_dma_domain(&domain->domain)))
 587                return NULL;
 588
 589        for_each_domain_iommu(iommu_id, domain)
 590                break;
 591
 592        if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 593                return NULL;
 594
 595        return g_iommus[iommu_id];
 596}
 597
 598static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
 599{
 600        return sm_supported(iommu) ?
 601                        ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
 602}
 603
 604static void domain_update_iommu_coherency(struct dmar_domain *domain)
 605{
 606        struct dmar_drhd_unit *drhd;
 607        struct intel_iommu *iommu;
 608        bool found = false;
 609        int i;
 610
 611        domain->iommu_coherency = true;
 612
 613        for_each_domain_iommu(i, domain) {
 614                found = true;
 615                if (!iommu_paging_structure_coherency(g_iommus[i])) {
 616                        domain->iommu_coherency = false;
 617                        break;
 618                }
 619        }
 620        if (found)
 621                return;
 622
 623        /* No hardware attached; use lowest common denominator */
 624        rcu_read_lock();
 625        for_each_active_iommu(iommu, drhd) {
 626                if (!iommu_paging_structure_coherency(iommu)) {
 627                        domain->iommu_coherency = false;
 628                        break;
 629                }
 630        }
 631        rcu_read_unlock();
 632}
 633
 634static bool domain_update_iommu_snooping(struct intel_iommu *skip)
 635{
 636        struct dmar_drhd_unit *drhd;
 637        struct intel_iommu *iommu;
 638        bool ret = true;
 639
 640        rcu_read_lock();
 641        for_each_active_iommu(iommu, drhd) {
 642                if (iommu != skip) {
 643                        /*
 644                         * If the hardware is operating in the scalable mode,
 645                         * the snooping control is always supported since we
 646                         * always set PASID-table-entry.PGSNP bit if the domain
 647                         * is managed outside (UNMANAGED).
 648                         */
 649                        if (!sm_supported(iommu) &&
 650                            !ecap_sc_support(iommu->ecap)) {
 651                                ret = false;
 652                                break;
 653                        }
 654                }
 655        }
 656        rcu_read_unlock();
 657
 658        return ret;
 659}
 660
 661static int domain_update_iommu_superpage(struct dmar_domain *domain,
 662                                         struct intel_iommu *skip)
 663{
 664        struct dmar_drhd_unit *drhd;
 665        struct intel_iommu *iommu;
 666        int mask = 0x3;
 667
 668        if (!intel_iommu_superpage)
 669                return 0;
 670
 671        /* set iommu_superpage to the smallest common denominator */
 672        rcu_read_lock();
 673        for_each_active_iommu(iommu, drhd) {
 674                if (iommu != skip) {
 675                        if (domain && domain_use_first_level(domain)) {
 676                                if (!cap_fl1gp_support(iommu->cap))
 677                                        mask = 0x1;
 678                        } else {
 679                                mask &= cap_super_page_val(iommu->cap);
 680                        }
 681
 682                        if (!mask)
 683                                break;
 684                }
 685        }
 686        rcu_read_unlock();
 687
 688        return fls(mask);
 689}
 690
 691static int domain_update_device_node(struct dmar_domain *domain)
 692{
 693        struct device_domain_info *info;
 694        int nid = NUMA_NO_NODE;
 695
 696        assert_spin_locked(&device_domain_lock);
 697
 698        if (list_empty(&domain->devices))
 699                return NUMA_NO_NODE;
 700
 701        list_for_each_entry(info, &domain->devices, link) {
 702                if (!info->dev)
 703                        continue;
 704
 705                /*
 706                 * There could possibly be multiple device numa nodes as devices
 707                 * within the same domain may sit behind different IOMMUs. There
 708                 * isn't perfect answer in such situation, so we select first
 709                 * come first served policy.
 710                 */
 711                nid = dev_to_node(info->dev);
 712                if (nid != NUMA_NO_NODE)
 713                        break;
 714        }
 715
 716        return nid;
 717}
 718
 719static void domain_update_iotlb(struct dmar_domain *domain);
 720
 721/* Return the super pagesize bitmap if supported. */
 722static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
 723{
 724        unsigned long bitmap = 0;
 725
 726        /*
 727         * 1-level super page supports page size of 2MiB, 2-level super page
 728         * supports page size of both 2MiB and 1GiB.
 729         */
 730        if (domain->iommu_superpage == 1)
 731                bitmap |= SZ_2M;
 732        else if (domain->iommu_superpage == 2)
 733                bitmap |= SZ_2M | SZ_1G;
 734
 735        return bitmap;
 736}
 737
 738/* Some capabilities may be different across iommus */
 739static void domain_update_iommu_cap(struct dmar_domain *domain)
 740{
 741        domain_update_iommu_coherency(domain);
 742        domain->iommu_snooping = domain_update_iommu_snooping(NULL);
 743        domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
 744
 745        /*
 746         * If RHSA is missing, we should default to the device numa domain
 747         * as fall back.
 748         */
 749        if (domain->nid == NUMA_NO_NODE)
 750                domain->nid = domain_update_device_node(domain);
 751
 752        /*
 753         * First-level translation restricts the input-address to a
 754         * canonical address (i.e., address bits 63:N have the same
 755         * value as address bit [N-1], where N is 48-bits with 4-level
 756         * paging and 57-bits with 5-level paging). Hence, skip bit
 757         * [N-1].
 758         */
 759        if (domain_use_first_level(domain))
 760                domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
 761        else
 762                domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
 763
 764        domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
 765        domain_update_iotlb(domain);
 766}
 767
 768struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
 769                                         u8 devfn, int alloc)
 770{
 771        struct root_entry *root = &iommu->root_entry[bus];
 772        struct context_entry *context;
 773        u64 *entry;
 774
 775        entry = &root->lo;
 776        if (sm_supported(iommu)) {
 777                if (devfn >= 0x80) {
 778                        devfn -= 0x80;
 779                        entry = &root->hi;
 780                }
 781                devfn *= 2;
 782        }
 783        if (*entry & 1)
 784                context = phys_to_virt(*entry & VTD_PAGE_MASK);
 785        else {
 786                unsigned long phy_addr;
 787                if (!alloc)
 788                        return NULL;
 789
 790                context = alloc_pgtable_page(iommu->node);
 791                if (!context)
 792                        return NULL;
 793
 794                __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 795                phy_addr = virt_to_phys((void *)context);
 796                *entry = phy_addr | 1;
 797                __iommu_flush_cache(iommu, entry, sizeof(*entry));
 798        }
 799        return &context[devfn];
 800}
 801
 802static bool attach_deferred(struct device *dev)
 803{
 804        return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
 805}
 806
 807/**
 808 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
 809 *                               sub-hierarchy of a candidate PCI-PCI bridge
 810 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
 811 * @bridge: the candidate PCI-PCI bridge
 812 *
 813 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
 814 */
 815static bool
 816is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
 817{
 818        struct pci_dev *pdev, *pbridge;
 819
 820        if (!dev_is_pci(dev) || !dev_is_pci(bridge))
 821                return false;
 822
 823        pdev = to_pci_dev(dev);
 824        pbridge = to_pci_dev(bridge);
 825
 826        if (pbridge->subordinate &&
 827            pbridge->subordinate->number <= pdev->bus->number &&
 828            pbridge->subordinate->busn_res.end >= pdev->bus->number)
 829                return true;
 830
 831        return false;
 832}
 833
 834static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
 835{
 836        struct dmar_drhd_unit *drhd;
 837        u32 vtbar;
 838        int rc;
 839
 840        /* We know that this device on this chipset has its own IOMMU.
 841         * If we find it under a different IOMMU, then the BIOS is lying
 842         * to us. Hope that the IOMMU for this device is actually
 843         * disabled, and it needs no translation...
 844         */
 845        rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
 846        if (rc) {
 847                /* "can't" happen */
 848                dev_info(&pdev->dev, "failed to run vt-d quirk\n");
 849                return false;
 850        }
 851        vtbar &= 0xffff0000;
 852
 853        /* we know that the this iommu should be at offset 0xa000 from vtbar */
 854        drhd = dmar_find_matched_drhd_unit(pdev);
 855        if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
 856                pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
 857                add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
 858                return true;
 859        }
 860
 861        return false;
 862}
 863
 864static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
 865{
 866        if (!iommu || iommu->drhd->ignored)
 867                return true;
 868
 869        if (dev_is_pci(dev)) {
 870                struct pci_dev *pdev = to_pci_dev(dev);
 871
 872                if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
 873                    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
 874                    quirk_ioat_snb_local_iommu(pdev))
 875                        return true;
 876        }
 877
 878        return false;
 879}
 880
 881struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 882{
 883        struct dmar_drhd_unit *drhd = NULL;
 884        struct pci_dev *pdev = NULL;
 885        struct intel_iommu *iommu;
 886        struct device *tmp;
 887        u16 segment = 0;
 888        int i;
 889
 890        if (!dev)
 891                return NULL;
 892
 893        if (dev_is_pci(dev)) {
 894                struct pci_dev *pf_pdev;
 895
 896                pdev = pci_real_dma_dev(to_pci_dev(dev));
 897
 898                /* VFs aren't listed in scope tables; we need to look up
 899                 * the PF instead to find the IOMMU. */
 900                pf_pdev = pci_physfn(pdev);
 901                dev = &pf_pdev->dev;
 902                segment = pci_domain_nr(pdev->bus);
 903        } else if (has_acpi_companion(dev))
 904                dev = &ACPI_COMPANION(dev)->dev;
 905
 906        rcu_read_lock();
 907        for_each_iommu(iommu, drhd) {
 908                if (pdev && segment != drhd->segment)
 909                        continue;
 910
 911                for_each_active_dev_scope(drhd->devices,
 912                                          drhd->devices_cnt, i, tmp) {
 913                        if (tmp == dev) {
 914                                /* For a VF use its original BDF# not that of the PF
 915                                 * which we used for the IOMMU lookup. Strictly speaking
 916                                 * we could do this for all PCI devices; we only need to
 917                                 * get the BDF# from the scope table for ACPI matches. */
 918                                if (pdev && pdev->is_virtfn)
 919                                        goto got_pdev;
 920
 921                                if (bus && devfn) {
 922                                        *bus = drhd->devices[i].bus;
 923                                        *devfn = drhd->devices[i].devfn;
 924                                }
 925                                goto out;
 926                        }
 927
 928                        if (is_downstream_to_pci_bridge(dev, tmp))
 929                                goto got_pdev;
 930                }
 931
 932                if (pdev && drhd->include_all) {
 933                got_pdev:
 934                        if (bus && devfn) {
 935                                *bus = pdev->bus->number;
 936                                *devfn = pdev->devfn;
 937                        }
 938                        goto out;
 939                }
 940        }
 941        iommu = NULL;
 942 out:
 943        if (iommu_is_dummy(iommu, dev))
 944                iommu = NULL;
 945
 946        rcu_read_unlock();
 947
 948        return iommu;
 949}
 950
 951static void domain_flush_cache(struct dmar_domain *domain,
 952                               void *addr, int size)
 953{
 954        if (!domain->iommu_coherency)
 955                clflush_cache_range(addr, size);
 956}
 957
 958static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 959{
 960        struct context_entry *context;
 961        int ret = 0;
 962        unsigned long flags;
 963
 964        spin_lock_irqsave(&iommu->lock, flags);
 965        context = iommu_context_addr(iommu, bus, devfn, 0);
 966        if (context)
 967                ret = context_present(context);
 968        spin_unlock_irqrestore(&iommu->lock, flags);
 969        return ret;
 970}
 971
 972static void free_context_table(struct intel_iommu *iommu)
 973{
 974        int i;
 975        unsigned long flags;
 976        struct context_entry *context;
 977
 978        spin_lock_irqsave(&iommu->lock, flags);
 979        if (!iommu->root_entry) {
 980                goto out;
 981        }
 982        for (i = 0; i < ROOT_ENTRY_NR; i++) {
 983                context = iommu_context_addr(iommu, i, 0, 0);
 984                if (context)
 985                        free_pgtable_page(context);
 986
 987                if (!sm_supported(iommu))
 988                        continue;
 989
 990                context = iommu_context_addr(iommu, i, 0x80, 0);
 991                if (context)
 992                        free_pgtable_page(context);
 993
 994        }
 995        free_pgtable_page(iommu->root_entry);
 996        iommu->root_entry = NULL;
 997out:
 998        spin_unlock_irqrestore(&iommu->lock, flags);
 999}
1000
1001#ifdef CONFIG_DMAR_DEBUG
1002static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, u8 bus, u8 devfn)
1003{
1004        struct device_domain_info *info;
1005        struct dma_pte *parent, *pte;
1006        struct dmar_domain *domain;
1007        int offset, level;
1008
1009        info = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
1010        if (!info || !info->domain) {
1011                pr_info("device [%02x:%02x.%d] not probed\n",
1012                        bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1013                return;
1014        }
1015
1016        domain = info->domain;
1017        level = agaw_to_level(domain->agaw);
1018        parent = domain->pgd;
1019        if (!parent) {
1020                pr_info("no page table setup\n");
1021                return;
1022        }
1023
1024        while (1) {
1025                offset = pfn_level_offset(pfn, level);
1026                pte = &parent[offset];
1027                if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
1028                        pr_info("PTE not present at level %d\n", level);
1029                        break;
1030                }
1031
1032                pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
1033
1034                if (level == 1)
1035                        break;
1036
1037                parent = phys_to_virt(dma_pte_addr(pte));
1038                level--;
1039        }
1040}
1041
1042void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
1043                          unsigned long long addr, u32 pasid)
1044{
1045        struct pasid_dir_entry *dir, *pde;
1046        struct pasid_entry *entries, *pte;
1047        struct context_entry *ctx_entry;
1048        struct root_entry *rt_entry;
1049        u8 devfn = source_id & 0xff;
1050        u8 bus = source_id >> 8;
1051        int i, dir_index, index;
1052
1053        pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
1054
1055        /* root entry dump */
1056        rt_entry = &iommu->root_entry[bus];
1057        if (!rt_entry) {
1058                pr_info("root table entry is not present\n");
1059                return;
1060        }
1061
1062        if (sm_supported(iommu))
1063                pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
1064                        rt_entry->hi, rt_entry->lo);
1065        else
1066                pr_info("root entry: 0x%016llx", rt_entry->lo);
1067
1068        /* context entry dump */
1069        ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
1070        if (!ctx_entry) {
1071                pr_info("context table entry is not present\n");
1072                return;
1073        }
1074
1075        pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
1076                ctx_entry->hi, ctx_entry->lo);
1077
1078        /* legacy mode does not require PASID entries */
1079        if (!sm_supported(iommu))
1080                goto pgtable_walk;
1081
1082        /* get the pointer to pasid directory entry */
1083        dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
1084        if (!dir) {
1085                pr_info("pasid directory entry is not present\n");
1086                return;
1087        }
1088        /* For request-without-pasid, get the pasid from context entry */
1089        if (intel_iommu_sm && pasid == INVALID_IOASID)
1090                pasid = PASID_RID2PASID;
1091
1092        dir_index = pasid >> PASID_PDE_SHIFT;
1093        pde = &dir[dir_index];
1094        pr_info("pasid dir entry: 0x%016llx\n", pde->val);
1095
1096        /* get the pointer to the pasid table entry */
1097        entries = get_pasid_table_from_pde(pde);
1098        if (!entries) {
1099                pr_info("pasid table entry is not present\n");
1100                return;
1101        }
1102        index = pasid & PASID_PTE_MASK;
1103        pte = &entries[index];
1104        for (i = 0; i < ARRAY_SIZE(pte->val); i++)
1105                pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
1106
1107pgtable_walk:
1108        pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn);
1109}
1110#endif
1111
1112static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
1113                                      unsigned long pfn, int *target_level)
1114{
1115        struct dma_pte *parent, *pte;
1116        int level = agaw_to_level(domain->agaw);
1117        int offset;
1118
1119        BUG_ON(!domain->pgd);
1120
1121        if (!domain_pfn_supported(domain, pfn))
1122                /* Address beyond IOMMU's addressing capabilities. */
1123                return NULL;
1124
1125        parent = domain->pgd;
1126
1127        while (1) {
1128                void *tmp_page;
1129
1130                offset = pfn_level_offset(pfn, level);
1131                pte = &parent[offset];
1132                if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1133                        break;
1134                if (level == *target_level)
1135                        break;
1136
1137                if (!dma_pte_present(pte)) {
1138                        uint64_t pteval;
1139
1140                        tmp_page = alloc_pgtable_page(domain->nid);
1141
1142                        if (!tmp_page)
1143                                return NULL;
1144
1145                        domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1146                        pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1147                        if (domain_use_first_level(domain)) {
1148                                pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1149                                if (iommu_is_dma_domain(&domain->domain))
1150                                        pteval |= DMA_FL_PTE_ACCESS;
1151                        }
1152                        if (cmpxchg64(&pte->val, 0ULL, pteval))
1153                                /* Someone else set it while we were thinking; use theirs. */
1154                                free_pgtable_page(tmp_page);
1155                        else
1156                                domain_flush_cache(domain, pte, sizeof(*pte));
1157                }
1158                if (level == 1)
1159                        break;
1160
1161                parent = phys_to_virt(dma_pte_addr(pte));
1162                level--;
1163        }
1164
1165        if (!*target_level)
1166                *target_level = level;
1167
1168        return pte;
1169}
1170
1171/* return address's pte at specific level */
1172static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1173                                         unsigned long pfn,
1174                                         int level, int *large_page)
1175{
1176        struct dma_pte *parent, *pte;
1177        int total = agaw_to_level(domain->agaw);
1178        int offset;
1179
1180        parent = domain->pgd;
1181        while (level <= total) {
1182                offset = pfn_level_offset(pfn, total);
1183                pte = &parent[offset];
1184                if (level == total)
1185                        return pte;
1186
1187                if (!dma_pte_present(pte)) {
1188                        *large_page = total;
1189                        break;
1190                }
1191
1192                if (dma_pte_superpage(pte)) {
1193                        *large_page = total;
1194                        return pte;
1195                }
1196
1197                parent = phys_to_virt(dma_pte_addr(pte));
1198                total--;
1199        }
1200        return NULL;
1201}
1202
1203/* clear last level pte, a tlb flush should be followed */
1204static void dma_pte_clear_range(struct dmar_domain *domain,
1205                                unsigned long start_pfn,
1206                                unsigned long last_pfn)
1207{
1208        unsigned int large_page;
1209        struct dma_pte *first_pte, *pte;
1210
1211        BUG_ON(!domain_pfn_supported(domain, start_pfn));
1212        BUG_ON(!domain_pfn_supported(domain, last_pfn));
1213        BUG_ON(start_pfn > last_pfn);
1214
1215        /* we don't need lock here; nobody else touches the iova range */
1216        do {
1217                large_page = 1;
1218                first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1219                if (!pte) {
1220                        start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1221                        continue;
1222                }
1223                do {
1224                        dma_clear_pte(pte);
1225                        start_pfn += lvl_to_nr_pages(large_page);
1226                        pte++;
1227                } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1228
1229                domain_flush_cache(domain, first_pte,
1230                                   (void *)pte - (void *)first_pte);
1231
1232        } while (start_pfn && start_pfn <= last_pfn);
1233}
1234
1235static void dma_pte_free_level(struct dmar_domain *domain, int level,
1236                               int retain_level, struct dma_pte *pte,
1237                               unsigned long pfn, unsigned long start_pfn,
1238                               unsigned long last_pfn)
1239{
1240        pfn = max(start_pfn, pfn);
1241        pte = &pte[pfn_level_offset(pfn, level)];
1242
1243        do {
1244                unsigned long level_pfn;
1245                struct dma_pte *level_pte;
1246
1247                if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1248                        goto next;
1249
1250                level_pfn = pfn & level_mask(level);
1251                level_pte = phys_to_virt(dma_pte_addr(pte));
1252
1253                if (level > 2) {
1254                        dma_pte_free_level(domain, level - 1, retain_level,
1255                                           level_pte, level_pfn, start_pfn,
1256                                           last_pfn);
1257                }
1258
1259                /*
1260                 * Free the page table if we're below the level we want to
1261                 * retain and the range covers the entire table.
1262                 */
1263                if (level < retain_level && !(start_pfn > level_pfn ||
1264                      last_pfn < level_pfn + level_size(level) - 1)) {
1265                        dma_clear_pte(pte);
1266                        domain_flush_cache(domain, pte, sizeof(*pte));
1267                        free_pgtable_page(level_pte);
1268                }
1269next:
1270                pfn += level_size(level);
1271        } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1272}
1273
1274/*
1275 * clear last level (leaf) ptes and free page table pages below the
1276 * level we wish to keep intact.
1277 */
1278static void dma_pte_free_pagetable(struct dmar_domain *domain,
1279                                   unsigned long start_pfn,
1280                                   unsigned long last_pfn,
1281                                   int retain_level)
1282{
1283        BUG_ON(!domain_pfn_supported(domain, start_pfn));
1284        BUG_ON(!domain_pfn_supported(domain, last_pfn));
1285        BUG_ON(start_pfn > last_pfn);
1286
1287        dma_pte_clear_range(domain, start_pfn, last_pfn);
1288
1289        /* We don't need lock here; nobody else touches the iova range */
1290        dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1291                           domain->pgd, 0, start_pfn, last_pfn);
1292
1293        /* free pgd */
1294        if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1295                free_pgtable_page(domain->pgd);
1296                domain->pgd = NULL;
1297        }
1298}
1299
1300/* When a page at a given level is being unlinked from its parent, we don't
1301   need to *modify* it at all. All we need to do is make a list of all the
1302   pages which can be freed just as soon as we've flushed the IOTLB and we
1303   know the hardware page-walk will no longer touch them.
1304   The 'pte' argument is the *parent* PTE, pointing to the page that is to
1305   be freed. */
1306static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1307                                            int level, struct dma_pte *pte,
1308                                            struct page *freelist)
1309{
1310        struct page *pg;
1311
1312        pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1313        pg->freelist = freelist;
1314        freelist = pg;
1315
1316        if (level == 1)
1317                return freelist;
1318
1319        pte = page_address(pg);
1320        do {
1321                if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1322                        freelist = dma_pte_list_pagetables(domain, level - 1,
1323                                                           pte, freelist);
1324                pte++;
1325        } while (!first_pte_in_page(pte));
1326
1327        return freelist;
1328}
1329
1330static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1331                                        struct dma_pte *pte, unsigned long pfn,
1332                                        unsigned long start_pfn,
1333                                        unsigned long last_pfn,
1334                                        struct page *freelist)
1335{
1336        struct dma_pte *first_pte = NULL, *last_pte = NULL;
1337
1338        pfn = max(start_pfn, pfn);
1339        pte = &pte[pfn_level_offset(pfn, level)];
1340
1341        do {
1342                unsigned long level_pfn = pfn & level_mask(level);
1343
1344                if (!dma_pte_present(pte))
1345                        goto next;
1346
1347                /* If range covers entire pagetable, free it */
1348                if (start_pfn <= level_pfn &&
1349                    last_pfn >= level_pfn + level_size(level) - 1) {
1350                        /* These suborbinate page tables are going away entirely. Don't
1351                           bother to clear them; we're just going to *free* them. */
1352                        if (level > 1 && !dma_pte_superpage(pte))
1353                                freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1354
1355                        dma_clear_pte(pte);
1356                        if (!first_pte)
1357                                first_pte = pte;
1358                        last_pte = pte;
1359                } else if (level > 1) {
1360                        /* Recurse down into a level that isn't *entirely* obsolete */
1361                        freelist = dma_pte_clear_level(domain, level - 1,
1362                                                       phys_to_virt(dma_pte_addr(pte)),
1363                                                       level_pfn, start_pfn, last_pfn,
1364                                                       freelist);
1365                }
1366next:
1367                pfn = level_pfn + level_size(level);
1368        } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1369
1370        if (first_pte)
1371                domain_flush_cache(domain, first_pte,
1372                                   (void *)++last_pte - (void *)first_pte);
1373
1374        return freelist;
1375}
1376
1377/* We can't just free the pages because the IOMMU may still be walking
1378   the page tables, and may have cached the intermediate levels. The
1379   pages can only be freed after the IOTLB flush has been done. */
1380static struct page *domain_unmap(struct dmar_domain *domain,
1381                                 unsigned long start_pfn,
1382                                 unsigned long last_pfn,
1383                                 struct page *freelist)
1384{
1385        BUG_ON(!domain_pfn_supported(domain, start_pfn));
1386        BUG_ON(!domain_pfn_supported(domain, last_pfn));
1387        BUG_ON(start_pfn > last_pfn);
1388
1389        /* we don't need lock here; nobody else touches the iova range */
1390        freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1391                                       domain->pgd, 0, start_pfn, last_pfn,
1392                                       freelist);
1393
1394        /* free pgd */
1395        if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1396                struct page *pgd_page = virt_to_page(domain->pgd);
1397                pgd_page->freelist = freelist;
1398                freelist = pgd_page;
1399
1400                domain->pgd = NULL;
1401        }
1402
1403        return freelist;
1404}
1405
1406static void dma_free_pagelist(struct page *freelist)
1407{
1408        struct page *pg;
1409
1410        while ((pg = freelist)) {
1411                freelist = pg->freelist;
1412                free_pgtable_page(page_address(pg));
1413        }
1414}
1415
1416/* iommu handling */
1417static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1418{
1419        struct root_entry *root;
1420        unsigned long flags;
1421
1422        root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1423        if (!root) {
1424                pr_err("Allocating root entry for %s failed\n",
1425                        iommu->name);
1426                return -ENOMEM;
1427        }
1428
1429        __iommu_flush_cache(iommu, root, ROOT_SIZE);
1430
1431        spin_lock_irqsave(&iommu->lock, flags);
1432        iommu->root_entry = root;
1433        spin_unlock_irqrestore(&iommu->lock, flags);
1434
1435        return 0;
1436}
1437
1438static void iommu_set_root_entry(struct intel_iommu *iommu)
1439{
1440        u64 addr;
1441        u32 sts;
1442        unsigned long flag;
1443
1444        addr = virt_to_phys(iommu->root_entry);
1445        if (sm_supported(iommu))
1446                addr |= DMA_RTADDR_SMT;
1447
1448        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1449        dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1450
1451        writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1452
1453        /* Make sure hardware complete it */
1454        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1455                      readl, (sts & DMA_GSTS_RTPS), sts);
1456
1457        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1458
1459        iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1460        if (sm_supported(iommu))
1461                qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1462        iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1463}
1464
1465void iommu_flush_write_buffer(struct intel_iommu *iommu)
1466{
1467        u32 val;
1468        unsigned long flag;
1469
1470        if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1471                return;
1472
1473        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1474        writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1475
1476        /* Make sure hardware complete it */
1477        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1478                      readl, (!(val & DMA_GSTS_WBFS)), val);
1479
1480        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1481}
1482
1483/* return value determine if we need a write buffer flush */
1484static void __iommu_flush_context(struct intel_iommu *iommu,
1485                                  u16 did, u16 source_id, u8 function_mask,
1486                                  u64 type)
1487{
1488        u64 val = 0;
1489        unsigned long flag;
1490
1491        switch (type) {
1492        case DMA_CCMD_GLOBAL_INVL:
1493                val = DMA_CCMD_GLOBAL_INVL;
1494                break;
1495        case DMA_CCMD_DOMAIN_INVL:
1496                val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1497                break;
1498        case DMA_CCMD_DEVICE_INVL:
1499                val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1500                        | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1501                break;
1502        default:
1503                BUG();
1504        }
1505        val |= DMA_CCMD_ICC;
1506
1507        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1508        dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1509
1510        /* Make sure hardware complete it */
1511        IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1512                dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1513
1514        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1515}
1516
1517/* return value determine if we need a write buffer flush */
1518static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1519                                u64 addr, unsigned int size_order, u64 type)
1520{
1521        int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1522        u64 val = 0, val_iva = 0;
1523        unsigned long flag;
1524
1525        switch (type) {
1526        case DMA_TLB_GLOBAL_FLUSH:
1527                /* global flush doesn't need set IVA_REG */
1528                val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1529                break;
1530        case DMA_TLB_DSI_FLUSH:
1531                val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1532                break;
1533        case DMA_TLB_PSI_FLUSH:
1534                val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1535                /* IH bit is passed in as part of address */
1536                val_iva = size_order | addr;
1537                break;
1538        default:
1539                BUG();
1540        }
1541        /* Note: set drain read/write */
1542#if 0
1543        /*
1544         * This is probably to be super secure.. Looks like we can
1545         * ignore it without any impact.
1546         */
1547        if (cap_read_drain(iommu->cap))
1548                val |= DMA_TLB_READ_DRAIN;
1549#endif
1550        if (cap_write_drain(iommu->cap))
1551                val |= DMA_TLB_WRITE_DRAIN;
1552
1553        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1554        /* Note: Only uses first TLB reg currently */
1555        if (val_iva)
1556                dmar_writeq(iommu->reg + tlb_offset, val_iva);
1557        dmar_writeq(iommu->reg + tlb_offset + 8, val);
1558
1559        /* Make sure hardware complete it */
1560        IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1561                dmar_readq, (!(val & DMA_TLB_IVT)), val);
1562
1563        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1564
1565        /* check IOTLB invalidation granularity */
1566        if (DMA_TLB_IAIG(val) == 0)
1567                pr_err("Flush IOTLB failed\n");
1568        if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1569                pr_debug("TLB flush request %Lx, actual %Lx\n",
1570                        (unsigned long long)DMA_TLB_IIRG(type),
1571                        (unsigned long long)DMA_TLB_IAIG(val));
1572}
1573
1574static struct device_domain_info *
1575iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1576                         u8 bus, u8 devfn)
1577{
1578        struct device_domain_info *info;
1579
1580        assert_spin_locked(&device_domain_lock);
1581
1582        if (!iommu->qi)
1583                return NULL;
1584
1585        list_for_each_entry(info, &domain->devices, link)
1586                if (info->iommu == iommu && info->bus == bus &&
1587                    info->devfn == devfn) {
1588                        if (info->ats_supported && info->dev)
1589                                return info;
1590                        break;
1591                }
1592
1593        return NULL;
1594}
1595
1596static void domain_update_iotlb(struct dmar_domain *domain)
1597{
1598        struct device_domain_info *info;
1599        bool has_iotlb_device = false;
1600
1601        assert_spin_locked(&device_domain_lock);
1602
1603        list_for_each_entry(info, &domain->devices, link)
1604                if (info->ats_enabled) {
1605                        has_iotlb_device = true;
1606                        break;
1607                }
1608
1609        if (!has_iotlb_device) {
1610                struct subdev_domain_info *sinfo;
1611
1612                list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1613                        info = get_domain_info(sinfo->pdev);
1614                        if (info && info->ats_enabled) {
1615                                has_iotlb_device = true;
1616                                break;
1617                        }
1618                }
1619        }
1620
1621        domain->has_iotlb_device = has_iotlb_device;
1622}
1623
1624static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1625{
1626        struct pci_dev *pdev;
1627
1628        assert_spin_locked(&device_domain_lock);
1629
1630        if (!info || !dev_is_pci(info->dev))
1631                return;
1632
1633        pdev = to_pci_dev(info->dev);
1634        /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1635         * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1636         * queue depth at PF level. If DIT is not set, PFSID will be treated as
1637         * reserved, which should be set to 0.
1638         */
1639        if (!ecap_dit(info->iommu->ecap))
1640                info->pfsid = 0;
1641        else {
1642                struct pci_dev *pf_pdev;
1643
1644                /* pdev will be returned if device is not a vf */
1645                pf_pdev = pci_physfn(pdev);
1646                info->pfsid = pci_dev_id(pf_pdev);
1647        }
1648
1649#ifdef CONFIG_INTEL_IOMMU_SVM
1650        /* The PCIe spec, in its wisdom, declares that the behaviour of
1651           the device if you enable PASID support after ATS support is
1652           undefined. So always enable PASID support on devices which
1653           have it, even if we can't yet know if we're ever going to
1654           use it. */
1655        if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1656                info->pasid_enabled = 1;
1657
1658        if (info->pri_supported &&
1659            (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1660            !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
1661                info->pri_enabled = 1;
1662#endif
1663        if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1664            !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1665                info->ats_enabled = 1;
1666                domain_update_iotlb(info->domain);
1667                info->ats_qdep = pci_ats_queue_depth(pdev);
1668        }
1669}
1670
1671static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1672{
1673        struct pci_dev *pdev;
1674
1675        assert_spin_locked(&device_domain_lock);
1676
1677        if (!dev_is_pci(info->dev))
1678                return;
1679
1680        pdev = to_pci_dev(info->dev);
1681
1682        if (info->ats_enabled) {
1683                pci_disable_ats(pdev);
1684                info->ats_enabled = 0;
1685                domain_update_iotlb(info->domain);
1686        }
1687#ifdef CONFIG_INTEL_IOMMU_SVM
1688        if (info->pri_enabled) {
1689                pci_disable_pri(pdev);
1690                info->pri_enabled = 0;
1691        }
1692        if (info->pasid_enabled) {
1693                pci_disable_pasid(pdev);
1694                info->pasid_enabled = 0;
1695        }
1696#endif
1697}
1698
1699static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1700                                    u64 addr, unsigned int mask)
1701{
1702        u16 sid, qdep;
1703
1704        if (!info || !info->ats_enabled)
1705                return;
1706
1707        sid = info->bus << 8 | info->devfn;
1708        qdep = info->ats_qdep;
1709        qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1710                           qdep, addr, mask);
1711}
1712
1713static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1714                                  u64 addr, unsigned mask)
1715{
1716        unsigned long flags;
1717        struct device_domain_info *info;
1718        struct subdev_domain_info *sinfo;
1719
1720        if (!domain->has_iotlb_device)
1721                return;
1722
1723        spin_lock_irqsave(&device_domain_lock, flags);
1724        list_for_each_entry(info, &domain->devices, link)
1725                __iommu_flush_dev_iotlb(info, addr, mask);
1726
1727        list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1728                info = get_domain_info(sinfo->pdev);
1729                __iommu_flush_dev_iotlb(info, addr, mask);
1730        }
1731        spin_unlock_irqrestore(&device_domain_lock, flags);
1732}
1733
1734static void domain_flush_piotlb(struct intel_iommu *iommu,
1735                                struct dmar_domain *domain,
1736                                u64 addr, unsigned long npages, bool ih)
1737{
1738        u16 did = domain->iommu_did[iommu->seq_id];
1739
1740        if (domain->default_pasid)
1741                qi_flush_piotlb(iommu, did, domain->default_pasid,
1742                                addr, npages, ih);
1743
1744        if (!list_empty(&domain->devices))
1745                qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1746}
1747
1748static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1749                                  struct dmar_domain *domain,
1750                                  unsigned long pfn, unsigned int pages,
1751                                  int ih, int map)
1752{
1753        unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1754        uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1755        u16 did = domain->iommu_did[iommu->seq_id];
1756
1757        BUG_ON(pages == 0);
1758
1759        if (ih)
1760                ih = 1 << 6;
1761
1762        if (domain_use_first_level(domain)) {
1763                domain_flush_piotlb(iommu, domain, addr, pages, ih);
1764        } else {
1765                /*
1766                 * Fallback to domain selective flush if no PSI support or
1767                 * the size is too big. PSI requires page size to be 2 ^ x,
1768                 * and the base address is naturally aligned to the size.
1769                 */
1770                if (!cap_pgsel_inv(iommu->cap) ||
1771                    mask > cap_max_amask_val(iommu->cap))
1772                        iommu->flush.flush_iotlb(iommu, did, 0, 0,
1773                                                        DMA_TLB_DSI_FLUSH);
1774                else
1775                        iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1776                                                        DMA_TLB_PSI_FLUSH);
1777        }
1778
1779        /*
1780         * In caching mode, changes of pages from non-present to present require
1781         * flush. However, device IOTLB doesn't need to be flushed in this case.
1782         */
1783        if (!cap_caching_mode(iommu->cap) || !map)
1784                iommu_flush_dev_iotlb(domain, addr, mask);
1785}
1786
1787/* Notification for newly created mappings */
1788static inline void __mapping_notify_one(struct intel_iommu *iommu,
1789                                        struct dmar_domain *domain,
1790                                        unsigned long pfn, unsigned int pages)
1791{
1792        /*
1793         * It's a non-present to present mapping. Only flush if caching mode
1794         * and second level.
1795         */
1796        if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1797                iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1798        else
1799                iommu_flush_write_buffer(iommu);
1800}
1801
1802static void intel_flush_iotlb_all(struct iommu_domain *domain)
1803{
1804        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1805        int idx;
1806
1807        for_each_domain_iommu(idx, dmar_domain) {
1808                struct intel_iommu *iommu = g_iommus[idx];
1809                u16 did = dmar_domain->iommu_did[iommu->seq_id];
1810
1811                if (domain_use_first_level(dmar_domain))
1812                        domain_flush_piotlb(iommu, dmar_domain, 0, -1, 0);
1813                else
1814                        iommu->flush.flush_iotlb(iommu, did, 0, 0,
1815                                                 DMA_TLB_DSI_FLUSH);
1816
1817                if (!cap_caching_mode(iommu->cap))
1818                        iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1819                                              0, MAX_AGAW_PFN_WIDTH);
1820        }
1821}
1822
1823static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1824{
1825        u32 pmen;
1826        unsigned long flags;
1827
1828        if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1829                return;
1830
1831        raw_spin_lock_irqsave(&iommu->register_lock, flags);
1832        pmen = readl(iommu->reg + DMAR_PMEN_REG);
1833        pmen &= ~DMA_PMEN_EPM;
1834        writel(pmen, iommu->reg + DMAR_PMEN_REG);
1835
1836        /* wait for the protected region status bit to clear */
1837        IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1838                readl, !(pmen & DMA_PMEN_PRS), pmen);
1839
1840        raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1841}
1842
1843static void iommu_enable_translation(struct intel_iommu *iommu)
1844{
1845        u32 sts;
1846        unsigned long flags;
1847
1848        raw_spin_lock_irqsave(&iommu->register_lock, flags);
1849        iommu->gcmd |= DMA_GCMD_TE;
1850        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1851
1852        /* Make sure hardware complete it */
1853        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1854                      readl, (sts & DMA_GSTS_TES), sts);
1855
1856        raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1857}
1858
1859static void iommu_disable_translation(struct intel_iommu *iommu)
1860{
1861        u32 sts;
1862        unsigned long flag;
1863
1864        if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1865            (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1866                return;
1867
1868        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1869        iommu->gcmd &= ~DMA_GCMD_TE;
1870        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1871
1872        /* Make sure hardware complete it */
1873        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1874                      readl, (!(sts & DMA_GSTS_TES)), sts);
1875
1876        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1877}
1878
1879static int iommu_init_domains(struct intel_iommu *iommu)
1880{
1881        u32 ndomains, nlongs;
1882        size_t size;
1883
1884        ndomains = cap_ndoms(iommu->cap);
1885        pr_debug("%s: Number of Domains supported <%d>\n",
1886                 iommu->name, ndomains);
1887        nlongs = BITS_TO_LONGS(ndomains);
1888
1889        spin_lock_init(&iommu->lock);
1890
1891        iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1892        if (!iommu->domain_ids)
1893                return -ENOMEM;
1894
1895        size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1896        iommu->domains = kzalloc(size, GFP_KERNEL);
1897
1898        if (iommu->domains) {
1899                size = 256 * sizeof(struct dmar_domain *);
1900                iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1901        }
1902
1903        if (!iommu->domains || !iommu->domains[0]) {
1904                pr_err("%s: Allocating domain array failed\n",
1905                       iommu->name);
1906                kfree(iommu->domain_ids);
1907                kfree(iommu->domains);
1908                iommu->domain_ids = NULL;
1909                iommu->domains    = NULL;
1910                return -ENOMEM;
1911        }
1912
1913        /*
1914         * If Caching mode is set, then invalid translations are tagged
1915         * with domain-id 0, hence we need to pre-allocate it. We also
1916         * use domain-id 0 as a marker for non-allocated domain-id, so
1917         * make sure it is not used for a real domain.
1918         */
1919        set_bit(0, iommu->domain_ids);
1920
1921        /*
1922         * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1923         * entry for first-level or pass-through translation modes should
1924         * be programmed with a domain id different from those used for
1925         * second-level or nested translation. We reserve a domain id for
1926         * this purpose.
1927         */
1928        if (sm_supported(iommu))
1929                set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1930
1931        return 0;
1932}
1933
1934static void disable_dmar_iommu(struct intel_iommu *iommu)
1935{
1936        struct device_domain_info *info, *tmp;
1937        unsigned long flags;
1938
1939        if (!iommu->domains || !iommu->domain_ids)
1940                return;
1941
1942        spin_lock_irqsave(&device_domain_lock, flags);
1943        list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1944                if (info->iommu != iommu)
1945                        continue;
1946
1947                if (!info->dev || !info->domain)
1948                        continue;
1949
1950                __dmar_remove_one_dev_info(info);
1951        }
1952        spin_unlock_irqrestore(&device_domain_lock, flags);
1953
1954        if (iommu->gcmd & DMA_GCMD_TE)
1955                iommu_disable_translation(iommu);
1956}
1957
1958static void free_dmar_iommu(struct intel_iommu *iommu)
1959{
1960        if ((iommu->domains) && (iommu->domain_ids)) {
1961                int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1962                int i;
1963
1964                for (i = 0; i < elems; i++)
1965                        kfree(iommu->domains[i]);
1966                kfree(iommu->domains);
1967                kfree(iommu->domain_ids);
1968                iommu->domains = NULL;
1969                iommu->domain_ids = NULL;
1970        }
1971
1972        g_iommus[iommu->seq_id] = NULL;
1973
1974        /* free context mapping */
1975        free_context_table(iommu);
1976
1977#ifdef CONFIG_INTEL_IOMMU_SVM
1978        if (pasid_supported(iommu)) {
1979                if (ecap_prs(iommu->ecap))
1980                        intel_svm_finish_prq(iommu);
1981        }
1982        if (vccap_pasid(iommu->vccap))
1983                ioasid_unregister_allocator(&iommu->pasid_allocator);
1984
1985#endif
1986}
1987
1988/*
1989 * Check and return whether first level is used by default for
1990 * DMA translation.
1991 */
1992static bool first_level_by_default(unsigned int type)
1993{
1994        /* Only SL is available in legacy mode */
1995        if (!scalable_mode_support())
1996                return false;
1997
1998        /* Only level (either FL or SL) is available, just use it */
1999        if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
2000                return intel_cap_flts_sanity();
2001
2002        /* Both levels are available, decide it based on domain type */
2003        return type != IOMMU_DOMAIN_UNMANAGED;
2004}
2005
2006static struct dmar_domain *alloc_domain(unsigned int type)
2007{
2008        struct dmar_domain *domain;
2009
2010        domain = alloc_domain_mem();
2011        if (!domain)
2012                return NULL;
2013
2014        memset(domain, 0, sizeof(*domain));
2015        domain->nid = NUMA_NO_NODE;
2016        if (first_level_by_default(type))
2017                domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
2018        domain->has_iotlb_device = false;
2019        INIT_LIST_HEAD(&domain->devices);
2020        INIT_LIST_HEAD(&domain->subdevices);
2021
2022        return domain;
2023}
2024
2025/* Must be called with iommu->lock */
2026static int domain_attach_iommu(struct dmar_domain *domain,
2027                               struct intel_iommu *iommu)
2028{
2029        unsigned long ndomains;
2030        int num;
2031
2032        assert_spin_locked(&device_domain_lock);
2033        assert_spin_locked(&iommu->lock);
2034
2035        domain->iommu_refcnt[iommu->seq_id] += 1;
2036        if (domain->iommu_refcnt[iommu->seq_id] == 1) {
2037                ndomains = cap_ndoms(iommu->cap);
2038                num      = find_first_zero_bit(iommu->domain_ids, ndomains);
2039
2040                if (num >= ndomains) {
2041                        pr_err("%s: No free domain ids\n", iommu->name);
2042                        domain->iommu_refcnt[iommu->seq_id] -= 1;
2043                        return -ENOSPC;
2044                }
2045
2046                set_bit(num, iommu->domain_ids);
2047                set_iommu_domain(iommu, num, domain);
2048
2049                domain->iommu_did[iommu->seq_id] = num;
2050                domain->nid                      = iommu->node;
2051
2052                domain_update_iommu_cap(domain);
2053        }
2054
2055        return 0;
2056}
2057
2058static void domain_detach_iommu(struct dmar_domain *domain,
2059                                struct intel_iommu *iommu)
2060{
2061        int num;
2062
2063        assert_spin_locked(&device_domain_lock);
2064        assert_spin_locked(&iommu->lock);
2065
2066        domain->iommu_refcnt[iommu->seq_id] -= 1;
2067        if (domain->iommu_refcnt[iommu->seq_id] == 0) {
2068                num = domain->iommu_did[iommu->seq_id];
2069                clear_bit(num, iommu->domain_ids);
2070                set_iommu_domain(iommu, num, NULL);
2071
2072                domain_update_iommu_cap(domain);
2073                domain->iommu_did[iommu->seq_id] = 0;
2074        }
2075}
2076
2077static inline int guestwidth_to_adjustwidth(int gaw)
2078{
2079        int agaw;
2080        int r = (gaw - 12) % 9;
2081
2082        if (r == 0)
2083                agaw = gaw;
2084        else
2085                agaw = gaw + 9 - r;
2086        if (agaw > 64)
2087                agaw = 64;
2088        return agaw;
2089}
2090
2091static void domain_exit(struct dmar_domain *domain)
2092{
2093
2094        /* Remove associated devices and clear attached or cached domains */
2095        domain_remove_dev_info(domain);
2096
2097        if (domain->pgd) {
2098                struct page *freelist;
2099
2100                freelist = domain_unmap(domain, 0,
2101                                        DOMAIN_MAX_PFN(domain->gaw), NULL);
2102                dma_free_pagelist(freelist);
2103        }
2104
2105        free_domain_mem(domain);
2106}
2107
2108/*
2109 * Get the PASID directory size for scalable mode context entry.
2110 * Value of X in the PDTS field of a scalable mode context entry
2111 * indicates PASID directory with 2^(X + 7) entries.
2112 */
2113static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2114{
2115        int pds, max_pde;
2116
2117        max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2118        pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2119        if (pds < 7)
2120                return 0;
2121
2122        return pds - 7;
2123}
2124
2125/*
2126 * Set the RID_PASID field of a scalable mode context entry. The
2127 * IOMMU hardware will use the PASID value set in this field for
2128 * DMA translations of DMA requests without PASID.
2129 */
2130static inline void
2131context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2132{
2133        context->hi |= pasid & ((1 << 20) - 1);
2134}
2135
2136/*
2137 * Set the DTE(Device-TLB Enable) field of a scalable mode context
2138 * entry.
2139 */
2140static inline void context_set_sm_dte(struct context_entry *context)
2141{
2142        context->lo |= (1 << 2);
2143}
2144
2145/*
2146 * Set the PRE(Page Request Enable) field of a scalable mode context
2147 * entry.
2148 */
2149static inline void context_set_sm_pre(struct context_entry *context)
2150{
2151        context->lo |= (1 << 4);
2152}
2153
2154/* Convert value to context PASID directory size field coding. */
2155#define context_pdts(pds)       (((pds) & 0x7) << 9)
2156
2157static int domain_context_mapping_one(struct dmar_domain *domain,
2158                                      struct intel_iommu *iommu,
2159                                      struct pasid_table *table,
2160                                      u8 bus, u8 devfn)
2161{
2162        u16 did = domain->iommu_did[iommu->seq_id];
2163        int translation = CONTEXT_TT_MULTI_LEVEL;
2164        struct device_domain_info *info = NULL;
2165        struct context_entry *context;
2166        unsigned long flags;
2167        int ret;
2168
2169        WARN_ON(did == 0);
2170
2171        if (hw_pass_through && domain_type_is_si(domain))
2172                translation = CONTEXT_TT_PASS_THROUGH;
2173
2174        pr_debug("Set context mapping for %02x:%02x.%d\n",
2175                bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2176
2177        BUG_ON(!domain->pgd);
2178
2179        spin_lock_irqsave(&device_domain_lock, flags);
2180        spin_lock(&iommu->lock);
2181
2182        ret = -ENOMEM;
2183        context = iommu_context_addr(iommu, bus, devfn, 1);
2184        if (!context)
2185                goto out_unlock;
2186
2187        ret = 0;
2188        if (context_present(context))
2189                goto out_unlock;
2190
2191        /*
2192         * For kdump cases, old valid entries may be cached due to the
2193         * in-flight DMA and copied pgtable, but there is no unmapping
2194         * behaviour for them, thus we need an explicit cache flush for
2195         * the newly-mapped device. For kdump, at this point, the device
2196         * is supposed to finish reset at its driver probe stage, so no
2197         * in-flight DMA will exist, and we don't need to worry anymore
2198         * hereafter.
2199         */
2200        if (context_copied(context)) {
2201                u16 did_old = context_domain_id(context);
2202
2203                if (did_old < cap_ndoms(iommu->cap)) {
2204                        iommu->flush.flush_context(iommu, did_old,
2205                                                   (((u16)bus) << 8) | devfn,
2206                                                   DMA_CCMD_MASK_NOBIT,
2207                                                   DMA_CCMD_DEVICE_INVL);
2208                        iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2209                                                 DMA_TLB_DSI_FLUSH);
2210                }
2211        }
2212
2213        context_clear_entry(context);
2214
2215        if (sm_supported(iommu)) {
2216                unsigned long pds;
2217
2218                WARN_ON(!table);
2219
2220                /* Setup the PASID DIR pointer: */
2221                pds = context_get_sm_pds(table);
2222                context->lo = (u64)virt_to_phys(table->table) |
2223                                context_pdts(pds);
2224
2225                /* Setup the RID_PASID field: */
2226                context_set_sm_rid2pasid(context, PASID_RID2PASID);
2227
2228                /*
2229                 * Setup the Device-TLB enable bit and Page request
2230                 * Enable bit:
2231                 */
2232                info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2233                if (info && info->ats_supported)
2234                        context_set_sm_dte(context);
2235                if (info && info->pri_supported)
2236                        context_set_sm_pre(context);
2237        } else {
2238                struct dma_pte *pgd = domain->pgd;
2239                int agaw;
2240
2241                context_set_domain_id(context, did);
2242
2243                if (translation != CONTEXT_TT_PASS_THROUGH) {
2244                        /*
2245                         * Skip top levels of page tables for iommu which has
2246                         * less agaw than default. Unnecessary for PT mode.
2247                         */
2248                        for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2249                                ret = -ENOMEM;
2250                                pgd = phys_to_virt(dma_pte_addr(pgd));
2251                                if (!dma_pte_present(pgd))
2252                                        goto out_unlock;
2253                        }
2254
2255                        info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2256                        if (info && info->ats_supported)
2257                                translation = CONTEXT_TT_DEV_IOTLB;
2258                        else
2259                                translation = CONTEXT_TT_MULTI_LEVEL;
2260
2261                        context_set_address_root(context, virt_to_phys(pgd));
2262                        context_set_address_width(context, agaw);
2263                } else {
2264                        /*
2265                         * In pass through mode, AW must be programmed to
2266                         * indicate the largest AGAW value supported by
2267                         * hardware. And ASR is ignored by hardware.
2268                         */
2269                        context_set_address_width(context, iommu->msagaw);
2270                }
2271
2272                context_set_translation_type(context, translation);
2273        }
2274
2275        context_set_fault_enable(context);
2276        context_set_present(context);
2277        if (!ecap_coherent(iommu->ecap))
2278                clflush_cache_range(context, sizeof(*context));
2279
2280        /*
2281         * It's a non-present to present mapping. If hardware doesn't cache
2282         * non-present entry we only need to flush the write-buffer. If the
2283         * _does_ cache non-present entries, then it does so in the special
2284         * domain #0, which we have to flush:
2285         */
2286        if (cap_caching_mode(iommu->cap)) {
2287                iommu->flush.flush_context(iommu, 0,
2288                                           (((u16)bus) << 8) | devfn,
2289                                           DMA_CCMD_MASK_NOBIT,
2290                                           DMA_CCMD_DEVICE_INVL);
2291                iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2292        } else {
2293                iommu_flush_write_buffer(iommu);
2294        }
2295        iommu_enable_dev_iotlb(info);
2296
2297        ret = 0;
2298
2299out_unlock:
2300        spin_unlock(&iommu->lock);
2301        spin_unlock_irqrestore(&device_domain_lock, flags);
2302
2303        return ret;
2304}
2305
2306struct domain_context_mapping_data {
2307        struct dmar_domain *domain;
2308        struct intel_iommu *iommu;
2309        struct pasid_table *table;
2310};
2311
2312static int domain_context_mapping_cb(struct pci_dev *pdev,
2313                                     u16 alias, void *opaque)
2314{
2315        struct domain_context_mapping_data *data = opaque;
2316
2317        return domain_context_mapping_one(data->domain, data->iommu,
2318                                          data->table, PCI_BUS_NUM(alias),
2319                                          alias & 0xff);
2320}
2321
2322static int
2323domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2324{
2325        struct domain_context_mapping_data data;
2326        struct pasid_table *table;
2327        struct intel_iommu *iommu;
2328        u8 bus, devfn;
2329
2330        iommu = device_to_iommu(dev, &bus, &devfn);
2331        if (!iommu)
2332                return -ENODEV;
2333
2334        table = intel_pasid_get_table(dev);
2335
2336        if (!dev_is_pci(dev))
2337                return domain_context_mapping_one(domain, iommu, table,
2338                                                  bus, devfn);
2339
2340        data.domain = domain;
2341        data.iommu = iommu;
2342        data.table = table;
2343
2344        return pci_for_each_dma_alias(to_pci_dev(dev),
2345                                      &domain_context_mapping_cb, &data);
2346}
2347
2348static int domain_context_mapped_cb(struct pci_dev *pdev,
2349                                    u16 alias, void *opaque)
2350{
2351        struct intel_iommu *iommu = opaque;
2352
2353        return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2354}
2355
2356static int domain_context_mapped(struct device *dev)
2357{
2358        struct intel_iommu *iommu;
2359        u8 bus, devfn;
2360
2361        iommu = device_to_iommu(dev, &bus, &devfn);
2362        if (!iommu)
2363                return -ENODEV;
2364
2365        if (!dev_is_pci(dev))
2366                return device_context_mapped(iommu, bus, devfn);
2367
2368        return !pci_for_each_dma_alias(to_pci_dev(dev),
2369                                       domain_context_mapped_cb, iommu);
2370}
2371
2372/* Returns a number of VTD pages, but aligned to MM page size */
2373static inline unsigned long aligned_nrpages(unsigned long host_addr,
2374                                            size_t size)
2375{
2376        host_addr &= ~PAGE_MASK;
2377        return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2378}
2379
2380/* Return largest possible superpage level for a given mapping */
2381static inline int hardware_largepage_caps(struct dmar_domain *domain,
2382                                          unsigned long iov_pfn,
2383                                          unsigned long phy_pfn,
2384                                          unsigned long pages)
2385{
2386        int support, level = 1;
2387        unsigned long pfnmerge;
2388
2389        support = domain->iommu_superpage;
2390
2391        /* To use a large page, the virtual *and* physical addresses
2392           must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2393           of them will mean we have to use smaller pages. So just
2394           merge them and check both at once. */
2395        pfnmerge = iov_pfn | phy_pfn;
2396
2397        while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2398                pages >>= VTD_STRIDE_SHIFT;
2399                if (!pages)
2400                        break;
2401                pfnmerge >>= VTD_STRIDE_SHIFT;
2402                level++;
2403                support--;
2404        }
2405        return level;
2406}
2407
2408/*
2409 * Ensure that old small page tables are removed to make room for superpage(s).
2410 * We're going to add new large pages, so make sure we don't remove their parent
2411 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2412 */
2413static void switch_to_super_page(struct dmar_domain *domain,
2414                                 unsigned long start_pfn,
2415                                 unsigned long end_pfn, int level)
2416{
2417        unsigned long lvl_pages = lvl_to_nr_pages(level);
2418        struct dma_pte *pte = NULL;
2419        int i;
2420
2421        while (start_pfn <= end_pfn) {
2422                if (!pte)
2423                        pte = pfn_to_dma_pte(domain, start_pfn, &level);
2424
2425                if (dma_pte_present(pte)) {
2426                        dma_pte_free_pagetable(domain, start_pfn,
2427                                               start_pfn + lvl_pages - 1,
2428                                               level + 1);
2429
2430                        for_each_domain_iommu(i, domain)
2431                                iommu_flush_iotlb_psi(g_iommus[i], domain,
2432                                                      start_pfn, lvl_pages,
2433                                                      0, 0);
2434                }
2435
2436                pte++;
2437                start_pfn += lvl_pages;
2438                if (first_pte_in_page(pte))
2439                        pte = NULL;
2440        }
2441}
2442
2443static int
2444__domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2445                 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2446{
2447        struct dma_pte *first_pte = NULL, *pte = NULL;
2448        unsigned int largepage_lvl = 0;
2449        unsigned long lvl_pages = 0;
2450        phys_addr_t pteval;
2451        u64 attr;
2452
2453        BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2454
2455        if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2456                return -EINVAL;
2457
2458        attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2459        attr |= DMA_FL_PTE_PRESENT;
2460        if (domain_use_first_level(domain)) {
2461                attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2462                if (prot & DMA_PTE_WRITE)
2463                        attr |= DMA_FL_PTE_DIRTY;
2464        }
2465
2466        pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2467
2468        while (nr_pages > 0) {
2469                uint64_t tmp;
2470
2471                if (!pte) {
2472                        largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2473                                        phys_pfn, nr_pages);
2474
2475                        pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2476                        if (!pte)
2477                                return -ENOMEM;
2478                        first_pte = pte;
2479
2480                        lvl_pages = lvl_to_nr_pages(largepage_lvl);
2481
2482                        /* It is large page*/
2483                        if (largepage_lvl > 1) {
2484                                unsigned long end_pfn;
2485                                unsigned long pages_to_remove;
2486
2487                                pteval |= DMA_PTE_LARGE_PAGE;
2488                                pages_to_remove = min_t(unsigned long, nr_pages,
2489                                                        nr_pte_to_next_page(pte) * lvl_pages);
2490                                end_pfn = iov_pfn + pages_to_remove - 1;
2491                                switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2492                        } else {
2493                                pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2494                        }
2495
2496                }
2497                /* We don't need lock here, nobody else
2498                 * touches the iova range
2499                 */
2500                tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2501                if (tmp) {
2502                        static int dumps = 5;
2503                        pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2504                                iov_pfn, tmp, (unsigned long long)pteval);
2505                        if (dumps) {
2506                                dumps--;
2507                                debug_dma_dump_mappings(NULL);
2508                        }
2509                        WARN_ON(1);
2510                }
2511
2512                nr_pages -= lvl_pages;
2513                iov_pfn += lvl_pages;
2514                phys_pfn += lvl_pages;
2515                pteval += lvl_pages * VTD_PAGE_SIZE;
2516
2517                /* If the next PTE would be the first in a new page, then we
2518                 * need to flush the cache on the entries we've just written.
2519                 * And then we'll need to recalculate 'pte', so clear it and
2520                 * let it get set again in the if (!pte) block above.
2521                 *
2522                 * If we're done (!nr_pages) we need to flush the cache too.
2523                 *
2524                 * Also if we've been setting superpages, we may need to
2525                 * recalculate 'pte' and switch back to smaller pages for the
2526                 * end of the mapping, if the trailing size is not enough to
2527                 * use another superpage (i.e. nr_pages < lvl_pages).
2528                 */
2529                pte++;
2530                if (!nr_pages || first_pte_in_page(pte) ||
2531                    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2532                        domain_flush_cache(domain, first_pte,
2533                                           (void *)pte - (void *)first_pte);
2534                        pte = NULL;
2535                }
2536        }
2537
2538        return 0;
2539}
2540
2541static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2542{
2543        struct intel_iommu *iommu = info->iommu;
2544        struct context_entry *context;
2545        unsigned long flags;
2546        u16 did_old;
2547
2548        if (!iommu)
2549                return;
2550
2551        spin_lock_irqsave(&iommu->lock, flags);
2552        context = iommu_context_addr(iommu, bus, devfn, 0);
2553        if (!context) {
2554                spin_unlock_irqrestore(&iommu->lock, flags);
2555                return;
2556        }
2557
2558        if (sm_supported(iommu)) {
2559                if (hw_pass_through && domain_type_is_si(info->domain))
2560                        did_old = FLPT_DEFAULT_DID;
2561                else
2562                        did_old = info->domain->iommu_did[iommu->seq_id];
2563        } else {
2564                did_old = context_domain_id(context);
2565        }
2566
2567        context_clear_entry(context);
2568        __iommu_flush_cache(iommu, context, sizeof(*context));
2569        spin_unlock_irqrestore(&iommu->lock, flags);
2570        iommu->flush.flush_context(iommu,
2571                                   did_old,
2572                                   (((u16)bus) << 8) | devfn,
2573                                   DMA_CCMD_MASK_NOBIT,
2574                                   DMA_CCMD_DEVICE_INVL);
2575
2576        if (sm_supported(iommu))
2577                qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2578
2579        iommu->flush.flush_iotlb(iommu,
2580                                 did_old,
2581                                 0,
2582                                 0,
2583                                 DMA_TLB_DSI_FLUSH);
2584
2585        __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2586}
2587
2588static inline void unlink_domain_info(struct device_domain_info *info)
2589{
2590        assert_spin_locked(&device_domain_lock);
2591        list_del(&info->link);
2592        list_del(&info->global);
2593        if (info->dev)
2594                dev_iommu_priv_set(info->dev, NULL);
2595}
2596
2597static void domain_remove_dev_info(struct dmar_domain *domain)
2598{
2599        struct device_domain_info *info, *tmp;
2600        unsigned long flags;
2601
2602        spin_lock_irqsave(&device_domain_lock, flags);
2603        list_for_each_entry_safe(info, tmp, &domain->devices, link)
2604                __dmar_remove_one_dev_info(info);
2605        spin_unlock_irqrestore(&device_domain_lock, flags);
2606}
2607
2608struct dmar_domain *find_domain(struct device *dev)
2609{
2610        struct device_domain_info *info;
2611
2612        if (unlikely(!dev || !dev->iommu))
2613                return NULL;
2614
2615        if (unlikely(attach_deferred(dev)))
2616                return NULL;
2617
2618        /* No lock here, assumes no domain exit in normal case */
2619        info = get_domain_info(dev);
2620        if (likely(info))
2621                return info->domain;
2622
2623        return NULL;
2624}
2625
2626static inline struct device_domain_info *
2627dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2628{
2629        struct device_domain_info *info;
2630
2631        list_for_each_entry(info, &device_domain_list, global)
2632                if (info->segment == segment && info->bus == bus &&
2633                    info->devfn == devfn)
2634                        return info;
2635
2636        return NULL;
2637}
2638
2639static int domain_setup_first_level(struct intel_iommu *iommu,
2640                                    struct dmar_domain *domain,
2641                                    struct device *dev,
2642                                    u32 pasid)
2643{
2644        struct dma_pte *pgd = domain->pgd;
2645        int agaw, level;
2646        int flags = 0;
2647
2648        /*
2649         * Skip top levels of page tables for iommu which has
2650         * less agaw than default. Unnecessary for PT mode.
2651         */
2652        for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2653                pgd = phys_to_virt(dma_pte_addr(pgd));
2654                if (!dma_pte_present(pgd))
2655                        return -ENOMEM;
2656        }
2657
2658        level = agaw_to_level(agaw);
2659        if (level != 4 && level != 5)
2660                return -EINVAL;
2661
2662        if (pasid != PASID_RID2PASID)
2663                flags |= PASID_FLAG_SUPERVISOR_MODE;
2664        if (level == 5)
2665                flags |= PASID_FLAG_FL5LP;
2666
2667        if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED)
2668                flags |= PASID_FLAG_PAGE_SNOOP;
2669
2670        return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2671                                             domain->iommu_did[iommu->seq_id],
2672                                             flags);
2673}
2674
2675static bool dev_is_real_dma_subdevice(struct device *dev)
2676{
2677        return dev && dev_is_pci(dev) &&
2678               pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2679}
2680
2681static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2682                                                    int bus, int devfn,
2683                                                    struct device *dev,
2684                                                    struct dmar_domain *domain)
2685{
2686        struct dmar_domain *found = NULL;
2687        struct device_domain_info *info;
2688        unsigned long flags;
2689        int ret;
2690
2691        info = alloc_devinfo_mem();
2692        if (!info)
2693                return NULL;
2694
2695        if (!dev_is_real_dma_subdevice(dev)) {
2696                info->bus = bus;
2697                info->devfn = devfn;
2698                info->segment = iommu->segment;
2699        } else {
2700                struct pci_dev *pdev = to_pci_dev(dev);
2701
2702                info->bus = pdev->bus->number;
2703                info->devfn = pdev->devfn;
2704                info->segment = pci_domain_nr(pdev->bus);
2705        }
2706
2707        info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2708        info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2709        info->ats_qdep = 0;
2710        info->dev = dev;
2711        info->domain = domain;
2712        info->iommu = iommu;
2713        info->pasid_table = NULL;
2714        info->auxd_enabled = 0;
2715        INIT_LIST_HEAD(&info->subdevices);
2716
2717        if (dev && dev_is_pci(dev)) {
2718                struct pci_dev *pdev = to_pci_dev(info->dev);
2719
2720                if (ecap_dev_iotlb_support(iommu->ecap) &&
2721                    pci_ats_supported(pdev) &&
2722                    dmar_find_matched_atsr_unit(pdev))
2723                        info->ats_supported = 1;
2724
2725                if (sm_supported(iommu)) {
2726                        if (pasid_supported(iommu)) {
2727                                int features = pci_pasid_features(pdev);
2728                                if (features >= 0)
2729                                        info->pasid_supported = features | 1;
2730                        }
2731
2732                        if (info->ats_supported && ecap_prs(iommu->ecap) &&
2733                            pci_pri_supported(pdev))
2734                                info->pri_supported = 1;
2735                }
2736        }
2737
2738        spin_lock_irqsave(&device_domain_lock, flags);
2739        if (dev)
2740                found = find_domain(dev);
2741
2742        if (!found) {
2743                struct device_domain_info *info2;
2744                info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2745                                                       info->devfn);
2746                if (info2) {
2747                        found      = info2->domain;
2748                        info2->dev = dev;
2749                }
2750        }
2751
2752        if (found) {
2753                spin_unlock_irqrestore(&device_domain_lock, flags);
2754                free_devinfo_mem(info);
2755                /* Caller must free the original domain */
2756                return found;
2757        }
2758
2759        spin_lock(&iommu->lock);
2760        ret = domain_attach_iommu(domain, iommu);
2761        spin_unlock(&iommu->lock);
2762
2763        if (ret) {
2764                spin_unlock_irqrestore(&device_domain_lock, flags);
2765                free_devinfo_mem(info);
2766                return NULL;
2767        }
2768
2769        list_add(&info->link, &domain->devices);
2770        list_add(&info->global, &device_domain_list);
2771        if (dev)
2772                dev_iommu_priv_set(dev, info);
2773        spin_unlock_irqrestore(&device_domain_lock, flags);
2774
2775        /* PASID table is mandatory for a PCI device in scalable mode. */
2776        if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2777                ret = intel_pasid_alloc_table(dev);
2778                if (ret) {
2779                        dev_err(dev, "PASID table allocation failed\n");
2780                        dmar_remove_one_dev_info(dev);
2781                        return NULL;
2782                }
2783
2784                /* Setup the PASID entry for requests without PASID: */
2785                spin_lock_irqsave(&iommu->lock, flags);
2786                if (hw_pass_through && domain_type_is_si(domain))
2787                        ret = intel_pasid_setup_pass_through(iommu, domain,
2788                                        dev, PASID_RID2PASID);
2789                else if (domain_use_first_level(domain))
2790                        ret = domain_setup_first_level(iommu, domain, dev,
2791                                        PASID_RID2PASID);
2792                else
2793                        ret = intel_pasid_setup_second_level(iommu, domain,
2794                                        dev, PASID_RID2PASID);
2795                spin_unlock_irqrestore(&iommu->lock, flags);
2796                if (ret) {
2797                        dev_err(dev, "Setup RID2PASID failed\n");
2798                        dmar_remove_one_dev_info(dev);
2799                        return NULL;
2800                }
2801        }
2802
2803        if (dev && domain_context_mapping(domain, dev)) {
2804                dev_err(dev, "Domain context map failed\n");
2805                dmar_remove_one_dev_info(dev);
2806                return NULL;
2807        }
2808
2809        return domain;
2810}
2811
2812static int iommu_domain_identity_map(struct dmar_domain *domain,
2813                                     unsigned long first_vpfn,
2814                                     unsigned long last_vpfn)
2815{
2816        /*
2817         * RMRR range might have overlap with physical memory range,
2818         * clear it first
2819         */
2820        dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2821
2822        return __domain_mapping(domain, first_vpfn,
2823                                first_vpfn, last_vpfn - first_vpfn + 1,
2824                                DMA_PTE_READ|DMA_PTE_WRITE);
2825}
2826
2827static int md_domain_init(struct dmar_domain *domain, int guest_width);
2828
2829static int __init si_domain_init(int hw)
2830{
2831        struct dmar_rmrr_unit *rmrr;
2832        struct device *dev;
2833        int i, nid, ret;
2834
2835        si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2836        if (!si_domain)
2837                return -EFAULT;
2838
2839        if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2840                domain_exit(si_domain);
2841                return -EFAULT;
2842        }
2843
2844        if (hw)
2845                return 0;
2846
2847        for_each_online_node(nid) {
2848                unsigned long start_pfn, end_pfn;
2849                int i;
2850
2851                for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2852                        ret = iommu_domain_identity_map(si_domain,
2853                                        mm_to_dma_pfn(start_pfn),
2854                                        mm_to_dma_pfn(end_pfn));
2855                        if (ret)
2856                                return ret;
2857                }
2858        }
2859
2860        /*
2861         * Identity map the RMRRs so that devices with RMRRs could also use
2862         * the si_domain.
2863         */
2864        for_each_rmrr_units(rmrr) {
2865                for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2866                                          i, dev) {
2867                        unsigned long long start = rmrr->base_address;
2868                        unsigned long long end = rmrr->end_address;
2869
2870                        if (WARN_ON(end < start ||
2871                                    end >> agaw_to_width(si_domain->agaw)))
2872                                continue;
2873
2874                        ret = iommu_domain_identity_map(si_domain,
2875                                        mm_to_dma_pfn(start >> PAGE_SHIFT),
2876                                        mm_to_dma_pfn(end >> PAGE_SHIFT));
2877                        if (ret)
2878                                return ret;
2879                }
2880        }
2881
2882        return 0;
2883}
2884
2885static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2886{
2887        struct dmar_domain *ndomain;
2888        struct intel_iommu *iommu;
2889        u8 bus, devfn;
2890
2891        iommu = device_to_iommu(dev, &bus, &devfn);
2892        if (!iommu)
2893                return -ENODEV;
2894
2895        ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2896        if (ndomain != domain)
2897                return -EBUSY;
2898
2899        return 0;
2900}
2901
2902static bool device_has_rmrr(struct device *dev)
2903{
2904        struct dmar_rmrr_unit *rmrr;
2905        struct device *tmp;
2906        int i;
2907
2908        rcu_read_lock();
2909        for_each_rmrr_units(rmrr) {
2910                /*
2911                 * Return TRUE if this RMRR contains the device that
2912                 * is passed in.
2913                 */
2914                for_each_active_dev_scope(rmrr->devices,
2915                                          rmrr->devices_cnt, i, tmp)
2916                        if (tmp == dev ||
2917                            is_downstream_to_pci_bridge(dev, tmp)) {
2918                                rcu_read_unlock();
2919                                return true;
2920                        }
2921        }
2922        rcu_read_unlock();
2923        return false;
2924}
2925
2926/**
2927 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2928 * is relaxable (ie. is allowed to be not enforced under some conditions)
2929 * @dev: device handle
2930 *
2931 * We assume that PCI USB devices with RMRRs have them largely
2932 * for historical reasons and that the RMRR space is not actively used post
2933 * boot.  This exclusion may change if vendors begin to abuse it.
2934 *
2935 * The same exception is made for graphics devices, with the requirement that
2936 * any use of the RMRR regions will be torn down before assigning the device
2937 * to a guest.
2938 *
2939 * Return: true if the RMRR is relaxable, false otherwise
2940 */
2941static bool device_rmrr_is_relaxable(struct device *dev)
2942{
2943        struct pci_dev *pdev;
2944
2945        if (!dev_is_pci(dev))
2946                return false;
2947
2948        pdev = to_pci_dev(dev);
2949        if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2950                return true;
2951        else
2952                return false;
2953}
2954
2955/*
2956 * There are a couple cases where we need to restrict the functionality of
2957 * devices associated with RMRRs.  The first is when evaluating a device for
2958 * identity mapping because problems exist when devices are moved in and out
2959 * of domains and their respective RMRR information is lost.  This means that
2960 * a device with associated RMRRs will never be in a "passthrough" domain.
2961 * The second is use of the device through the IOMMU API.  This interface
2962 * expects to have full control of the IOVA space for the device.  We cannot
2963 * satisfy both the requirement that RMRR access is maintained and have an
2964 * unencumbered IOVA space.  We also have no ability to quiesce the device's
2965 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2966 * We therefore prevent devices associated with an RMRR from participating in
2967 * the IOMMU API, which eliminates them from device assignment.
2968 *
2969 * In both cases, devices which have relaxable RMRRs are not concerned by this
2970 * restriction. See device_rmrr_is_relaxable comment.
2971 */
2972static bool device_is_rmrr_locked(struct device *dev)
2973{
2974        if (!device_has_rmrr(dev))
2975                return false;
2976
2977        if (device_rmrr_is_relaxable(dev))
2978                return false;
2979
2980        return true;
2981}
2982
2983/*
2984 * Return the required default domain type for a specific device.
2985 *
2986 * @dev: the device in query
2987 * @startup: true if this is during early boot
2988 *
2989 * Returns:
2990 *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2991 *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2992 *  - 0: both identity and dynamic domains work for this device
2993 */
2994static int device_def_domain_type(struct device *dev)
2995{
2996        if (dev_is_pci(dev)) {
2997                struct pci_dev *pdev = to_pci_dev(dev);
2998
2999                if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
3000                        return IOMMU_DOMAIN_IDENTITY;
3001
3002                if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
3003                        return IOMMU_DOMAIN_IDENTITY;
3004        }
3005
3006        return 0;
3007}
3008
3009static void intel_iommu_init_qi(struct intel_iommu *iommu)
3010{
3011        /*
3012         * Start from the sane iommu hardware state.
3013         * If the queued invalidation is already initialized by us
3014         * (for example, while enabling interrupt-remapping) then
3015         * we got the things already rolling from a sane state.
3016         */
3017        if (!iommu->qi) {
3018                /*
3019                 * Clear any previous faults.
3020                 */
3021                dmar_fault(-1, iommu);
3022                /*
3023                 * Disable queued invalidation if supported and already enabled
3024                 * before OS handover.
3025                 */
3026                dmar_disable_qi(iommu);
3027        }
3028
3029        if (dmar_enable_qi(iommu)) {
3030                /*
3031                 * Queued Invalidate not enabled, use Register Based Invalidate
3032                 */
3033                iommu->flush.flush_context = __iommu_flush_context;
3034                iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3035                pr_info("%s: Using Register based invalidation\n",
3036                        iommu->name);
3037        } else {
3038                iommu->flush.flush_context = qi_flush_context;
3039                iommu->flush.flush_iotlb = qi_flush_iotlb;
3040                pr_info("%s: Using Queued invalidation\n", iommu->name);
3041        }
3042}
3043
3044static int copy_context_table(struct intel_iommu *iommu,
3045                              struct root_entry *old_re,
3046                              struct context_entry **tbl,
3047                              int bus, bool ext)
3048{
3049        int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3050        struct context_entry *new_ce = NULL, ce;
3051        struct context_entry *old_ce = NULL;
3052        struct root_entry re;
3053        phys_addr_t old_ce_phys;
3054
3055        tbl_idx = ext ? bus * 2 : bus;
3056        memcpy(&re, old_re, sizeof(re));
3057
3058        for (devfn = 0; devfn < 256; devfn++) {
3059                /* First calculate the correct index */
3060                idx = (ext ? devfn * 2 : devfn) % 256;
3061
3062                if (idx == 0) {
3063                        /* First save what we may have and clean up */
3064                        if (new_ce) {
3065                                tbl[tbl_idx] = new_ce;
3066                                __iommu_flush_cache(iommu, new_ce,
3067                                                    VTD_PAGE_SIZE);
3068                                pos = 1;
3069                        }
3070
3071                        if (old_ce)
3072                                memunmap(old_ce);
3073
3074                        ret = 0;
3075                        if (devfn < 0x80)
3076                                old_ce_phys = root_entry_lctp(&re);
3077                        else
3078                                old_ce_phys = root_entry_uctp(&re);
3079
3080                        if (!old_ce_phys) {
3081                                if (ext && devfn == 0) {
3082                                        /* No LCTP, try UCTP */
3083                                        devfn = 0x7f;
3084                                        continue;
3085                                } else {
3086                                        goto out;
3087                                }
3088                        }
3089
3090                        ret = -ENOMEM;
3091                        old_ce = memremap(old_ce_phys, PAGE_SIZE,
3092                                        MEMREMAP_WB);
3093                        if (!old_ce)
3094                                goto out;
3095
3096                        new_ce = alloc_pgtable_page(iommu->node);
3097                        if (!new_ce)
3098                                goto out_unmap;
3099
3100                        ret = 0;
3101                }
3102
3103                /* Now copy the context entry */
3104                memcpy(&ce, old_ce + idx, sizeof(ce));
3105
3106                if (!__context_present(&ce))
3107                        continue;
3108
3109                did = context_domain_id(&ce);
3110                if (did >= 0 && did < cap_ndoms(iommu->cap))
3111                        set_bit(did, iommu->domain_ids);
3112
3113                /*
3114                 * We need a marker for copied context entries. This
3115                 * marker needs to work for the old format as well as
3116                 * for extended context entries.
3117                 *
3118                 * Bit 67 of the context entry is used. In the old
3119                 * format this bit is available to software, in the
3120                 * extended format it is the PGE bit, but PGE is ignored
3121                 * by HW if PASIDs are disabled (and thus still
3122                 * available).
3123                 *
3124                 * So disable PASIDs first and then mark the entry
3125                 * copied. This means that we don't copy PASID
3126                 * translations from the old kernel, but this is fine as
3127                 * faults there are not fatal.
3128                 */
3129                context_clear_pasid_enable(&ce);
3130                context_set_copied(&ce);
3131
3132                new_ce[idx] = ce;
3133        }
3134
3135        tbl[tbl_idx + pos] = new_ce;
3136
3137        __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3138
3139out_unmap:
3140        memunmap(old_ce);
3141
3142out:
3143        return ret;
3144}
3145
3146static int copy_translation_tables(struct intel_iommu *iommu)
3147{
3148        struct context_entry **ctxt_tbls;
3149        struct root_entry *old_rt;
3150        phys_addr_t old_rt_phys;
3151        int ctxt_table_entries;
3152        unsigned long flags;
3153        u64 rtaddr_reg;
3154        int bus, ret;
3155        bool new_ext, ext;
3156
3157        rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3158        ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3159        new_ext    = !!ecap_ecs(iommu->ecap);
3160
3161        /*
3162         * The RTT bit can only be changed when translation is disabled,
3163         * but disabling translation means to open a window for data
3164         * corruption. So bail out and don't copy anything if we would
3165         * have to change the bit.
3166         */
3167        if (new_ext != ext)
3168                return -EINVAL;
3169
3170        old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3171        if (!old_rt_phys)
3172                return -EINVAL;
3173
3174        old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3175        if (!old_rt)
3176                return -ENOMEM;
3177
3178        /* This is too big for the stack - allocate it from slab */
3179        ctxt_table_entries = ext ? 512 : 256;
3180        ret = -ENOMEM;
3181        ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3182        if (!ctxt_tbls)
3183                goto out_unmap;
3184
3185        for (bus = 0; bus < 256; bus++) {
3186                ret = copy_context_table(iommu, &old_rt[bus],
3187                                         ctxt_tbls, bus, ext);
3188                if (ret) {
3189                        pr_err("%s: Failed to copy context table for bus %d\n",
3190                                iommu->name, bus);
3191                        continue;
3192                }
3193        }
3194
3195        spin_lock_irqsave(&iommu->lock, flags);
3196
3197        /* Context tables are copied, now write them to the root_entry table */
3198        for (bus = 0; bus < 256; bus++) {
3199                int idx = ext ? bus * 2 : bus;
3200                u64 val;
3201
3202                if (ctxt_tbls[idx]) {
3203                        val = virt_to_phys(ctxt_tbls[idx]) | 1;
3204                        iommu->root_entry[bus].lo = val;
3205                }
3206
3207                if (!ext || !ctxt_tbls[idx + 1])
3208                        continue;
3209
3210                val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3211                iommu->root_entry[bus].hi = val;
3212        }
3213
3214        spin_unlock_irqrestore(&iommu->lock, flags);
3215
3216        kfree(ctxt_tbls);
3217
3218        __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3219
3220        ret = 0;
3221
3222out_unmap:
3223        memunmap(old_rt);
3224
3225        return ret;
3226}
3227
3228#ifdef CONFIG_INTEL_IOMMU_SVM
3229static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3230{
3231        struct intel_iommu *iommu = data;
3232        ioasid_t ioasid;
3233
3234        if (!iommu)
3235                return INVALID_IOASID;
3236        /*
3237         * VT-d virtual command interface always uses the full 20 bit
3238         * PASID range. Host can partition guest PASID range based on
3239         * policies but it is out of guest's control.
3240         */
3241        if (min < PASID_MIN || max > intel_pasid_max_id)
3242                return INVALID_IOASID;
3243
3244        if (vcmd_alloc_pasid(iommu, &ioasid))
3245                return INVALID_IOASID;
3246
3247        return ioasid;
3248}
3249
3250static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3251{
3252        struct intel_iommu *iommu = data;
3253
3254        if (!iommu)
3255                return;
3256        /*
3257         * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3258         * We can only free the PASID when all the devices are unbound.
3259         */
3260        if (ioasid_find(NULL, ioasid, NULL)) {
3261                pr_alert("Cannot free active IOASID %d\n", ioasid);
3262                return;
3263        }
3264        vcmd_free_pasid(iommu, ioasid);
3265}
3266
3267static void register_pasid_allocator(struct intel_iommu *iommu)
3268{
3269        /*
3270         * If we are running in the host, no need for custom allocator
3271         * in that PASIDs are allocated from the host system-wide.
3272         */
3273        if (!cap_caching_mode(iommu->cap))
3274                return;
3275
3276        if (!sm_supported(iommu)) {
3277                pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3278                return;
3279        }
3280
3281        /*
3282         * Register a custom PASID allocator if we are running in a guest,
3283         * guest PASID must be obtained via virtual command interface.
3284         * There can be multiple vIOMMUs in each guest but only one allocator
3285         * is active. All vIOMMU allocators will eventually be calling the same
3286         * host allocator.
3287         */
3288        if (!vccap_pasid(iommu->vccap))
3289                return;
3290
3291        pr_info("Register custom PASID allocator\n");
3292        iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3293        iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3294        iommu->pasid_allocator.pdata = (void *)iommu;
3295        if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3296                pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3297                /*
3298                 * Disable scalable mode on this IOMMU if there
3299                 * is no custom allocator. Mixing SM capable vIOMMU
3300                 * and non-SM vIOMMU are not supported.
3301                 */
3302                intel_iommu_sm = 0;
3303        }
3304}
3305#endif
3306
3307static int __init init_dmars(void)
3308{
3309        struct dmar_drhd_unit *drhd;
3310        struct intel_iommu *iommu;
3311        int ret;
3312
3313        /*
3314         * for each drhd
3315         *    allocate root
3316         *    initialize and program root entry to not present
3317         * endfor
3318         */
3319        for_each_drhd_unit(drhd) {
3320                /*
3321                 * lock not needed as this is only incremented in the single
3322                 * threaded kernel __init code path all other access are read
3323                 * only
3324                 */
3325                if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3326                        g_num_of_iommus++;
3327                        continue;
3328                }
3329                pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3330        }
3331
3332        /* Preallocate enough resources for IOMMU hot-addition */
3333        if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3334                g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3335
3336        g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3337                        GFP_KERNEL);
3338        if (!g_iommus) {
3339                ret = -ENOMEM;
3340                goto error;
3341        }
3342
3343        ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
3344        if (ret)
3345                goto free_iommu;
3346
3347        for_each_iommu(iommu, drhd) {
3348                if (drhd->ignored) {
3349                        iommu_disable_translation(iommu);
3350                        continue;
3351                }
3352
3353                /*
3354                 * Find the max pasid size of all IOMMU's in the system.
3355                 * We need to ensure the system pasid table is no bigger
3356                 * than the smallest supported.
3357                 */
3358                if (pasid_supported(iommu)) {
3359                        u32 temp = 2 << ecap_pss(iommu->ecap);
3360
3361                        intel_pasid_max_id = min_t(u32, temp,
3362                                                   intel_pasid_max_id);
3363                }
3364
3365                g_iommus[iommu->seq_id] = iommu;
3366
3367                intel_iommu_init_qi(iommu);
3368
3369                ret = iommu_init_domains(iommu);
3370                if (ret)
3371                        goto free_iommu;
3372
3373                init_translation_status(iommu);
3374
3375                if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3376                        iommu_disable_translation(iommu);
3377                        clear_translation_pre_enabled(iommu);
3378                        pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3379                                iommu->name);
3380                }
3381
3382                /*
3383                 * TBD:
3384                 * we could share the same root & context tables
3385                 * among all IOMMU's. Need to Split it later.
3386                 */
3387                ret = iommu_alloc_root_entry(iommu);
3388                if (ret)
3389                        goto free_iommu;
3390
3391                if (translation_pre_enabled(iommu)) {
3392                        pr_info("Translation already enabled - trying to copy translation structures\n");
3393
3394                        ret = copy_translation_tables(iommu);
3395                        if (ret) {
3396                                /*
3397                                 * We found the IOMMU with translation
3398                                 * enabled - but failed to copy over the
3399                                 * old root-entry table. Try to proceed
3400                                 * by disabling translation now and
3401                                 * allocating a clean root-entry table.
3402                                 * This might cause DMAR faults, but
3403                                 * probably the dump will still succeed.
3404                                 */
3405                                pr_err("Failed to copy translation tables from previous kernel for %s\n",
3406                                       iommu->name);
3407                                iommu_disable_translation(iommu);
3408                                clear_translation_pre_enabled(iommu);
3409                        } else {
3410                                pr_info("Copied translation tables from previous kernel for %s\n",
3411                                        iommu->name);
3412                        }
3413                }
3414
3415                if (!ecap_pass_through(iommu->ecap))
3416                        hw_pass_through = 0;
3417                intel_svm_check(iommu);
3418        }
3419
3420        /*
3421         * Now that qi is enabled on all iommus, set the root entry and flush
3422         * caches. This is required on some Intel X58 chipsets, otherwise the
3423         * flush_context function will loop forever and the boot hangs.
3424         */
3425        for_each_active_iommu(iommu, drhd) {
3426                iommu_flush_write_buffer(iommu);
3427#ifdef CONFIG_INTEL_IOMMU_SVM
3428                register_pasid_allocator(iommu);
3429#endif
3430                iommu_set_root_entry(iommu);
3431        }
3432
3433#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3434        dmar_map_gfx = 0;
3435#endif
3436
3437        if (!dmar_map_gfx)
3438                iommu_identity_mapping |= IDENTMAP_GFX;
3439
3440        check_tylersburg_isoch();
3441
3442        ret = si_domain_init(hw_pass_through);
3443        if (ret)
3444                goto free_iommu;
3445
3446        /*
3447         * for each drhd
3448         *   enable fault log
3449         *   global invalidate context cache
3450         *   global invalidate iotlb
3451         *   enable translation
3452         */
3453        for_each_iommu(iommu, drhd) {
3454                if (drhd->ignored) {
3455                        /*
3456                         * we always have to disable PMRs or DMA may fail on
3457                         * this device
3458                         */
3459                        if (force_on)
3460                                iommu_disable_protect_mem_regions(iommu);
3461                        continue;
3462                }
3463
3464                iommu_flush_write_buffer(iommu);
3465
3466#ifdef CONFIG_INTEL_IOMMU_SVM
3467                if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3468                        /*
3469                         * Call dmar_alloc_hwirq() with dmar_global_lock held,
3470                         * could cause possible lock race condition.
3471                         */
3472                        up_write(&dmar_global_lock);
3473                        ret = intel_svm_enable_prq(iommu);
3474                        down_write(&dmar_global_lock);
3475                        if (ret)
3476                                goto free_iommu;
3477                }
3478#endif
3479                ret = dmar_set_interrupt(iommu);
3480                if (ret)
3481                        goto free_iommu;
3482        }
3483
3484        return 0;
3485
3486free_iommu:
3487        for_each_active_iommu(iommu, drhd) {
3488                disable_dmar_iommu(iommu);
3489                free_dmar_iommu(iommu);
3490        }
3491
3492        kfree(g_iommus);
3493
3494error:
3495        return ret;
3496}
3497
3498static inline int iommu_domain_cache_init(void)
3499{
3500        int ret = 0;
3501
3502        iommu_domain_cache = kmem_cache_create("iommu_domain",
3503                                         sizeof(struct dmar_domain),
3504                                         0,
3505                                         SLAB_HWCACHE_ALIGN,
3506
3507                                         NULL);
3508        if (!iommu_domain_cache) {
3509                pr_err("Couldn't create iommu_domain cache\n");
3510                ret = -ENOMEM;
3511        }
3512
3513        return ret;
3514}
3515
3516static inline int iommu_devinfo_cache_init(void)
3517{
3518        int ret = 0;
3519
3520        iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3521                                         sizeof(struct device_domain_info),
3522                                         0,
3523                                         SLAB_HWCACHE_ALIGN,
3524                                         NULL);
3525        if (!iommu_devinfo_cache) {
3526                pr_err("Couldn't create devinfo cache\n");
3527                ret = -ENOMEM;
3528        }
3529
3530        return ret;
3531}
3532
3533static int __init iommu_init_mempool(void)
3534{
3535        int ret;
3536        ret = iova_cache_get();
3537        if (ret)
3538                return ret;
3539
3540        ret = iommu_domain_cache_init();
3541        if (ret)
3542                goto domain_error;
3543
3544        ret = iommu_devinfo_cache_init();
3545        if (!ret)
3546                return ret;
3547
3548        kmem_cache_destroy(iommu_domain_cache);
3549domain_error:
3550        iova_cache_put();
3551
3552        return -ENOMEM;
3553}
3554
3555static void __init iommu_exit_mempool(void)
3556{
3557        kmem_cache_destroy(iommu_devinfo_cache);
3558        kmem_cache_destroy(iommu_domain_cache);
3559        iova_cache_put();
3560}
3561
3562static void __init init_no_remapping_devices(void)
3563{
3564        struct dmar_drhd_unit *drhd;
3565        struct device *dev;
3566        int i;
3567
3568        for_each_drhd_unit(drhd) {
3569                if (!drhd->include_all) {
3570                        for_each_active_dev_scope(drhd->devices,
3571                                                  drhd->devices_cnt, i, dev)
3572                                break;
3573                        /* ignore DMAR unit if no devices exist */
3574                        if (i == drhd->devices_cnt)
3575                                drhd->ignored = 1;
3576                }
3577        }
3578
3579        for_each_active_drhd_unit(drhd) {
3580                if (drhd->include_all)
3581                        continue;
3582
3583                for_each_active_dev_scope(drhd->devices,
3584                                          drhd->devices_cnt, i, dev)
3585                        if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3586                                break;
3587                if (i < drhd->devices_cnt)
3588                        continue;
3589
3590                /* This IOMMU has *only* gfx devices. Either bypass it or
3591                   set the gfx_mapped flag, as appropriate */
3592                drhd->gfx_dedicated = 1;
3593                if (!dmar_map_gfx)
3594                        drhd->ignored = 1;
3595        }
3596}
3597
3598#ifdef CONFIG_SUSPEND
3599static int init_iommu_hw(void)
3600{
3601        struct dmar_drhd_unit *drhd;
3602        struct intel_iommu *iommu = NULL;
3603
3604        for_each_active_iommu(iommu, drhd)
3605                if (iommu->qi)
3606                        dmar_reenable_qi(iommu);
3607
3608        for_each_iommu(iommu, drhd) {
3609                if (drhd->ignored) {
3610                        /*
3611                         * we always have to disable PMRs or DMA may fail on
3612                         * this device
3613                         */
3614                        if (force_on)
3615                                iommu_disable_protect_mem_regions(iommu);
3616                        continue;
3617                }
3618
3619                iommu_flush_write_buffer(iommu);
3620                iommu_set_root_entry(iommu);
3621                iommu_enable_translation(iommu);
3622                iommu_disable_protect_mem_regions(iommu);
3623        }
3624
3625        return 0;
3626}
3627
3628static void iommu_flush_all(void)
3629{
3630        struct dmar_drhd_unit *drhd;
3631        struct intel_iommu *iommu;
3632
3633        for_each_active_iommu(iommu, drhd) {
3634                iommu->flush.flush_context(iommu, 0, 0, 0,
3635                                           DMA_CCMD_GLOBAL_INVL);
3636                iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3637                                         DMA_TLB_GLOBAL_FLUSH);
3638        }
3639}
3640
3641static int iommu_suspend(void)
3642{
3643        struct dmar_drhd_unit *drhd;
3644        struct intel_iommu *iommu = NULL;
3645        unsigned long flag;
3646
3647        for_each_active_iommu(iommu, drhd) {
3648                iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3649                                             GFP_KERNEL);
3650                if (!iommu->iommu_state)
3651                        goto nomem;
3652        }
3653
3654        iommu_flush_all();
3655
3656        for_each_active_iommu(iommu, drhd) {
3657                iommu_disable_translation(iommu);
3658
3659                raw_spin_lock_irqsave(&iommu->register_lock, flag);
3660
3661                iommu->iommu_state[SR_DMAR_FECTL_REG] =
3662                        readl(iommu->reg + DMAR_FECTL_REG);
3663                iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3664                        readl(iommu->reg + DMAR_FEDATA_REG);
3665                iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3666                        readl(iommu->reg + DMAR_FEADDR_REG);
3667                iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3668                        readl(iommu->reg + DMAR_FEUADDR_REG);
3669
3670                raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3671        }
3672        return 0;
3673
3674nomem:
3675        for_each_active_iommu(iommu, drhd)
3676                kfree(iommu->iommu_state);
3677
3678        return -ENOMEM;
3679}
3680
3681static void iommu_resume(void)
3682{
3683        struct dmar_drhd_unit *drhd;
3684        struct intel_iommu *iommu = NULL;
3685        unsigned long flag;
3686
3687        if (init_iommu_hw()) {
3688                if (force_on)
3689                        panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3690                else
3691                        WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3692                return;
3693        }
3694
3695        for_each_active_iommu(iommu, drhd) {
3696
3697                raw_spin_lock_irqsave(&iommu->register_lock, flag);
3698
3699                writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3700                        iommu->reg + DMAR_FECTL_REG);
3701                writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3702                        iommu->reg + DMAR_FEDATA_REG);
3703                writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3704                        iommu->reg + DMAR_FEADDR_REG);
3705                writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3706                        iommu->reg + DMAR_FEUADDR_REG);
3707
3708                raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3709        }
3710
3711        for_each_active_iommu(iommu, drhd)
3712                kfree(iommu->iommu_state);
3713}
3714
3715static struct syscore_ops iommu_syscore_ops = {
3716        .resume         = iommu_resume,
3717        .suspend        = iommu_suspend,
3718};
3719
3720static void __init init_iommu_pm_ops(void)
3721{
3722        register_syscore_ops(&iommu_syscore_ops);
3723}
3724
3725#else
3726static inline void init_iommu_pm_ops(void) {}
3727#endif  /* CONFIG_PM */
3728
3729static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3730{
3731        if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3732            !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3733            rmrr->end_address <= rmrr->base_address ||
3734            arch_rmrr_sanity_check(rmrr))
3735                return -EINVAL;
3736
3737        return 0;
3738}
3739
3740int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3741{
3742        struct acpi_dmar_reserved_memory *rmrr;
3743        struct dmar_rmrr_unit *rmrru;
3744
3745        rmrr = (struct acpi_dmar_reserved_memory *)header;
3746        if (rmrr_sanity_check(rmrr)) {
3747                pr_warn(FW_BUG
3748                           "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3749                           "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3750                           rmrr->base_address, rmrr->end_address,
3751                           dmi_get_system_info(DMI_BIOS_VENDOR),
3752                           dmi_get_system_info(DMI_BIOS_VERSION),
3753                           dmi_get_system_info(DMI_PRODUCT_VERSION));
3754                add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3755        }
3756
3757        rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3758        if (!rmrru)
3759                goto out;
3760
3761        rmrru->hdr = header;
3762
3763        rmrru->base_address = rmrr->base_address;
3764        rmrru->end_address = rmrr->end_address;
3765
3766        rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3767                                ((void *)rmrr) + rmrr->header.length,
3768                                &rmrru->devices_cnt);
3769        if (rmrru->devices_cnt && rmrru->devices == NULL)
3770                goto free_rmrru;
3771
3772        list_add(&rmrru->list, &dmar_rmrr_units);
3773
3774        return 0;
3775free_rmrru:
3776        kfree(rmrru);
3777out:
3778        return -ENOMEM;
3779}
3780
3781static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3782{
3783        struct dmar_atsr_unit *atsru;
3784        struct acpi_dmar_atsr *tmp;
3785
3786        list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3787                                dmar_rcu_check()) {
3788                tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3789                if (atsr->segment != tmp->segment)
3790                        continue;
3791                if (atsr->header.length != tmp->header.length)
3792                        continue;
3793                if (memcmp(atsr, tmp, atsr->header.length) == 0)
3794                        return atsru;
3795        }
3796
3797        return NULL;
3798}
3799
3800int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3801{
3802        struct acpi_dmar_atsr *atsr;
3803        struct dmar_atsr_unit *atsru;
3804
3805        if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3806                return 0;
3807
3808        atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3809        atsru = dmar_find_atsr(atsr);
3810        if (atsru)
3811                return 0;
3812
3813        atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3814        if (!atsru)
3815                return -ENOMEM;
3816
3817        /*
3818         * If memory is allocated from slab by ACPI _DSM method, we need to
3819         * copy the memory content because the memory buffer will be freed
3820         * on return.
3821         */
3822        atsru->hdr = (void *)(atsru + 1);
3823        memcpy(atsru->hdr, hdr, hdr->length);
3824        atsru->include_all = atsr->flags & 0x1;
3825        if (!atsru->include_all) {
3826                atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3827                                (void *)atsr + atsr->header.length,
3828                                &atsru->devices_cnt);
3829                if (atsru->devices_cnt && atsru->devices == NULL) {
3830                        kfree(atsru);
3831                        return -ENOMEM;
3832                }
3833        }
3834
3835        list_add_rcu(&atsru->list, &dmar_atsr_units);
3836
3837        return 0;
3838}
3839
3840static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3841{
3842        dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3843        kfree(atsru);
3844}
3845
3846int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3847{
3848        struct acpi_dmar_atsr *atsr;
3849        struct dmar_atsr_unit *atsru;
3850
3851        atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3852        atsru = dmar_find_atsr(atsr);
3853        if (atsru) {
3854                list_del_rcu(&atsru->list);
3855                synchronize_rcu();
3856                intel_iommu_free_atsr(atsru);
3857        }
3858
3859        return 0;
3860}
3861
3862int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3863{
3864        int i;
3865        struct device *dev;
3866        struct acpi_dmar_atsr *atsr;
3867        struct dmar_atsr_unit *atsru;
3868
3869        atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3870        atsru = dmar_find_atsr(atsr);
3871        if (!atsru)
3872                return 0;
3873
3874        if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3875                for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3876                                          i, dev)
3877                        return -EBUSY;
3878        }
3879
3880        return 0;
3881}
3882
3883static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3884{
3885        struct dmar_satc_unit *satcu;
3886        struct acpi_dmar_satc *tmp;
3887
3888        list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3889                                dmar_rcu_check()) {
3890                tmp = (struct acpi_dmar_satc *)satcu->hdr;
3891                if (satc->segment != tmp->segment)
3892                        continue;
3893                if (satc->header.length != tmp->header.length)
3894                        continue;
3895                if (memcmp(satc, tmp, satc->header.length) == 0)
3896                        return satcu;
3897        }
3898
3899        return NULL;
3900}
3901
3902int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3903{
3904        struct acpi_dmar_satc *satc;
3905        struct dmar_satc_unit *satcu;
3906
3907        if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3908                return 0;
3909
3910        satc = container_of(hdr, struct acpi_dmar_satc, header);
3911        satcu = dmar_find_satc(satc);
3912        if (satcu)
3913                return 0;
3914
3915        satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3916        if (!satcu)
3917                return -ENOMEM;
3918
3919        satcu->hdr = (void *)(satcu + 1);
3920        memcpy(satcu->hdr, hdr, hdr->length);
3921        satcu->atc_required = satc->flags & 0x1;
3922        satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3923                                              (void *)satc + satc->header.length,
3924                                              &satcu->devices_cnt);
3925        if (satcu->devices_cnt && !satcu->devices) {
3926                kfree(satcu);
3927                return -ENOMEM;
3928        }
3929        list_add_rcu(&satcu->list, &dmar_satc_units);
3930
3931        return 0;
3932}
3933
3934static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3935{
3936        int sp, ret;
3937        struct intel_iommu *iommu = dmaru->iommu;
3938
3939        if (g_iommus[iommu->seq_id])
3940                return 0;
3941
3942        ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3943        if (ret)
3944                goto out;
3945
3946        if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3947                pr_warn("%s: Doesn't support hardware pass through.\n",
3948                        iommu->name);
3949                return -ENXIO;
3950        }
3951        if (!ecap_sc_support(iommu->ecap) &&
3952            domain_update_iommu_snooping(iommu)) {
3953                pr_warn("%s: Doesn't support snooping.\n",
3954                        iommu->name);
3955                return -ENXIO;
3956        }
3957        sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3958        if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3959                pr_warn("%s: Doesn't support large page.\n",
3960                        iommu->name);
3961                return -ENXIO;
3962        }
3963
3964        /*
3965         * Disable translation if already enabled prior to OS handover.
3966         */
3967        if (iommu->gcmd & DMA_GCMD_TE)
3968                iommu_disable_translation(iommu);
3969
3970        g_iommus[iommu->seq_id] = iommu;
3971        ret = iommu_init_domains(iommu);
3972        if (ret == 0)
3973                ret = iommu_alloc_root_entry(iommu);
3974        if (ret)
3975                goto out;
3976
3977        intel_svm_check(iommu);
3978
3979        if (dmaru->ignored) {
3980                /*
3981                 * we always have to disable PMRs or DMA may fail on this device
3982                 */
3983                if (force_on)
3984                        iommu_disable_protect_mem_regions(iommu);
3985                return 0;
3986        }
3987
3988        intel_iommu_init_qi(iommu);
3989        iommu_flush_write_buffer(iommu);
3990
3991#ifdef CONFIG_INTEL_IOMMU_SVM
3992        if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3993                ret = intel_svm_enable_prq(iommu);
3994                if (ret)
3995                        goto disable_iommu;
3996        }
3997#endif
3998        ret = dmar_set_interrupt(iommu);
3999        if (ret)
4000                goto disable_iommu;
4001
4002        iommu_set_root_entry(iommu);
4003        iommu_enable_translation(iommu);
4004
4005        iommu_disable_protect_mem_regions(iommu);
4006        return 0;
4007
4008disable_iommu:
4009        disable_dmar_iommu(iommu);
4010out:
4011        free_dmar_iommu(iommu);
4012        return ret;
4013}
4014
4015int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4016{
4017        int ret = 0;
4018        struct intel_iommu *iommu = dmaru->iommu;
4019
4020        if (!intel_iommu_enabled)
4021                return 0;
4022        if (iommu == NULL)
4023                return -EINVAL;
4024
4025        if (insert) {
4026                ret = intel_iommu_add(dmaru);
4027        } else {
4028                disable_dmar_iommu(iommu);
4029                free_dmar_iommu(iommu);
4030        }
4031
4032        return ret;
4033}
4034
4035static void intel_iommu_free_dmars(void)
4036{
4037        struct dmar_rmrr_unit *rmrru, *rmrr_n;
4038        struct dmar_atsr_unit *atsru, *atsr_n;
4039        struct dmar_satc_unit *satcu, *satc_n;
4040
4041        list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4042                list_del(&rmrru->list);
4043                dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4044                kfree(rmrru);
4045        }
4046
4047        list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4048                list_del(&atsru->list);
4049                intel_iommu_free_atsr(atsru);
4050        }
4051        list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
4052                list_del(&satcu->list);
4053                dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
4054                kfree(satcu);
4055        }
4056}
4057
4058int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4059{
4060        int i, ret = 1;
4061        struct pci_bus *bus;
4062        struct pci_dev *bridge = NULL;
4063        struct device *tmp;
4064        struct acpi_dmar_atsr *atsr;
4065        struct dmar_atsr_unit *atsru;
4066
4067        dev = pci_physfn(dev);
4068        for (bus = dev->bus; bus; bus = bus->parent) {
4069                bridge = bus->self;
4070                /* If it's an integrated device, allow ATS */
4071                if (!bridge)
4072                        return 1;
4073                /* Connected via non-PCIe: no ATS */
4074                if (!pci_is_pcie(bridge) ||
4075                    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4076                        return 0;
4077                /* If we found the root port, look it up in the ATSR */
4078                if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4079                        break;
4080        }
4081
4082        rcu_read_lock();
4083        list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4084                atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4085                if (atsr->segment != pci_domain_nr(dev->bus))
4086                        continue;
4087
4088                for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4089                        if (tmp == &bridge->dev)
4090                                goto out;
4091
4092                if (atsru->include_all)
4093                        goto out;
4094        }
4095        ret = 0;
4096out:
4097        rcu_read_unlock();
4098
4099        return ret;
4100}
4101
4102int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4103{
4104        int ret;
4105        struct dmar_rmrr_unit *rmrru;
4106        struct dmar_atsr_unit *atsru;
4107        struct dmar_satc_unit *satcu;
4108        struct acpi_dmar_atsr *atsr;
4109        struct acpi_dmar_reserved_memory *rmrr;
4110        struct acpi_dmar_satc *satc;
4111
4112        if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4113                return 0;
4114
4115        list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4116                rmrr = container_of(rmrru->hdr,
4117                                    struct acpi_dmar_reserved_memory, header);
4118                if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4119                        ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4120                                ((void *)rmrr) + rmrr->header.length,
4121                                rmrr->segment, rmrru->devices,
4122                                rmrru->devices_cnt);
4123                        if (ret < 0)
4124                                return ret;
4125                } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4126                        dmar_remove_dev_scope(info, rmrr->segment,
4127                                rmrru->devices, rmrru->devices_cnt);
4128                }
4129        }
4130
4131        list_for_each_entry(atsru, &dmar_atsr_units, list) {
4132                if (atsru->include_all)
4133                        continue;
4134
4135                atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4136                if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4137                        ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4138                                        (void *)atsr + atsr->header.length,
4139                                        atsr->segment, atsru->devices,
4140                                        atsru->devices_cnt);
4141                        if (ret > 0)
4142                                break;
4143                        else if (ret < 0)
4144                                return ret;
4145                } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4146                        if (dmar_remove_dev_scope(info, atsr->segment,
4147                                        atsru->devices, atsru->devices_cnt))
4148                                break;
4149                }
4150        }
4151        list_for_each_entry(satcu, &dmar_satc_units, list) {
4152                satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
4153                if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4154                        ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
4155                                        (void *)satc + satc->header.length,
4156                                        satc->segment, satcu->devices,
4157                                        satcu->devices_cnt);
4158                        if (ret > 0)
4159                                break;
4160                        else if (ret < 0)
4161                                return ret;
4162                } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4163                        if (dmar_remove_dev_scope(info, satc->segment,
4164                                        satcu->devices, satcu->devices_cnt))
4165                                break;
4166                }
4167        }
4168
4169        return 0;
4170}
4171
4172static int intel_iommu_memory_notifier(struct notifier_block *nb,
4173                                       unsigned long val, void *v)
4174{
4175        struct memory_notify *mhp = v;
4176        unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4177        unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4178                        mhp->nr_pages - 1);
4179
4180        switch (val) {
4181        case MEM_GOING_ONLINE:
4182                if (iommu_domain_identity_map(si_domain,
4183                                              start_vpfn, last_vpfn)) {
4184                        pr_warn("Failed to build identity map for [%lx-%lx]\n",
4185                                start_vpfn, last_vpfn);
4186                        return NOTIFY_BAD;
4187                }
4188                break;
4189
4190        case MEM_OFFLINE:
4191        case MEM_CANCEL_ONLINE:
4192                {
4193                        struct dmar_drhd_unit *drhd;
4194                        struct intel_iommu *iommu;
4195                        struct page *freelist;
4196
4197                        freelist = domain_unmap(si_domain,
4198                                                start_vpfn, last_vpfn,
4199                                                NULL);
4200
4201                        rcu_read_lock();
4202                        for_each_active_iommu(iommu, drhd)
4203                                iommu_flush_iotlb_psi(iommu, si_domain,
4204                                        start_vpfn, mhp->nr_pages,
4205                                        !freelist, 0);
4206                        rcu_read_unlock();
4207                        dma_free_pagelist(freelist);
4208                }
4209                break;
4210        }
4211
4212        return NOTIFY_OK;
4213}
4214
4215static struct notifier_block intel_iommu_memory_nb = {
4216        .notifier_call = intel_iommu_memory_notifier,
4217        .priority = 0
4218};
4219
4220static void intel_disable_iommus(void)
4221{
4222        struct intel_iommu *iommu = NULL;
4223        struct dmar_drhd_unit *drhd;
4224
4225        for_each_iommu(iommu, drhd)
4226                iommu_disable_translation(iommu);
4227}
4228
4229void intel_iommu_shutdown(void)
4230{
4231        struct dmar_drhd_unit *drhd;
4232        struct intel_iommu *iommu = NULL;
4233
4234        if (no_iommu || dmar_disabled)
4235                return;
4236
4237        down_write(&dmar_global_lock);
4238
4239        /* Disable PMRs explicitly here. */
4240        for_each_iommu(iommu, drhd)
4241                iommu_disable_protect_mem_regions(iommu);
4242
4243        /* Make sure the IOMMUs are switched off */
4244        intel_disable_iommus();
4245
4246        up_write(&dmar_global_lock);
4247}
4248
4249static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4250{
4251        struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4252
4253        return container_of(iommu_dev, struct intel_iommu, iommu);
4254}
4255
4256static ssize_t version_show(struct device *dev,
4257                            struct device_attribute *attr, char *buf)
4258{
4259        struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4260        u32 ver = readl(iommu->reg + DMAR_VER_REG);
4261        return sprintf(buf, "%d:%d\n",
4262                       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4263}
4264static DEVICE_ATTR_RO(version);
4265
4266static ssize_t address_show(struct device *dev,
4267                            struct device_attribute *attr, char *buf)
4268{
4269        struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4270        return sprintf(buf, "%llx\n", iommu->reg_phys);
4271}
4272static DEVICE_ATTR_RO(address);
4273
4274static ssize_t cap_show(struct device *dev,
4275                        struct device_attribute *attr, char *buf)
4276{
4277        struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4278        return sprintf(buf, "%llx\n", iommu->cap);
4279}
4280static DEVICE_ATTR_RO(cap);
4281
4282static ssize_t ecap_show(struct device *dev,
4283                         struct device_attribute *attr, char *buf)
4284{
4285        struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4286        return sprintf(buf, "%llx\n", iommu->ecap);
4287}
4288static DEVICE_ATTR_RO(ecap);
4289
4290static ssize_t domains_supported_show(struct device *dev,
4291                                      struct device_attribute *attr, char *buf)
4292{
4293        struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4294        return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4295}
4296static DEVICE_ATTR_RO(domains_supported);
4297
4298static ssize_t domains_used_show(struct device *dev,
4299                                 struct device_attribute *attr, char *buf)
4300{
4301        struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4302        return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4303                                                  cap_ndoms(iommu->cap)));
4304}
4305static DEVICE_ATTR_RO(domains_used);
4306
4307static struct attribute *intel_iommu_attrs[] = {
4308        &dev_attr_version.attr,
4309        &dev_attr_address.attr,
4310        &dev_attr_cap.attr,
4311        &dev_attr_ecap.attr,
4312        &dev_attr_domains_supported.attr,
4313        &dev_attr_domains_used.attr,
4314        NULL,
4315};
4316
4317static struct attribute_group intel_iommu_group = {
4318        .name = "intel-iommu",
4319        .attrs = intel_iommu_attrs,
4320};
4321
4322const struct attribute_group *intel_iommu_groups[] = {
4323        &intel_iommu_group,
4324        NULL,
4325};
4326
4327static inline bool has_external_pci(void)
4328{
4329        struct pci_dev *pdev = NULL;
4330
4331        for_each_pci_dev(pdev)
4332                if (pdev->external_facing)
4333                        return true;
4334
4335        return false;
4336}
4337
4338static int __init platform_optin_force_iommu(void)
4339{
4340        if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4341                return 0;
4342
4343        if (no_iommu || dmar_disabled)
4344                pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4345
4346        /*
4347         * If Intel-IOMMU is disabled by default, we will apply identity
4348         * map for all devices except those marked as being untrusted.
4349         */
4350        if (dmar_disabled)
4351                iommu_set_default_passthrough(false);
4352
4353        dmar_disabled = 0;
4354        no_iommu = 0;
4355
4356        return 1;
4357}
4358
4359static int __init probe_acpi_namespace_devices(void)
4360{
4361        struct dmar_drhd_unit *drhd;
4362        /* To avoid a -Wunused-but-set-variable warning. */
4363        struct intel_iommu *iommu __maybe_unused;
4364        struct device *dev;
4365        int i, ret = 0;
4366
4367        for_each_active_iommu(iommu, drhd) {
4368                for_each_active_dev_scope(drhd->devices,
4369                                          drhd->devices_cnt, i, dev) {
4370                        struct acpi_device_physical_node *pn;
4371                        struct iommu_group *group;
4372                        struct acpi_device *adev;
4373
4374                        if (dev->bus != &acpi_bus_type)
4375                                continue;
4376
4377                        adev = to_acpi_device(dev);
4378                        mutex_lock(&adev->physical_node_lock);
4379                        list_for_each_entry(pn,
4380                                            &adev->physical_node_list, node) {
4381                                group = iommu_group_get(pn->dev);
4382                                if (group) {
4383                                        iommu_group_put(group);
4384                                        continue;
4385                                }
4386
4387                                pn->dev->bus->iommu_ops = &intel_iommu_ops;
4388                                ret = iommu_probe_device(pn->dev);
4389                                if (ret)
4390                                        break;
4391                        }
4392                        mutex_unlock(&adev->physical_node_lock);
4393
4394                        if (ret)
4395                                return ret;
4396                }
4397        }
4398
4399        return 0;
4400}
4401
4402int __init intel_iommu_init(void)
4403{
4404        int ret = -ENODEV;
4405        struct dmar_drhd_unit *drhd;
4406        struct intel_iommu *iommu;
4407
4408        /*
4409         * Intel IOMMU is required for a TXT/tboot launch or platform
4410         * opt in, so enforce that.
4411         */
4412        force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4413                    platform_optin_force_iommu();
4414
4415        if (iommu_init_mempool()) {
4416                if (force_on)
4417                        panic("tboot: Failed to initialize iommu memory\n");
4418                return -ENOMEM;
4419        }
4420
4421        down_write(&dmar_global_lock);
4422        if (dmar_table_init()) {
4423                if (force_on)
4424                        panic("tboot: Failed to initialize DMAR table\n");
4425                goto out_free_dmar;
4426        }
4427
4428        if (dmar_dev_scope_init() < 0) {
4429                if (force_on)
4430                        panic("tboot: Failed to initialize DMAR device scope\n");
4431                goto out_free_dmar;
4432        }
4433
4434        up_write(&dmar_global_lock);
4435
4436        /*
4437         * The bus notifier takes the dmar_global_lock, so lockdep will
4438         * complain later when we register it under the lock.
4439         */
4440        dmar_register_bus_notifier();
4441
4442        down_write(&dmar_global_lock);
4443
4444        if (!no_iommu)
4445                intel_iommu_debugfs_init();
4446
4447        if (no_iommu || dmar_disabled) {
4448                /*
4449                 * We exit the function here to ensure IOMMU's remapping and
4450                 * mempool aren't setup, which means that the IOMMU's PMRs
4451                 * won't be disabled via the call to init_dmars(). So disable
4452                 * it explicitly here. The PMRs were setup by tboot prior to
4453                 * calling SENTER, but the kernel is expected to reset/tear
4454                 * down the PMRs.
4455                 */
4456                if (intel_iommu_tboot_noforce) {
4457                        for_each_iommu(iommu, drhd)
4458                                iommu_disable_protect_mem_regions(iommu);
4459                }
4460
4461                /*
4462                 * Make sure the IOMMUs are switched off, even when we
4463                 * boot into a kexec kernel and the previous kernel left
4464                 * them enabled
4465                 */
4466                intel_disable_iommus();
4467                goto out_free_dmar;
4468        }
4469
4470        if (list_empty(&dmar_rmrr_units))
4471                pr_info("No RMRR found\n");
4472
4473        if (list_empty(&dmar_atsr_units))
4474                pr_info("No ATSR found\n");
4475
4476        if (list_empty(&dmar_satc_units))
4477                pr_info("No SATC found\n");
4478
4479        if (dmar_map_gfx)
4480                intel_iommu_gfx_mapped = 1;
4481
4482        init_no_remapping_devices();
4483
4484        ret = init_dmars();
4485        if (ret) {
4486                if (force_on)
4487                        panic("tboot: Failed to initialize DMARs\n");
4488                pr_err("Initialization failed\n");
4489                goto out_free_dmar;
4490        }
4491        up_write(&dmar_global_lock);
4492
4493        init_iommu_pm_ops();
4494
4495        down_read(&dmar_global_lock);
4496        for_each_active_iommu(iommu, drhd) {
4497                /*
4498                 * The flush queue implementation does not perform
4499                 * page-selective invalidations that are required for efficient
4500                 * TLB flushes in virtual environments.  The benefit of batching
4501                 * is likely to be much lower than the overhead of synchronizing
4502                 * the virtual and physical IOMMU page-tables.
4503                 */
4504                if (cap_caching_mode(iommu->cap)) {
4505                        pr_info_once("IOMMU batching disallowed due to virtualization\n");
4506                        iommu_set_dma_strict();
4507                }
4508                iommu_device_sysfs_add(&iommu->iommu, NULL,
4509                                       intel_iommu_groups,
4510                                       "%s", iommu->name);
4511                iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4512        }
4513        up_read(&dmar_global_lock);
4514
4515        bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4516        if (si_domain && !hw_pass_through)
4517                register_memory_notifier(&intel_iommu_memory_nb);
4518
4519        down_read(&dmar_global_lock);
4520        if (probe_acpi_namespace_devices())
4521                pr_warn("ACPI name space devices didn't probe correctly\n");
4522
4523        /* Finally, we enable the DMA remapping hardware. */
4524        for_each_iommu(iommu, drhd) {
4525                if (!drhd->ignored && !translation_pre_enabled(iommu))
4526                        iommu_enable_translation(iommu);
4527
4528                iommu_disable_protect_mem_regions(iommu);
4529        }
4530        up_read(&dmar_global_lock);
4531
4532        pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4533
4534        intel_iommu_enabled = 1;
4535
4536        return 0;
4537
4538out_free_dmar:
4539        intel_iommu_free_dmars();
4540        up_write(&dmar_global_lock);
4541        iommu_exit_mempool();
4542        return ret;
4543}
4544
4545static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4546{
4547        struct device_domain_info *info = opaque;
4548
4549        domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4550        return 0;
4551}
4552
4553/*
4554 * NB - intel-iommu lacks any sort of reference counting for the users of
4555 * dependent devices.  If multiple endpoints have intersecting dependent
4556 * devices, unbinding the driver from any one of them will possibly leave
4557 * the others unable to operate.
4558 */
4559static void domain_context_clear(struct device_domain_info *info)
4560{
4561        if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4562                return;
4563
4564        pci_for_each_dma_alias(to_pci_dev(info->dev),
4565                               &domain_context_clear_one_cb, info);
4566}
4567
4568static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4569{
4570        struct dmar_domain *domain;
4571        struct intel_iommu *iommu;
4572        unsigned long flags;
4573
4574        assert_spin_locked(&device_domain_lock);
4575
4576        if (WARN_ON(!info))
4577                return;
4578
4579        iommu = info->iommu;
4580        domain = info->domain;
4581
4582        if (info->dev && !dev_is_real_dma_subdevice(info->dev)) {
4583                if (dev_is_pci(info->dev) && sm_supported(iommu))
4584                        intel_pasid_tear_down_entry(iommu, info->dev,
4585                                        PASID_RID2PASID, false);
4586
4587                iommu_disable_dev_iotlb(info);
4588                domain_context_clear(info);
4589                intel_pasid_free_table(info->dev);
4590        }
4591
4592        unlink_domain_info(info);
4593
4594        spin_lock_irqsave(&iommu->lock, flags);
4595        domain_detach_iommu(domain, iommu);
4596        spin_unlock_irqrestore(&iommu->lock, flags);
4597
4598        free_devinfo_mem(info);
4599}
4600
4601static void dmar_remove_one_dev_info(struct device *dev)
4602{
4603        struct device_domain_info *info;
4604        unsigned long flags;
4605
4606        spin_lock_irqsave(&device_domain_lock, flags);
4607        info = get_domain_info(dev);
4608        if (info)
4609                __dmar_remove_one_dev_info(info);
4610        spin_unlock_irqrestore(&device_domain_lock, flags);
4611}
4612
4613static int md_domain_init(struct dmar_domain *domain, int guest_width)
4614{
4615        int adjust_width;
4616
4617        /* calculate AGAW */
4618        domain->gaw = guest_width;
4619        adjust_width = guestwidth_to_adjustwidth(guest_width);
4620        domain->agaw = width_to_agaw(adjust_width);
4621
4622        domain->iommu_coherency = false;
4623        domain->iommu_snooping = false;
4624        domain->iommu_superpage = 0;
4625        domain->max_addr = 0;
4626
4627        /* always allocate the top pgd */
4628        domain->pgd = alloc_pgtable_page(domain->nid);
4629        if (!domain->pgd)
4630                return -ENOMEM;
4631        domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4632        return 0;
4633}
4634
4635static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4636{
4637        struct dmar_domain *dmar_domain;
4638        struct iommu_domain *domain;
4639
4640        switch (type) {
4641        case IOMMU_DOMAIN_DMA:
4642        case IOMMU_DOMAIN_DMA_FQ:
4643        case IOMMU_DOMAIN_UNMANAGED:
4644                dmar_domain = alloc_domain(type);
4645                if (!dmar_domain) {
4646                        pr_err("Can't allocate dmar_domain\n");
4647                        return NULL;
4648                }
4649                if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4650                        pr_err("Domain initialization failed\n");
4651                        domain_exit(dmar_domain);
4652                        return NULL;
4653                }
4654
4655                domain = &dmar_domain->domain;
4656                domain->geometry.aperture_start = 0;
4657                domain->geometry.aperture_end   =
4658                                __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4659                domain->geometry.force_aperture = true;
4660
4661                return domain;
4662        case IOMMU_DOMAIN_IDENTITY:
4663                return &si_domain->domain;
4664        default:
4665                return NULL;
4666        }
4667
4668        return NULL;
4669}
4670
4671static void intel_iommu_domain_free(struct iommu_domain *domain)
4672{
4673        if (domain != &si_domain->domain)
4674                domain_exit(to_dmar_domain(domain));
4675}
4676
4677/*
4678 * Check whether a @domain could be attached to the @dev through the
4679 * aux-domain attach/detach APIs.
4680 */
4681static inline bool
4682is_aux_domain(struct device *dev, struct iommu_domain *domain)
4683{
4684        struct device_domain_info *info = get_domain_info(dev);
4685
4686        return info && info->auxd_enabled &&
4687                        domain->type == IOMMU_DOMAIN_UNMANAGED;
4688}
4689
4690static inline struct subdev_domain_info *
4691lookup_subdev_info(struct dmar_domain *domain, struct device *dev)
4692{
4693        struct subdev_domain_info *sinfo;
4694
4695        if (!list_empty(&domain->subdevices)) {
4696                list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
4697                        if (sinfo->pdev == dev)
4698                                return sinfo;
4699                }
4700        }
4701
4702        return NULL;
4703}
4704
4705static int auxiliary_link_device(struct dmar_domain *domain,
4706                                 struct device *dev)
4707{
4708        struct device_domain_info *info = get_domain_info(dev);
4709        struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4710
4711        assert_spin_locked(&device_domain_lock);
4712        if (WARN_ON(!info))
4713                return -EINVAL;
4714
4715        if (!sinfo) {
4716                sinfo = kzalloc(sizeof(*sinfo), GFP_ATOMIC);
4717                if (!sinfo)
4718                        return -ENOMEM;
4719                sinfo->domain = domain;
4720                sinfo->pdev = dev;
4721                list_add(&sinfo->link_phys, &info->subdevices);
4722                list_add(&sinfo->link_domain, &domain->subdevices);
4723        }
4724
4725        return ++sinfo->users;
4726}
4727
4728static int auxiliary_unlink_device(struct dmar_domain *domain,
4729                                   struct device *dev)
4730{
4731        struct device_domain_info *info = get_domain_info(dev);
4732        struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4733        int ret;
4734
4735        assert_spin_locked(&device_domain_lock);
4736        if (WARN_ON(!info || !sinfo || sinfo->users <= 0))
4737                return -EINVAL;
4738
4739        ret = --sinfo->users;
4740        if (!ret) {
4741                list_del(&sinfo->link_phys);
4742                list_del(&sinfo->link_domain);
4743                kfree(sinfo);
4744        }
4745
4746        return ret;
4747}
4748
4749static int aux_domain_add_dev(struct dmar_domain *domain,
4750                              struct device *dev)
4751{
4752        int ret;
4753        unsigned long flags;
4754        struct intel_iommu *iommu;
4755
4756        iommu = device_to_iommu(dev, NULL, NULL);
4757        if (!iommu)
4758                return -ENODEV;
4759
4760        if (domain->default_pasid <= 0) {
4761                u32 pasid;
4762
4763                /* No private data needed for the default pasid */
4764                pasid = ioasid_alloc(NULL, PASID_MIN,
4765                                     pci_max_pasids(to_pci_dev(dev)) - 1,
4766                                     NULL);
4767                if (pasid == INVALID_IOASID) {
4768                        pr_err("Can't allocate default pasid\n");
4769                        return -ENODEV;
4770                }
4771                domain->default_pasid = pasid;
4772        }
4773
4774        spin_lock_irqsave(&device_domain_lock, flags);
4775        ret = auxiliary_link_device(domain, dev);
4776        if (ret <= 0)
4777                goto link_failed;
4778
4779        /*
4780         * Subdevices from the same physical device can be attached to the
4781         * same domain. For such cases, only the first subdevice attachment
4782         * needs to go through the full steps in this function. So if ret >
4783         * 1, just goto out.
4784         */
4785        if (ret > 1)
4786                goto out;
4787
4788        /*
4789         * iommu->lock must be held to attach domain to iommu and setup the
4790         * pasid entry for second level translation.
4791         */
4792        spin_lock(&iommu->lock);
4793        ret = domain_attach_iommu(domain, iommu);
4794        if (ret)
4795                goto attach_failed;
4796
4797        /* Setup the PASID entry for mediated devices: */
4798        if (domain_use_first_level(domain))
4799                ret = domain_setup_first_level(iommu, domain, dev,
4800                                               domain->default_pasid);
4801        else
4802                ret = intel_pasid_setup_second_level(iommu, domain, dev,
4803                                                     domain->default_pasid);
4804        if (ret)
4805                goto table_failed;
4806
4807        spin_unlock(&iommu->lock);
4808out:
4809        spin_unlock_irqrestore(&device_domain_lock, flags);
4810
4811        return 0;
4812
4813table_failed:
4814        domain_detach_iommu(domain, iommu);
4815attach_failed:
4816        spin_unlock(&iommu->lock);
4817        auxiliary_unlink_device(domain, dev);
4818link_failed:
4819        spin_unlock_irqrestore(&device_domain_lock, flags);
4820        if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4821                ioasid_put(domain->default_pasid);
4822
4823        return ret;
4824}
4825
4826static void aux_domain_remove_dev(struct dmar_domain *domain,
4827                                  struct device *dev)
4828{
4829        struct device_domain_info *info;
4830        struct intel_iommu *iommu;
4831        unsigned long flags;
4832
4833        if (!is_aux_domain(dev, &domain->domain))
4834                return;
4835
4836        spin_lock_irqsave(&device_domain_lock, flags);
4837        info = get_domain_info(dev);
4838        iommu = info->iommu;
4839
4840        if (!auxiliary_unlink_device(domain, dev)) {
4841                spin_lock(&iommu->lock);
4842                intel_pasid_tear_down_entry(iommu, dev,
4843                                            domain->default_pasid, false);
4844                domain_detach_iommu(domain, iommu);
4845                spin_unlock(&iommu->lock);
4846        }
4847
4848        spin_unlock_irqrestore(&device_domain_lock, flags);
4849
4850        if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4851                ioasid_put(domain->default_pasid);
4852}
4853
4854static int prepare_domain_attach_device(struct iommu_domain *domain,
4855                                        struct device *dev)
4856{
4857        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4858        struct intel_iommu *iommu;
4859        int addr_width;
4860
4861        iommu = device_to_iommu(dev, NULL, NULL);
4862        if (!iommu)
4863                return -ENODEV;
4864
4865        if ((dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE) &&
4866            !ecap_nest(iommu->ecap)) {
4867                dev_err(dev, "%s: iommu not support nested translation\n",
4868                        iommu->name);
4869                return -EINVAL;
4870        }
4871
4872        /* check if this iommu agaw is sufficient for max mapped address */
4873        addr_width = agaw_to_width(iommu->agaw);
4874        if (addr_width > cap_mgaw(iommu->cap))
4875                addr_width = cap_mgaw(iommu->cap);
4876
4877        if (dmar_domain->max_addr > (1LL << addr_width)) {
4878                dev_err(dev, "%s: iommu width (%d) is not "
4879                        "sufficient for the mapped address (%llx)\n",
4880                        __func__, addr_width, dmar_domain->max_addr);
4881                return -EFAULT;
4882        }
4883        dmar_domain->gaw = addr_width;
4884
4885        /*
4886         * Knock out extra levels of page tables if necessary
4887         */
4888        while (iommu->agaw < dmar_domain->agaw) {
4889                struct dma_pte *pte;
4890
4891                pte = dmar_domain->pgd;
4892                if (dma_pte_present(pte)) {
4893                        dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4894                        free_pgtable_page(pte);
4895                }
4896                dmar_domain->agaw--;
4897        }
4898
4899        return 0;
4900}
4901
4902static int intel_iommu_attach_device(struct iommu_domain *domain,
4903                                     struct device *dev)
4904{
4905        int ret;
4906
4907        if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4908            device_is_rmrr_locked(dev)) {
4909                dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4910                return -EPERM;
4911        }
4912
4913        if (is_aux_domain(dev, domain))
4914                return -EPERM;
4915
4916        /* normally dev is not mapped */
4917        if (unlikely(domain_context_mapped(dev))) {
4918                struct dmar_domain *old_domain;
4919
4920                old_domain = find_domain(dev);
4921                if (old_domain)
4922                        dmar_remove_one_dev_info(dev);
4923        }
4924
4925        ret = prepare_domain_attach_device(domain, dev);
4926        if (ret)
4927                return ret;
4928
4929        return domain_add_dev_info(to_dmar_domain(domain), dev);
4930}
4931
4932static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
4933                                         struct device *dev)
4934{
4935        int ret;
4936
4937        if (!is_aux_domain(dev, domain))
4938                return -EPERM;
4939
4940        ret = prepare_domain_attach_device(domain, dev);
4941        if (ret)
4942                return ret;
4943
4944        return aux_domain_add_dev(to_dmar_domain(domain), dev);
4945}
4946
4947static void intel_iommu_detach_device(struct iommu_domain *domain,
4948                                      struct device *dev)
4949{
4950        dmar_remove_one_dev_info(dev);
4951}
4952
4953static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
4954                                          struct device *dev)
4955{
4956        aux_domain_remove_dev(to_dmar_domain(domain), dev);
4957}
4958
4959#ifdef CONFIG_INTEL_IOMMU_SVM
4960/*
4961 * 2D array for converting and sanitizing IOMMU generic TLB granularity to
4962 * VT-d granularity. Invalidation is typically included in the unmap operation
4963 * as a result of DMA or VFIO unmap. However, for assigned devices guest
4964 * owns the first level page tables. Invalidations of translation caches in the
4965 * guest are trapped and passed down to the host.
4966 *
4967 * vIOMMU in the guest will only expose first level page tables, therefore
4968 * we do not support IOTLB granularity for request without PASID (second level).
4969 *
4970 * For example, to find the VT-d granularity encoding for IOTLB
4971 * type and page selective granularity within PASID:
4972 * X: indexed by iommu cache type
4973 * Y: indexed by enum iommu_inv_granularity
4974 * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
4975 */
4976
4977static const int
4978inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
4979        /*
4980         * PASID based IOTLB invalidation: PASID selective (per PASID),
4981         * page selective (address granularity)
4982         */
4983        {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
4984        /* PASID based dev TLBs */
4985        {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
4986        /* PASID cache */
4987        {-EINVAL, -EINVAL, -EINVAL}
4988};
4989
4990static inline int to_vtd_granularity(int type, int granu)
4991{
4992        return inv_type_granu_table[type][granu];
4993}
4994
4995static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
4996{
4997        u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
4998
4999        /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
5000         * IOMMU cache invalidate API passes granu_size in bytes, and number of
5001         * granu size in contiguous memory.
5002         */
5003        return order_base_2(nr_pages);
5004}
5005
5006static int
5007intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
5008                           struct iommu_cache_invalidate_info *inv_info)
5009{
5010        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5011        struct device_domain_info *info;
5012        struct intel_iommu *iommu;
5013        unsigned long flags;
5014        int cache_type;
5015        u8 bus, devfn;
5016        u16 did, sid;
5017        int ret = 0;
5018        u64 size = 0;
5019
5020        if (!inv_info || !dmar_domain)
5021                return -EINVAL;
5022
5023        if (!dev || !dev_is_pci(dev))
5024                return -ENODEV;
5025
5026        iommu = device_to_iommu(dev, &bus, &devfn);
5027        if (!iommu)
5028                return -ENODEV;
5029
5030        if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
5031                return -EINVAL;
5032
5033        spin_lock_irqsave(&device_domain_lock, flags);
5034        spin_lock(&iommu->lock);
5035        info = get_domain_info(dev);
5036        if (!info) {
5037                ret = -EINVAL;
5038                goto out_unlock;
5039        }
5040        did = dmar_domain->iommu_did[iommu->seq_id];
5041        sid = PCI_DEVID(bus, devfn);
5042
5043        /* Size is only valid in address selective invalidation */
5044        if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
5045                size = to_vtd_size(inv_info->granu.addr_info.granule_size,
5046                                   inv_info->granu.addr_info.nb_granules);
5047
5048        for_each_set_bit(cache_type,
5049                         (unsigned long *)&inv_info->cache,
5050                         IOMMU_CACHE_INV_TYPE_NR) {
5051                int granu = 0;
5052                u64 pasid = 0;
5053                u64 addr = 0;
5054
5055                granu = to_vtd_granularity(cache_type, inv_info->granularity);
5056                if (granu == -EINVAL) {
5057                        pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
5058                                           cache_type, inv_info->granularity);
5059                        break;
5060                }
5061
5062                /*
5063                 * PASID is stored in different locations based on the
5064                 * granularity.
5065                 */
5066                if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
5067                    (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
5068                        pasid = inv_info->granu.pasid_info.pasid;
5069                else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5070                         (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
5071                        pasid = inv_info->granu.addr_info.pasid;
5072
5073                switch (BIT(cache_type)) {
5074                case IOMMU_CACHE_INV_TYPE_IOTLB:
5075                        /* HW will ignore LSB bits based on address mask */
5076                        if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5077                            size &&
5078                            (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
5079                                pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
5080                                                   inv_info->granu.addr_info.addr, size);
5081                        }
5082
5083                        /*
5084                         * If granu is PASID-selective, address is ignored.
5085                         * We use npages = -1 to indicate that.
5086                         */
5087                        qi_flush_piotlb(iommu, did, pasid,
5088                                        mm_to_dma_pfn(inv_info->granu.addr_info.addr),
5089                                        (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
5090                                        inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
5091
5092                        if (!info->ats_enabled)
5093                                break;
5094                        /*
5095                         * Always flush device IOTLB if ATS is enabled. vIOMMU
5096                         * in the guest may assume IOTLB flush is inclusive,
5097                         * which is more efficient.
5098                         */
5099                        fallthrough;
5100                case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
5101                        /*
5102                         * PASID based device TLB invalidation does not support
5103                         * IOMMU_INV_GRANU_PASID granularity but only supports
5104                         * IOMMU_INV_GRANU_ADDR.
5105                         * The equivalent of that is we set the size to be the
5106                         * entire range of 64 bit. User only provides PASID info
5107                         * without address info. So we set addr to 0.
5108                         */
5109                        if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
5110                                size = 64 - VTD_PAGE_SHIFT;
5111                                addr = 0;
5112                        } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
5113                                addr = inv_info->granu.addr_info.addr;
5114                        }
5115
5116                        if (info->ats_enabled)
5117                                qi_flush_dev_iotlb_pasid(iommu, sid,
5118                                                info->pfsid, pasid,
5119                                                info->ats_qdep, addr,
5120                                                size);
5121                        else
5122                                pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5123                        break;
5124                default:
5125                        dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5126                                            cache_type);
5127                        ret = -EINVAL;
5128                }
5129        }
5130out_unlock:
5131        spin_unlock(&iommu->lock);
5132        spin_unlock_irqrestore(&device_domain_lock, flags);
5133
5134        return ret;
5135}
5136#endif
5137
5138static int intel_iommu_map(struct iommu_domain *domain,
5139                           unsigned long iova, phys_addr_t hpa,
5140                           size_t size, int iommu_prot, gfp_t gfp)
5141{
5142        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5143        u64 max_addr;
5144        int prot = 0;
5145
5146        if (iommu_prot & IOMMU_READ)
5147                prot |= DMA_PTE_READ;
5148        if (iommu_prot & IOMMU_WRITE)
5149                prot |= DMA_PTE_WRITE;
5150        if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5151                prot |= DMA_PTE_SNP;
5152
5153        max_addr = iova + size;
5154        if (dmar_domain->max_addr < max_addr) {
5155                u64 end;
5156
5157                /* check if minimum agaw is sufficient for mapped address */
5158                end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5159                if (end < max_addr) {
5160                        pr_err("%s: iommu width (%d) is not "
5161                               "sufficient for the mapped address (%llx)\n",
5162                               __func__, dmar_domain->gaw, max_addr);
5163                        return -EFAULT;
5164                }
5165                dmar_domain->max_addr = max_addr;
5166        }
5167        /* Round up size to next multiple of PAGE_SIZE, if it and
5168           the low bits of hpa would take us onto the next page */
5169        size = aligned_nrpages(hpa, size);
5170        return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5171                                hpa >> VTD_PAGE_SHIFT, size, prot);
5172}
5173
5174static int intel_iommu_map_pages(struct iommu_domain *domain,
5175                                 unsigned long iova, phys_addr_t paddr,
5176                                 size_t pgsize, size_t pgcount,
5177                                 int prot, gfp_t gfp, size_t *mapped)
5178{
5179        unsigned long pgshift = __ffs(pgsize);
5180        size_t size = pgcount << pgshift;
5181        int ret;
5182
5183        if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
5184                return -EINVAL;
5185
5186        if (!IS_ALIGNED(iova | paddr, pgsize))
5187                return -EINVAL;
5188
5189        ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
5190        if (!ret && mapped)
5191                *mapped = size;
5192
5193        return ret;
5194}
5195
5196static size_t intel_iommu_unmap(struct iommu_domain *domain,
5197                                unsigned long iova, size_t size,
5198                                struct iommu_iotlb_gather *gather)
5199{
5200        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5201        unsigned long start_pfn, last_pfn;
5202        int level = 0;
5203
5204        /* Cope with horrid API which requires us to unmap more than the
5205           size argument if it happens to be a large-page mapping. */
5206        BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5207
5208        if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5209                size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5210
5211        start_pfn = iova >> VTD_PAGE_SHIFT;
5212        last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5213
5214        gather->freelist = domain_unmap(dmar_domain, start_pfn,
5215                                        last_pfn, gather->freelist);
5216
5217        if (dmar_domain->max_addr == iova + size)
5218                dmar_domain->max_addr = iova;
5219
5220        iommu_iotlb_gather_add_page(domain, gather, iova, size);
5221
5222        return size;
5223}
5224
5225static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
5226                                      unsigned long iova,
5227                                      size_t pgsize, size_t pgcount,
5228                                      struct iommu_iotlb_gather *gather)
5229{
5230        unsigned long pgshift = __ffs(pgsize);
5231        size_t size = pgcount << pgshift;
5232
5233        return intel_iommu_unmap(domain, iova, size, gather);
5234}
5235
5236static void intel_iommu_tlb_sync(struct iommu_domain *domain,
5237                                 struct iommu_iotlb_gather *gather)
5238{
5239        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5240        unsigned long iova_pfn = IOVA_PFN(gather->start);
5241        size_t size = gather->end - gather->start;
5242        unsigned long start_pfn;
5243        unsigned long nrpages;
5244        int iommu_id;
5245
5246        nrpages = aligned_nrpages(gather->start, size);
5247        start_pfn = mm_to_dma_pfn(iova_pfn);
5248
5249        for_each_domain_iommu(iommu_id, dmar_domain)
5250                iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5251                                      start_pfn, nrpages, !gather->freelist, 0);
5252
5253        dma_free_pagelist(gather->freelist);
5254}
5255
5256static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5257                                            dma_addr_t iova)
5258{
5259        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5260        struct dma_pte *pte;
5261        int level = 0;
5262        u64 phys = 0;
5263
5264        pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5265        if (pte && dma_pte_present(pte))
5266                phys = dma_pte_addr(pte) +
5267                        (iova & (BIT_MASK(level_to_offset_bits(level) +
5268                                                VTD_PAGE_SHIFT) - 1));
5269
5270        return phys;
5271}
5272
5273static bool intel_iommu_capable(enum iommu_cap cap)
5274{
5275        if (cap == IOMMU_CAP_CACHE_COHERENCY)
5276                return domain_update_iommu_snooping(NULL);
5277        if (cap == IOMMU_CAP_INTR_REMAP)
5278                return irq_remapping_enabled == 1;
5279
5280        return false;
5281}
5282
5283static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5284{
5285        struct intel_iommu *iommu;
5286
5287        iommu = device_to_iommu(dev, NULL, NULL);
5288        if (!iommu)
5289                return ERR_PTR(-ENODEV);
5290
5291        if (translation_pre_enabled(iommu))
5292                dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5293
5294        return &iommu->iommu;
5295}
5296
5297static void intel_iommu_release_device(struct device *dev)
5298{
5299        struct intel_iommu *iommu;
5300
5301        iommu = device_to_iommu(dev, NULL, NULL);
5302        if (!iommu)
5303                return;
5304
5305        dmar_remove_one_dev_info(dev);
5306
5307        set_dma_ops(dev, NULL);
5308}
5309
5310static void intel_iommu_probe_finalize(struct device *dev)
5311{
5312        set_dma_ops(dev, NULL);
5313        iommu_setup_dma_ops(dev, 0, U64_MAX);
5314}
5315
5316static void intel_iommu_get_resv_regions(struct device *device,
5317                                         struct list_head *head)
5318{
5319        int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5320        struct iommu_resv_region *reg;
5321        struct dmar_rmrr_unit *rmrr;
5322        struct device *i_dev;
5323        int i;
5324
5325        down_read(&dmar_global_lock);
5326        for_each_rmrr_units(rmrr) {
5327                for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5328                                          i, i_dev) {
5329                        struct iommu_resv_region *resv;
5330                        enum iommu_resv_type type;
5331                        size_t length;
5332
5333                        if (i_dev != device &&
5334                            !is_downstream_to_pci_bridge(device, i_dev))
5335                                continue;
5336
5337                        length = rmrr->end_address - rmrr->base_address + 1;
5338
5339                        type = device_rmrr_is_relaxable(device) ?
5340                                IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5341
5342                        resv = iommu_alloc_resv_region(rmrr->base_address,
5343                                                       length, prot, type);
5344                        if (!resv)
5345                                break;
5346
5347                        list_add_tail(&resv->list, head);
5348                }
5349        }
5350        up_read(&dmar_global_lock);
5351
5352#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5353        if (dev_is_pci(device)) {
5354                struct pci_dev *pdev = to_pci_dev(device);
5355
5356                if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5357                        reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5358                                                   IOMMU_RESV_DIRECT_RELAXABLE);
5359                        if (reg)
5360                                list_add_tail(&reg->list, head);
5361                }
5362        }
5363#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5364
5365        reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5366                                      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5367                                      0, IOMMU_RESV_MSI);
5368        if (!reg)
5369                return;
5370        list_add_tail(&reg->list, head);
5371}
5372
5373int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5374{
5375        struct device_domain_info *info;
5376        struct context_entry *context;
5377        struct dmar_domain *domain;
5378        unsigned long flags;
5379        u64 ctx_lo;
5380        int ret;
5381
5382        domain = find_domain(dev);
5383        if (!domain)
5384                return -EINVAL;
5385
5386        spin_lock_irqsave(&device_domain_lock, flags);
5387        spin_lock(&iommu->lock);
5388
5389        ret = -EINVAL;
5390        info = get_domain_info(dev);
5391        if (!info || !info->pasid_supported)
5392                goto out;
5393
5394        context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5395        if (WARN_ON(!context))
5396                goto out;
5397
5398        ctx_lo = context[0].lo;
5399
5400        if (!(ctx_lo & CONTEXT_PASIDE)) {
5401                ctx_lo |= CONTEXT_PASIDE;
5402                context[0].lo = ctx_lo;
5403                wmb();
5404                iommu->flush.flush_context(iommu,
5405                                           domain->iommu_did[iommu->seq_id],
5406                                           PCI_DEVID(info->bus, info->devfn),
5407                                           DMA_CCMD_MASK_NOBIT,
5408                                           DMA_CCMD_DEVICE_INVL);
5409        }
5410
5411        /* Enable PASID support in the device, if it wasn't already */
5412        if (!info->pasid_enabled)
5413                iommu_enable_dev_iotlb(info);
5414
5415        ret = 0;
5416
5417 out:
5418        spin_unlock(&iommu->lock);
5419        spin_unlock_irqrestore(&device_domain_lock, flags);
5420
5421        return ret;
5422}
5423
5424static struct iommu_group *intel_iommu_device_group(struct device *dev)
5425{
5426        if (dev_is_pci(dev))
5427                return pci_device_group(dev);
5428        return generic_device_group(dev);
5429}
5430
5431static int intel_iommu_enable_auxd(struct device *dev)
5432{
5433        struct device_domain_info *info;
5434        struct intel_iommu *iommu;
5435        unsigned long flags;
5436        int ret;
5437
5438        iommu = device_to_iommu(dev, NULL, NULL);
5439        if (!iommu || dmar_disabled)
5440                return -EINVAL;
5441
5442        if (!sm_supported(iommu) || !pasid_supported(iommu))
5443                return -EINVAL;
5444
5445        ret = intel_iommu_enable_pasid(iommu, dev);
5446        if (ret)
5447                return -ENODEV;
5448
5449        spin_lock_irqsave(&device_domain_lock, flags);
5450        info = get_domain_info(dev);
5451        info->auxd_enabled = 1;
5452        spin_unlock_irqrestore(&device_domain_lock, flags);
5453
5454        return 0;
5455}
5456
5457static int intel_iommu_disable_auxd(struct device *dev)
5458{
5459        struct device_domain_info *info;
5460        unsigned long flags;
5461
5462        spin_lock_irqsave(&device_domain_lock, flags);
5463        info = get_domain_info(dev);
5464        if (!WARN_ON(!info))
5465                info->auxd_enabled = 0;
5466        spin_unlock_irqrestore(&device_domain_lock, flags);
5467
5468        return 0;
5469}
5470
5471static int intel_iommu_enable_sva(struct device *dev)
5472{
5473        struct device_domain_info *info = get_domain_info(dev);
5474        struct intel_iommu *iommu;
5475        int ret;
5476
5477        if (!info || dmar_disabled)
5478                return -EINVAL;
5479
5480        iommu = info->iommu;
5481        if (!iommu)
5482                return -EINVAL;
5483
5484        if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
5485                return -ENODEV;
5486
5487        if (intel_iommu_enable_pasid(iommu, dev))
5488                return -ENODEV;
5489
5490        if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
5491                return -EINVAL;
5492
5493        ret = iopf_queue_add_device(iommu->iopf_queue, dev);
5494        if (!ret)
5495                ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
5496
5497        return ret;
5498}
5499
5500static int intel_iommu_disable_sva(struct device *dev)
5501{
5502        struct device_domain_info *info = get_domain_info(dev);
5503        struct intel_iommu *iommu = info->iommu;
5504        int ret;
5505
5506        ret = iommu_unregister_device_fault_handler(dev);
5507        if (!ret)
5508                ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
5509
5510        return ret;
5511}
5512
5513static int intel_iommu_enable_iopf(struct device *dev)
5514{
5515        struct device_domain_info *info = get_domain_info(dev);
5516
5517        if (info && info->pri_supported)
5518                return 0;
5519
5520        return -ENODEV;
5521}
5522
5523static int
5524intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5525{
5526        switch (feat) {
5527        case IOMMU_DEV_FEAT_AUX:
5528                return intel_iommu_enable_auxd(dev);
5529
5530        case IOMMU_DEV_FEAT_IOPF:
5531                return intel_iommu_enable_iopf(dev);
5532
5533        case IOMMU_DEV_FEAT_SVA:
5534                return intel_iommu_enable_sva(dev);
5535
5536        default:
5537                return -ENODEV;
5538        }
5539}
5540
5541static int
5542intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5543{
5544        switch (feat) {
5545        case IOMMU_DEV_FEAT_AUX:
5546                return intel_iommu_disable_auxd(dev);
5547
5548        case IOMMU_DEV_FEAT_IOPF:
5549                return 0;
5550
5551        case IOMMU_DEV_FEAT_SVA:
5552                return intel_iommu_disable_sva(dev);
5553
5554        default:
5555                return -ENODEV;
5556        }
5557}
5558
5559static bool
5560intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5561{
5562        struct device_domain_info *info = get_domain_info(dev);
5563
5564        if (feat == IOMMU_DEV_FEAT_AUX)
5565                return scalable_mode_support() && info && info->auxd_enabled;
5566
5567        return false;
5568}
5569
5570static int
5571intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5572{
5573        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5574
5575        return dmar_domain->default_pasid > 0 ?
5576                        dmar_domain->default_pasid : -EINVAL;
5577}
5578
5579static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5580                                           struct device *dev)
5581{
5582        return attach_deferred(dev);
5583}
5584
5585static int
5586intel_iommu_enable_nesting(struct iommu_domain *domain)
5587{
5588        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5589        unsigned long flags;
5590        int ret = -ENODEV;
5591
5592        spin_lock_irqsave(&device_domain_lock, flags);
5593        if (list_empty(&dmar_domain->devices)) {
5594                dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
5595                dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
5596                ret = 0;
5597        }
5598        spin_unlock_irqrestore(&device_domain_lock, flags);
5599
5600        return ret;
5601}
5602
5603/*
5604 * Check that the device does not live on an external facing PCI port that is
5605 * marked as untrusted. Such devices should not be able to apply quirks and
5606 * thus not be able to bypass the IOMMU restrictions.
5607 */
5608static bool risky_device(struct pci_dev *pdev)
5609{
5610        if (pdev->untrusted) {
5611                pci_info(pdev,
5612                         "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
5613                         pdev->vendor, pdev->device);
5614                pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
5615                return true;
5616        }
5617        return false;
5618}
5619
5620static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
5621                                       unsigned long iova, size_t size)
5622{
5623        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5624        unsigned long pages = aligned_nrpages(iova, size);
5625        unsigned long pfn = iova >> VTD_PAGE_SHIFT;
5626        struct intel_iommu *iommu;
5627        int iommu_id;
5628
5629        for_each_domain_iommu(iommu_id, dmar_domain) {
5630                iommu = g_iommus[iommu_id];
5631                __mapping_notify_one(iommu, dmar_domain, pfn, pages);
5632        }
5633}
5634
5635const struct iommu_ops intel_iommu_ops = {
5636        .capable                = intel_iommu_capable,
5637        .domain_alloc           = intel_iommu_domain_alloc,
5638        .domain_free            = intel_iommu_domain_free,
5639        .enable_nesting         = intel_iommu_enable_nesting,
5640        .attach_dev             = intel_iommu_attach_device,
5641        .detach_dev             = intel_iommu_detach_device,
5642        .aux_attach_dev         = intel_iommu_aux_attach_device,
5643        .aux_detach_dev         = intel_iommu_aux_detach_device,
5644        .aux_get_pasid          = intel_iommu_aux_get_pasid,
5645        .map_pages              = intel_iommu_map_pages,
5646        .unmap_pages            = intel_iommu_unmap_pages,
5647        .iotlb_sync_map         = intel_iommu_iotlb_sync_map,
5648        .flush_iotlb_all        = intel_flush_iotlb_all,
5649        .iotlb_sync             = intel_iommu_tlb_sync,
5650        .iova_to_phys           = intel_iommu_iova_to_phys,
5651        .probe_device           = intel_iommu_probe_device,
5652        .probe_finalize         = intel_iommu_probe_finalize,
5653        .release_device         = intel_iommu_release_device,
5654        .get_resv_regions       = intel_iommu_get_resv_regions,
5655        .put_resv_regions       = generic_iommu_put_resv_regions,
5656        .device_group           = intel_iommu_device_group,
5657        .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
5658        .dev_enable_feat        = intel_iommu_dev_enable_feat,
5659        .dev_disable_feat       = intel_iommu_dev_disable_feat,
5660        .is_attach_deferred     = intel_iommu_is_attach_deferred,
5661        .def_domain_type        = device_def_domain_type,
5662        .pgsize_bitmap          = SZ_4K,
5663#ifdef CONFIG_INTEL_IOMMU_SVM
5664        .cache_invalidate       = intel_iommu_sva_invalidate,
5665        .sva_bind_gpasid        = intel_svm_bind_gpasid,
5666        .sva_unbind_gpasid      = intel_svm_unbind_gpasid,
5667        .sva_bind               = intel_svm_bind,
5668        .sva_unbind             = intel_svm_unbind,
5669        .sva_get_pasid          = intel_svm_get_pasid,
5670        .page_response          = intel_svm_page_response,
5671#endif
5672};
5673
5674static void quirk_iommu_igfx(struct pci_dev *dev)
5675{
5676        if (risky_device(dev))
5677                return;
5678
5679        pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5680        dmar_map_gfx = 0;
5681}
5682
5683/* G4x/GM45 integrated gfx dmar support is totally busted. */
5684DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
5685DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
5686DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
5687DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
5688DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
5689DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
5690DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
5691
5692/* Broadwell igfx malfunctions with dmar */
5693DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
5694DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
5695DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
5696DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
5697DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
5698DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
5699DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
5700DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
5701DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
5702DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
5703DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
5704DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
5705DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
5706DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
5707DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
5708DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
5709DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
5710DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
5711DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
5712DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
5713DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
5714DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
5715DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
5716DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
5717
5718static void quirk_iommu_rwbf(struct pci_dev *dev)
5719{
5720        if (risky_device(dev))
5721                return;
5722
5723        /*
5724         * Mobile 4 Series Chipset neglects to set RWBF capability,
5725         * but needs it. Same seems to hold for the desktop versions.
5726         */
5727        pci_info(dev, "Forcing write-buffer flush capability\n");
5728        rwbf_quirk = 1;
5729}
5730
5731DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5732DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5733DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5734DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5735DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5736DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5737DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5738
5739#define GGC 0x52
5740#define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5741#define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5742#define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5743#define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5744#define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5745#define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5746#define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5747#define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5748
5749static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5750{
5751        unsigned short ggc;
5752
5753        if (risky_device(dev))
5754                return;
5755
5756        if (pci_read_config_word(dev, GGC, &ggc))
5757                return;
5758
5759        if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5760                pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5761                dmar_map_gfx = 0;
5762        } else if (dmar_map_gfx) {
5763                /* we have to ensure the gfx device is idle before we flush */
5764                pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5765                iommu_set_dma_strict();
5766        }
5767}
5768DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5769DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5770DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5771DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5772
5773static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5774{
5775        unsigned short ver;
5776
5777        if (!IS_GFX_DEVICE(dev))
5778                return;
5779
5780        ver = (dev->device >> 8) & 0xff;
5781        if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5782            ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5783            ver != 0x9a)
5784                return;
5785
5786        if (risky_device(dev))
5787                return;
5788
5789        pci_info(dev, "Skip IOMMU disabling for graphics\n");
5790        iommu_skip_te_disable = 1;
5791}
5792DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5793
5794/* On Tylersburg chipsets, some BIOSes have been known to enable the
5795   ISOCH DMAR unit for the Azalia sound device, but not give it any
5796   TLB entries, which causes it to deadlock. Check for that.  We do
5797   this in a function called from init_dmars(), instead of in a PCI
5798   quirk, because we don't want to print the obnoxious "BIOS broken"
5799   message if VT-d is actually disabled.
5800*/
5801static void __init check_tylersburg_isoch(void)
5802{
5803        struct pci_dev *pdev;
5804        uint32_t vtisochctrl;
5805
5806        /* If there's no Azalia in the system anyway, forget it. */
5807        pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5808        if (!pdev)
5809                return;
5810
5811        if (risky_device(pdev)) {
5812                pci_dev_put(pdev);
5813                return;
5814        }
5815
5816        pci_dev_put(pdev);
5817
5818        /* System Management Registers. Might be hidden, in which case
5819           we can't do the sanity check. But that's OK, because the
5820           known-broken BIOSes _don't_ actually hide it, so far. */
5821        pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5822        if (!pdev)
5823                return;
5824
5825        if (risky_device(pdev)) {
5826                pci_dev_put(pdev);
5827                return;
5828        }
5829
5830        if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5831                pci_dev_put(pdev);
5832                return;
5833        }
5834
5835        pci_dev_put(pdev);
5836
5837        /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5838        if (vtisochctrl & 1)
5839                return;
5840
5841        /* Drop all bits other than the number of TLB entries */
5842        vtisochctrl &= 0x1c;
5843
5844        /* If we have the recommended number of TLB entries (16), fine. */
5845        if (vtisochctrl == 0x10)
5846                return;
5847
5848        /* Zero TLB entries? You get to ride the short bus to school. */
5849        if (!vtisochctrl) {
5850                WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5851                     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5852                     dmi_get_system_info(DMI_BIOS_VENDOR),
5853                     dmi_get_system_info(DMI_BIOS_VERSION),
5854                     dmi_get_system_info(DMI_PRODUCT_VERSION));
5855                iommu_identity_mapping |= IDENTMAP_AZALIA;
5856                return;
5857        }
5858
5859        pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5860               vtisochctrl);
5861}
5862