linux/drivers/iommu/intel-iommu.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright © 2006-2014 Intel Corporation.
   4 *
   5 * Authors: David Woodhouse <dwmw2@infradead.org>,
   6 *          Ashok Raj <ashok.raj@intel.com>,
   7 *          Shaohua Li <shaohua.li@intel.com>,
   8 *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
   9 *          Fenghua Yu <fenghua.yu@intel.com>
  10 *          Joerg Roedel <jroedel@suse.de>
  11 */
  12
  13#define pr_fmt(fmt)     "DMAR: " fmt
  14#define dev_fmt(fmt)    pr_fmt(fmt)
  15
  16#include <linux/init.h>
  17#include <linux/bitmap.h>
  18#include <linux/debugfs.h>
  19#include <linux/export.h>
  20#include <linux/slab.h>
  21#include <linux/irq.h>
  22#include <linux/interrupt.h>
  23#include <linux/spinlock.h>
  24#include <linux/pci.h>
  25#include <linux/dmar.h>
  26#include <linux/dma-mapping.h>
  27#include <linux/mempool.h>
  28#include <linux/memory.h>
  29#include <linux/cpu.h>
  30#include <linux/timer.h>
  31#include <linux/io.h>
  32#include <linux/iova.h>
  33#include <linux/iommu.h>
  34#include <linux/intel-iommu.h>
  35#include <linux/syscore_ops.h>
  36#include <linux/tboot.h>
  37#include <linux/dmi.h>
  38#include <linux/pci-ats.h>
  39#include <linux/memblock.h>
  40#include <linux/dma-contiguous.h>
  41#include <linux/dma-direct.h>
  42#include <linux/crash_dump.h>
  43#include <linux/numa.h>
  44#include <linux/swiotlb.h>
  45#include <asm/irq_remapping.h>
  46#include <asm/cacheflush.h>
  47#include <asm/iommu.h>
  48#include <trace/events/intel_iommu.h>
  49
  50#include "irq_remapping.h"
  51#include "intel-pasid.h"
  52
  53#define ROOT_SIZE               VTD_PAGE_SIZE
  54#define CONTEXT_SIZE            VTD_PAGE_SIZE
  55
  56#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  57#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  58#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  59#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  60
  61#define IOAPIC_RANGE_START      (0xfee00000)
  62#define IOAPIC_RANGE_END        (0xfeefffff)
  63#define IOVA_START_ADDR         (0x1000)
  64
  65#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
  66
  67#define MAX_AGAW_WIDTH 64
  68#define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
  69
  70#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  71#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  72
  73/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  74   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  75#define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  76                                __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  77#define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  78
  79/* IO virtual address start page frame number */
  80#define IOVA_START_PFN          (1)
  81
  82#define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  83
  84/* page table handling */
  85#define LEVEL_STRIDE            (9)
  86#define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  87
  88/*
  89 * This bitmap is used to advertise the page sizes our hardware support
  90 * to the IOMMU core, which will then use this information to split
  91 * physically contiguous memory regions it is mapping into page sizes
  92 * that we support.
  93 *
  94 * Traditionally the IOMMU core just handed us the mappings directly,
  95 * after making sure the size is an order of a 4KiB page and that the
  96 * mapping has natural alignment.
  97 *
  98 * To retain this behavior, we currently advertise that we support
  99 * all page sizes that are an order of 4KiB.
 100 *
 101 * If at some point we'd like to utilize the IOMMU core's new behavior,
 102 * we could change this to advertise the real page sizes we support.
 103 */
 104#define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
 105
 106static inline int agaw_to_level(int agaw)
 107{
 108        return agaw + 2;
 109}
 110
 111static inline int agaw_to_width(int agaw)
 112{
 113        return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
 114}
 115
 116static inline int width_to_agaw(int width)
 117{
 118        return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
 119}
 120
 121static inline unsigned int level_to_offset_bits(int level)
 122{
 123        return (level - 1) * LEVEL_STRIDE;
 124}
 125
 126static inline int pfn_level_offset(unsigned long pfn, int level)
 127{
 128        return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 129}
 130
 131static inline unsigned long level_mask(int level)
 132{
 133        return -1UL << level_to_offset_bits(level);
 134}
 135
 136static inline unsigned long level_size(int level)
 137{
 138        return 1UL << level_to_offset_bits(level);
 139}
 140
 141static inline unsigned long align_to_level(unsigned long pfn, int level)
 142{
 143        return (pfn + level_size(level) - 1) & level_mask(level);
 144}
 145
 146static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 147{
 148        return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
 149}
 150
 151/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 152   are never going to work. */
 153static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 154{
 155        return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 156}
 157
 158static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 159{
 160        return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 161}
 162static inline unsigned long page_to_dma_pfn(struct page *pg)
 163{
 164        return mm_to_dma_pfn(page_to_pfn(pg));
 165}
 166static inline unsigned long virt_to_dma_pfn(void *p)
 167{
 168        return page_to_dma_pfn(virt_to_page(p));
 169}
 170
 171/* global iommu list, set NULL for ignored DMAR units */
 172static struct intel_iommu **g_iommus;
 173
 174static void __init check_tylersburg_isoch(void);
 175static int rwbf_quirk;
 176
 177/*
 178 * set to 1 to panic kernel if can't successfully enable VT-d
 179 * (used when kernel is launched w/ TXT)
 180 */
 181static int force_on = 0;
 182int intel_iommu_tboot_noforce;
 183static int no_platform_optin;
 184
 185#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 186
 187/*
 188 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 189 * if marked present.
 190 */
 191static phys_addr_t root_entry_lctp(struct root_entry *re)
 192{
 193        if (!(re->lo & 1))
 194                return 0;
 195
 196        return re->lo & VTD_PAGE_MASK;
 197}
 198
 199/*
 200 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 201 * if marked present.
 202 */
 203static phys_addr_t root_entry_uctp(struct root_entry *re)
 204{
 205        if (!(re->hi & 1))
 206                return 0;
 207
 208        return re->hi & VTD_PAGE_MASK;
 209}
 210
 211static inline void context_clear_pasid_enable(struct context_entry *context)
 212{
 213        context->lo &= ~(1ULL << 11);
 214}
 215
 216static inline bool context_pasid_enabled(struct context_entry *context)
 217{
 218        return !!(context->lo & (1ULL << 11));
 219}
 220
 221static inline void context_set_copied(struct context_entry *context)
 222{
 223        context->hi |= (1ull << 3);
 224}
 225
 226static inline bool context_copied(struct context_entry *context)
 227{
 228        return !!(context->hi & (1ULL << 3));
 229}
 230
 231static inline bool __context_present(struct context_entry *context)
 232{
 233        return (context->lo & 1);
 234}
 235
 236bool context_present(struct context_entry *context)
 237{
 238        return context_pasid_enabled(context) ?
 239             __context_present(context) :
 240             __context_present(context) && !context_copied(context);
 241}
 242
 243static inline void context_set_present(struct context_entry *context)
 244{
 245        context->lo |= 1;
 246}
 247
 248static inline void context_set_fault_enable(struct context_entry *context)
 249{
 250        context->lo &= (((u64)-1) << 2) | 1;
 251}
 252
 253static inline void context_set_translation_type(struct context_entry *context,
 254                                                unsigned long value)
 255{
 256        context->lo &= (((u64)-1) << 4) | 3;
 257        context->lo |= (value & 3) << 2;
 258}
 259
 260static inline void context_set_address_root(struct context_entry *context,
 261                                            unsigned long value)
 262{
 263        context->lo &= ~VTD_PAGE_MASK;
 264        context->lo |= value & VTD_PAGE_MASK;
 265}
 266
 267static inline void context_set_address_width(struct context_entry *context,
 268                                             unsigned long value)
 269{
 270        context->hi |= value & 7;
 271}
 272
 273static inline void context_set_domain_id(struct context_entry *context,
 274                                         unsigned long value)
 275{
 276        context->hi |= (value & ((1 << 16) - 1)) << 8;
 277}
 278
 279static inline int context_domain_id(struct context_entry *c)
 280{
 281        return((c->hi >> 8) & 0xffff);
 282}
 283
 284static inline void context_clear_entry(struct context_entry *context)
 285{
 286        context->lo = 0;
 287        context->hi = 0;
 288}
 289
 290/*
 291 * This domain is a statically identity mapping domain.
 292 *      1. This domain creats a static 1:1 mapping to all usable memory.
 293 *      2. It maps to each iommu if successful.
 294 *      3. Each iommu mapps to this domain if successful.
 295 */
 296static struct dmar_domain *si_domain;
 297static int hw_pass_through = 1;
 298
 299/* si_domain contains mulitple devices */
 300#define DOMAIN_FLAG_STATIC_IDENTITY             BIT(0)
 301
 302/*
 303 * This is a DMA domain allocated through the iommu domain allocation
 304 * interface. But one or more devices belonging to this domain have
 305 * been chosen to use a private domain. We should avoid to use the
 306 * map/unmap/iova_to_phys APIs on it.
 307 */
 308#define DOMAIN_FLAG_LOSE_CHILDREN               BIT(1)
 309
 310#define for_each_domain_iommu(idx, domain)                      \
 311        for (idx = 0; idx < g_num_of_iommus; idx++)             \
 312                if (domain->iommu_refcnt[idx])
 313
 314struct dmar_rmrr_unit {
 315        struct list_head list;          /* list of rmrr units   */
 316        struct acpi_dmar_header *hdr;   /* ACPI header          */
 317        u64     base_address;           /* reserved base address*/
 318        u64     end_address;            /* reserved end address */
 319        struct dmar_dev_scope *devices; /* target devices */
 320        int     devices_cnt;            /* target device count */
 321};
 322
 323struct dmar_atsr_unit {
 324        struct list_head list;          /* list of ATSR units */
 325        struct acpi_dmar_header *hdr;   /* ACPI header */
 326        struct dmar_dev_scope *devices; /* target devices */
 327        int devices_cnt;                /* target device count */
 328        u8 include_all:1;               /* include all ports */
 329};
 330
 331static LIST_HEAD(dmar_atsr_units);
 332static LIST_HEAD(dmar_rmrr_units);
 333
 334#define for_each_rmrr_units(rmrr) \
 335        list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 336
 337/* bitmap for indexing intel_iommus */
 338static int g_num_of_iommus;
 339
 340static void domain_exit(struct dmar_domain *domain);
 341static void domain_remove_dev_info(struct dmar_domain *domain);
 342static void dmar_remove_one_dev_info(struct device *dev);
 343static void __dmar_remove_one_dev_info(struct device_domain_info *info);
 344static void domain_context_clear(struct intel_iommu *iommu,
 345                                 struct device *dev);
 346static int domain_detach_iommu(struct dmar_domain *domain,
 347                               struct intel_iommu *iommu);
 348static bool device_is_rmrr_locked(struct device *dev);
 349static int intel_iommu_attach_device(struct iommu_domain *domain,
 350                                     struct device *dev);
 351static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
 352                                            dma_addr_t iova);
 353
 354#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
 355int dmar_disabled = 0;
 356#else
 357int dmar_disabled = 1;
 358#endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
 359
 360int intel_iommu_sm;
 361int intel_iommu_enabled = 0;
 362EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 363
 364static int dmar_map_gfx = 1;
 365static int dmar_forcedac;
 366static int intel_iommu_strict;
 367static int intel_iommu_superpage = 1;
 368static int iommu_identity_mapping;
 369static int intel_no_bounce;
 370
 371#define IDENTMAP_ALL            1
 372#define IDENTMAP_GFX            2
 373#define IDENTMAP_AZALIA         4
 374
 375int intel_iommu_gfx_mapped;
 376EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 377
 378#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 379#define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
 380static DEFINE_SPINLOCK(device_domain_lock);
 381static LIST_HEAD(device_domain_list);
 382
 383#define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&    \
 384                                to_pci_dev(d)->untrusted)
 385
 386/*
 387 * Iterate over elements in device_domain_list and call the specified
 388 * callback @fn against each element.
 389 */
 390int for_each_device_domain(int (*fn)(struct device_domain_info *info,
 391                                     void *data), void *data)
 392{
 393        int ret = 0;
 394        unsigned long flags;
 395        struct device_domain_info *info;
 396
 397        spin_lock_irqsave(&device_domain_lock, flags);
 398        list_for_each_entry(info, &device_domain_list, global) {
 399                ret = fn(info, data);
 400                if (ret) {
 401                        spin_unlock_irqrestore(&device_domain_lock, flags);
 402                        return ret;
 403                }
 404        }
 405        spin_unlock_irqrestore(&device_domain_lock, flags);
 406
 407        return 0;
 408}
 409
 410const struct iommu_ops intel_iommu_ops;
 411
 412static bool translation_pre_enabled(struct intel_iommu *iommu)
 413{
 414        return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 415}
 416
 417static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 418{
 419        iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 420}
 421
 422static void init_translation_status(struct intel_iommu *iommu)
 423{
 424        u32 gsts;
 425
 426        gsts = readl(iommu->reg + DMAR_GSTS_REG);
 427        if (gsts & DMA_GSTS_TES)
 428                iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 429}
 430
 431/* Convert generic 'struct iommu_domain to private struct dmar_domain */
 432static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
 433{
 434        return container_of(dom, struct dmar_domain, domain);
 435}
 436
 437static int __init intel_iommu_setup(char *str)
 438{
 439        if (!str)
 440                return -EINVAL;
 441        while (*str) {
 442                if (!strncmp(str, "on", 2)) {
 443                        dmar_disabled = 0;
 444                        pr_info("IOMMU enabled\n");
 445                } else if (!strncmp(str, "off", 3)) {
 446                        dmar_disabled = 1;
 447                        no_platform_optin = 1;
 448                        pr_info("IOMMU disabled\n");
 449                } else if (!strncmp(str, "igfx_off", 8)) {
 450                        dmar_map_gfx = 0;
 451                        pr_info("Disable GFX device mapping\n");
 452                } else if (!strncmp(str, "forcedac", 8)) {
 453                        pr_info("Forcing DAC for PCI devices\n");
 454                        dmar_forcedac = 1;
 455                } else if (!strncmp(str, "strict", 6)) {
 456                        pr_info("Disable batched IOTLB flush\n");
 457                        intel_iommu_strict = 1;
 458                } else if (!strncmp(str, "sp_off", 6)) {
 459                        pr_info("Disable supported super page\n");
 460                        intel_iommu_superpage = 0;
 461                } else if (!strncmp(str, "sm_on", 5)) {
 462                        pr_info("Intel-IOMMU: scalable mode supported\n");
 463                        intel_iommu_sm = 1;
 464                } else if (!strncmp(str, "tboot_noforce", 13)) {
 465                        printk(KERN_INFO
 466                                "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 467                        intel_iommu_tboot_noforce = 1;
 468                } else if (!strncmp(str, "nobounce", 8)) {
 469                        pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
 470                        intel_no_bounce = 1;
 471                }
 472
 473                str += strcspn(str, ",");
 474                while (*str == ',')
 475                        str++;
 476        }
 477        return 0;
 478}
 479__setup("intel_iommu=", intel_iommu_setup);
 480
 481static struct kmem_cache *iommu_domain_cache;
 482static struct kmem_cache *iommu_devinfo_cache;
 483
 484static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
 485{
 486        struct dmar_domain **domains;
 487        int idx = did >> 8;
 488
 489        domains = iommu->domains[idx];
 490        if (!domains)
 491                return NULL;
 492
 493        return domains[did & 0xff];
 494}
 495
 496static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
 497                             struct dmar_domain *domain)
 498{
 499        struct dmar_domain **domains;
 500        int idx = did >> 8;
 501
 502        if (!iommu->domains[idx]) {
 503                size_t size = 256 * sizeof(struct dmar_domain *);
 504                iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
 505        }
 506
 507        domains = iommu->domains[idx];
 508        if (WARN_ON(!domains))
 509                return;
 510        else
 511                domains[did & 0xff] = domain;
 512}
 513
 514void *alloc_pgtable_page(int node)
 515{
 516        struct page *page;
 517        void *vaddr = NULL;
 518
 519        page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 520        if (page)
 521                vaddr = page_address(page);
 522        return vaddr;
 523}
 524
 525void free_pgtable_page(void *vaddr)
 526{
 527        free_page((unsigned long)vaddr);
 528}
 529
 530static inline void *alloc_domain_mem(void)
 531{
 532        return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 533}
 534
 535static void free_domain_mem(void *vaddr)
 536{
 537        kmem_cache_free(iommu_domain_cache, vaddr);
 538}
 539
 540static inline void * alloc_devinfo_mem(void)
 541{
 542        return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 543}
 544
 545static inline void free_devinfo_mem(void *vaddr)
 546{
 547        kmem_cache_free(iommu_devinfo_cache, vaddr);
 548}
 549
 550static inline int domain_type_is_si(struct dmar_domain *domain)
 551{
 552        return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
 553}
 554
 555static inline int domain_pfn_supported(struct dmar_domain *domain,
 556                                       unsigned long pfn)
 557{
 558        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 559
 560        return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 561}
 562
 563static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 564{
 565        unsigned long sagaw;
 566        int agaw = -1;
 567
 568        sagaw = cap_sagaw(iommu->cap);
 569        for (agaw = width_to_agaw(max_gaw);
 570             agaw >= 0; agaw--) {
 571                if (test_bit(agaw, &sagaw))
 572                        break;
 573        }
 574
 575        return agaw;
 576}
 577
 578/*
 579 * Calculate max SAGAW for each iommu.
 580 */
 581int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 582{
 583        return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 584}
 585
 586/*
 587 * calculate agaw for each iommu.
 588 * "SAGAW" may be different across iommus, use a default agaw, and
 589 * get a supported less agaw for iommus that don't support the default agaw.
 590 */
 591int iommu_calculate_agaw(struct intel_iommu *iommu)
 592{
 593        return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 594}
 595
 596/* This functionin only returns single iommu in a domain */
 597struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 598{
 599        int iommu_id;
 600
 601        /* si_domain and vm domain should not get here. */
 602        if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
 603                return NULL;
 604
 605        for_each_domain_iommu(iommu_id, domain)
 606                break;
 607
 608        if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 609                return NULL;
 610
 611        return g_iommus[iommu_id];
 612}
 613
 614static void domain_update_iommu_coherency(struct dmar_domain *domain)
 615{
 616        struct dmar_drhd_unit *drhd;
 617        struct intel_iommu *iommu;
 618        bool found = false;
 619        int i;
 620
 621        domain->iommu_coherency = 1;
 622
 623        for_each_domain_iommu(i, domain) {
 624                found = true;
 625                if (!ecap_coherent(g_iommus[i]->ecap)) {
 626                        domain->iommu_coherency = 0;
 627                        break;
 628                }
 629        }
 630        if (found)
 631                return;
 632
 633        /* No hardware attached; use lowest common denominator */
 634        rcu_read_lock();
 635        for_each_active_iommu(iommu, drhd) {
 636                if (!ecap_coherent(iommu->ecap)) {
 637                        domain->iommu_coherency = 0;
 638                        break;
 639                }
 640        }
 641        rcu_read_unlock();
 642}
 643
 644static int domain_update_iommu_snooping(struct intel_iommu *skip)
 645{
 646        struct dmar_drhd_unit *drhd;
 647        struct intel_iommu *iommu;
 648        int ret = 1;
 649
 650        rcu_read_lock();
 651        for_each_active_iommu(iommu, drhd) {
 652                if (iommu != skip) {
 653                        if (!ecap_sc_support(iommu->ecap)) {
 654                                ret = 0;
 655                                break;
 656                        }
 657                }
 658        }
 659        rcu_read_unlock();
 660
 661        return ret;
 662}
 663
 664static int domain_update_iommu_superpage(struct intel_iommu *skip)
 665{
 666        struct dmar_drhd_unit *drhd;
 667        struct intel_iommu *iommu;
 668        int mask = 0xf;
 669
 670        if (!intel_iommu_superpage) {
 671                return 0;
 672        }
 673
 674        /* set iommu_superpage to the smallest common denominator */
 675        rcu_read_lock();
 676        for_each_active_iommu(iommu, drhd) {
 677                if (iommu != skip) {
 678                        mask &= cap_super_page_val(iommu->cap);
 679                        if (!mask)
 680                                break;
 681                }
 682        }
 683        rcu_read_unlock();
 684
 685        return fls(mask);
 686}
 687
 688/* Some capabilities may be different across iommus */
 689static void domain_update_iommu_cap(struct dmar_domain *domain)
 690{
 691        domain_update_iommu_coherency(domain);
 692        domain->iommu_snooping = domain_update_iommu_snooping(NULL);
 693        domain->iommu_superpage = domain_update_iommu_superpage(NULL);
 694}
 695
 696struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
 697                                         u8 devfn, int alloc)
 698{
 699        struct root_entry *root = &iommu->root_entry[bus];
 700        struct context_entry *context;
 701        u64 *entry;
 702
 703        entry = &root->lo;
 704        if (sm_supported(iommu)) {
 705                if (devfn >= 0x80) {
 706                        devfn -= 0x80;
 707                        entry = &root->hi;
 708                }
 709                devfn *= 2;
 710        }
 711        if (*entry & 1)
 712                context = phys_to_virt(*entry & VTD_PAGE_MASK);
 713        else {
 714                unsigned long phy_addr;
 715                if (!alloc)
 716                        return NULL;
 717
 718                context = alloc_pgtable_page(iommu->node);
 719                if (!context)
 720                        return NULL;
 721
 722                __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 723                phy_addr = virt_to_phys((void *)context);
 724                *entry = phy_addr | 1;
 725                __iommu_flush_cache(iommu, entry, sizeof(*entry));
 726        }
 727        return &context[devfn];
 728}
 729
 730static int iommu_dummy(struct device *dev)
 731{
 732        return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
 733}
 734
 735/**
 736 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
 737 *                               sub-hierarchy of a candidate PCI-PCI bridge
 738 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
 739 * @bridge: the candidate PCI-PCI bridge
 740 *
 741 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
 742 */
 743static bool
 744is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
 745{
 746        struct pci_dev *pdev, *pbridge;
 747
 748        if (!dev_is_pci(dev) || !dev_is_pci(bridge))
 749                return false;
 750
 751        pdev = to_pci_dev(dev);
 752        pbridge = to_pci_dev(bridge);
 753
 754        if (pbridge->subordinate &&
 755            pbridge->subordinate->number <= pdev->bus->number &&
 756            pbridge->subordinate->busn_res.end >= pdev->bus->number)
 757                return true;
 758
 759        return false;
 760}
 761
 762static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 763{
 764        struct dmar_drhd_unit *drhd = NULL;
 765        struct intel_iommu *iommu;
 766        struct device *tmp;
 767        struct pci_dev *pdev = NULL;
 768        u16 segment = 0;
 769        int i;
 770
 771        if (iommu_dummy(dev))
 772                return NULL;
 773
 774        if (dev_is_pci(dev)) {
 775                struct pci_dev *pf_pdev;
 776
 777                pdev = to_pci_dev(dev);
 778
 779#ifdef CONFIG_X86
 780                /* VMD child devices currently cannot be handled individually */
 781                if (is_vmd(pdev->bus))
 782                        return NULL;
 783#endif
 784
 785                /* VFs aren't listed in scope tables; we need to look up
 786                 * the PF instead to find the IOMMU. */
 787                pf_pdev = pci_physfn(pdev);
 788                dev = &pf_pdev->dev;
 789                segment = pci_domain_nr(pdev->bus);
 790        } else if (has_acpi_companion(dev))
 791                dev = &ACPI_COMPANION(dev)->dev;
 792
 793        rcu_read_lock();
 794        for_each_active_iommu(iommu, drhd) {
 795                if (pdev && segment != drhd->segment)
 796                        continue;
 797
 798                for_each_active_dev_scope(drhd->devices,
 799                                          drhd->devices_cnt, i, tmp) {
 800                        if (tmp == dev) {
 801                                /* For a VF use its original BDF# not that of the PF
 802                                 * which we used for the IOMMU lookup. Strictly speaking
 803                                 * we could do this for all PCI devices; we only need to
 804                                 * get the BDF# from the scope table for ACPI matches. */
 805                                if (pdev && pdev->is_virtfn)
 806                                        goto got_pdev;
 807
 808                                *bus = drhd->devices[i].bus;
 809                                *devfn = drhd->devices[i].devfn;
 810                                goto out;
 811                        }
 812
 813                        if (is_downstream_to_pci_bridge(dev, tmp))
 814                                goto got_pdev;
 815                }
 816
 817                if (pdev && drhd->include_all) {
 818                got_pdev:
 819                        *bus = pdev->bus->number;
 820                        *devfn = pdev->devfn;
 821                        goto out;
 822                }
 823        }
 824        iommu = NULL;
 825 out:
 826        rcu_read_unlock();
 827
 828        return iommu;
 829}
 830
 831static void domain_flush_cache(struct dmar_domain *domain,
 832                               void *addr, int size)
 833{
 834        if (!domain->iommu_coherency)
 835                clflush_cache_range(addr, size);
 836}
 837
 838static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 839{
 840        struct context_entry *context;
 841        int ret = 0;
 842        unsigned long flags;
 843
 844        spin_lock_irqsave(&iommu->lock, flags);
 845        context = iommu_context_addr(iommu, bus, devfn, 0);
 846        if (context)
 847                ret = context_present(context);
 848        spin_unlock_irqrestore(&iommu->lock, flags);
 849        return ret;
 850}
 851
 852static void free_context_table(struct intel_iommu *iommu)
 853{
 854        int i;
 855        unsigned long flags;
 856        struct context_entry *context;
 857
 858        spin_lock_irqsave(&iommu->lock, flags);
 859        if (!iommu->root_entry) {
 860                goto out;
 861        }
 862        for (i = 0; i < ROOT_ENTRY_NR; i++) {
 863                context = iommu_context_addr(iommu, i, 0, 0);
 864                if (context)
 865                        free_pgtable_page(context);
 866
 867                if (!sm_supported(iommu))
 868                        continue;
 869
 870                context = iommu_context_addr(iommu, i, 0x80, 0);
 871                if (context)
 872                        free_pgtable_page(context);
 873
 874        }
 875        free_pgtable_page(iommu->root_entry);
 876        iommu->root_entry = NULL;
 877out:
 878        spin_unlock_irqrestore(&iommu->lock, flags);
 879}
 880
 881static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 882                                      unsigned long pfn, int *target_level)
 883{
 884        struct dma_pte *parent, *pte;
 885        int level = agaw_to_level(domain->agaw);
 886        int offset;
 887
 888        BUG_ON(!domain->pgd);
 889
 890        if (!domain_pfn_supported(domain, pfn))
 891                /* Address beyond IOMMU's addressing capabilities. */
 892                return NULL;
 893
 894        parent = domain->pgd;
 895
 896        while (1) {
 897                void *tmp_page;
 898
 899                offset = pfn_level_offset(pfn, level);
 900                pte = &parent[offset];
 901                if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 902                        break;
 903                if (level == *target_level)
 904                        break;
 905
 906                if (!dma_pte_present(pte)) {
 907                        uint64_t pteval;
 908
 909                        tmp_page = alloc_pgtable_page(domain->nid);
 910
 911                        if (!tmp_page)
 912                                return NULL;
 913
 914                        domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 915                        pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 916                        if (cmpxchg64(&pte->val, 0ULL, pteval))
 917                                /* Someone else set it while we were thinking; use theirs. */
 918                                free_pgtable_page(tmp_page);
 919                        else
 920                                domain_flush_cache(domain, pte, sizeof(*pte));
 921                }
 922                if (level == 1)
 923                        break;
 924
 925                parent = phys_to_virt(dma_pte_addr(pte));
 926                level--;
 927        }
 928
 929        if (!*target_level)
 930                *target_level = level;
 931
 932        return pte;
 933}
 934
 935/* return address's pte at specific level */
 936static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
 937                                         unsigned long pfn,
 938                                         int level, int *large_page)
 939{
 940        struct dma_pte *parent, *pte;
 941        int total = agaw_to_level(domain->agaw);
 942        int offset;
 943
 944        parent = domain->pgd;
 945        while (level <= total) {
 946                offset = pfn_level_offset(pfn, total);
 947                pte = &parent[offset];
 948                if (level == total)
 949                        return pte;
 950
 951                if (!dma_pte_present(pte)) {
 952                        *large_page = total;
 953                        break;
 954                }
 955
 956                if (dma_pte_superpage(pte)) {
 957                        *large_page = total;
 958                        return pte;
 959                }
 960
 961                parent = phys_to_virt(dma_pte_addr(pte));
 962                total--;
 963        }
 964        return NULL;
 965}
 966
 967/* clear last level pte, a tlb flush should be followed */
 968static void dma_pte_clear_range(struct dmar_domain *domain,
 969                                unsigned long start_pfn,
 970                                unsigned long last_pfn)
 971{
 972        unsigned int large_page;
 973        struct dma_pte *first_pte, *pte;
 974
 975        BUG_ON(!domain_pfn_supported(domain, start_pfn));
 976        BUG_ON(!domain_pfn_supported(domain, last_pfn));
 977        BUG_ON(start_pfn > last_pfn);
 978
 979        /* we don't need lock here; nobody else touches the iova range */
 980        do {
 981                large_page = 1;
 982                first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
 983                if (!pte) {
 984                        start_pfn = align_to_level(start_pfn + 1, large_page + 1);
 985                        continue;
 986                }
 987                do {
 988                        dma_clear_pte(pte);
 989                        start_pfn += lvl_to_nr_pages(large_page);
 990                        pte++;
 991                } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
 992
 993                domain_flush_cache(domain, first_pte,
 994                                   (void *)pte - (void *)first_pte);
 995
 996        } while (start_pfn && start_pfn <= last_pfn);
 997}
 998
 999static void dma_pte_free_level(struct dmar_domain *domain, int level,
1000                               int retain_level, struct dma_pte *pte,
1001                               unsigned long pfn, unsigned long start_pfn,
1002                               unsigned long last_pfn)
1003{
1004        pfn = max(start_pfn, pfn);
1005        pte = &pte[pfn_level_offset(pfn, level)];
1006
1007        do {
1008                unsigned long level_pfn;
1009                struct dma_pte *level_pte;
1010
1011                if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1012                        goto next;
1013
1014                level_pfn = pfn & level_mask(level);
1015                level_pte = phys_to_virt(dma_pte_addr(pte));
1016
1017                if (level > 2) {
1018                        dma_pte_free_level(domain, level - 1, retain_level,
1019                                           level_pte, level_pfn, start_pfn,
1020                                           last_pfn);
1021                }
1022
1023                /*
1024                 * Free the page table if we're below the level we want to
1025                 * retain and the range covers the entire table.
1026                 */
1027                if (level < retain_level && !(start_pfn > level_pfn ||
1028                      last_pfn < level_pfn + level_size(level) - 1)) {
1029                        dma_clear_pte(pte);
1030                        domain_flush_cache(domain, pte, sizeof(*pte));
1031                        free_pgtable_page(level_pte);
1032                }
1033next:
1034                pfn += level_size(level);
1035        } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1036}
1037
1038/*
1039 * clear last level (leaf) ptes and free page table pages below the
1040 * level we wish to keep intact.
1041 */
1042static void dma_pte_free_pagetable(struct dmar_domain *domain,
1043                                   unsigned long start_pfn,
1044                                   unsigned long last_pfn,
1045                                   int retain_level)
1046{
1047        BUG_ON(!domain_pfn_supported(domain, start_pfn));
1048        BUG_ON(!domain_pfn_supported(domain, last_pfn));
1049        BUG_ON(start_pfn > last_pfn);
1050
1051        dma_pte_clear_range(domain, start_pfn, last_pfn);
1052
1053        /* We don't need lock here; nobody else touches the iova range */
1054        dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1055                           domain->pgd, 0, start_pfn, last_pfn);
1056
1057        /* free pgd */
1058        if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1059                free_pgtable_page(domain->pgd);
1060                domain->pgd = NULL;
1061        }
1062}
1063
1064/* When a page at a given level is being unlinked from its parent, we don't
1065   need to *modify* it at all. All we need to do is make a list of all the
1066   pages which can be freed just as soon as we've flushed the IOTLB and we
1067   know the hardware page-walk will no longer touch them.
1068   The 'pte' argument is the *parent* PTE, pointing to the page that is to
1069   be freed. */
1070static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1071                                            int level, struct dma_pte *pte,
1072                                            struct page *freelist)
1073{
1074        struct page *pg;
1075
1076        pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1077        pg->freelist = freelist;
1078        freelist = pg;
1079
1080        if (level == 1)
1081                return freelist;
1082
1083        pte = page_address(pg);
1084        do {
1085                if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1086                        freelist = dma_pte_list_pagetables(domain, level - 1,
1087                                                           pte, freelist);
1088                pte++;
1089        } while (!first_pte_in_page(pte));
1090
1091        return freelist;
1092}
1093
1094static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1095                                        struct dma_pte *pte, unsigned long pfn,
1096                                        unsigned long start_pfn,
1097                                        unsigned long last_pfn,
1098                                        struct page *freelist)
1099{
1100        struct dma_pte *first_pte = NULL, *last_pte = NULL;
1101
1102        pfn = max(start_pfn, pfn);
1103        pte = &pte[pfn_level_offset(pfn, level)];
1104
1105        do {
1106                unsigned long level_pfn;
1107
1108                if (!dma_pte_present(pte))
1109                        goto next;
1110
1111                level_pfn = pfn & level_mask(level);
1112
1113                /* If range covers entire pagetable, free it */
1114                if (start_pfn <= level_pfn &&
1115                    last_pfn >= level_pfn + level_size(level) - 1) {
1116                        /* These suborbinate page tables are going away entirely. Don't
1117                           bother to clear them; we're just going to *free* them. */
1118                        if (level > 1 && !dma_pte_superpage(pte))
1119                                freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1120
1121                        dma_clear_pte(pte);
1122                        if (!first_pte)
1123                                first_pte = pte;
1124                        last_pte = pte;
1125                } else if (level > 1) {
1126                        /* Recurse down into a level that isn't *entirely* obsolete */
1127                        freelist = dma_pte_clear_level(domain, level - 1,
1128                                                       phys_to_virt(dma_pte_addr(pte)),
1129                                                       level_pfn, start_pfn, last_pfn,
1130                                                       freelist);
1131                }
1132next:
1133                pfn += level_size(level);
1134        } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1135
1136        if (first_pte)
1137                domain_flush_cache(domain, first_pte,
1138                                   (void *)++last_pte - (void *)first_pte);
1139
1140        return freelist;
1141}
1142
1143/* We can't just free the pages because the IOMMU may still be walking
1144   the page tables, and may have cached the intermediate levels. The
1145   pages can only be freed after the IOTLB flush has been done. */
1146static struct page *domain_unmap(struct dmar_domain *domain,
1147                                 unsigned long start_pfn,
1148                                 unsigned long last_pfn)
1149{
1150        struct page *freelist;
1151
1152        BUG_ON(!domain_pfn_supported(domain, start_pfn));
1153        BUG_ON(!domain_pfn_supported(domain, last_pfn));
1154        BUG_ON(start_pfn > last_pfn);
1155
1156        /* we don't need lock here; nobody else touches the iova range */
1157        freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1158                                       domain->pgd, 0, start_pfn, last_pfn, NULL);
1159
1160        /* free pgd */
1161        if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1162                struct page *pgd_page = virt_to_page(domain->pgd);
1163                pgd_page->freelist = freelist;
1164                freelist = pgd_page;
1165
1166                domain->pgd = NULL;
1167        }
1168
1169        return freelist;
1170}
1171
1172static void dma_free_pagelist(struct page *freelist)
1173{
1174        struct page *pg;
1175
1176        while ((pg = freelist)) {
1177                freelist = pg->freelist;
1178                free_pgtable_page(page_address(pg));
1179        }
1180}
1181
1182static void iova_entry_free(unsigned long data)
1183{
1184        struct page *freelist = (struct page *)data;
1185
1186        dma_free_pagelist(freelist);
1187}
1188
1189/* iommu handling */
1190static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1191{
1192        struct root_entry *root;
1193        unsigned long flags;
1194
1195        root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1196        if (!root) {
1197                pr_err("Allocating root entry for %s failed\n",
1198                        iommu->name);
1199                return -ENOMEM;
1200        }
1201
1202        __iommu_flush_cache(iommu, root, ROOT_SIZE);
1203
1204        spin_lock_irqsave(&iommu->lock, flags);
1205        iommu->root_entry = root;
1206        spin_unlock_irqrestore(&iommu->lock, flags);
1207
1208        return 0;
1209}
1210
1211static void iommu_set_root_entry(struct intel_iommu *iommu)
1212{
1213        u64 addr;
1214        u32 sts;
1215        unsigned long flag;
1216
1217        addr = virt_to_phys(iommu->root_entry);
1218        if (sm_supported(iommu))
1219                addr |= DMA_RTADDR_SMT;
1220
1221        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1222        dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1223
1224        writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1225
1226        /* Make sure hardware complete it */
1227        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1228                      readl, (sts & DMA_GSTS_RTPS), sts);
1229
1230        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1231}
1232
1233void iommu_flush_write_buffer(struct intel_iommu *iommu)
1234{
1235        u32 val;
1236        unsigned long flag;
1237
1238        if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1239                return;
1240
1241        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1242        writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1243
1244        /* Make sure hardware complete it */
1245        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1246                      readl, (!(val & DMA_GSTS_WBFS)), val);
1247
1248        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1249}
1250
1251/* return value determine if we need a write buffer flush */
1252static void __iommu_flush_context(struct intel_iommu *iommu,
1253                                  u16 did, u16 source_id, u8 function_mask,
1254                                  u64 type)
1255{
1256        u64 val = 0;
1257        unsigned long flag;
1258
1259        switch (type) {
1260        case DMA_CCMD_GLOBAL_INVL:
1261                val = DMA_CCMD_GLOBAL_INVL;
1262                break;
1263        case DMA_CCMD_DOMAIN_INVL:
1264                val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1265                break;
1266        case DMA_CCMD_DEVICE_INVL:
1267                val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1268                        | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1269                break;
1270        default:
1271                BUG();
1272        }
1273        val |= DMA_CCMD_ICC;
1274
1275        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1276        dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1277
1278        /* Make sure hardware complete it */
1279        IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1280                dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1281
1282        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1283}
1284
1285/* return value determine if we need a write buffer flush */
1286static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1287                                u64 addr, unsigned int size_order, u64 type)
1288{
1289        int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1290        u64 val = 0, val_iva = 0;
1291        unsigned long flag;
1292
1293        switch (type) {
1294        case DMA_TLB_GLOBAL_FLUSH:
1295                /* global flush doesn't need set IVA_REG */
1296                val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1297                break;
1298        case DMA_TLB_DSI_FLUSH:
1299                val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1300                break;
1301        case DMA_TLB_PSI_FLUSH:
1302                val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1303                /* IH bit is passed in as part of address */
1304                val_iva = size_order | addr;
1305                break;
1306        default:
1307                BUG();
1308        }
1309        /* Note: set drain read/write */
1310#if 0
1311        /*
1312         * This is probably to be super secure.. Looks like we can
1313         * ignore it without any impact.
1314         */
1315        if (cap_read_drain(iommu->cap))
1316                val |= DMA_TLB_READ_DRAIN;
1317#endif
1318        if (cap_write_drain(iommu->cap))
1319                val |= DMA_TLB_WRITE_DRAIN;
1320
1321        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1322        /* Note: Only uses first TLB reg currently */
1323        if (val_iva)
1324                dmar_writeq(iommu->reg + tlb_offset, val_iva);
1325        dmar_writeq(iommu->reg + tlb_offset + 8, val);
1326
1327        /* Make sure hardware complete it */
1328        IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1329                dmar_readq, (!(val & DMA_TLB_IVT)), val);
1330
1331        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1332
1333        /* check IOTLB invalidation granularity */
1334        if (DMA_TLB_IAIG(val) == 0)
1335                pr_err("Flush IOTLB failed\n");
1336        if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1337                pr_debug("TLB flush request %Lx, actual %Lx\n",
1338                        (unsigned long long)DMA_TLB_IIRG(type),
1339                        (unsigned long long)DMA_TLB_IAIG(val));
1340}
1341
1342static struct device_domain_info *
1343iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1344                         u8 bus, u8 devfn)
1345{
1346        struct device_domain_info *info;
1347
1348        assert_spin_locked(&device_domain_lock);
1349
1350        if (!iommu->qi)
1351                return NULL;
1352
1353        list_for_each_entry(info, &domain->devices, link)
1354                if (info->iommu == iommu && info->bus == bus &&
1355                    info->devfn == devfn) {
1356                        if (info->ats_supported && info->dev)
1357                                return info;
1358                        break;
1359                }
1360
1361        return NULL;
1362}
1363
1364static void domain_update_iotlb(struct dmar_domain *domain)
1365{
1366        struct device_domain_info *info;
1367        bool has_iotlb_device = false;
1368
1369        assert_spin_locked(&device_domain_lock);
1370
1371        list_for_each_entry(info, &domain->devices, link) {
1372                struct pci_dev *pdev;
1373
1374                if (!info->dev || !dev_is_pci(info->dev))
1375                        continue;
1376
1377                pdev = to_pci_dev(info->dev);
1378                if (pdev->ats_enabled) {
1379                        has_iotlb_device = true;
1380                        break;
1381                }
1382        }
1383
1384        domain->has_iotlb_device = has_iotlb_device;
1385}
1386
1387static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1388{
1389        struct pci_dev *pdev;
1390
1391        assert_spin_locked(&device_domain_lock);
1392
1393        if (!info || !dev_is_pci(info->dev))
1394                return;
1395
1396        pdev = to_pci_dev(info->dev);
1397        /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1398         * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1399         * queue depth at PF level. If DIT is not set, PFSID will be treated as
1400         * reserved, which should be set to 0.
1401         */
1402        if (!ecap_dit(info->iommu->ecap))
1403                info->pfsid = 0;
1404        else {
1405                struct pci_dev *pf_pdev;
1406
1407                /* pdev will be returned if device is not a vf */
1408                pf_pdev = pci_physfn(pdev);
1409                info->pfsid = pci_dev_id(pf_pdev);
1410        }
1411
1412#ifdef CONFIG_INTEL_IOMMU_SVM
1413        /* The PCIe spec, in its wisdom, declares that the behaviour of
1414           the device if you enable PASID support after ATS support is
1415           undefined. So always enable PASID support on devices which
1416           have it, even if we can't yet know if we're ever going to
1417           use it. */
1418        if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1419                info->pasid_enabled = 1;
1420
1421        if (info->pri_supported &&
1422            (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1423            !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1424                info->pri_enabled = 1;
1425#endif
1426        if (!pdev->untrusted && info->ats_supported &&
1427            pci_ats_page_aligned(pdev) &&
1428            !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1429                info->ats_enabled = 1;
1430                domain_update_iotlb(info->domain);
1431                info->ats_qdep = pci_ats_queue_depth(pdev);
1432        }
1433}
1434
1435static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1436{
1437        struct pci_dev *pdev;
1438
1439        assert_spin_locked(&device_domain_lock);
1440
1441        if (!dev_is_pci(info->dev))
1442                return;
1443
1444        pdev = to_pci_dev(info->dev);
1445
1446        if (info->ats_enabled) {
1447                pci_disable_ats(pdev);
1448                info->ats_enabled = 0;
1449                domain_update_iotlb(info->domain);
1450        }
1451#ifdef CONFIG_INTEL_IOMMU_SVM
1452        if (info->pri_enabled) {
1453                pci_disable_pri(pdev);
1454                info->pri_enabled = 0;
1455        }
1456        if (info->pasid_enabled) {
1457                pci_disable_pasid(pdev);
1458                info->pasid_enabled = 0;
1459        }
1460#endif
1461}
1462
1463static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1464                                  u64 addr, unsigned mask)
1465{
1466        u16 sid, qdep;
1467        unsigned long flags;
1468        struct device_domain_info *info;
1469
1470        if (!domain->has_iotlb_device)
1471                return;
1472
1473        spin_lock_irqsave(&device_domain_lock, flags);
1474        list_for_each_entry(info, &domain->devices, link) {
1475                if (!info->ats_enabled)
1476                        continue;
1477
1478                sid = info->bus << 8 | info->devfn;
1479                qdep = info->ats_qdep;
1480                qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1481                                qdep, addr, mask);
1482        }
1483        spin_unlock_irqrestore(&device_domain_lock, flags);
1484}
1485
1486static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1487                                  struct dmar_domain *domain,
1488                                  unsigned long pfn, unsigned int pages,
1489                                  int ih, int map)
1490{
1491        unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1492        uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1493        u16 did = domain->iommu_did[iommu->seq_id];
1494
1495        BUG_ON(pages == 0);
1496
1497        if (ih)
1498                ih = 1 << 6;
1499        /*
1500         * Fallback to domain selective flush if no PSI support or the size is
1501         * too big.
1502         * PSI requires page size to be 2 ^ x, and the base address is naturally
1503         * aligned to the size
1504         */
1505        if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1506                iommu->flush.flush_iotlb(iommu, did, 0, 0,
1507                                                DMA_TLB_DSI_FLUSH);
1508        else
1509                iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1510                                                DMA_TLB_PSI_FLUSH);
1511
1512        /*
1513         * In caching mode, changes of pages from non-present to present require
1514         * flush. However, device IOTLB doesn't need to be flushed in this case.
1515         */
1516        if (!cap_caching_mode(iommu->cap) || !map)
1517                iommu_flush_dev_iotlb(domain, addr, mask);
1518}
1519
1520/* Notification for newly created mappings */
1521static inline void __mapping_notify_one(struct intel_iommu *iommu,
1522                                        struct dmar_domain *domain,
1523                                        unsigned long pfn, unsigned int pages)
1524{
1525        /* It's a non-present to present mapping. Only flush if caching mode */
1526        if (cap_caching_mode(iommu->cap))
1527                iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1528        else
1529                iommu_flush_write_buffer(iommu);
1530}
1531
1532static void iommu_flush_iova(struct iova_domain *iovad)
1533{
1534        struct dmar_domain *domain;
1535        int idx;
1536
1537        domain = container_of(iovad, struct dmar_domain, iovad);
1538
1539        for_each_domain_iommu(idx, domain) {
1540                struct intel_iommu *iommu = g_iommus[idx];
1541                u16 did = domain->iommu_did[iommu->seq_id];
1542
1543                iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1544
1545                if (!cap_caching_mode(iommu->cap))
1546                        iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1547                                              0, MAX_AGAW_PFN_WIDTH);
1548        }
1549}
1550
1551static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1552{
1553        u32 pmen;
1554        unsigned long flags;
1555
1556        if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1557                return;
1558
1559        raw_spin_lock_irqsave(&iommu->register_lock, flags);
1560        pmen = readl(iommu->reg + DMAR_PMEN_REG);
1561        pmen &= ~DMA_PMEN_EPM;
1562        writel(pmen, iommu->reg + DMAR_PMEN_REG);
1563
1564        /* wait for the protected region status bit to clear */
1565        IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1566                readl, !(pmen & DMA_PMEN_PRS), pmen);
1567
1568        raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1569}
1570
1571static void iommu_enable_translation(struct intel_iommu *iommu)
1572{
1573        u32 sts;
1574        unsigned long flags;
1575
1576        raw_spin_lock_irqsave(&iommu->register_lock, flags);
1577        iommu->gcmd |= DMA_GCMD_TE;
1578        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1579
1580        /* Make sure hardware complete it */
1581        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1582                      readl, (sts & DMA_GSTS_TES), sts);
1583
1584        raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1585}
1586
1587static void iommu_disable_translation(struct intel_iommu *iommu)
1588{
1589        u32 sts;
1590        unsigned long flag;
1591
1592        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1593        iommu->gcmd &= ~DMA_GCMD_TE;
1594        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1595
1596        /* Make sure hardware complete it */
1597        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1598                      readl, (!(sts & DMA_GSTS_TES)), sts);
1599
1600        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1601}
1602
1603static int iommu_init_domains(struct intel_iommu *iommu)
1604{
1605        u32 ndomains, nlongs;
1606        size_t size;
1607
1608        ndomains = cap_ndoms(iommu->cap);
1609        pr_debug("%s: Number of Domains supported <%d>\n",
1610                 iommu->name, ndomains);
1611        nlongs = BITS_TO_LONGS(ndomains);
1612
1613        spin_lock_init(&iommu->lock);
1614
1615        iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1616        if (!iommu->domain_ids) {
1617                pr_err("%s: Allocating domain id array failed\n",
1618                       iommu->name);
1619                return -ENOMEM;
1620        }
1621
1622        size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1623        iommu->domains = kzalloc(size, GFP_KERNEL);
1624
1625        if (iommu->domains) {
1626                size = 256 * sizeof(struct dmar_domain *);
1627                iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1628        }
1629
1630        if (!iommu->domains || !iommu->domains[0]) {
1631                pr_err("%s: Allocating domain array failed\n",
1632                       iommu->name);
1633                kfree(iommu->domain_ids);
1634                kfree(iommu->domains);
1635                iommu->domain_ids = NULL;
1636                iommu->domains    = NULL;
1637                return -ENOMEM;
1638        }
1639
1640        /*
1641         * If Caching mode is set, then invalid translations are tagged
1642         * with domain-id 0, hence we need to pre-allocate it. We also
1643         * use domain-id 0 as a marker for non-allocated domain-id, so
1644         * make sure it is not used for a real domain.
1645         */
1646        set_bit(0, iommu->domain_ids);
1647
1648        /*
1649         * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1650         * entry for first-level or pass-through translation modes should
1651         * be programmed with a domain id different from those used for
1652         * second-level or nested translation. We reserve a domain id for
1653         * this purpose.
1654         */
1655        if (sm_supported(iommu))
1656                set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1657
1658        return 0;
1659}
1660
1661static void disable_dmar_iommu(struct intel_iommu *iommu)
1662{
1663        struct device_domain_info *info, *tmp;
1664        unsigned long flags;
1665
1666        if (!iommu->domains || !iommu->domain_ids)
1667                return;
1668
1669        spin_lock_irqsave(&device_domain_lock, flags);
1670        list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1671                if (info->iommu != iommu)
1672                        continue;
1673
1674                if (!info->dev || !info->domain)
1675                        continue;
1676
1677                __dmar_remove_one_dev_info(info);
1678        }
1679        spin_unlock_irqrestore(&device_domain_lock, flags);
1680
1681        if (iommu->gcmd & DMA_GCMD_TE)
1682                iommu_disable_translation(iommu);
1683}
1684
1685static void free_dmar_iommu(struct intel_iommu *iommu)
1686{
1687        if ((iommu->domains) && (iommu->domain_ids)) {
1688                int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1689                int i;
1690
1691                for (i = 0; i < elems; i++)
1692                        kfree(iommu->domains[i]);
1693                kfree(iommu->domains);
1694                kfree(iommu->domain_ids);
1695                iommu->domains = NULL;
1696                iommu->domain_ids = NULL;
1697        }
1698
1699        g_iommus[iommu->seq_id] = NULL;
1700
1701        /* free context mapping */
1702        free_context_table(iommu);
1703
1704#ifdef CONFIG_INTEL_IOMMU_SVM
1705        if (pasid_supported(iommu)) {
1706                if (ecap_prs(iommu->ecap))
1707                        intel_svm_finish_prq(iommu);
1708        }
1709#endif
1710}
1711
1712static struct dmar_domain *alloc_domain(int flags)
1713{
1714        struct dmar_domain *domain;
1715
1716        domain = alloc_domain_mem();
1717        if (!domain)
1718                return NULL;
1719
1720        memset(domain, 0, sizeof(*domain));
1721        domain->nid = NUMA_NO_NODE;
1722        domain->flags = flags;
1723        domain->has_iotlb_device = false;
1724        INIT_LIST_HEAD(&domain->devices);
1725
1726        return domain;
1727}
1728
1729/* Must be called with iommu->lock */
1730static int domain_attach_iommu(struct dmar_domain *domain,
1731                               struct intel_iommu *iommu)
1732{
1733        unsigned long ndomains;
1734        int num;
1735
1736        assert_spin_locked(&device_domain_lock);
1737        assert_spin_locked(&iommu->lock);
1738
1739        domain->iommu_refcnt[iommu->seq_id] += 1;
1740        domain->iommu_count += 1;
1741        if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1742                ndomains = cap_ndoms(iommu->cap);
1743                num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1744
1745                if (num >= ndomains) {
1746                        pr_err("%s: No free domain ids\n", iommu->name);
1747                        domain->iommu_refcnt[iommu->seq_id] -= 1;
1748                        domain->iommu_count -= 1;
1749                        return -ENOSPC;
1750                }
1751
1752                set_bit(num, iommu->domain_ids);
1753                set_iommu_domain(iommu, num, domain);
1754
1755                domain->iommu_did[iommu->seq_id] = num;
1756                domain->nid                      = iommu->node;
1757
1758                domain_update_iommu_cap(domain);
1759        }
1760
1761        return 0;
1762}
1763
1764static int domain_detach_iommu(struct dmar_domain *domain,
1765                               struct intel_iommu *iommu)
1766{
1767        int num, count;
1768
1769        assert_spin_locked(&device_domain_lock);
1770        assert_spin_locked(&iommu->lock);
1771
1772        domain->iommu_refcnt[iommu->seq_id] -= 1;
1773        count = --domain->iommu_count;
1774        if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1775                num = domain->iommu_did[iommu->seq_id];
1776                clear_bit(num, iommu->domain_ids);
1777                set_iommu_domain(iommu, num, NULL);
1778
1779                domain_update_iommu_cap(domain);
1780                domain->iommu_did[iommu->seq_id] = 0;
1781        }
1782
1783        return count;
1784}
1785
1786static struct iova_domain reserved_iova_list;
1787static struct lock_class_key reserved_rbtree_key;
1788
1789static int dmar_init_reserved_ranges(void)
1790{
1791        struct pci_dev *pdev = NULL;
1792        struct iova *iova;
1793        int i;
1794
1795        init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1796
1797        lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1798                &reserved_rbtree_key);
1799
1800        /* IOAPIC ranges shouldn't be accessed by DMA */
1801        iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1802                IOVA_PFN(IOAPIC_RANGE_END));
1803        if (!iova) {
1804                pr_err("Reserve IOAPIC range failed\n");
1805                return -ENODEV;
1806        }
1807
1808        /* Reserve all PCI MMIO to avoid peer-to-peer access */
1809        for_each_pci_dev(pdev) {
1810                struct resource *r;
1811
1812                for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1813                        r = &pdev->resource[i];
1814                        if (!r->flags || !(r->flags & IORESOURCE_MEM))
1815                                continue;
1816                        iova = reserve_iova(&reserved_iova_list,
1817                                            IOVA_PFN(r->start),
1818                                            IOVA_PFN(r->end));
1819                        if (!iova) {
1820                                pci_err(pdev, "Reserve iova for %pR failed\n", r);
1821                                return -ENODEV;
1822                        }
1823                }
1824        }
1825        return 0;
1826}
1827
1828static void domain_reserve_special_ranges(struct dmar_domain *domain)
1829{
1830        copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1831}
1832
1833static inline int guestwidth_to_adjustwidth(int gaw)
1834{
1835        int agaw;
1836        int r = (gaw - 12) % 9;
1837
1838        if (r == 0)
1839                agaw = gaw;
1840        else
1841                agaw = gaw + 9 - r;
1842        if (agaw > 64)
1843                agaw = 64;
1844        return agaw;
1845}
1846
1847static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1848                       int guest_width)
1849{
1850        int adjust_width, agaw;
1851        unsigned long sagaw;
1852        int err;
1853
1854        init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1855
1856        err = init_iova_flush_queue(&domain->iovad,
1857                                    iommu_flush_iova, iova_entry_free);
1858        if (err)
1859                return err;
1860
1861        domain_reserve_special_ranges(domain);
1862
1863        /* calculate AGAW */
1864        if (guest_width > cap_mgaw(iommu->cap))
1865                guest_width = cap_mgaw(iommu->cap);
1866        domain->gaw = guest_width;
1867        adjust_width = guestwidth_to_adjustwidth(guest_width);
1868        agaw = width_to_agaw(adjust_width);
1869        sagaw = cap_sagaw(iommu->cap);
1870        if (!test_bit(agaw, &sagaw)) {
1871                /* hardware doesn't support it, choose a bigger one */
1872                pr_debug("Hardware doesn't support agaw %d\n", agaw);
1873                agaw = find_next_bit(&sagaw, 5, agaw);
1874                if (agaw >= 5)
1875                        return -ENODEV;
1876        }
1877        domain->agaw = agaw;
1878
1879        if (ecap_coherent(iommu->ecap))
1880                domain->iommu_coherency = 1;
1881        else
1882                domain->iommu_coherency = 0;
1883
1884        if (ecap_sc_support(iommu->ecap))
1885                domain->iommu_snooping = 1;
1886        else
1887                domain->iommu_snooping = 0;
1888
1889        if (intel_iommu_superpage)
1890                domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1891        else
1892                domain->iommu_superpage = 0;
1893
1894        domain->nid = iommu->node;
1895
1896        /* always allocate the top pgd */
1897        domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1898        if (!domain->pgd)
1899                return -ENOMEM;
1900        __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1901        return 0;
1902}
1903
1904static void domain_exit(struct dmar_domain *domain)
1905{
1906
1907        /* Remove associated devices and clear attached or cached domains */
1908        domain_remove_dev_info(domain);
1909
1910        /* destroy iovas */
1911        put_iova_domain(&domain->iovad);
1912
1913        if (domain->pgd) {
1914                struct page *freelist;
1915
1916                freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1917                dma_free_pagelist(freelist);
1918        }
1919
1920        free_domain_mem(domain);
1921}
1922
1923/*
1924 * Get the PASID directory size for scalable mode context entry.
1925 * Value of X in the PDTS field of a scalable mode context entry
1926 * indicates PASID directory with 2^(X + 7) entries.
1927 */
1928static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1929{
1930        int pds, max_pde;
1931
1932        max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1933        pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1934        if (pds < 7)
1935                return 0;
1936
1937        return pds - 7;
1938}
1939
1940/*
1941 * Set the RID_PASID field of a scalable mode context entry. The
1942 * IOMMU hardware will use the PASID value set in this field for
1943 * DMA translations of DMA requests without PASID.
1944 */
1945static inline void
1946context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1947{
1948        context->hi |= pasid & ((1 << 20) - 1);
1949        context->hi |= (1 << 20);
1950}
1951
1952/*
1953 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1954 * entry.
1955 */
1956static inline void context_set_sm_dte(struct context_entry *context)
1957{
1958        context->lo |= (1 << 2);
1959}
1960
1961/*
1962 * Set the PRE(Page Request Enable) field of a scalable mode context
1963 * entry.
1964 */
1965static inline void context_set_sm_pre(struct context_entry *context)
1966{
1967        context->lo |= (1 << 4);
1968}
1969
1970/* Convert value to context PASID directory size field coding. */
1971#define context_pdts(pds)       (((pds) & 0x7) << 9)
1972
1973static int domain_context_mapping_one(struct dmar_domain *domain,
1974                                      struct intel_iommu *iommu,
1975                                      struct pasid_table *table,
1976                                      u8 bus, u8 devfn)
1977{
1978        u16 did = domain->iommu_did[iommu->seq_id];
1979        int translation = CONTEXT_TT_MULTI_LEVEL;
1980        struct device_domain_info *info = NULL;
1981        struct context_entry *context;
1982        unsigned long flags;
1983        int ret;
1984
1985        WARN_ON(did == 0);
1986
1987        if (hw_pass_through && domain_type_is_si(domain))
1988                translation = CONTEXT_TT_PASS_THROUGH;
1989
1990        pr_debug("Set context mapping for %02x:%02x.%d\n",
1991                bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1992
1993        BUG_ON(!domain->pgd);
1994
1995        spin_lock_irqsave(&device_domain_lock, flags);
1996        spin_lock(&iommu->lock);
1997
1998        ret = -ENOMEM;
1999        context = iommu_context_addr(iommu, bus, devfn, 1);
2000        if (!context)
2001                goto out_unlock;
2002
2003        ret = 0;
2004        if (context_present(context))
2005                goto out_unlock;
2006
2007        /*
2008         * For kdump cases, old valid entries may be cached due to the
2009         * in-flight DMA and copied pgtable, but there is no unmapping
2010         * behaviour for them, thus we need an explicit cache flush for
2011         * the newly-mapped device. For kdump, at this point, the device
2012         * is supposed to finish reset at its driver probe stage, so no
2013         * in-flight DMA will exist, and we don't need to worry anymore
2014         * hereafter.
2015         */
2016        if (context_copied(context)) {
2017                u16 did_old = context_domain_id(context);
2018
2019                if (did_old < cap_ndoms(iommu->cap)) {
2020                        iommu->flush.flush_context(iommu, did_old,
2021                                                   (((u16)bus) << 8) | devfn,
2022                                                   DMA_CCMD_MASK_NOBIT,
2023                                                   DMA_CCMD_DEVICE_INVL);
2024                        iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2025                                                 DMA_TLB_DSI_FLUSH);
2026                }
2027        }
2028
2029        context_clear_entry(context);
2030
2031        if (sm_supported(iommu)) {
2032                unsigned long pds;
2033
2034                WARN_ON(!table);
2035
2036                /* Setup the PASID DIR pointer: */
2037                pds = context_get_sm_pds(table);
2038                context->lo = (u64)virt_to_phys(table->table) |
2039                                context_pdts(pds);
2040
2041                /* Setup the RID_PASID field: */
2042                context_set_sm_rid2pasid(context, PASID_RID2PASID);
2043
2044                /*
2045                 * Setup the Device-TLB enable bit and Page request
2046                 * Enable bit:
2047                 */
2048                info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2049                if (info && info->ats_supported)
2050                        context_set_sm_dte(context);
2051                if (info && info->pri_supported)
2052                        context_set_sm_pre(context);
2053        } else {
2054                struct dma_pte *pgd = domain->pgd;
2055                int agaw;
2056
2057                context_set_domain_id(context, did);
2058
2059                if (translation != CONTEXT_TT_PASS_THROUGH) {
2060                        /*
2061                         * Skip top levels of page tables for iommu which has
2062                         * less agaw than default. Unnecessary for PT mode.
2063                         */
2064                        for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2065                                ret = -ENOMEM;
2066                                pgd = phys_to_virt(dma_pte_addr(pgd));
2067                                if (!dma_pte_present(pgd))
2068                                        goto out_unlock;
2069                        }
2070
2071                        info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2072                        if (info && info->ats_supported)
2073                                translation = CONTEXT_TT_DEV_IOTLB;
2074                        else
2075                                translation = CONTEXT_TT_MULTI_LEVEL;
2076
2077                        context_set_address_root(context, virt_to_phys(pgd));
2078                        context_set_address_width(context, agaw);
2079                } else {
2080                        /*
2081                         * In pass through mode, AW must be programmed to
2082                         * indicate the largest AGAW value supported by
2083                         * hardware. And ASR is ignored by hardware.
2084                         */
2085                        context_set_address_width(context, iommu->msagaw);
2086                }
2087
2088                context_set_translation_type(context, translation);
2089        }
2090
2091        context_set_fault_enable(context);
2092        context_set_present(context);
2093        domain_flush_cache(domain, context, sizeof(*context));
2094
2095        /*
2096         * It's a non-present to present mapping. If hardware doesn't cache
2097         * non-present entry we only need to flush the write-buffer. If the
2098         * _does_ cache non-present entries, then it does so in the special
2099         * domain #0, which we have to flush:
2100         */
2101        if (cap_caching_mode(iommu->cap)) {
2102                iommu->flush.flush_context(iommu, 0,
2103                                           (((u16)bus) << 8) | devfn,
2104                                           DMA_CCMD_MASK_NOBIT,
2105                                           DMA_CCMD_DEVICE_INVL);
2106                iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2107        } else {
2108                iommu_flush_write_buffer(iommu);
2109        }
2110        iommu_enable_dev_iotlb(info);
2111
2112        ret = 0;
2113
2114out_unlock:
2115        spin_unlock(&iommu->lock);
2116        spin_unlock_irqrestore(&device_domain_lock, flags);
2117
2118        return ret;
2119}
2120
2121struct domain_context_mapping_data {
2122        struct dmar_domain *domain;
2123        struct intel_iommu *iommu;
2124        struct pasid_table *table;
2125};
2126
2127static int domain_context_mapping_cb(struct pci_dev *pdev,
2128                                     u16 alias, void *opaque)
2129{
2130        struct domain_context_mapping_data *data = opaque;
2131
2132        return domain_context_mapping_one(data->domain, data->iommu,
2133                                          data->table, PCI_BUS_NUM(alias),
2134                                          alias & 0xff);
2135}
2136
2137static int
2138domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2139{
2140        struct domain_context_mapping_data data;
2141        struct pasid_table *table;
2142        struct intel_iommu *iommu;
2143        u8 bus, devfn;
2144
2145        iommu = device_to_iommu(dev, &bus, &devfn);
2146        if (!iommu)
2147                return -ENODEV;
2148
2149        table = intel_pasid_get_table(dev);
2150
2151        if (!dev_is_pci(dev))
2152                return domain_context_mapping_one(domain, iommu, table,
2153                                                  bus, devfn);
2154
2155        data.domain = domain;
2156        data.iommu = iommu;
2157        data.table = table;
2158
2159        return pci_for_each_dma_alias(to_pci_dev(dev),
2160                                      &domain_context_mapping_cb, &data);
2161}
2162
2163static int domain_context_mapped_cb(struct pci_dev *pdev,
2164                                    u16 alias, void *opaque)
2165{
2166        struct intel_iommu *iommu = opaque;
2167
2168        return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2169}
2170
2171static int domain_context_mapped(struct device *dev)
2172{
2173        struct intel_iommu *iommu;
2174        u8 bus, devfn;
2175
2176        iommu = device_to_iommu(dev, &bus, &devfn);
2177        if (!iommu)
2178                return -ENODEV;
2179
2180        if (!dev_is_pci(dev))
2181                return device_context_mapped(iommu, bus, devfn);
2182
2183        return !pci_for_each_dma_alias(to_pci_dev(dev),
2184                                       domain_context_mapped_cb, iommu);
2185}
2186
2187/* Returns a number of VTD pages, but aligned to MM page size */
2188static inline unsigned long aligned_nrpages(unsigned long host_addr,
2189                                            size_t size)
2190{
2191        host_addr &= ~PAGE_MASK;
2192        return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2193}
2194
2195/* Return largest possible superpage level for a given mapping */
2196static inline int hardware_largepage_caps(struct dmar_domain *domain,
2197                                          unsigned long iov_pfn,
2198                                          unsigned long phy_pfn,
2199                                          unsigned long pages)
2200{
2201        int support, level = 1;
2202        unsigned long pfnmerge;
2203
2204        support = domain->iommu_superpage;
2205
2206        /* To use a large page, the virtual *and* physical addresses
2207           must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2208           of them will mean we have to use smaller pages. So just
2209           merge them and check both at once. */
2210        pfnmerge = iov_pfn | phy_pfn;
2211
2212        while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2213                pages >>= VTD_STRIDE_SHIFT;
2214                if (!pages)
2215                        break;
2216                pfnmerge >>= VTD_STRIDE_SHIFT;
2217                level++;
2218                support--;
2219        }
2220        return level;
2221}
2222
2223static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2224                            struct scatterlist *sg, unsigned long phys_pfn,
2225                            unsigned long nr_pages, int prot)
2226{
2227        struct dma_pte *first_pte = NULL, *pte = NULL;
2228        phys_addr_t uninitialized_var(pteval);
2229        unsigned long sg_res = 0;
2230        unsigned int largepage_lvl = 0;
2231        unsigned long lvl_pages = 0;
2232
2233        BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2234
2235        if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2236                return -EINVAL;
2237
2238        prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2239
2240        if (!sg) {
2241                sg_res = nr_pages;
2242                pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2243        }
2244
2245        while (nr_pages > 0) {
2246                uint64_t tmp;
2247
2248                if (!sg_res) {
2249                        unsigned int pgoff = sg->offset & ~PAGE_MASK;
2250
2251                        sg_res = aligned_nrpages(sg->offset, sg->length);
2252                        sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2253                        sg->dma_length = sg->length;
2254                        pteval = (sg_phys(sg) - pgoff) | prot;
2255                        phys_pfn = pteval >> VTD_PAGE_SHIFT;
2256                }
2257
2258                if (!pte) {
2259                        largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2260
2261                        first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2262                        if (!pte)
2263                                return -ENOMEM;
2264                        /* It is large page*/
2265                        if (largepage_lvl > 1) {
2266                                unsigned long nr_superpages, end_pfn;
2267
2268                                pteval |= DMA_PTE_LARGE_PAGE;
2269                                lvl_pages = lvl_to_nr_pages(largepage_lvl);
2270
2271                                nr_superpages = sg_res / lvl_pages;
2272                                end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2273
2274                                /*
2275                                 * Ensure that old small page tables are
2276                                 * removed to make room for superpage(s).
2277                                 * We're adding new large pages, so make sure
2278                                 * we don't remove their parent tables.
2279                                 */
2280                                dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2281                                                       largepage_lvl + 1);
2282                        } else {
2283                                pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2284                        }
2285
2286                }
2287                /* We don't need lock here, nobody else
2288                 * touches the iova range
2289                 */
2290                tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2291                if (tmp) {
2292                        static int dumps = 5;
2293                        pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2294                                iov_pfn, tmp, (unsigned long long)pteval);
2295                        if (dumps) {
2296                                dumps--;
2297                                debug_dma_dump_mappings(NULL);
2298                        }
2299                        WARN_ON(1);
2300                }
2301
2302                lvl_pages = lvl_to_nr_pages(largepage_lvl);
2303
2304                BUG_ON(nr_pages < lvl_pages);
2305                BUG_ON(sg_res < lvl_pages);
2306
2307                nr_pages -= lvl_pages;
2308                iov_pfn += lvl_pages;
2309                phys_pfn += lvl_pages;
2310                pteval += lvl_pages * VTD_PAGE_SIZE;
2311                sg_res -= lvl_pages;
2312
2313                /* If the next PTE would be the first in a new page, then we
2314                   need to flush the cache on the entries we've just written.
2315                   And then we'll need to recalculate 'pte', so clear it and
2316                   let it get set again in the if (!pte) block above.
2317
2318                   If we're done (!nr_pages) we need to flush the cache too.
2319
2320                   Also if we've been setting superpages, we may need to
2321                   recalculate 'pte' and switch back to smaller pages for the
2322                   end of the mapping, if the trailing size is not enough to
2323                   use another superpage (i.e. sg_res < lvl_pages). */
2324                pte++;
2325                if (!nr_pages || first_pte_in_page(pte) ||
2326                    (largepage_lvl > 1 && sg_res < lvl_pages)) {
2327                        domain_flush_cache(domain, first_pte,
2328                                           (void *)pte - (void *)first_pte);
2329                        pte = NULL;
2330                }
2331
2332                if (!sg_res && nr_pages)
2333                        sg = sg_next(sg);
2334        }
2335        return 0;
2336}
2337
2338static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2339                          struct scatterlist *sg, unsigned long phys_pfn,
2340                          unsigned long nr_pages, int prot)
2341{
2342        int iommu_id, ret;
2343        struct intel_iommu *iommu;
2344
2345        /* Do the real mapping first */
2346        ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2347        if (ret)
2348                return ret;
2349
2350        for_each_domain_iommu(iommu_id, domain) {
2351                iommu = g_iommus[iommu_id];
2352                __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2353        }
2354
2355        return 0;
2356}
2357
2358static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2359                                    struct scatterlist *sg, unsigned long nr_pages,
2360                                    int prot)
2361{
2362        return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2363}
2364
2365static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2366                                     unsigned long phys_pfn, unsigned long nr_pages,
2367                                     int prot)
2368{
2369        return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2370}
2371
2372static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2373{
2374        unsigned long flags;
2375        struct context_entry *context;
2376        u16 did_old;
2377
2378        if (!iommu)
2379                return;
2380
2381        spin_lock_irqsave(&iommu->lock, flags);
2382        context = iommu_context_addr(iommu, bus, devfn, 0);
2383        if (!context) {
2384                spin_unlock_irqrestore(&iommu->lock, flags);
2385                return;
2386        }
2387        did_old = context_domain_id(context);
2388        context_clear_entry(context);
2389        __iommu_flush_cache(iommu, context, sizeof(*context));
2390        spin_unlock_irqrestore(&iommu->lock, flags);
2391        iommu->flush.flush_context(iommu,
2392                                   did_old,
2393                                   (((u16)bus) << 8) | devfn,
2394                                   DMA_CCMD_MASK_NOBIT,
2395                                   DMA_CCMD_DEVICE_INVL);
2396        iommu->flush.flush_iotlb(iommu,
2397                                 did_old,
2398                                 0,
2399                                 0,
2400                                 DMA_TLB_DSI_FLUSH);
2401}
2402
2403static inline void unlink_domain_info(struct device_domain_info *info)
2404{
2405        assert_spin_locked(&device_domain_lock);
2406        list_del(&info->link);
2407        list_del(&info->global);
2408        if (info->dev)
2409                info->dev->archdata.iommu = NULL;
2410}
2411
2412static void domain_remove_dev_info(struct dmar_domain *domain)
2413{
2414        struct device_domain_info *info, *tmp;
2415        unsigned long flags;
2416
2417        spin_lock_irqsave(&device_domain_lock, flags);
2418        list_for_each_entry_safe(info, tmp, &domain->devices, link)
2419                __dmar_remove_one_dev_info(info);
2420        spin_unlock_irqrestore(&device_domain_lock, flags);
2421}
2422
2423/*
2424 * find_domain
2425 * Note: we use struct device->archdata.iommu stores the info
2426 */
2427static struct dmar_domain *find_domain(struct device *dev)
2428{
2429        struct device_domain_info *info;
2430
2431        if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) {
2432                struct iommu_domain *domain;
2433
2434                dev->archdata.iommu = NULL;
2435                domain = iommu_get_domain_for_dev(dev);
2436                if (domain)
2437                        intel_iommu_attach_device(domain, dev);
2438        }
2439
2440        /* No lock here, assumes no domain exit in normal case */
2441        info = dev->archdata.iommu;
2442
2443        if (likely(info))
2444                return info->domain;
2445        return NULL;
2446}
2447
2448static inline struct device_domain_info *
2449dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2450{
2451        struct device_domain_info *info;
2452
2453        list_for_each_entry(info, &device_domain_list, global)
2454                if (info->iommu->segment == segment && info->bus == bus &&
2455                    info->devfn == devfn)
2456                        return info;
2457
2458        return NULL;
2459}
2460
2461static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2462                                                    int bus, int devfn,
2463                                                    struct device *dev,
2464                                                    struct dmar_domain *domain)
2465{
2466        struct dmar_domain *found = NULL;
2467        struct device_domain_info *info;
2468        unsigned long flags;
2469        int ret;
2470
2471        info = alloc_devinfo_mem();
2472        if (!info)
2473                return NULL;
2474
2475        info->bus = bus;
2476        info->devfn = devfn;
2477        info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2478        info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2479        info->ats_qdep = 0;
2480        info->dev = dev;
2481        info->domain = domain;
2482        info->iommu = iommu;
2483        info->pasid_table = NULL;
2484        info->auxd_enabled = 0;
2485        INIT_LIST_HEAD(&info->auxiliary_domains);
2486
2487        if (dev && dev_is_pci(dev)) {
2488                struct pci_dev *pdev = to_pci_dev(info->dev);
2489
2490                if (!pdev->untrusted &&
2491                    !pci_ats_disabled() &&
2492                    ecap_dev_iotlb_support(iommu->ecap) &&
2493                    pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2494                    dmar_find_matched_atsr_unit(pdev))
2495                        info->ats_supported = 1;
2496
2497                if (sm_supported(iommu)) {
2498                        if (pasid_supported(iommu)) {
2499                                int features = pci_pasid_features(pdev);
2500                                if (features >= 0)
2501                                        info->pasid_supported = features | 1;
2502                        }
2503
2504                        if (info->ats_supported && ecap_prs(iommu->ecap) &&
2505                            pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2506                                info->pri_supported = 1;
2507                }
2508        }
2509
2510        spin_lock_irqsave(&device_domain_lock, flags);
2511        if (dev)
2512                found = find_domain(dev);
2513
2514        if (!found) {
2515                struct device_domain_info *info2;
2516                info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2517                if (info2) {
2518                        found      = info2->domain;
2519                        info2->dev = dev;
2520                }
2521        }
2522
2523        if (found) {
2524                spin_unlock_irqrestore(&device_domain_lock, flags);
2525                free_devinfo_mem(info);
2526                /* Caller must free the original domain */
2527                return found;
2528        }
2529
2530        spin_lock(&iommu->lock);
2531        ret = domain_attach_iommu(domain, iommu);
2532        spin_unlock(&iommu->lock);
2533
2534        if (ret) {
2535                spin_unlock_irqrestore(&device_domain_lock, flags);
2536                free_devinfo_mem(info);
2537                return NULL;
2538        }
2539
2540        list_add(&info->link, &domain->devices);
2541        list_add(&info->global, &device_domain_list);
2542        if (dev)
2543                dev->archdata.iommu = info;
2544        spin_unlock_irqrestore(&device_domain_lock, flags);
2545
2546        /* PASID table is mandatory for a PCI device in scalable mode. */
2547        if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2548                ret = intel_pasid_alloc_table(dev);
2549                if (ret) {
2550                        dev_err(dev, "PASID table allocation failed\n");
2551                        dmar_remove_one_dev_info(dev);
2552                        return NULL;
2553                }
2554
2555                /* Setup the PASID entry for requests without PASID: */
2556                spin_lock(&iommu->lock);
2557                if (hw_pass_through && domain_type_is_si(domain))
2558                        ret = intel_pasid_setup_pass_through(iommu, domain,
2559                                        dev, PASID_RID2PASID);
2560                else
2561                        ret = intel_pasid_setup_second_level(iommu, domain,
2562                                        dev, PASID_RID2PASID);
2563                spin_unlock(&iommu->lock);
2564                if (ret) {
2565                        dev_err(dev, "Setup RID2PASID failed\n");
2566                        dmar_remove_one_dev_info(dev);
2567                        return NULL;
2568                }
2569        }
2570
2571        if (dev && domain_context_mapping(domain, dev)) {
2572                dev_err(dev, "Domain context map failed\n");
2573                dmar_remove_one_dev_info(dev);
2574                return NULL;
2575        }
2576
2577        return domain;
2578}
2579
2580static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2581{
2582        *(u16 *)opaque = alias;
2583        return 0;
2584}
2585
2586static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2587{
2588        struct device_domain_info *info;
2589        struct dmar_domain *domain = NULL;
2590        struct intel_iommu *iommu;
2591        u16 dma_alias;
2592        unsigned long flags;
2593        u8 bus, devfn;
2594
2595        iommu = device_to_iommu(dev, &bus, &devfn);
2596        if (!iommu)
2597                return NULL;
2598
2599        if (dev_is_pci(dev)) {
2600                struct pci_dev *pdev = to_pci_dev(dev);
2601
2602                pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2603
2604                spin_lock_irqsave(&device_domain_lock, flags);
2605                info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2606                                                      PCI_BUS_NUM(dma_alias),
2607                                                      dma_alias & 0xff);
2608                if (info) {
2609                        iommu = info->iommu;
2610                        domain = info->domain;
2611                }
2612                spin_unlock_irqrestore(&device_domain_lock, flags);
2613
2614                /* DMA alias already has a domain, use it */
2615                if (info)
2616                        goto out;
2617        }
2618
2619        /* Allocate and initialize new domain for the device */
2620        domain = alloc_domain(0);
2621        if (!domain)
2622                return NULL;
2623        if (domain_init(domain, iommu, gaw)) {
2624                domain_exit(domain);
2625                return NULL;
2626        }
2627
2628out:
2629        return domain;
2630}
2631
2632static struct dmar_domain *set_domain_for_dev(struct device *dev,
2633                                              struct dmar_domain *domain)
2634{
2635        struct intel_iommu *iommu;
2636        struct dmar_domain *tmp;
2637        u16 req_id, dma_alias;
2638        u8 bus, devfn;
2639
2640        iommu = device_to_iommu(dev, &bus, &devfn);
2641        if (!iommu)
2642                return NULL;
2643
2644        req_id = ((u16)bus << 8) | devfn;
2645
2646        if (dev_is_pci(dev)) {
2647                struct pci_dev *pdev = to_pci_dev(dev);
2648
2649                pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2650
2651                /* register PCI DMA alias device */
2652                if (req_id != dma_alias) {
2653                        tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2654                                        dma_alias & 0xff, NULL, domain);
2655
2656                        if (!tmp || tmp != domain)
2657                                return tmp;
2658                }
2659        }
2660
2661        tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2662        if (!tmp || tmp != domain)
2663                return tmp;
2664
2665        return domain;
2666}
2667
2668static int iommu_domain_identity_map(struct dmar_domain *domain,
2669                                     unsigned long long start,
2670                                     unsigned long long end)
2671{
2672        unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2673        unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2674
2675        if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2676                          dma_to_mm_pfn(last_vpfn))) {
2677                pr_err("Reserving iova failed\n");
2678                return -ENOMEM;
2679        }
2680
2681        pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2682        /*
2683         * RMRR range might have overlap with physical memory range,
2684         * clear it first
2685         */
2686        dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2687
2688        return __domain_mapping(domain, first_vpfn, NULL,
2689                                first_vpfn, last_vpfn - first_vpfn + 1,
2690                                DMA_PTE_READ|DMA_PTE_WRITE);
2691}
2692
2693static int domain_prepare_identity_map(struct device *dev,
2694                                       struct dmar_domain *domain,
2695                                       unsigned long long start,
2696                                       unsigned long long end)
2697{
2698        /* For _hardware_ passthrough, don't bother. But for software
2699           passthrough, we do it anyway -- it may indicate a memory
2700           range which is reserved in E820, so which didn't get set
2701           up to start with in si_domain */
2702        if (domain == si_domain && hw_pass_through) {
2703                dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2704                         start, end);
2705                return 0;
2706        }
2707
2708        dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2709
2710        if (end < start) {
2711                WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2712                        "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2713                        dmi_get_system_info(DMI_BIOS_VENDOR),
2714                        dmi_get_system_info(DMI_BIOS_VERSION),
2715                     dmi_get_system_info(DMI_PRODUCT_VERSION));
2716                return -EIO;
2717        }
2718
2719        if (end >> agaw_to_width(domain->agaw)) {
2720                WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2721                     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2722                     agaw_to_width(domain->agaw),
2723                     dmi_get_system_info(DMI_BIOS_VENDOR),
2724                     dmi_get_system_info(DMI_BIOS_VERSION),
2725                     dmi_get_system_info(DMI_PRODUCT_VERSION));
2726                return -EIO;
2727        }
2728
2729        return iommu_domain_identity_map(domain, start, end);
2730}
2731
2732static int md_domain_init(struct dmar_domain *domain, int guest_width);
2733
2734static int __init si_domain_init(int hw)
2735{
2736        struct dmar_rmrr_unit *rmrr;
2737        struct device *dev;
2738        int i, nid, ret;
2739
2740        si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2741        if (!si_domain)
2742                return -EFAULT;
2743
2744        if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2745                domain_exit(si_domain);
2746                return -EFAULT;
2747        }
2748
2749        if (hw)
2750                return 0;
2751
2752        for_each_online_node(nid) {
2753                unsigned long start_pfn, end_pfn;
2754                int i;
2755
2756                for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2757                        ret = iommu_domain_identity_map(si_domain,
2758                                        PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2759                        if (ret)
2760                                return ret;
2761                }
2762        }
2763
2764        /*
2765         * Normally we use DMA domains for devices which have RMRRs. But we
2766         * loose this requirement for graphic and usb devices. Identity map
2767         * the RMRRs for graphic and USB devices so that they could use the
2768         * si_domain.
2769         */
2770        for_each_rmrr_units(rmrr) {
2771                for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2772                                          i, dev) {
2773                        unsigned long long start = rmrr->base_address;
2774                        unsigned long long end = rmrr->end_address;
2775
2776                        if (device_is_rmrr_locked(dev))
2777                                continue;
2778
2779                        if (WARN_ON(end < start ||
2780                                    end >> agaw_to_width(si_domain->agaw)))
2781                                continue;
2782
2783                        ret = iommu_domain_identity_map(si_domain, start, end);
2784                        if (ret)
2785                                return ret;
2786                }
2787        }
2788
2789        return 0;
2790}
2791
2792static int identity_mapping(struct device *dev)
2793{
2794        struct device_domain_info *info;
2795
2796        info = dev->archdata.iommu;
2797        if (info && info != DUMMY_DEVICE_DOMAIN_INFO && info != DEFER_DEVICE_DOMAIN_INFO)
2798                return (info->domain == si_domain);
2799
2800        return 0;
2801}
2802
2803static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2804{
2805        struct dmar_domain *ndomain;
2806        struct intel_iommu *iommu;
2807        u8 bus, devfn;
2808
2809        iommu = device_to_iommu(dev, &bus, &devfn);
2810        if (!iommu)
2811                return -ENODEV;
2812
2813        ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2814        if (ndomain != domain)
2815                return -EBUSY;
2816
2817        return 0;
2818}
2819
2820static bool device_has_rmrr(struct device *dev)
2821{
2822        struct dmar_rmrr_unit *rmrr;
2823        struct device *tmp;
2824        int i;
2825
2826        rcu_read_lock();
2827        for_each_rmrr_units(rmrr) {
2828                /*
2829                 * Return TRUE if this RMRR contains the device that
2830                 * is passed in.
2831                 */
2832                for_each_active_dev_scope(rmrr->devices,
2833                                          rmrr->devices_cnt, i, tmp)
2834                        if (tmp == dev ||
2835                            is_downstream_to_pci_bridge(dev, tmp)) {
2836                                rcu_read_unlock();
2837                                return true;
2838                        }
2839        }
2840        rcu_read_unlock();
2841        return false;
2842}
2843
2844/**
2845 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2846 * is relaxable (ie. is allowed to be not enforced under some conditions)
2847 * @dev: device handle
2848 *
2849 * We assume that PCI USB devices with RMRRs have them largely
2850 * for historical reasons and that the RMRR space is not actively used post
2851 * boot.  This exclusion may change if vendors begin to abuse it.
2852 *
2853 * The same exception is made for graphics devices, with the requirement that
2854 * any use of the RMRR regions will be torn down before assigning the device
2855 * to a guest.
2856 *
2857 * Return: true if the RMRR is relaxable, false otherwise
2858 */
2859static bool device_rmrr_is_relaxable(struct device *dev)
2860{
2861        struct pci_dev *pdev;
2862
2863        if (!dev_is_pci(dev))
2864                return false;
2865
2866        pdev = to_pci_dev(dev);
2867        if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2868                return true;
2869        else
2870                return false;
2871}
2872
2873/*
2874 * There are a couple cases where we need to restrict the functionality of
2875 * devices associated with RMRRs.  The first is when evaluating a device for
2876 * identity mapping because problems exist when devices are moved in and out
2877 * of domains and their respective RMRR information is lost.  This means that
2878 * a device with associated RMRRs will never be in a "passthrough" domain.
2879 * The second is use of the device through the IOMMU API.  This interface
2880 * expects to have full control of the IOVA space for the device.  We cannot
2881 * satisfy both the requirement that RMRR access is maintained and have an
2882 * unencumbered IOVA space.  We also have no ability to quiesce the device's
2883 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2884 * We therefore prevent devices associated with an RMRR from participating in
2885 * the IOMMU API, which eliminates them from device assignment.
2886 *
2887 * In both cases, devices which have relaxable RMRRs are not concerned by this
2888 * restriction. See device_rmrr_is_relaxable comment.
2889 */
2890static bool device_is_rmrr_locked(struct device *dev)
2891{
2892        if (!device_has_rmrr(dev))
2893                return false;
2894
2895        if (device_rmrr_is_relaxable(dev))
2896                return false;
2897
2898        return true;
2899}
2900
2901/*
2902 * Return the required default domain type for a specific device.
2903 *
2904 * @dev: the device in query
2905 * @startup: true if this is during early boot
2906 *
2907 * Returns:
2908 *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2909 *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2910 *  - 0: both identity and dynamic domains work for this device
2911 */
2912static int device_def_domain_type(struct device *dev)
2913{
2914        if (dev_is_pci(dev)) {
2915                struct pci_dev *pdev = to_pci_dev(dev);
2916
2917                if (device_is_rmrr_locked(dev))
2918                        return IOMMU_DOMAIN_DMA;
2919
2920                /*
2921                 * Prevent any device marked as untrusted from getting
2922                 * placed into the statically identity mapping domain.
2923                 */
2924                if (pdev->untrusted)
2925                        return IOMMU_DOMAIN_DMA;
2926
2927                if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2928                        return IOMMU_DOMAIN_IDENTITY;
2929
2930                if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2931                        return IOMMU_DOMAIN_IDENTITY;
2932
2933                /*
2934                 * We want to start off with all devices in the 1:1 domain, and
2935                 * take them out later if we find they can't access all of memory.
2936                 *
2937                 * However, we can't do this for PCI devices behind bridges,
2938                 * because all PCI devices behind the same bridge will end up
2939                 * with the same source-id on their transactions.
2940                 *
2941                 * Practically speaking, we can't change things around for these
2942                 * devices at run-time, because we can't be sure there'll be no
2943                 * DMA transactions in flight for any of their siblings.
2944                 *
2945                 * So PCI devices (unless they're on the root bus) as well as
2946                 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2947                 * the 1:1 domain, just in _case_ one of their siblings turns out
2948                 * not to be able to map all of memory.
2949                 */
2950                if (!pci_is_pcie(pdev)) {
2951                        if (!pci_is_root_bus(pdev->bus))
2952                                return IOMMU_DOMAIN_DMA;
2953                        if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2954                                return IOMMU_DOMAIN_DMA;
2955                } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2956                        return IOMMU_DOMAIN_DMA;
2957        } else {
2958                if (device_has_rmrr(dev))
2959                        return IOMMU_DOMAIN_DMA;
2960        }
2961
2962        return (iommu_identity_mapping & IDENTMAP_ALL) ?
2963                        IOMMU_DOMAIN_IDENTITY : 0;
2964}
2965
2966static void intel_iommu_init_qi(struct intel_iommu *iommu)
2967{
2968        /*
2969         * Start from the sane iommu hardware state.
2970         * If the queued invalidation is already initialized by us
2971         * (for example, while enabling interrupt-remapping) then
2972         * we got the things already rolling from a sane state.
2973         */
2974        if (!iommu->qi) {
2975                /*
2976                 * Clear any previous faults.
2977                 */
2978                dmar_fault(-1, iommu);
2979                /*
2980                 * Disable queued invalidation if supported and already enabled
2981                 * before OS handover.
2982                 */
2983                dmar_disable_qi(iommu);
2984        }
2985
2986        if (dmar_enable_qi(iommu)) {
2987                /*
2988                 * Queued Invalidate not enabled, use Register Based Invalidate
2989                 */
2990                iommu->flush.flush_context = __iommu_flush_context;
2991                iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2992                pr_info("%s: Using Register based invalidation\n",
2993                        iommu->name);
2994        } else {
2995                iommu->flush.flush_context = qi_flush_context;
2996                iommu->flush.flush_iotlb = qi_flush_iotlb;
2997                pr_info("%s: Using Queued invalidation\n", iommu->name);
2998        }
2999}
3000
3001static int copy_context_table(struct intel_iommu *iommu,
3002                              struct root_entry *old_re,
3003                              struct context_entry **tbl,
3004                              int bus, bool ext)
3005{
3006        int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3007        struct context_entry *new_ce = NULL, ce;
3008        struct context_entry *old_ce = NULL;
3009        struct root_entry re;
3010        phys_addr_t old_ce_phys;
3011
3012        tbl_idx = ext ? bus * 2 : bus;
3013        memcpy(&re, old_re, sizeof(re));
3014
3015        for (devfn = 0; devfn < 256; devfn++) {
3016                /* First calculate the correct index */
3017                idx = (ext ? devfn * 2 : devfn) % 256;
3018
3019                if (idx == 0) {
3020                        /* First save what we may have and clean up */
3021                        if (new_ce) {
3022                                tbl[tbl_idx] = new_ce;
3023                                __iommu_flush_cache(iommu, new_ce,
3024                                                    VTD_PAGE_SIZE);
3025                                pos = 1;
3026                        }
3027
3028                        if (old_ce)
3029                                memunmap(old_ce);
3030
3031                        ret = 0;
3032                        if (devfn < 0x80)
3033                                old_ce_phys = root_entry_lctp(&re);
3034                        else
3035                                old_ce_phys = root_entry_uctp(&re);
3036
3037                        if (!old_ce_phys) {
3038                                if (ext && devfn == 0) {
3039                                        /* No LCTP, try UCTP */
3040                                        devfn = 0x7f;
3041                                        continue;
3042                                } else {
3043                                        goto out;
3044                                }
3045                        }
3046
3047                        ret = -ENOMEM;
3048                        old_ce = memremap(old_ce_phys, PAGE_SIZE,
3049                                        MEMREMAP_WB);
3050                        if (!old_ce)
3051                                goto out;
3052
3053                        new_ce = alloc_pgtable_page(iommu->node);
3054                        if (!new_ce)
3055                                goto out_unmap;
3056
3057                        ret = 0;
3058                }
3059
3060                /* Now copy the context entry */
3061                memcpy(&ce, old_ce + idx, sizeof(ce));
3062
3063                if (!__context_present(&ce))
3064                        continue;
3065
3066                did = context_domain_id(&ce);
3067                if (did >= 0 && did < cap_ndoms(iommu->cap))
3068                        set_bit(did, iommu->domain_ids);
3069
3070                /*
3071                 * We need a marker for copied context entries. This
3072                 * marker needs to work for the old format as well as
3073                 * for extended context entries.
3074                 *
3075                 * Bit 67 of the context entry is used. In the old
3076                 * format this bit is available to software, in the
3077                 * extended format it is the PGE bit, but PGE is ignored
3078                 * by HW if PASIDs are disabled (and thus still
3079                 * available).
3080                 *
3081                 * So disable PASIDs first and then mark the entry
3082                 * copied. This means that we don't copy PASID
3083                 * translations from the old kernel, but this is fine as
3084                 * faults there are not fatal.
3085                 */
3086                context_clear_pasid_enable(&ce);
3087                context_set_copied(&ce);
3088
3089                new_ce[idx] = ce;
3090        }
3091
3092        tbl[tbl_idx + pos] = new_ce;
3093
3094        __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3095
3096out_unmap:
3097        memunmap(old_ce);
3098
3099out:
3100        return ret;
3101}
3102
3103static int copy_translation_tables(struct intel_iommu *iommu)
3104{
3105        struct context_entry **ctxt_tbls;
3106        struct root_entry *old_rt;
3107        phys_addr_t old_rt_phys;
3108        int ctxt_table_entries;
3109        unsigned long flags;
3110        u64 rtaddr_reg;
3111        int bus, ret;
3112        bool new_ext, ext;
3113
3114        rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3115        ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3116        new_ext    = !!ecap_ecs(iommu->ecap);
3117
3118        /*
3119         * The RTT bit can only be changed when translation is disabled,
3120         * but disabling translation means to open a window for data
3121         * corruption. So bail out and don't copy anything if we would
3122         * have to change the bit.
3123         */
3124        if (new_ext != ext)
3125                return -EINVAL;
3126
3127        old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3128        if (!old_rt_phys)
3129                return -EINVAL;
3130
3131        old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3132        if (!old_rt)
3133                return -ENOMEM;
3134
3135        /* This is too big for the stack - allocate it from slab */
3136        ctxt_table_entries = ext ? 512 : 256;
3137        ret = -ENOMEM;
3138        ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3139        if (!ctxt_tbls)
3140                goto out_unmap;
3141
3142        for (bus = 0; bus < 256; bus++) {
3143                ret = copy_context_table(iommu, &old_rt[bus],
3144                                         ctxt_tbls, bus, ext);
3145                if (ret) {
3146                        pr_err("%s: Failed to copy context table for bus %d\n",
3147                                iommu->name, bus);
3148                        continue;
3149                }
3150        }
3151
3152        spin_lock_irqsave(&iommu->lock, flags);
3153
3154        /* Context tables are copied, now write them to the root_entry table */
3155        for (bus = 0; bus < 256; bus++) {
3156                int idx = ext ? bus * 2 : bus;
3157                u64 val;
3158
3159                if (ctxt_tbls[idx]) {
3160                        val = virt_to_phys(ctxt_tbls[idx]) | 1;
3161                        iommu->root_entry[bus].lo = val;
3162                }
3163
3164                if (!ext || !ctxt_tbls[idx + 1])
3165                        continue;
3166
3167                val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3168                iommu->root_entry[bus].hi = val;
3169        }
3170
3171        spin_unlock_irqrestore(&iommu->lock, flags);
3172
3173        kfree(ctxt_tbls);
3174
3175        __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3176
3177        ret = 0;
3178
3179out_unmap:
3180        memunmap(old_rt);
3181
3182        return ret;
3183}
3184
3185static int __init init_dmars(void)
3186{
3187        struct dmar_drhd_unit *drhd;
3188        struct intel_iommu *iommu;
3189        int ret;
3190
3191        /*
3192         * for each drhd
3193         *    allocate root
3194         *    initialize and program root entry to not present
3195         * endfor
3196         */
3197        for_each_drhd_unit(drhd) {
3198                /*
3199                 * lock not needed as this is only incremented in the single
3200                 * threaded kernel __init code path all other access are read
3201                 * only
3202                 */
3203                if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3204                        g_num_of_iommus++;
3205                        continue;
3206                }
3207                pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3208        }
3209
3210        /* Preallocate enough resources for IOMMU hot-addition */
3211        if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3212                g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3213
3214        g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3215                        GFP_KERNEL);
3216        if (!g_iommus) {
3217                pr_err("Allocating global iommu array failed\n");
3218                ret = -ENOMEM;
3219                goto error;
3220        }
3221
3222        for_each_iommu(iommu, drhd) {
3223                if (drhd->ignored) {
3224                        iommu_disable_translation(iommu);
3225                        continue;
3226                }
3227
3228                /*
3229                 * Find the max pasid size of all IOMMU's in the system.
3230                 * We need to ensure the system pasid table is no bigger
3231                 * than the smallest supported.
3232                 */
3233                if (pasid_supported(iommu)) {
3234                        u32 temp = 2 << ecap_pss(iommu->ecap);
3235
3236                        intel_pasid_max_id = min_t(u32, temp,
3237                                                   intel_pasid_max_id);
3238                }
3239
3240                g_iommus[iommu->seq_id] = iommu;
3241
3242                intel_iommu_init_qi(iommu);
3243
3244                ret = iommu_init_domains(iommu);
3245                if (ret)
3246                        goto free_iommu;
3247
3248                init_translation_status(iommu);
3249
3250                if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3251                        iommu_disable_translation(iommu);
3252                        clear_translation_pre_enabled(iommu);
3253                        pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3254                                iommu->name);
3255                }
3256
3257                /*
3258                 * TBD:
3259                 * we could share the same root & context tables
3260                 * among all IOMMU's. Need to Split it later.
3261                 */
3262                ret = iommu_alloc_root_entry(iommu);
3263                if (ret)
3264                        goto free_iommu;
3265
3266                if (translation_pre_enabled(iommu)) {
3267                        pr_info("Translation already enabled - trying to copy translation structures\n");
3268
3269                        ret = copy_translation_tables(iommu);
3270                        if (ret) {
3271                                /*
3272                                 * We found the IOMMU with translation
3273                                 * enabled - but failed to copy over the
3274                                 * old root-entry table. Try to proceed
3275                                 * by disabling translation now and
3276                                 * allocating a clean root-entry table.
3277                                 * This might cause DMAR faults, but
3278                                 * probably the dump will still succeed.
3279                                 */
3280                                pr_err("Failed to copy translation tables from previous kernel for %s\n",
3281                                       iommu->name);
3282                                iommu_disable_translation(iommu);
3283                                clear_translation_pre_enabled(iommu);
3284                        } else {
3285                                pr_info("Copied translation tables from previous kernel for %s\n",
3286                                        iommu->name);
3287                        }
3288                }
3289
3290                if (!ecap_pass_through(iommu->ecap))
3291                        hw_pass_through = 0;
3292#ifdef CONFIG_INTEL_IOMMU_SVM
3293                if (pasid_supported(iommu))
3294                        intel_svm_init(iommu);
3295#endif
3296        }
3297
3298        /*
3299         * Now that qi is enabled on all iommus, set the root entry and flush
3300         * caches. This is required on some Intel X58 chipsets, otherwise the
3301         * flush_context function will loop forever and the boot hangs.
3302         */
3303        for_each_active_iommu(iommu, drhd) {
3304                iommu_flush_write_buffer(iommu);
3305                iommu_set_root_entry(iommu);
3306                iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3307                iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3308        }
3309
3310        if (iommu_default_passthrough())
3311                iommu_identity_mapping |= IDENTMAP_ALL;
3312
3313#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3314        dmar_map_gfx = 0;
3315#endif
3316
3317        if (!dmar_map_gfx)
3318                iommu_identity_mapping |= IDENTMAP_GFX;
3319
3320        check_tylersburg_isoch();
3321
3322        ret = si_domain_init(hw_pass_through);
3323        if (ret)
3324                goto free_iommu;
3325
3326        /*
3327         * for each drhd
3328         *   enable fault log
3329         *   global invalidate context cache
3330         *   global invalidate iotlb
3331         *   enable translation
3332         */
3333        for_each_iommu(iommu, drhd) {
3334                if (drhd->ignored) {
3335                        /*
3336                         * we always have to disable PMRs or DMA may fail on
3337                         * this device
3338                         */
3339                        if (force_on)
3340                                iommu_disable_protect_mem_regions(iommu);
3341                        continue;
3342                }
3343
3344                iommu_flush_write_buffer(iommu);
3345
3346#ifdef CONFIG_INTEL_IOMMU_SVM
3347                if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3348                        /*
3349                         * Call dmar_alloc_hwirq() with dmar_global_lock held,
3350                         * could cause possible lock race condition.
3351                         */
3352                        up_write(&dmar_global_lock);
3353                        ret = intel_svm_enable_prq(iommu);
3354                        down_write(&dmar_global_lock);
3355                        if (ret)
3356                                goto free_iommu;
3357                }
3358#endif
3359                ret = dmar_set_interrupt(iommu);
3360                if (ret)
3361                        goto free_iommu;
3362        }
3363
3364        return 0;
3365
3366free_iommu:
3367        for_each_active_iommu(iommu, drhd) {
3368                disable_dmar_iommu(iommu);
3369                free_dmar_iommu(iommu);
3370        }
3371
3372        kfree(g_iommus);
3373
3374error:
3375        return ret;
3376}
3377
3378/* This takes a number of _MM_ pages, not VTD pages */
3379static unsigned long intel_alloc_iova(struct device *dev,
3380                                     struct dmar_domain *domain,
3381                                     unsigned long nrpages, uint64_t dma_mask)
3382{
3383        unsigned long iova_pfn;
3384
3385        /* Restrict dma_mask to the width that the iommu can handle */
3386        dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3387        /* Ensure we reserve the whole size-aligned region */
3388        nrpages = __roundup_pow_of_two(nrpages);
3389
3390        if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3391                /*
3392                 * First try to allocate an io virtual address in
3393                 * DMA_BIT_MASK(32) and if that fails then try allocating
3394                 * from higher range
3395                 */
3396                iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3397                                           IOVA_PFN(DMA_BIT_MASK(32)), false);
3398                if (iova_pfn)
3399                        return iova_pfn;
3400        }
3401        iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3402                                   IOVA_PFN(dma_mask), true);
3403        if (unlikely(!iova_pfn)) {
3404                dev_err(dev, "Allocating %ld-page iova failed", nrpages);
3405                return 0;
3406        }
3407
3408        return iova_pfn;
3409}
3410
3411static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3412{
3413        struct dmar_domain *domain, *tmp;
3414        struct dmar_rmrr_unit *rmrr;
3415        struct device *i_dev;
3416        int i, ret;
3417
3418        /* Device shouldn't be attached by any domains. */
3419        domain = find_domain(dev);
3420        if (domain)
3421                return NULL;
3422
3423        domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3424        if (!domain)
3425                goto out;
3426
3427        /* We have a new domain - setup possible RMRRs for the device */
3428        rcu_read_lock();
3429        for_each_rmrr_units(rmrr) {
3430                for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3431                                          i, i_dev) {
3432                        if (i_dev != dev)
3433                                continue;
3434
3435                        ret = domain_prepare_identity_map(dev, domain,
3436                                                          rmrr->base_address,
3437                                                          rmrr->end_address);
3438                        if (ret)
3439                                dev_err(dev, "Mapping reserved region failed\n");
3440                }
3441        }
3442        rcu_read_unlock();
3443
3444        tmp = set_domain_for_dev(dev, domain);
3445        if (!tmp || domain != tmp) {
3446                domain_exit(domain);
3447                domain = tmp;
3448        }
3449
3450out:
3451        if (!domain)
3452                dev_err(dev, "Allocating domain failed\n");
3453        else
3454                domain->domain.type = IOMMU_DOMAIN_DMA;
3455
3456        return domain;
3457}
3458
3459/* Check if the dev needs to go through non-identity map and unmap process.*/
3460static bool iommu_need_mapping(struct device *dev)
3461{
3462        int ret;
3463
3464        if (iommu_dummy(dev))
3465                return false;
3466
3467        ret = identity_mapping(dev);
3468        if (ret) {
3469                u64 dma_mask = *dev->dma_mask;
3470
3471                if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3472                        dma_mask = dev->coherent_dma_mask;
3473
3474                if (dma_mask >= dma_direct_get_required_mask(dev))
3475                        return false;
3476
3477                /*
3478                 * 32 bit DMA is removed from si_domain and fall back to
3479                 * non-identity mapping.
3480                 */
3481                dmar_remove_one_dev_info(dev);
3482                ret = iommu_request_dma_domain_for_dev(dev);
3483                if (ret) {
3484                        struct iommu_domain *domain;
3485                        struct dmar_domain *dmar_domain;
3486
3487                        domain = iommu_get_domain_for_dev(dev);
3488                        if (domain) {
3489                                dmar_domain = to_dmar_domain(domain);
3490                                dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3491                        }
3492                        dmar_remove_one_dev_info(dev);
3493                        get_private_domain_for_dev(dev);
3494                }
3495
3496                dev_info(dev, "32bit DMA uses non-identity mapping\n");
3497        }
3498
3499        return true;
3500}
3501
3502static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3503                                     size_t size, int dir, u64 dma_mask)
3504{
3505        struct dmar_domain *domain;
3506        phys_addr_t start_paddr;
3507        unsigned long iova_pfn;
3508        int prot = 0;
3509        int ret;
3510        struct intel_iommu *iommu;
3511        unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3512
3513        BUG_ON(dir == DMA_NONE);
3514
3515        domain = find_domain(dev);
3516        if (!domain)
3517                return DMA_MAPPING_ERROR;
3518
3519        iommu = domain_get_iommu(domain);
3520        size = aligned_nrpages(paddr, size);
3521
3522        iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3523        if (!iova_pfn)
3524                goto error;
3525
3526        /*
3527         * Check if DMAR supports zero-length reads on write only
3528         * mappings..
3529         */
3530        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3531                        !cap_zlr(iommu->cap))
3532                prot |= DMA_PTE_READ;
3533        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3534                prot |= DMA_PTE_WRITE;
3535        /*
3536         * paddr - (paddr + size) might be partial page, we should map the whole
3537         * page.  Note: if two part of one page are separately mapped, we
3538         * might have two guest_addr mapping to the same host paddr, but this
3539         * is not a big problem
3540         */
3541        ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3542                                 mm_to_dma_pfn(paddr_pfn), size, prot);
3543        if (ret)
3544                goto error;
3545
3546        start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3547        start_paddr += paddr & ~PAGE_MASK;
3548
3549        trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3550
3551        return start_paddr;
3552
3553error:
3554        if (iova_pfn)
3555                free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3556        dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3557                size, (unsigned long long)paddr, dir);
3558        return DMA_MAPPING_ERROR;
3559}
3560
3561static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3562                                 unsigned long offset, size_t size,
3563                                 enum dma_data_direction dir,
3564                                 unsigned long attrs)
3565{
3566        if (iommu_need_mapping(dev))
3567                return __intel_map_single(dev, page_to_phys(page) + offset,
3568                                size, dir, *dev->dma_mask);
3569        return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3570}
3571
3572static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3573                                     size_t size, enum dma_data_direction dir,
3574                                     unsigned long attrs)
3575{
3576        if (iommu_need_mapping(dev))
3577                return __intel_map_single(dev, phys_addr, size, dir,
3578                                *dev->dma_mask);
3579        return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3580}
3581
3582static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3583{
3584        struct dmar_domain *domain;
3585        unsigned long start_pfn, last_pfn;
3586        unsigned long nrpages;
3587        unsigned long iova_pfn;
3588        struct intel_iommu *iommu;
3589        struct page *freelist;
3590        struct pci_dev *pdev = NULL;
3591
3592        domain = find_domain(dev);
3593        BUG_ON(!domain);
3594
3595        iommu = domain_get_iommu(domain);
3596
3597        iova_pfn = IOVA_PFN(dev_addr);
3598
3599        nrpages = aligned_nrpages(dev_addr, size);
3600        start_pfn = mm_to_dma_pfn(iova_pfn);
3601        last_pfn = start_pfn + nrpages - 1;
3602
3603        if (dev_is_pci(dev))
3604                pdev = to_pci_dev(dev);
3605
3606        freelist = domain_unmap(domain, start_pfn, last_pfn);
3607        if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3608                        !has_iova_flush_queue(&domain->iovad)) {
3609                iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3610                                      nrpages, !freelist, 0);
3611                /* free iova */
3612                free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3613                dma_free_pagelist(freelist);
3614        } else {
3615                queue_iova(&domain->iovad, iova_pfn, nrpages,
3616                           (unsigned long)freelist);
3617                /*
3618                 * queue up the release of the unmap to save the 1/6th of the
3619                 * cpu used up by the iotlb flush operation...
3620                 */
3621        }
3622
3623        trace_unmap_single(dev, dev_addr, size);
3624}
3625
3626static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3627                             size_t size, enum dma_data_direction dir,
3628                             unsigned long attrs)
3629{
3630        if (iommu_need_mapping(dev))
3631                intel_unmap(dev, dev_addr, size);
3632        else
3633                dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3634}
3635
3636static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3637                size_t size, enum dma_data_direction dir, unsigned long attrs)
3638{
3639        if (iommu_need_mapping(dev))
3640                intel_unmap(dev, dev_addr, size);
3641}
3642
3643static void *intel_alloc_coherent(struct device *dev, size_t size,
3644                                  dma_addr_t *dma_handle, gfp_t flags,
3645                                  unsigned long attrs)
3646{
3647        struct page *page = NULL;
3648        int order;
3649
3650        if (!iommu_need_mapping(dev))
3651                return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3652
3653        size = PAGE_ALIGN(size);
3654        order = get_order(size);
3655
3656        if (gfpflags_allow_blocking(flags)) {
3657                unsigned int count = size >> PAGE_SHIFT;
3658
3659                page = dma_alloc_from_contiguous(dev, count, order,
3660                                                 flags & __GFP_NOWARN);
3661        }
3662
3663        if (!page)
3664                page = alloc_pages(flags, order);
3665        if (!page)
3666                return NULL;
3667        memset(page_address(page), 0, size);
3668
3669        *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3670                                         DMA_BIDIRECTIONAL,
3671                                         dev->coherent_dma_mask);
3672        if (*dma_handle != DMA_MAPPING_ERROR)
3673                return page_address(page);
3674        if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3675                __free_pages(page, order);
3676
3677        return NULL;
3678}
3679
3680static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3681                                dma_addr_t dma_handle, unsigned long attrs)
3682{
3683        int order;
3684        struct page *page = virt_to_page(vaddr);
3685
3686        if (!iommu_need_mapping(dev))
3687                return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3688
3689        size = PAGE_ALIGN(size);
3690        order = get_order(size);
3691
3692        intel_unmap(dev, dma_handle, size);
3693        if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3694                __free_pages(page, order);
3695}
3696
3697static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3698                           int nelems, enum dma_data_direction dir,
3699                           unsigned long attrs)
3700{
3701        dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3702        unsigned long nrpages = 0;
3703        struct scatterlist *sg;
3704        int i;
3705
3706        if (!iommu_need_mapping(dev))
3707                return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3708
3709        for_each_sg(sglist, sg, nelems, i) {
3710                nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3711        }
3712
3713        intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3714
3715        trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3716}
3717
3718static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3719                        enum dma_data_direction dir, unsigned long attrs)
3720{
3721        int i;
3722        struct dmar_domain *domain;
3723        size_t size = 0;
3724        int prot = 0;
3725        unsigned long iova_pfn;
3726        int ret;
3727        struct scatterlist *sg;
3728        unsigned long start_vpfn;
3729        struct intel_iommu *iommu;
3730
3731        BUG_ON(dir == DMA_NONE);
3732        if (!iommu_need_mapping(dev))
3733                return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3734
3735        domain = find_domain(dev);
3736        if (!domain)
3737                return 0;
3738
3739        iommu = domain_get_iommu(domain);
3740
3741        for_each_sg(sglist, sg, nelems, i)
3742                size += aligned_nrpages(sg->offset, sg->length);
3743
3744        iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3745                                *dev->dma_mask);
3746        if (!iova_pfn) {
3747                sglist->dma_length = 0;
3748                return 0;
3749        }
3750
3751        /*
3752         * Check if DMAR supports zero-length reads on write only
3753         * mappings..
3754         */
3755        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3756                        !cap_zlr(iommu->cap))
3757                prot |= DMA_PTE_READ;
3758        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3759                prot |= DMA_PTE_WRITE;
3760
3761        start_vpfn = mm_to_dma_pfn(iova_pfn);
3762
3763        ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3764        if (unlikely(ret)) {
3765                dma_pte_free_pagetable(domain, start_vpfn,
3766                                       start_vpfn + size - 1,
3767                                       agaw_to_level(domain->agaw) + 1);
3768                free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3769                return 0;
3770        }
3771
3772        trace_map_sg(dev, iova_pfn << PAGE_SHIFT,
3773                     sg_phys(sglist), size << VTD_PAGE_SHIFT);
3774
3775        return nelems;
3776}
3777
3778static u64 intel_get_required_mask(struct device *dev)
3779{
3780        if (!iommu_need_mapping(dev))
3781                return dma_direct_get_required_mask(dev);
3782        return DMA_BIT_MASK(32);
3783}
3784
3785static const struct dma_map_ops intel_dma_ops = {
3786        .alloc = intel_alloc_coherent,
3787        .free = intel_free_coherent,
3788        .map_sg = intel_map_sg,
3789        .unmap_sg = intel_unmap_sg,
3790        .map_page = intel_map_page,
3791        .unmap_page = intel_unmap_page,
3792        .map_resource = intel_map_resource,
3793        .unmap_resource = intel_unmap_resource,
3794        .dma_supported = dma_direct_supported,
3795        .mmap = dma_common_mmap,
3796        .get_sgtable = dma_common_get_sgtable,
3797        .get_required_mask = intel_get_required_mask,
3798};
3799
3800static void
3801bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3802                   enum dma_data_direction dir, enum dma_sync_target target)
3803{
3804        struct dmar_domain *domain;
3805        phys_addr_t tlb_addr;
3806
3807        domain = find_domain(dev);
3808        if (WARN_ON(!domain))
3809                return;
3810
3811        tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3812        if (is_swiotlb_buffer(tlb_addr))
3813                swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3814}
3815
3816static dma_addr_t
3817bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3818                  enum dma_data_direction dir, unsigned long attrs,
3819                  u64 dma_mask)
3820{
3821        size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3822        struct dmar_domain *domain;
3823        struct intel_iommu *iommu;
3824        unsigned long iova_pfn;
3825        unsigned long nrpages;
3826        phys_addr_t tlb_addr;
3827        int prot = 0;
3828        int ret;
3829
3830        domain = find_domain(dev);
3831        if (WARN_ON(dir == DMA_NONE || !domain))
3832                return DMA_MAPPING_ERROR;
3833
3834        iommu = domain_get_iommu(domain);
3835        if (WARN_ON(!iommu))
3836                return DMA_MAPPING_ERROR;
3837
3838        nrpages = aligned_nrpages(0, size);
3839        iova_pfn = intel_alloc_iova(dev, domain,
3840                                    dma_to_mm_pfn(nrpages), dma_mask);
3841        if (!iova_pfn)
3842                return DMA_MAPPING_ERROR;
3843
3844        /*
3845         * Check if DMAR supports zero-length reads on write only
3846         * mappings..
3847         */
3848        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3849                        !cap_zlr(iommu->cap))
3850                prot |= DMA_PTE_READ;
3851        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3852                prot |= DMA_PTE_WRITE;
3853
3854        /*
3855         * If both the physical buffer start address and size are
3856         * page aligned, we don't need to use a bounce page.
3857         */
3858        if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3859                tlb_addr = swiotlb_tbl_map_single(dev,
3860                                __phys_to_dma(dev, io_tlb_start),
3861                                paddr, size, aligned_size, dir, attrs);
3862                if (tlb_addr == DMA_MAPPING_ERROR) {
3863                        goto swiotlb_error;
3864                } else {
3865                        /* Cleanup the padding area. */
3866                        void *padding_start = phys_to_virt(tlb_addr);
3867                        size_t padding_size = aligned_size;
3868
3869                        if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3870                            (dir == DMA_TO_DEVICE ||
3871                             dir == DMA_BIDIRECTIONAL)) {
3872                                padding_start += size;
3873                                padding_size -= size;
3874                        }
3875
3876                        memset(padding_start, 0, padding_size);
3877                }
3878        } else {
3879                tlb_addr = paddr;
3880        }
3881
3882        ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3883                                 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3884        if (ret)
3885                goto mapping_error;
3886
3887        trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3888
3889        return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3890
3891mapping_error:
3892        if (is_swiotlb_buffer(tlb_addr))
3893                swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3894                                         aligned_size, dir, attrs);
3895swiotlb_error:
3896        free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3897        dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3898                size, (unsigned long long)paddr, dir);
3899
3900        return DMA_MAPPING_ERROR;
3901}
3902
3903static void
3904bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3905                    enum dma_data_direction dir, unsigned long attrs)
3906{
3907        size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3908        struct dmar_domain *domain;
3909        phys_addr_t tlb_addr;
3910
3911        domain = find_domain(dev);
3912        if (WARN_ON(!domain))
3913                return;
3914
3915        tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3916        if (WARN_ON(!tlb_addr))
3917                return;
3918
3919        intel_unmap(dev, dev_addr, size);
3920        if (is_swiotlb_buffer(tlb_addr))
3921                swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3922                                         aligned_size, dir, attrs);
3923
3924        trace_bounce_unmap_single(dev, dev_addr, size);
3925}
3926
3927static dma_addr_t
3928bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3929                size_t size, enum dma_data_direction dir, unsigned long attrs)
3930{
3931        return bounce_map_single(dev, page_to_phys(page) + offset,
3932                                 size, dir, attrs, *dev->dma_mask);
3933}
3934
3935static dma_addr_t
3936bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3937                    enum dma_data_direction dir, unsigned long attrs)
3938{
3939        return bounce_map_single(dev, phys_addr, size,
3940                                 dir, attrs, *dev->dma_mask);
3941}
3942
3943static void
3944bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3945                  enum dma_data_direction dir, unsigned long attrs)
3946{
3947        bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3948}
3949
3950static void
3951bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3952                      enum dma_data_direction dir, unsigned long attrs)
3953{
3954        bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3955}
3956
3957static void
3958bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3959                enum dma_data_direction dir, unsigned long attrs)
3960{
3961        struct scatterlist *sg;
3962        int i;
3963
3964        for_each_sg(sglist, sg, nelems, i)
3965                bounce_unmap_page(dev, sg->dma_address,
3966                                  sg_dma_len(sg), dir, attrs);
3967}
3968
3969static int
3970bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3971              enum dma_data_direction dir, unsigned long attrs)
3972{
3973        int i;
3974        struct scatterlist *sg;
3975
3976        for_each_sg(sglist, sg, nelems, i) {
3977                sg->dma_address = bounce_map_page(dev, sg_page(sg),
3978                                                  sg->offset, sg->length,
3979                                                  dir, attrs);
3980                if (sg->dma_address == DMA_MAPPING_ERROR)
3981                        goto out_unmap;
3982                sg_dma_len(sg) = sg->length;
3983        }
3984
3985        return nelems;
3986
3987out_unmap:
3988        bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3989        return 0;
3990}
3991
3992static void
3993bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3994                           size_t size, enum dma_data_direction dir)
3995{
3996        bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
3997}
3998
3999static void
4000bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
4001                              size_t size, enum dma_data_direction dir)
4002{
4003        bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
4004}
4005
4006static void
4007bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
4008                       int nelems, enum dma_data_direction dir)
4009{
4010        struct scatterlist *sg;
4011        int i;
4012
4013        for_each_sg(sglist, sg, nelems, i)
4014                bounce_sync_single(dev, sg_dma_address(sg),
4015                                   sg_dma_len(sg), dir, SYNC_FOR_CPU);
4016}
4017
4018static void
4019bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
4020                          int nelems, enum dma_data_direction dir)
4021{
4022        struct scatterlist *sg;
4023        int i;
4024
4025        for_each_sg(sglist, sg, nelems, i)
4026                bounce_sync_single(dev, sg_dma_address(sg),
4027                                   sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
4028}
4029
4030static const struct dma_map_ops bounce_dma_ops = {
4031        .alloc                  = intel_alloc_coherent,
4032        .free                   = intel_free_coherent,
4033        .map_sg                 = bounce_map_sg,
4034        .unmap_sg               = bounce_unmap_sg,
4035        .map_page               = bounce_map_page,
4036        .unmap_page             = bounce_unmap_page,
4037        .sync_single_for_cpu    = bounce_sync_single_for_cpu,
4038        .sync_single_for_device = bounce_sync_single_for_device,
4039        .sync_sg_for_cpu        = bounce_sync_sg_for_cpu,
4040        .sync_sg_for_device     = bounce_sync_sg_for_device,
4041        .map_resource           = bounce_map_resource,
4042        .unmap_resource         = bounce_unmap_resource,
4043        .dma_supported          = dma_direct_supported,
4044};
4045
4046static inline int iommu_domain_cache_init(void)
4047{
4048        int ret = 0;
4049
4050        iommu_domain_cache = kmem_cache_create("iommu_domain",
4051                                         sizeof(struct dmar_domain),
4052                                         0,
4053                                         SLAB_HWCACHE_ALIGN,
4054
4055                                         NULL);
4056        if (!iommu_domain_cache) {
4057                pr_err("Couldn't create iommu_domain cache\n");
4058                ret = -ENOMEM;
4059        }
4060
4061        return ret;
4062}
4063
4064static inline int iommu_devinfo_cache_init(void)
4065{
4066        int ret = 0;
4067
4068        iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4069                                         sizeof(struct device_domain_info),
4070                                         0,
4071                                         SLAB_HWCACHE_ALIGN,
4072                                         NULL);
4073        if (!iommu_devinfo_cache) {
4074                pr_err("Couldn't create devinfo cache\n");
4075                ret = -ENOMEM;
4076        }
4077
4078        return ret;
4079}
4080
4081static int __init iommu_init_mempool(void)
4082{
4083        int ret;
4084        ret = iova_cache_get();
4085        if (ret)
4086                return ret;
4087
4088        ret = iommu_domain_cache_init();
4089        if (ret)
4090                goto domain_error;
4091
4092        ret = iommu_devinfo_cache_init();
4093        if (!ret)
4094                return ret;
4095
4096        kmem_cache_destroy(iommu_domain_cache);
4097domain_error:
4098        iova_cache_put();
4099
4100        return -ENOMEM;
4101}
4102
4103static void __init iommu_exit_mempool(void)
4104{
4105        kmem_cache_destroy(iommu_devinfo_cache);
4106        kmem_cache_destroy(iommu_domain_cache);
4107        iova_cache_put();
4108}
4109
4110static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
4111{
4112        struct dmar_drhd_unit *drhd;
4113        u32 vtbar;
4114        int rc;
4115
4116        /* We know that this device on this chipset has its own IOMMU.
4117         * If we find it under a different IOMMU, then the BIOS is lying
4118         * to us. Hope that the IOMMU for this device is actually
4119         * disabled, and it needs no translation...
4120         */
4121        rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4122        if (rc) {
4123                /* "can't" happen */
4124                dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4125                return;
4126        }
4127        vtbar &= 0xffff0000;
4128
4129        /* we know that the this iommu should be at offset 0xa000 from vtbar */
4130        drhd = dmar_find_matched_drhd_unit(pdev);
4131        if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
4132                            TAINT_FIRMWARE_WORKAROUND,
4133                            "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
4134                pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4135}
4136DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4137
4138static void __init init_no_remapping_devices(void)
4139{
4140        struct dmar_drhd_unit *drhd;
4141        struct device *dev;
4142        int i;
4143
4144        for_each_drhd_unit(drhd) {
4145                if (!drhd->include_all) {
4146                        for_each_active_dev_scope(drhd->devices,
4147                                                  drhd->devices_cnt, i, dev)
4148                                break;
4149                        /* ignore DMAR unit if no devices exist */
4150                        if (i == drhd->devices_cnt)
4151                                drhd->ignored = 1;
4152                }
4153        }
4154
4155        for_each_active_drhd_unit(drhd) {
4156                if (drhd->include_all)
4157                        continue;
4158
4159                for_each_active_dev_scope(drhd->devices,
4160                                          drhd->devices_cnt, i, dev)
4161                        if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4162                                break;
4163                if (i < drhd->devices_cnt)
4164                        continue;
4165
4166                /* This IOMMU has *only* gfx devices. Either bypass it or
4167                   set the gfx_mapped flag, as appropriate */
4168                if (!dmar_map_gfx) {
4169                        drhd->ignored = 1;
4170                        for_each_active_dev_scope(drhd->devices,
4171                                                  drhd->devices_cnt, i, dev)
4172                                dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4173                }
4174        }
4175}
4176
4177#ifdef CONFIG_SUSPEND
4178static int init_iommu_hw(void)
4179{
4180        struct dmar_drhd_unit *drhd;
4181        struct intel_iommu *iommu = NULL;
4182
4183        for_each_active_iommu(iommu, drhd)
4184                if (iommu->qi)
4185                        dmar_reenable_qi(iommu);
4186
4187        for_each_iommu(iommu, drhd) {
4188                if (drhd->ignored) {
4189                        /*
4190                         * we always have to disable PMRs or DMA may fail on
4191                         * this device
4192                         */
4193                        if (force_on)
4194                                iommu_disable_protect_mem_regions(iommu);
4195                        continue;
4196                }
4197
4198                iommu_flush_write_buffer(iommu);
4199
4200                iommu_set_root_entry(iommu);
4201
4202                iommu->flush.flush_context(iommu, 0, 0, 0,
4203                                           DMA_CCMD_GLOBAL_INVL);
4204                iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4205                iommu_enable_translation(iommu);
4206                iommu_disable_protect_mem_regions(iommu);
4207        }
4208
4209        return 0;
4210}
4211
4212static void iommu_flush_all(void)
4213{
4214        struct dmar_drhd_unit *drhd;
4215        struct intel_iommu *iommu;
4216
4217        for_each_active_iommu(iommu, drhd) {
4218                iommu->flush.flush_context(iommu, 0, 0, 0,
4219                                           DMA_CCMD_GLOBAL_INVL);
4220                iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4221                                         DMA_TLB_GLOBAL_FLUSH);
4222        }
4223}
4224
4225static int iommu_suspend(void)
4226{
4227        struct dmar_drhd_unit *drhd;
4228        struct intel_iommu *iommu = NULL;
4229        unsigned long flag;
4230
4231        for_each_active_iommu(iommu, drhd) {
4232                iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4233                                                 GFP_ATOMIC);
4234                if (!iommu->iommu_state)
4235                        goto nomem;
4236        }
4237
4238        iommu_flush_all();
4239
4240        for_each_active_iommu(iommu, drhd) {
4241                iommu_disable_translation(iommu);
4242
4243                raw_spin_lock_irqsave(&iommu->register_lock, flag);
4244
4245                iommu->iommu_state[SR_DMAR_FECTL_REG] =
4246                        readl(iommu->reg + DMAR_FECTL_REG);
4247                iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4248                        readl(iommu->reg + DMAR_FEDATA_REG);
4249                iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4250                        readl(iommu->reg + DMAR_FEADDR_REG);
4251                iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4252                        readl(iommu->reg + DMAR_FEUADDR_REG);
4253
4254                raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4255        }
4256        return 0;
4257
4258nomem:
4259        for_each_active_iommu(iommu, drhd)
4260                kfree(iommu->iommu_state);
4261
4262        return -ENOMEM;
4263}
4264
4265static void iommu_resume(void)
4266{
4267        struct dmar_drhd_unit *drhd;
4268        struct intel_iommu *iommu = NULL;
4269        unsigned long flag;
4270
4271        if (init_iommu_hw()) {
4272                if (force_on)
4273                        panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4274                else
4275                        WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4276                return;
4277        }
4278
4279        for_each_active_iommu(iommu, drhd) {
4280
4281                raw_spin_lock_irqsave(&iommu->register_lock, flag);
4282
4283                writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4284                        iommu->reg + DMAR_FECTL_REG);
4285                writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4286                        iommu->reg + DMAR_FEDATA_REG);
4287                writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4288                        iommu->reg + DMAR_FEADDR_REG);
4289                writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4290                        iommu->reg + DMAR_FEUADDR_REG);
4291
4292                raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4293        }
4294
4295        for_each_active_iommu(iommu, drhd)
4296                kfree(iommu->iommu_state);
4297}
4298
4299static struct syscore_ops iommu_syscore_ops = {
4300        .resume         = iommu_resume,
4301        .suspend        = iommu_suspend,
4302};
4303
4304static void __init init_iommu_pm_ops(void)
4305{
4306        register_syscore_ops(&iommu_syscore_ops);
4307}
4308
4309#else
4310static inline void init_iommu_pm_ops(void) {}
4311#endif  /* CONFIG_PM */
4312
4313int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4314{
4315        struct acpi_dmar_reserved_memory *rmrr;
4316        struct dmar_rmrr_unit *rmrru;
4317
4318        rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4319        if (!rmrru)
4320                goto out;
4321
4322        rmrru->hdr = header;
4323        rmrr = (struct acpi_dmar_reserved_memory *)header;
4324        rmrru->base_address = rmrr->base_address;
4325        rmrru->end_address = rmrr->end_address;
4326
4327        rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4328                                ((void *)rmrr) + rmrr->header.length,
4329                                &rmrru->devices_cnt);
4330        if (rmrru->devices_cnt && rmrru->devices == NULL)
4331                goto free_rmrru;
4332
4333        list_add(&rmrru->list, &dmar_rmrr_units);
4334
4335        return 0;
4336free_rmrru:
4337        kfree(rmrru);
4338out:
4339        return -ENOMEM;
4340}
4341
4342static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4343{
4344        struct dmar_atsr_unit *atsru;
4345        struct acpi_dmar_atsr *tmp;
4346
4347        list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4348                tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4349                if (atsr->segment != tmp->segment)
4350                        continue;
4351                if (atsr->header.length != tmp->header.length)
4352                        continue;
4353                if (memcmp(atsr, tmp, atsr->header.length) == 0)
4354                        return atsru;
4355        }
4356
4357        return NULL;
4358}
4359
4360int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4361{
4362        struct acpi_dmar_atsr *atsr;
4363        struct dmar_atsr_unit *atsru;
4364
4365        if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4366                return 0;
4367
4368        atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4369        atsru = dmar_find_atsr(atsr);
4370        if (atsru)
4371                return 0;
4372
4373        atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4374        if (!atsru)
4375                return -ENOMEM;
4376
4377        /*
4378         * If memory is allocated from slab by ACPI _DSM method, we need to
4379         * copy the memory content because the memory buffer will be freed
4380         * on return.
4381         */
4382        atsru->hdr = (void *)(atsru + 1);
4383        memcpy(atsru->hdr, hdr, hdr->length);
4384        atsru->include_all = atsr->flags & 0x1;
4385        if (!atsru->include_all) {
4386                atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4387                                (void *)atsr + atsr->header.length,
4388                                &atsru->devices_cnt);
4389                if (atsru->devices_cnt && atsru->devices == NULL) {
4390                        kfree(atsru);
4391                        return -ENOMEM;
4392                }
4393        }
4394
4395        list_add_rcu(&atsru->list, &dmar_atsr_units);
4396
4397        return 0;
4398}
4399
4400static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4401{
4402        dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4403        kfree(atsru);
4404}
4405
4406int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4407{
4408        struct acpi_dmar_atsr *atsr;
4409        struct dmar_atsr_unit *atsru;
4410
4411        atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4412        atsru = dmar_find_atsr(atsr);
4413        if (atsru) {
4414                list_del_rcu(&atsru->list);
4415                synchronize_rcu();
4416                intel_iommu_free_atsr(atsru);
4417        }
4418
4419        return 0;
4420}
4421
4422int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4423{
4424        int i;
4425        struct device *dev;
4426        struct acpi_dmar_atsr *atsr;
4427        struct dmar_atsr_unit *atsru;
4428
4429        atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4430        atsru = dmar_find_atsr(atsr);
4431        if (!atsru)
4432                return 0;
4433
4434        if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4435                for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4436                                          i, dev)
4437                        return -EBUSY;
4438        }
4439
4440        return 0;
4441}
4442
4443static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4444{
4445        int sp, ret;
4446        struct intel_iommu *iommu = dmaru->iommu;
4447
4448        if (g_iommus[iommu->seq_id])
4449                return 0;
4450
4451        if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4452                pr_warn("%s: Doesn't support hardware pass through.\n",
4453                        iommu->name);
4454                return -ENXIO;
4455        }
4456        if (!ecap_sc_support(iommu->ecap) &&
4457            domain_update_iommu_snooping(iommu)) {
4458                pr_warn("%s: Doesn't support snooping.\n",
4459                        iommu->name);
4460                return -ENXIO;
4461        }
4462        sp = domain_update_iommu_superpage(iommu) - 1;
4463        if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4464                pr_warn("%s: Doesn't support large page.\n",
4465                        iommu->name);
4466                return -ENXIO;
4467        }
4468
4469        /*
4470         * Disable translation if already enabled prior to OS handover.
4471         */
4472        if (iommu->gcmd & DMA_GCMD_TE)
4473                iommu_disable_translation(iommu);
4474
4475        g_iommus[iommu->seq_id] = iommu;
4476        ret = iommu_init_domains(iommu);
4477        if (ret == 0)
4478                ret = iommu_alloc_root_entry(iommu);
4479        if (ret)
4480                goto out;
4481
4482#ifdef CONFIG_INTEL_IOMMU_SVM
4483        if (pasid_supported(iommu))
4484                intel_svm_init(iommu);
4485#endif
4486
4487        if (dmaru->ignored) {
4488                /*
4489                 * we always have to disable PMRs or DMA may fail on this device
4490                 */
4491                if (force_on)
4492                        iommu_disable_protect_mem_regions(iommu);
4493                return 0;
4494        }
4495
4496        intel_iommu_init_qi(iommu);
4497        iommu_flush_write_buffer(iommu);
4498
4499#ifdef CONFIG_INTEL_IOMMU_SVM
4500        if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4501                ret = intel_svm_enable_prq(iommu);
4502                if (ret)
4503                        goto disable_iommu;
4504        }
4505#endif
4506        ret = dmar_set_interrupt(iommu);
4507        if (ret)
4508                goto disable_iommu;
4509
4510        iommu_set_root_entry(iommu);
4511        iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4512        iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4513        iommu_enable_translation(iommu);
4514
4515        iommu_disable_protect_mem_regions(iommu);
4516        return 0;
4517
4518disable_iommu:
4519        disable_dmar_iommu(iommu);
4520out:
4521        free_dmar_iommu(iommu);
4522        return ret;
4523}
4524
4525int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4526{
4527        int ret = 0;
4528        struct intel_iommu *iommu = dmaru->iommu;
4529
4530        if (!intel_iommu_enabled)
4531                return 0;
4532        if (iommu == NULL)
4533                return -EINVAL;
4534
4535        if (insert) {
4536                ret = intel_iommu_add(dmaru);
4537        } else {
4538                disable_dmar_iommu(iommu);
4539                free_dmar_iommu(iommu);
4540        }
4541
4542        return ret;
4543}
4544
4545static void intel_iommu_free_dmars(void)
4546{
4547        struct dmar_rmrr_unit *rmrru, *rmrr_n;
4548        struct dmar_atsr_unit *atsru, *atsr_n;
4549
4550        list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4551                list_del(&rmrru->list);
4552                dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4553                kfree(rmrru);
4554        }
4555
4556        list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4557                list_del(&atsru->list);
4558                intel_iommu_free_atsr(atsru);
4559        }
4560}
4561
4562int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4563{
4564        int i, ret = 1;
4565        struct pci_bus *bus;
4566        struct pci_dev *bridge = NULL;
4567        struct device *tmp;
4568        struct acpi_dmar_atsr *atsr;
4569        struct dmar_atsr_unit *atsru;
4570
4571        dev = pci_physfn(dev);
4572        for (bus = dev->bus; bus; bus = bus->parent) {
4573                bridge = bus->self;
4574                /* If it's an integrated device, allow ATS */
4575                if (!bridge)
4576                        return 1;
4577                /* Connected via non-PCIe: no ATS */
4578                if (!pci_is_pcie(bridge) ||
4579                    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4580                        return 0;
4581                /* If we found the root port, look it up in the ATSR */
4582                if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4583                        break;
4584        }
4585
4586        rcu_read_lock();
4587        list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4588                atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4589                if (atsr->segment != pci_domain_nr(dev->bus))
4590                        continue;
4591
4592                for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4593                        if (tmp == &bridge->dev)
4594                                goto out;
4595
4596                if (atsru->include_all)
4597                        goto out;
4598        }
4599        ret = 0;
4600out:
4601        rcu_read_unlock();
4602
4603        return ret;
4604}
4605
4606int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4607{
4608        int ret;
4609        struct dmar_rmrr_unit *rmrru;
4610        struct dmar_atsr_unit *atsru;
4611        struct acpi_dmar_atsr *atsr;
4612        struct acpi_dmar_reserved_memory *rmrr;
4613
4614        if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4615                return 0;
4616
4617        list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4618                rmrr = container_of(rmrru->hdr,
4619                                    struct acpi_dmar_reserved_memory, header);
4620                if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4621                        ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4622                                ((void *)rmrr) + rmrr->header.length,
4623                                rmrr->segment, rmrru->devices,
4624                                rmrru->devices_cnt);
4625                        if (ret < 0)
4626                                return ret;
4627                } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4628                        dmar_remove_dev_scope(info, rmrr->segment,
4629                                rmrru->devices, rmrru->devices_cnt);
4630                }
4631        }
4632
4633        list_for_each_entry(atsru, &dmar_atsr_units, list) {
4634                if (atsru->include_all)
4635                        continue;
4636
4637                atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4638                if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4639                        ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4640                                        (void *)atsr + atsr->header.length,
4641                                        atsr->segment, atsru->devices,
4642                                        atsru->devices_cnt);
4643                        if (ret > 0)
4644                                break;
4645                        else if (ret < 0)
4646                                return ret;
4647                } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4648                        if (dmar_remove_dev_scope(info, atsr->segment,
4649                                        atsru->devices, atsru->devices_cnt))
4650                                break;
4651                }
4652        }
4653
4654        return 0;
4655}
4656
4657static int intel_iommu_memory_notifier(struct notifier_block *nb,
4658                                       unsigned long val, void *v)
4659{
4660        struct memory_notify *mhp = v;
4661        unsigned long long start, end;
4662        unsigned long start_vpfn, last_vpfn;
4663
4664        switch (val) {
4665        case MEM_GOING_ONLINE:
4666                start = mhp->start_pfn << PAGE_SHIFT;
4667                end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4668                if (iommu_domain_identity_map(si_domain, start, end)) {
4669                        pr_warn("Failed to build identity map for [%llx-%llx]\n",
4670                                start, end);
4671                        return NOTIFY_BAD;
4672                }
4673                break;
4674
4675        case MEM_OFFLINE:
4676        case MEM_CANCEL_ONLINE:
4677                start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4678                last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4679                while (start_vpfn <= last_vpfn) {
4680                        struct iova *iova;
4681                        struct dmar_drhd_unit *drhd;
4682                        struct intel_iommu *iommu;
4683                        struct page *freelist;
4684
4685                        iova = find_iova(&si_domain->iovad, start_vpfn);
4686                        if (iova == NULL) {
4687                                pr_debug("Failed get IOVA for PFN %lx\n",
4688                                         start_vpfn);
4689                                break;
4690                        }
4691
4692                        iova = split_and_remove_iova(&si_domain->iovad, iova,
4693                                                     start_vpfn, last_vpfn);
4694                        if (iova == NULL) {
4695                                pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4696                                        start_vpfn, last_vpfn);
4697                                return NOTIFY_BAD;
4698                        }
4699
4700                        freelist = domain_unmap(si_domain, iova->pfn_lo,
4701                                               iova->pfn_hi);
4702
4703                        rcu_read_lock();
4704                        for_each_active_iommu(iommu, drhd)
4705                                iommu_flush_iotlb_psi(iommu, si_domain,
4706                                        iova->pfn_lo, iova_size(iova),
4707                                        !freelist, 0);
4708                        rcu_read_unlock();
4709                        dma_free_pagelist(freelist);
4710
4711                        start_vpfn = iova->pfn_hi + 1;
4712                        free_iova_mem(iova);
4713                }
4714                break;
4715        }
4716
4717        return NOTIFY_OK;
4718}
4719
4720static struct notifier_block intel_iommu_memory_nb = {
4721        .notifier_call = intel_iommu_memory_notifier,
4722        .priority = 0
4723};
4724
4725static void free_all_cpu_cached_iovas(unsigned int cpu)
4726{
4727        int i;
4728
4729        for (i = 0; i < g_num_of_iommus; i++) {
4730                struct intel_iommu *iommu = g_iommus[i];
4731                struct dmar_domain *domain;
4732                int did;
4733
4734                if (!iommu)
4735                        continue;
4736
4737                for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4738                        domain = get_iommu_domain(iommu, (u16)did);
4739
4740                        if (!domain)
4741                                continue;
4742                        free_cpu_cached_iovas(cpu, &domain->iovad);
4743                }
4744        }
4745}
4746
4747static int intel_iommu_cpu_dead(unsigned int cpu)
4748{
4749        free_all_cpu_cached_iovas(cpu);
4750        return 0;
4751}
4752
4753static void intel_disable_iommus(void)
4754{
4755        struct intel_iommu *iommu = NULL;
4756        struct dmar_drhd_unit *drhd;
4757
4758        for_each_iommu(iommu, drhd)
4759                iommu_disable_translation(iommu);
4760}
4761
4762static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4763{
4764        struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4765
4766        return container_of(iommu_dev, struct intel_iommu, iommu);
4767}
4768
4769static ssize_t intel_iommu_show_version(struct device *dev,
4770                                        struct device_attribute *attr,
4771                                        char *buf)
4772{
4773        struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4774        u32 ver = readl(iommu->reg + DMAR_VER_REG);
4775        return sprintf(buf, "%d:%d\n",
4776                       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4777}
4778static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4779
4780static ssize_t intel_iommu_show_address(struct device *dev,
4781                                        struct device_attribute *attr,
4782                                        char *buf)
4783{
4784        struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4785        return sprintf(buf, "%llx\n", iommu->reg_phys);
4786}
4787static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4788
4789static ssize_t intel_iommu_show_cap(struct device *dev,
4790                                    struct device_attribute *attr,
4791                                    char *buf)
4792{
4793        struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4794        return sprintf(buf, "%llx\n", iommu->cap);
4795}
4796static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4797
4798static ssize_t intel_iommu_show_ecap(struct device *dev,
4799                                    struct device_attribute *attr,
4800                                    char *buf)
4801{
4802        struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4803        return sprintf(buf, "%llx\n", iommu->ecap);
4804}
4805static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4806
4807static ssize_t intel_iommu_show_ndoms(struct device *dev,
4808                                      struct device_attribute *attr,
4809                                      char *buf)
4810{
4811        struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4812        return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4813}
4814static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4815
4816static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4817                                           struct device_attribute *attr,
4818                                           char *buf)
4819{
4820        struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4821        return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4822                                                  cap_ndoms(iommu->cap)));
4823}
4824static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4825
4826static struct attribute *intel_iommu_attrs[] = {
4827        &dev_attr_version.attr,
4828        &dev_attr_address.attr,
4829        &dev_attr_cap.attr,
4830        &dev_attr_ecap.attr,
4831        &dev_attr_domains_supported.attr,
4832        &dev_attr_domains_used.attr,
4833        NULL,
4834};
4835
4836static struct attribute_group intel_iommu_group = {
4837        .name = "intel-iommu",
4838        .attrs = intel_iommu_attrs,
4839};
4840
4841const struct attribute_group *intel_iommu_groups[] = {
4842        &intel_iommu_group,
4843        NULL,
4844};
4845
4846static inline bool has_untrusted_dev(void)
4847{
4848        struct pci_dev *pdev = NULL;
4849
4850        for_each_pci_dev(pdev)
4851                if (pdev->untrusted)
4852                        return true;
4853
4854        return false;
4855}
4856
4857static int __init platform_optin_force_iommu(void)
4858{
4859        if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
4860                return 0;
4861
4862        if (no_iommu || dmar_disabled)
4863                pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4864
4865        /*
4866         * If Intel-IOMMU is disabled by default, we will apply identity
4867         * map for all devices except those marked as being untrusted.
4868         */
4869        if (dmar_disabled)
4870                iommu_identity_mapping |= IDENTMAP_ALL;
4871
4872        dmar_disabled = 0;
4873        no_iommu = 0;
4874
4875        return 1;
4876}
4877
4878static int __init probe_acpi_namespace_devices(void)
4879{
4880        struct dmar_drhd_unit *drhd;
4881        /* To avoid a -Wunused-but-set-variable warning. */
4882        struct intel_iommu *iommu __maybe_unused;
4883        struct device *dev;
4884        int i, ret = 0;
4885
4886        for_each_active_iommu(iommu, drhd) {
4887                for_each_active_dev_scope(drhd->devices,
4888                                          drhd->devices_cnt, i, dev) {
4889                        struct acpi_device_physical_node *pn;
4890                        struct iommu_group *group;
4891                        struct acpi_device *adev;
4892
4893                        if (dev->bus != &acpi_bus_type)
4894                                continue;
4895
4896                        adev = to_acpi_device(dev);
4897                        mutex_lock(&adev->physical_node_lock);
4898                        list_for_each_entry(pn,
4899                                            &adev->physical_node_list, node) {
4900                                group = iommu_group_get(pn->dev);
4901                                if (group) {
4902                                        iommu_group_put(group);
4903                                        continue;
4904                                }
4905
4906                                pn->dev->bus->iommu_ops = &intel_iommu_ops;
4907                                ret = iommu_probe_device(pn->dev);
4908                                if (ret)
4909                                        break;
4910                        }
4911                        mutex_unlock(&adev->physical_node_lock);
4912
4913                        if (ret)
4914                                return ret;
4915                }
4916        }
4917
4918        return 0;
4919}
4920
4921int __init intel_iommu_init(void)
4922{
4923        int ret = -ENODEV;
4924        struct dmar_drhd_unit *drhd;
4925        struct intel_iommu *iommu;
4926
4927        /*
4928         * Intel IOMMU is required for a TXT/tboot launch or platform
4929         * opt in, so enforce that.
4930         */
4931        force_on = tboot_force_iommu() || platform_optin_force_iommu();
4932
4933        if (iommu_init_mempool()) {
4934                if (force_on)
4935                        panic("tboot: Failed to initialize iommu memory\n");
4936                return -ENOMEM;
4937        }
4938
4939        down_write(&dmar_global_lock);
4940        if (dmar_table_init()) {
4941                if (force_on)
4942                        panic("tboot: Failed to initialize DMAR table\n");
4943                goto out_free_dmar;
4944        }
4945
4946        if (dmar_dev_scope_init() < 0) {
4947                if (force_on)
4948                        panic("tboot: Failed to initialize DMAR device scope\n");
4949                goto out_free_dmar;
4950        }
4951
4952        up_write(&dmar_global_lock);
4953
4954        /*
4955         * The bus notifier takes the dmar_global_lock, so lockdep will
4956         * complain later when we register it under the lock.
4957         */
4958        dmar_register_bus_notifier();
4959
4960        down_write(&dmar_global_lock);
4961
4962        if (no_iommu || dmar_disabled) {
4963                /*
4964                 * We exit the function here to ensure IOMMU's remapping and
4965                 * mempool aren't setup, which means that the IOMMU's PMRs
4966                 * won't be disabled via the call to init_dmars(). So disable
4967                 * it explicitly here. The PMRs were setup by tboot prior to
4968                 * calling SENTER, but the kernel is expected to reset/tear
4969                 * down the PMRs.
4970                 */
4971                if (intel_iommu_tboot_noforce) {
4972                        for_each_iommu(iommu, drhd)
4973                                iommu_disable_protect_mem_regions(iommu);
4974                }
4975
4976                /*
4977                 * Make sure the IOMMUs are switched off, even when we
4978                 * boot into a kexec kernel and the previous kernel left
4979                 * them enabled
4980                 */
4981                intel_disable_iommus();
4982                goto out_free_dmar;
4983        }
4984
4985        if (list_empty(&dmar_rmrr_units))
4986                pr_info("No RMRR found\n");
4987
4988        if (list_empty(&dmar_atsr_units))
4989                pr_info("No ATSR found\n");
4990
4991        if (dmar_init_reserved_ranges()) {
4992                if (force_on)
4993                        panic("tboot: Failed to reserve iommu ranges\n");
4994                goto out_free_reserved_range;
4995        }
4996
4997        if (dmar_map_gfx)
4998                intel_iommu_gfx_mapped = 1;
4999
5000        init_no_remapping_devices();
5001
5002        ret = init_dmars();
5003        if (ret) {
5004                if (force_on)
5005                        panic("tboot: Failed to initialize DMARs\n");
5006                pr_err("Initialization failed\n");
5007                goto out_free_reserved_range;
5008        }
5009        up_write(&dmar_global_lock);
5010
5011#if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
5012        /*
5013         * If the system has no untrusted device or the user has decided
5014         * to disable the bounce page mechanisms, we don't need swiotlb.
5015         * Mark this and the pre-allocated bounce pages will be released
5016         * later.
5017         */
5018        if (!has_untrusted_dev() || intel_no_bounce)
5019                swiotlb = 0;
5020#endif
5021        dma_ops = &intel_dma_ops;
5022
5023        init_iommu_pm_ops();
5024
5025        for_each_active_iommu(iommu, drhd) {
5026                iommu_device_sysfs_add(&iommu->iommu, NULL,
5027                                       intel_iommu_groups,
5028                                       "%s", iommu->name);
5029                iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
5030                iommu_device_register(&iommu->iommu);
5031        }
5032
5033        bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
5034        if (si_domain && !hw_pass_through)
5035                register_memory_notifier(&intel_iommu_memory_nb);
5036        cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
5037                          intel_iommu_cpu_dead);
5038
5039        down_read(&dmar_global_lock);
5040        if (probe_acpi_namespace_devices())
5041                pr_warn("ACPI name space devices didn't probe correctly\n");
5042        up_read(&dmar_global_lock);
5043
5044        /* Finally, we enable the DMA remapping hardware. */
5045        for_each_iommu(iommu, drhd) {
5046                if (!drhd->ignored && !translation_pre_enabled(iommu))
5047                        iommu_enable_translation(iommu);
5048
5049                iommu_disable_protect_mem_regions(iommu);
5050        }
5051        pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5052
5053        intel_iommu_enabled = 1;
5054        intel_iommu_debugfs_init();
5055
5056        return 0;
5057
5058out_free_reserved_range:
5059        put_iova_domain(&reserved_iova_list);
5060out_free_dmar:
5061        intel_iommu_free_dmars();
5062        up_write(&dmar_global_lock);
5063        iommu_exit_mempool();
5064        return ret;
5065}
5066
5067static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5068{
5069        struct intel_iommu *iommu = opaque;
5070
5071        domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5072        return 0;
5073}
5074
5075/*
5076 * NB - intel-iommu lacks any sort of reference counting for the users of
5077 * dependent devices.  If multiple endpoints have intersecting dependent
5078 * devices, unbinding the driver from any one of them will possibly leave
5079 * the others unable to operate.
5080 */
5081static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5082{
5083        if (!iommu || !dev || !dev_is_pci(dev))
5084                return;
5085
5086        pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5087}
5088
5089static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5090{
5091        struct dmar_domain *domain;
5092        struct intel_iommu *iommu;
5093        unsigned long flags;
5094
5095        assert_spin_locked(&device_domain_lock);
5096
5097        if (WARN_ON(!info))
5098                return;
5099
5100        iommu = info->iommu;
5101        domain = info->domain;
5102
5103        if (info->dev) {
5104                if (dev_is_pci(info->dev) && sm_supported(iommu))
5105                        intel_pasid_tear_down_entry(iommu, info->dev,
5106                                        PASID_RID2PASID);
5107
5108                iommu_disable_dev_iotlb(info);
5109                domain_context_clear(iommu, info->dev);
5110                intel_pasid_free_table(info->dev);
5111        }
5112
5113        unlink_domain_info(info);
5114
5115        spin_lock_irqsave(&iommu->lock, flags);
5116        domain_detach_iommu(domain, iommu);
5117        spin_unlock_irqrestore(&iommu->lock, flags);
5118
5119        /* free the private domain */
5120        if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
5121            !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
5122            list_empty(&domain->devices))
5123                domain_exit(info->domain);
5124
5125        free_devinfo_mem(info);
5126}
5127
5128static void dmar_remove_one_dev_info(struct device *dev)
5129{
5130        struct device_domain_info *info;
5131        unsigned long flags;
5132
5133        spin_lock_irqsave(&device_domain_lock, flags);
5134        info = dev->archdata.iommu;
5135        if (info)
5136                __dmar_remove_one_dev_info(info);
5137        spin_unlock_irqrestore(&device_domain_lock, flags);
5138}
5139
5140static int md_domain_init(struct dmar_domain *domain, int guest_width)
5141{
5142        int adjust_width;
5143
5144        init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5145        domain_reserve_special_ranges(domain);
5146
5147        /* calculate AGAW */
5148        domain->gaw = guest_width;
5149        adjust_width = guestwidth_to_adjustwidth(guest_width);
5150        domain->agaw = width_to_agaw(adjust_width);
5151
5152        domain->iommu_coherency = 0;
5153        domain->iommu_snooping = 0;
5154        domain->iommu_superpage = 0;
5155        domain->max_addr = 0;
5156
5157        /* always allocate the top pgd */
5158        domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5159        if (!domain->pgd)
5160                return -ENOMEM;
5161        domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5162        return 0;
5163}
5164
5165static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5166{
5167        struct dmar_domain *dmar_domain;
5168        struct iommu_domain *domain;
5169
5170        switch (type) {
5171        case IOMMU_DOMAIN_DMA:
5172        /* fallthrough */
5173        case IOMMU_DOMAIN_UNMANAGED:
5174                dmar_domain = alloc_domain(0);
5175                if (!dmar_domain) {
5176                        pr_err("Can't allocate dmar_domain\n");
5177                        return NULL;
5178                }
5179                if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5180                        pr_err("Domain initialization failed\n");
5181                        domain_exit(dmar_domain);
5182                        return NULL;
5183                }
5184
5185                if (type == IOMMU_DOMAIN_DMA &&
5186                    init_iova_flush_queue(&dmar_domain->iovad,
5187                                          iommu_flush_iova, iova_entry_free)) {
5188                        pr_warn("iova flush queue initialization failed\n");
5189                        intel_iommu_strict = 1;
5190                }
5191
5192                domain_update_iommu_cap(dmar_domain);
5193
5194                domain = &dmar_domain->domain;
5195                domain->geometry.aperture_start = 0;
5196                domain->geometry.aperture_end   =
5197                                __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5198                domain->geometry.force_aperture = true;
5199
5200                return domain;
5201        case IOMMU_DOMAIN_IDENTITY:
5202                return &si_domain->domain;
5203        default:
5204                return NULL;
5205        }
5206
5207        return NULL;
5208}
5209
5210static void intel_iommu_domain_free(struct iommu_domain *domain)
5211{
5212        if (domain != &si_domain->domain)
5213                domain_exit(to_dmar_domain(domain));
5214}
5215
5216/*
5217 * Check whether a @domain could be attached to the @dev through the
5218 * aux-domain attach/detach APIs.
5219 */
5220static inline bool
5221is_aux_domain(struct device *dev, struct iommu_domain *domain)
5222{
5223        struct device_domain_info *info = dev->archdata.iommu;
5224
5225        return info && info->auxd_enabled &&
5226                        domain->type == IOMMU_DOMAIN_UNMANAGED;
5227}
5228
5229static void auxiliary_link_device(struct dmar_domain *domain,
5230                                  struct device *dev)
5231{
5232        struct device_domain_info *info = dev->archdata.iommu;
5233
5234        assert_spin_locked(&device_domain_lock);
5235        if (WARN_ON(!info))
5236                return;
5237
5238        domain->auxd_refcnt++;
5239        list_add(&domain->auxd, &info->auxiliary_domains);
5240}
5241
5242static void auxiliary_unlink_device(struct dmar_domain *domain,
5243                                    struct device *dev)
5244{
5245        struct device_domain_info *info = dev->archdata.iommu;
5246
5247        assert_spin_locked(&device_domain_lock);
5248        if (WARN_ON(!info))
5249                return;
5250
5251        list_del(&domain->auxd);
5252        domain->auxd_refcnt--;
5253
5254        if (!domain->auxd_refcnt && domain->default_pasid > 0)
5255                intel_pasid_free_id(domain->default_pasid);
5256}
5257
5258static int aux_domain_add_dev(struct dmar_domain *domain,
5259                              struct device *dev)
5260{
5261        int ret;
5262        u8 bus, devfn;
5263        unsigned long flags;
5264        struct intel_iommu *iommu;
5265
5266        iommu = device_to_iommu(dev, &bus, &devfn);
5267        if (!iommu)
5268                return -ENODEV;
5269
5270        if (domain->default_pasid <= 0) {
5271                int pasid;
5272
5273                pasid = intel_pasid_alloc_id(domain, PASID_MIN,
5274                                             pci_max_pasids(to_pci_dev(dev)),
5275                                             GFP_KERNEL);
5276                if (pasid <= 0) {
5277                        pr_err("Can't allocate default pasid\n");
5278                        return -ENODEV;
5279                }
5280                domain->default_pasid = pasid;
5281        }
5282
5283        spin_lock_irqsave(&device_domain_lock, flags);
5284        /*
5285         * iommu->lock must be held to attach domain to iommu and setup the
5286         * pasid entry for second level translation.
5287         */
5288        spin_lock(&iommu->lock);
5289        ret = domain_attach_iommu(domain, iommu);
5290        if (ret)
5291                goto attach_failed;
5292
5293        /* Setup the PASID entry for mediated devices: */
5294        ret = intel_pasid_setup_second_level(iommu, domain, dev,
5295                                             domain->default_pasid);
5296        if (ret)
5297                goto table_failed;
5298        spin_unlock(&iommu->lock);
5299
5300        auxiliary_link_device(domain, dev);
5301
5302        spin_unlock_irqrestore(&device_domain_lock, flags);
5303
5304        return 0;
5305
5306table_failed:
5307        domain_detach_iommu(domain, iommu);
5308attach_failed:
5309        spin_unlock(&iommu->lock);
5310        spin_unlock_irqrestore(&device_domain_lock, flags);
5311        if (!domain->auxd_refcnt && domain->default_pasid > 0)
5312                intel_pasid_free_id(domain->default_pasid);
5313
5314        return ret;
5315}
5316
5317static void aux_domain_remove_dev(struct dmar_domain *domain,
5318                                  struct device *dev)
5319{
5320        struct device_domain_info *info;
5321        struct intel_iommu *iommu;
5322        unsigned long flags;
5323
5324        if (!is_aux_domain(dev, &domain->domain))
5325                return;
5326
5327        spin_lock_irqsave(&device_domain_lock, flags);
5328        info = dev->archdata.iommu;
5329        iommu = info->iommu;
5330
5331        auxiliary_unlink_device(domain, dev);
5332
5333        spin_lock(&iommu->lock);
5334        intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5335        domain_detach_iommu(domain, iommu);
5336        spin_unlock(&iommu->lock);
5337
5338        spin_unlock_irqrestore(&device_domain_lock, flags);
5339}
5340
5341static int prepare_domain_attach_device(struct iommu_domain *domain,
5342                                        struct device *dev)
5343{
5344        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5345        struct intel_iommu *iommu;
5346        int addr_width;
5347        u8 bus, devfn;
5348
5349        iommu = device_to_iommu(dev, &bus, &devfn);
5350        if (!iommu)
5351                return -ENODEV;
5352
5353        /* check if this iommu agaw is sufficient for max mapped address */
5354        addr_width = agaw_to_width(iommu->agaw);
5355        if (addr_width > cap_mgaw(iommu->cap))
5356                addr_width = cap_mgaw(iommu->cap);
5357
5358        if (dmar_domain->max_addr > (1LL << addr_width)) {
5359                dev_err(dev, "%s: iommu width (%d) is not "
5360                        "sufficient for the mapped address (%llx)\n",
5361                        __func__, addr_width, dmar_domain->max_addr);
5362                return -EFAULT;
5363        }
5364        dmar_domain->gaw = addr_width;
5365
5366        /*
5367         * Knock out extra levels of page tables if necessary
5368         */
5369        while (iommu->agaw < dmar_domain->agaw) {
5370                struct dma_pte *pte;
5371
5372                pte = dmar_domain->pgd;
5373                if (dma_pte_present(pte)) {
5374                        dmar_domain->pgd = (struct dma_pte *)
5375                                phys_to_virt(dma_pte_addr(pte));
5376                        free_pgtable_page(pte);
5377                }
5378                dmar_domain->agaw--;
5379        }
5380
5381        return 0;
5382}
5383
5384static int intel_iommu_attach_device(struct iommu_domain *domain,
5385                                     struct device *dev)
5386{
5387        int ret;
5388
5389        if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5390            device_is_rmrr_locked(dev)) {
5391                dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5392                return -EPERM;
5393        }
5394
5395        if (is_aux_domain(dev, domain))
5396                return -EPERM;
5397
5398        /* normally dev is not mapped */
5399        if (unlikely(domain_context_mapped(dev))) {
5400                struct dmar_domain *old_domain;
5401
5402                old_domain = find_domain(dev);
5403                if (old_domain)
5404                        dmar_remove_one_dev_info(dev);
5405        }
5406
5407        ret = prepare_domain_attach_device(domain, dev);
5408        if (ret)
5409                return ret;
5410
5411        return domain_add_dev_info(to_dmar_domain(domain), dev);
5412}
5413
5414static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5415                                         struct device *dev)
5416{
5417        int ret;
5418
5419        if (!is_aux_domain(dev, domain))
5420                return -EPERM;
5421
5422        ret = prepare_domain_attach_device(domain, dev);
5423        if (ret)
5424                return ret;
5425
5426        return aux_domain_add_dev(to_dmar_domain(domain), dev);
5427}
5428
5429static void intel_iommu_detach_device(struct iommu_domain *domain,
5430                                      struct device *dev)
5431{
5432        dmar_remove_one_dev_info(dev);
5433}
5434
5435static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5436                                          struct device *dev)
5437{
5438        aux_domain_remove_dev(to_dmar_domain(domain), dev);
5439}
5440
5441static int intel_iommu_map(struct iommu_domain *domain,
5442                           unsigned long iova, phys_addr_t hpa,
5443                           size_t size, int iommu_prot)
5444{
5445        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5446        u64 max_addr;
5447        int prot = 0;
5448        int ret;
5449
5450        if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5451                return -EINVAL;
5452
5453        if (iommu_prot & IOMMU_READ)
5454                prot |= DMA_PTE_READ;
5455        if (iommu_prot & IOMMU_WRITE)
5456                prot |= DMA_PTE_WRITE;
5457        if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5458                prot |= DMA_PTE_SNP;
5459
5460        max_addr = iova + size;
5461        if (dmar_domain->max_addr < max_addr) {
5462                u64 end;
5463
5464                /* check if minimum agaw is sufficient for mapped address */
5465                end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5466                if (end < max_addr) {
5467                        pr_err("%s: iommu width (%d) is not "
5468                               "sufficient for the mapped address (%llx)\n",
5469                               __func__, dmar_d