linux/drivers/iommu/intel-iommu.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright © 2006-2014 Intel Corporation.
   4 *
   5 * Authors: David Woodhouse <dwmw2@infradead.org>,
   6 *          Ashok Raj <ashok.raj@intel.com>,
   7 *          Shaohua Li <shaohua.li@intel.com>,
   8 *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
   9 *          Fenghua Yu <fenghua.yu@intel.com>
  10 *          Joerg Roedel <jroedel@suse.de>
  11 */
  12
  13#define pr_fmt(fmt)     "DMAR: " fmt
  14#define dev_fmt(fmt)    pr_fmt(fmt)
  15
  16#include <linux/init.h>
  17#include <linux/bitmap.h>
  18#include <linux/debugfs.h>
  19#include <linux/export.h>
  20#include <linux/slab.h>
  21#include <linux/irq.h>
  22#include <linux/interrupt.h>
  23#include <linux/spinlock.h>
  24#include <linux/pci.h>
  25#include <linux/dmar.h>
  26#include <linux/dma-mapping.h>
  27#include <linux/mempool.h>
  28#include <linux/memory.h>
  29#include <linux/cpu.h>
  30#include <linux/timer.h>
  31#include <linux/io.h>
  32#include <linux/iova.h>
  33#include <linux/iommu.h>
  34#include <linux/intel-iommu.h>
  35#include <linux/syscore_ops.h>
  36#include <linux/tboot.h>
  37#include <linux/dmi.h>
  38#include <linux/pci-ats.h>
  39#include <linux/memblock.h>
  40#include <linux/dma-contiguous.h>
  41#include <linux/dma-direct.h>
  42#include <linux/crash_dump.h>
  43#include <linux/numa.h>
  44#include <asm/irq_remapping.h>
  45#include <asm/cacheflush.h>
  46#include <asm/iommu.h>
  47
  48#include "irq_remapping.h"
  49#include "intel-pasid.h"
  50
  51#define ROOT_SIZE               VTD_PAGE_SIZE
  52#define CONTEXT_SIZE            VTD_PAGE_SIZE
  53
  54#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  55#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  56#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  57#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  58
  59#define IOAPIC_RANGE_START      (0xfee00000)
  60#define IOAPIC_RANGE_END        (0xfeefffff)
  61#define IOVA_START_ADDR         (0x1000)
  62
  63#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
  64
  65#define MAX_AGAW_WIDTH 64
  66#define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
  67
  68#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  69#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  70
  71/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  72   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  73#define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  74                                __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  75#define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  76
  77/* IO virtual address start page frame number */
  78#define IOVA_START_PFN          (1)
  79
  80#define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  81
  82/* page table handling */
  83#define LEVEL_STRIDE            (9)
  84#define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  85
  86/*
  87 * This bitmap is used to advertise the page sizes our hardware support
  88 * to the IOMMU core, which will then use this information to split
  89 * physically contiguous memory regions it is mapping into page sizes
  90 * that we support.
  91 *
  92 * Traditionally the IOMMU core just handed us the mappings directly,
  93 * after making sure the size is an order of a 4KiB page and that the
  94 * mapping has natural alignment.
  95 *
  96 * To retain this behavior, we currently advertise that we support
  97 * all page sizes that are an order of 4KiB.
  98 *
  99 * If at some point we'd like to utilize the IOMMU core's new behavior,
 100 * we could change this to advertise the real page sizes we support.
 101 */
 102#define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
 103
 104static inline int agaw_to_level(int agaw)
 105{
 106        return agaw + 2;
 107}
 108
 109static inline int agaw_to_width(int agaw)
 110{
 111        return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
 112}
 113
 114static inline int width_to_agaw(int width)
 115{
 116        return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
 117}
 118
 119static inline unsigned int level_to_offset_bits(int level)
 120{
 121        return (level - 1) * LEVEL_STRIDE;
 122}
 123
 124static inline int pfn_level_offset(unsigned long pfn, int level)
 125{
 126        return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 127}
 128
 129static inline unsigned long level_mask(int level)
 130{
 131        return -1UL << level_to_offset_bits(level);
 132}
 133
 134static inline unsigned long level_size(int level)
 135{
 136        return 1UL << level_to_offset_bits(level);
 137}
 138
 139static inline unsigned long align_to_level(unsigned long pfn, int level)
 140{
 141        return (pfn + level_size(level) - 1) & level_mask(level);
 142}
 143
 144static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 145{
 146        return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
 147}
 148
 149/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 150   are never going to work. */
 151static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 152{
 153        return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 154}
 155
 156static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 157{
 158        return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 159}
 160static inline unsigned long page_to_dma_pfn(struct page *pg)
 161{
 162        return mm_to_dma_pfn(page_to_pfn(pg));
 163}
 164static inline unsigned long virt_to_dma_pfn(void *p)
 165{
 166        return page_to_dma_pfn(virt_to_page(p));
 167}
 168
 169/* global iommu list, set NULL for ignored DMAR units */
 170static struct intel_iommu **g_iommus;
 171
 172static void __init check_tylersburg_isoch(void);
 173static int rwbf_quirk;
 174
 175/*
 176 * set to 1 to panic kernel if can't successfully enable VT-d
 177 * (used when kernel is launched w/ TXT)
 178 */
 179static int force_on = 0;
 180int intel_iommu_tboot_noforce;
 181static int no_platform_optin;
 182
 183#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 184
 185/*
 186 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 187 * if marked present.
 188 */
 189static phys_addr_t root_entry_lctp(struct root_entry *re)
 190{
 191        if (!(re->lo & 1))
 192                return 0;
 193
 194        return re->lo & VTD_PAGE_MASK;
 195}
 196
 197/*
 198 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 199 * if marked present.
 200 */
 201static phys_addr_t root_entry_uctp(struct root_entry *re)
 202{
 203        if (!(re->hi & 1))
 204                return 0;
 205
 206        return re->hi & VTD_PAGE_MASK;
 207}
 208
 209static inline void context_clear_pasid_enable(struct context_entry *context)
 210{
 211        context->lo &= ~(1ULL << 11);
 212}
 213
 214static inline bool context_pasid_enabled(struct context_entry *context)
 215{
 216        return !!(context->lo & (1ULL << 11));
 217}
 218
 219static inline void context_set_copied(struct context_entry *context)
 220{
 221        context->hi |= (1ull << 3);
 222}
 223
 224static inline bool context_copied(struct context_entry *context)
 225{
 226        return !!(context->hi & (1ULL << 3));
 227}
 228
 229static inline bool __context_present(struct context_entry *context)
 230{
 231        return (context->lo & 1);
 232}
 233
 234bool context_present(struct context_entry *context)
 235{
 236        return context_pasid_enabled(context) ?
 237             __context_present(context) :
 238             __context_present(context) && !context_copied(context);
 239}
 240
 241static inline void context_set_present(struct context_entry *context)
 242{
 243        context->lo |= 1;
 244}
 245
 246static inline void context_set_fault_enable(struct context_entry *context)
 247{
 248        context->lo &= (((u64)-1) << 2) | 1;
 249}
 250
 251static inline void context_set_translation_type(struct context_entry *context,
 252                                                unsigned long value)
 253{
 254        context->lo &= (((u64)-1) << 4) | 3;
 255        context->lo |= (value & 3) << 2;
 256}
 257
 258static inline void context_set_address_root(struct context_entry *context,
 259                                            unsigned long value)
 260{
 261        context->lo &= ~VTD_PAGE_MASK;
 262        context->lo |= value & VTD_PAGE_MASK;
 263}
 264
 265static inline void context_set_address_width(struct context_entry *context,
 266                                             unsigned long value)
 267{
 268        context->hi |= value & 7;
 269}
 270
 271static inline void context_set_domain_id(struct context_entry *context,
 272                                         unsigned long value)
 273{
 274        context->hi |= (value & ((1 << 16) - 1)) << 8;
 275}
 276
 277static inline int context_domain_id(struct context_entry *c)
 278{
 279        return((c->hi >> 8) & 0xffff);
 280}
 281
 282static inline void context_clear_entry(struct context_entry *context)
 283{
 284        context->lo = 0;
 285        context->hi = 0;
 286}
 287
 288/*
 289 * This domain is a statically identity mapping domain.
 290 *      1. This domain creats a static 1:1 mapping to all usable memory.
 291 *      2. It maps to each iommu if successful.
 292 *      3. Each iommu mapps to this domain if successful.
 293 */
 294static struct dmar_domain *si_domain;
 295static int hw_pass_through = 1;
 296
 297/* si_domain contains mulitple devices */
 298#define DOMAIN_FLAG_STATIC_IDENTITY             BIT(0)
 299
 300/*
 301 * This is a DMA domain allocated through the iommu domain allocation
 302 * interface. But one or more devices belonging to this domain have
 303 * been chosen to use a private domain. We should avoid to use the
 304 * map/unmap/iova_to_phys APIs on it.
 305 */
 306#define DOMAIN_FLAG_LOSE_CHILDREN               BIT(1)
 307
 308#define for_each_domain_iommu(idx, domain)                      \
 309        for (idx = 0; idx < g_num_of_iommus; idx++)             \
 310                if (domain->iommu_refcnt[idx])
 311
 312struct dmar_rmrr_unit {
 313        struct list_head list;          /* list of rmrr units   */
 314        struct acpi_dmar_header *hdr;   /* ACPI header          */
 315        u64     base_address;           /* reserved base address*/
 316        u64     end_address;            /* reserved end address */
 317        struct dmar_dev_scope *devices; /* target devices */
 318        int     devices_cnt;            /* target device count */
 319};
 320
 321struct dmar_atsr_unit {
 322        struct list_head list;          /* list of ATSR units */
 323        struct acpi_dmar_header *hdr;   /* ACPI header */
 324        struct dmar_dev_scope *devices; /* target devices */
 325        int devices_cnt;                /* target device count */
 326        u8 include_all:1;               /* include all ports */
 327};
 328
 329static LIST_HEAD(dmar_atsr_units);
 330static LIST_HEAD(dmar_rmrr_units);
 331
 332#define for_each_rmrr_units(rmrr) \
 333        list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 334
 335/* bitmap for indexing intel_iommus */
 336static int g_num_of_iommus;
 337
 338static void domain_exit(struct dmar_domain *domain);
 339static void domain_remove_dev_info(struct dmar_domain *domain);
 340static void dmar_remove_one_dev_info(struct device *dev);
 341static void __dmar_remove_one_dev_info(struct device_domain_info *info);
 342static void domain_context_clear(struct intel_iommu *iommu,
 343                                 struct device *dev);
 344static int domain_detach_iommu(struct dmar_domain *domain,
 345                               struct intel_iommu *iommu);
 346static bool device_is_rmrr_locked(struct device *dev);
 347static int intel_iommu_attach_device(struct iommu_domain *domain,
 348                                     struct device *dev);
 349
 350#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
 351int dmar_disabled = 0;
 352#else
 353int dmar_disabled = 1;
 354#endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
 355
 356int intel_iommu_sm;
 357int intel_iommu_enabled = 0;
 358EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 359
 360static int dmar_map_gfx = 1;
 361static int dmar_forcedac;
 362static int intel_iommu_strict;
 363static int intel_iommu_superpage = 1;
 364static int iommu_identity_mapping;
 365
 366#define IDENTMAP_ALL            1
 367#define IDENTMAP_GFX            2
 368#define IDENTMAP_AZALIA         4
 369
 370int intel_iommu_gfx_mapped;
 371EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 372
 373#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 374#define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
 375static DEFINE_SPINLOCK(device_domain_lock);
 376static LIST_HEAD(device_domain_list);
 377
 378/*
 379 * Iterate over elements in device_domain_list and call the specified
 380 * callback @fn against each element.
 381 */
 382int for_each_device_domain(int (*fn)(struct device_domain_info *info,
 383                                     void *data), void *data)
 384{
 385        int ret = 0;
 386        unsigned long flags;
 387        struct device_domain_info *info;
 388
 389        spin_lock_irqsave(&device_domain_lock, flags);
 390        list_for_each_entry(info, &device_domain_list, global) {
 391                ret = fn(info, data);
 392                if (ret) {
 393                        spin_unlock_irqrestore(&device_domain_lock, flags);
 394                        return ret;
 395                }
 396        }
 397        spin_unlock_irqrestore(&device_domain_lock, flags);
 398
 399        return 0;
 400}
 401
 402const struct iommu_ops intel_iommu_ops;
 403
 404static bool translation_pre_enabled(struct intel_iommu *iommu)
 405{
 406        return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 407}
 408
 409static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 410{
 411        iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 412}
 413
 414static void init_translation_status(struct intel_iommu *iommu)
 415{
 416        u32 gsts;
 417
 418        gsts = readl(iommu->reg + DMAR_GSTS_REG);
 419        if (gsts & DMA_GSTS_TES)
 420                iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 421}
 422
 423/* Convert generic 'struct iommu_domain to private struct dmar_domain */
 424static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
 425{
 426        return container_of(dom, struct dmar_domain, domain);
 427}
 428
 429static int __init intel_iommu_setup(char *str)
 430{
 431        if (!str)
 432                return -EINVAL;
 433        while (*str) {
 434                if (!strncmp(str, "on", 2)) {
 435                        dmar_disabled = 0;
 436                        pr_info("IOMMU enabled\n");
 437                } else if (!strncmp(str, "off", 3)) {
 438                        dmar_disabled = 1;
 439                        no_platform_optin = 1;
 440                        pr_info("IOMMU disabled\n");
 441                } else if (!strncmp(str, "igfx_off", 8)) {
 442                        dmar_map_gfx = 0;
 443                        pr_info("Disable GFX device mapping\n");
 444                } else if (!strncmp(str, "forcedac", 8)) {
 445                        pr_info("Forcing DAC for PCI devices\n");
 446                        dmar_forcedac = 1;
 447                } else if (!strncmp(str, "strict", 6)) {
 448                        pr_info("Disable batched IOTLB flush\n");
 449                        intel_iommu_strict = 1;
 450                } else if (!strncmp(str, "sp_off", 6)) {
 451                        pr_info("Disable supported super page\n");
 452                        intel_iommu_superpage = 0;
 453                } else if (!strncmp(str, "sm_on", 5)) {
 454                        pr_info("Intel-IOMMU: scalable mode supported\n");
 455                        intel_iommu_sm = 1;
 456                } else if (!strncmp(str, "tboot_noforce", 13)) {
 457                        printk(KERN_INFO
 458                                "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 459                        intel_iommu_tboot_noforce = 1;
 460                }
 461
 462                str += strcspn(str, ",");
 463                while (*str == ',')
 464                        str++;
 465        }
 466        return 0;
 467}
 468__setup("intel_iommu=", intel_iommu_setup);
 469
 470static struct kmem_cache *iommu_domain_cache;
 471static struct kmem_cache *iommu_devinfo_cache;
 472
 473static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
 474{
 475        struct dmar_domain **domains;
 476        int idx = did >> 8;
 477
 478        domains = iommu->domains[idx];
 479        if (!domains)
 480                return NULL;
 481
 482        return domains[did & 0xff];
 483}
 484
 485static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
 486                             struct dmar_domain *domain)
 487{
 488        struct dmar_domain **domains;
 489        int idx = did >> 8;
 490
 491        if (!iommu->domains[idx]) {
 492                size_t size = 256 * sizeof(struct dmar_domain *);
 493                iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
 494        }
 495
 496        domains = iommu->domains[idx];
 497        if (WARN_ON(!domains))
 498                return;
 499        else
 500                domains[did & 0xff] = domain;
 501}
 502
 503void *alloc_pgtable_page(int node)
 504{
 505        struct page *page;
 506        void *vaddr = NULL;
 507
 508        page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 509        if (page)
 510                vaddr = page_address(page);
 511        return vaddr;
 512}
 513
 514void free_pgtable_page(void *vaddr)
 515{
 516        free_page((unsigned long)vaddr);
 517}
 518
 519static inline void *alloc_domain_mem(void)
 520{
 521        return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 522}
 523
 524static void free_domain_mem(void *vaddr)
 525{
 526        kmem_cache_free(iommu_domain_cache, vaddr);
 527}
 528
 529static inline void * alloc_devinfo_mem(void)
 530{
 531        return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 532}
 533
 534static inline void free_devinfo_mem(void *vaddr)
 535{
 536        kmem_cache_free(iommu_devinfo_cache, vaddr);
 537}
 538
 539static inline int domain_type_is_si(struct dmar_domain *domain)
 540{
 541        return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
 542}
 543
 544static inline int domain_pfn_supported(struct dmar_domain *domain,
 545                                       unsigned long pfn)
 546{
 547        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 548
 549        return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 550}
 551
 552static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 553{
 554        unsigned long sagaw;
 555        int agaw = -1;
 556
 557        sagaw = cap_sagaw(iommu->cap);
 558        for (agaw = width_to_agaw(max_gaw);
 559             agaw >= 0; agaw--) {
 560                if (test_bit(agaw, &sagaw))
 561                        break;
 562        }
 563
 564        return agaw;
 565}
 566
 567/*
 568 * Calculate max SAGAW for each iommu.
 569 */
 570int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 571{
 572        return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 573}
 574
 575/*
 576 * calculate agaw for each iommu.
 577 * "SAGAW" may be different across iommus, use a default agaw, and
 578 * get a supported less agaw for iommus that don't support the default agaw.
 579 */
 580int iommu_calculate_agaw(struct intel_iommu *iommu)
 581{
 582        return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 583}
 584
 585/* This functionin only returns single iommu in a domain */
 586struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 587{
 588        int iommu_id;
 589
 590        /* si_domain and vm domain should not get here. */
 591        if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
 592                return NULL;
 593
 594        for_each_domain_iommu(iommu_id, domain)
 595                break;
 596
 597        if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 598                return NULL;
 599
 600        return g_iommus[iommu_id];
 601}
 602
 603static void domain_update_iommu_coherency(struct dmar_domain *domain)
 604{
 605        struct dmar_drhd_unit *drhd;
 606        struct intel_iommu *iommu;
 607        bool found = false;
 608        int i;
 609
 610        domain->iommu_coherency = 1;
 611
 612        for_each_domain_iommu(i, domain) {
 613                found = true;
 614                if (!ecap_coherent(g_iommus[i]->ecap)) {
 615                        domain->iommu_coherency = 0;
 616                        break;
 617                }
 618        }
 619        if (found)
 620                return;
 621
 622        /* No hardware attached; use lowest common denominator */
 623        rcu_read_lock();
 624        for_each_active_iommu(iommu, drhd) {
 625                if (!ecap_coherent(iommu->ecap)) {
 626                        domain->iommu_coherency = 0;
 627                        break;
 628                }
 629        }
 630        rcu_read_unlock();
 631}
 632
 633static int domain_update_iommu_snooping(struct intel_iommu *skip)
 634{
 635        struct dmar_drhd_unit *drhd;
 636        struct intel_iommu *iommu;
 637        int ret = 1;
 638
 639        rcu_read_lock();
 640        for_each_active_iommu(iommu, drhd) {
 641                if (iommu != skip) {
 642                        if (!ecap_sc_support(iommu->ecap)) {
 643                                ret = 0;
 644                                break;
 645                        }
 646                }
 647        }
 648        rcu_read_unlock();
 649
 650        return ret;
 651}
 652
 653static int domain_update_iommu_superpage(struct intel_iommu *skip)
 654{
 655        struct dmar_drhd_unit *drhd;
 656        struct intel_iommu *iommu;
 657        int mask = 0xf;
 658
 659        if (!intel_iommu_superpage) {
 660                return 0;
 661        }
 662
 663        /* set iommu_superpage to the smallest common denominator */
 664        rcu_read_lock();
 665        for_each_active_iommu(iommu, drhd) {
 666                if (iommu != skip) {
 667                        mask &= cap_super_page_val(iommu->cap);
 668                        if (!mask)
 669                                break;
 670                }
 671        }
 672        rcu_read_unlock();
 673
 674        return fls(mask);
 675}
 676
 677/* Some capabilities may be different across iommus */
 678static void domain_update_iommu_cap(struct dmar_domain *domain)
 679{
 680        domain_update_iommu_coherency(domain);
 681        domain->iommu_snooping = domain_update_iommu_snooping(NULL);
 682        domain->iommu_superpage = domain_update_iommu_superpage(NULL);
 683}
 684
 685struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
 686                                         u8 devfn, int alloc)
 687{
 688        struct root_entry *root = &iommu->root_entry[bus];
 689        struct context_entry *context;
 690        u64 *entry;
 691
 692        entry = &root->lo;
 693        if (sm_supported(iommu)) {
 694                if (devfn >= 0x80) {
 695                        devfn -= 0x80;
 696                        entry = &root->hi;
 697                }
 698                devfn *= 2;
 699        }
 700        if (*entry & 1)
 701                context = phys_to_virt(*entry & VTD_PAGE_MASK);
 702        else {
 703                unsigned long phy_addr;
 704                if (!alloc)
 705                        return NULL;
 706
 707                context = alloc_pgtable_page(iommu->node);
 708                if (!context)
 709                        return NULL;
 710
 711                __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 712                phy_addr = virt_to_phys((void *)context);
 713                *entry = phy_addr | 1;
 714                __iommu_flush_cache(iommu, entry, sizeof(*entry));
 715        }
 716        return &context[devfn];
 717}
 718
 719static int iommu_dummy(struct device *dev)
 720{
 721        return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
 722}
 723
 724/**
 725 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
 726 *                               sub-hierarchy of a candidate PCI-PCI bridge
 727 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
 728 * @bridge: the candidate PCI-PCI bridge
 729 *
 730 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
 731 */
 732static bool
 733is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
 734{
 735        struct pci_dev *pdev, *pbridge;
 736
 737        if (!dev_is_pci(dev) || !dev_is_pci(bridge))
 738                return false;
 739
 740        pdev = to_pci_dev(dev);
 741        pbridge = to_pci_dev(bridge);
 742
 743        if (pbridge->subordinate &&
 744            pbridge->subordinate->number <= pdev->bus->number &&
 745            pbridge->subordinate->busn_res.end >= pdev->bus->number)
 746                return true;
 747
 748        return false;
 749}
 750
 751static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 752{
 753        struct dmar_drhd_unit *drhd = NULL;
 754        struct intel_iommu *iommu;
 755        struct device *tmp;
 756        struct pci_dev *pdev = NULL;
 757        u16 segment = 0;
 758        int i;
 759
 760        if (iommu_dummy(dev))
 761                return NULL;
 762
 763        if (dev_is_pci(dev)) {
 764                struct pci_dev *pf_pdev;
 765
 766                pdev = to_pci_dev(dev);
 767
 768#ifdef CONFIG_X86
 769                /* VMD child devices currently cannot be handled individually */
 770                if (is_vmd(pdev->bus))
 771                        return NULL;
 772#endif
 773
 774                /* VFs aren't listed in scope tables; we need to look up
 775                 * the PF instead to find the IOMMU. */
 776                pf_pdev = pci_physfn(pdev);
 777                dev = &pf_pdev->dev;
 778                segment = pci_domain_nr(pdev->bus);
 779        } else if (has_acpi_companion(dev))
 780                dev = &ACPI_COMPANION(dev)->dev;
 781
 782        rcu_read_lock();
 783        for_each_active_iommu(iommu, drhd) {
 784                if (pdev && segment != drhd->segment)
 785                        continue;
 786
 787                for_each_active_dev_scope(drhd->devices,
 788                                          drhd->devices_cnt, i, tmp) {
 789                        if (tmp == dev) {
 790                                /* For a VF use its original BDF# not that of the PF
 791                                 * which we used for the IOMMU lookup. Strictly speaking
 792                                 * we could do this for all PCI devices; we only need to
 793                                 * get the BDF# from the scope table for ACPI matches. */
 794                                if (pdev && pdev->is_virtfn)
 795                                        goto got_pdev;
 796
 797                                *bus = drhd->devices[i].bus;
 798                                *devfn = drhd->devices[i].devfn;
 799                                goto out;
 800                        }
 801
 802                        if (is_downstream_to_pci_bridge(dev, tmp))
 803                                goto got_pdev;
 804                }
 805
 806                if (pdev && drhd->include_all) {
 807                got_pdev:
 808                        *bus = pdev->bus->number;
 809                        *devfn = pdev->devfn;
 810                        goto out;
 811                }
 812        }
 813        iommu = NULL;
 814 out:
 815        rcu_read_unlock();
 816
 817        return iommu;
 818}
 819
 820static void domain_flush_cache(struct dmar_domain *domain,
 821                               void *addr, int size)
 822{
 823        if (!domain->iommu_coherency)
 824                clflush_cache_range(addr, size);
 825}
 826
 827static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 828{
 829        struct context_entry *context;
 830        int ret = 0;
 831        unsigned long flags;
 832
 833        spin_lock_irqsave(&iommu->lock, flags);
 834        context = iommu_context_addr(iommu, bus, devfn, 0);
 835        if (context)
 836                ret = context_present(context);
 837        spin_unlock_irqrestore(&iommu->lock, flags);
 838        return ret;
 839}
 840
 841static void free_context_table(struct intel_iommu *iommu)
 842{
 843        int i;
 844        unsigned long flags;
 845        struct context_entry *context;
 846
 847        spin_lock_irqsave(&iommu->lock, flags);
 848        if (!iommu->root_entry) {
 849                goto out;
 850        }
 851        for (i = 0; i < ROOT_ENTRY_NR; i++) {
 852                context = iommu_context_addr(iommu, i, 0, 0);
 853                if (context)
 854                        free_pgtable_page(context);
 855
 856                if (!sm_supported(iommu))
 857                        continue;
 858
 859                context = iommu_context_addr(iommu, i, 0x80, 0);
 860                if (context)
 861                        free_pgtable_page(context);
 862
 863        }
 864        free_pgtable_page(iommu->root_entry);
 865        iommu->root_entry = NULL;
 866out:
 867        spin_unlock_irqrestore(&iommu->lock, flags);
 868}
 869
 870static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 871                                      unsigned long pfn, int *target_level)
 872{
 873        struct dma_pte *parent, *pte;
 874        int level = agaw_to_level(domain->agaw);
 875        int offset;
 876
 877        BUG_ON(!domain->pgd);
 878
 879        if (!domain_pfn_supported(domain, pfn))
 880                /* Address beyond IOMMU's addressing capabilities. */
 881                return NULL;
 882
 883        parent = domain->pgd;
 884
 885        while (1) {
 886                void *tmp_page;
 887
 888                offset = pfn_level_offset(pfn, level);
 889                pte = &parent[offset];
 890                if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 891                        break;
 892                if (level == *target_level)
 893                        break;
 894
 895                if (!dma_pte_present(pte)) {
 896                        uint64_t pteval;
 897
 898                        tmp_page = alloc_pgtable_page(domain->nid);
 899
 900                        if (!tmp_page)
 901                                return NULL;
 902
 903                        domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 904                        pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 905                        if (cmpxchg64(&pte->val, 0ULL, pteval))
 906                                /* Someone else set it while we were thinking; use theirs. */
 907                                free_pgtable_page(tmp_page);
 908                        else
 909                                domain_flush_cache(domain, pte, sizeof(*pte));
 910                }
 911                if (level == 1)
 912                        break;
 913
 914                parent = phys_to_virt(dma_pte_addr(pte));
 915                level--;
 916        }
 917
 918        if (!*target_level)
 919                *target_level = level;
 920
 921        return pte;
 922}
 923
 924/* return address's pte at specific level */
 925static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
 926                                         unsigned long pfn,
 927                                         int level, int *large_page)
 928{
 929        struct dma_pte *parent, *pte;
 930        int total = agaw_to_level(domain->agaw);
 931        int offset;
 932
 933        parent = domain->pgd;
 934        while (level <= total) {
 935                offset = pfn_level_offset(pfn, total);
 936                pte = &parent[offset];
 937                if (level == total)
 938                        return pte;
 939
 940                if (!dma_pte_present(pte)) {
 941                        *large_page = total;
 942                        break;
 943                }
 944
 945                if (dma_pte_superpage(pte)) {
 946                        *large_page = total;
 947                        return pte;
 948                }
 949
 950                parent = phys_to_virt(dma_pte_addr(pte));
 951                total--;
 952        }
 953        return NULL;
 954}
 955
 956/* clear last level pte, a tlb flush should be followed */
 957static void dma_pte_clear_range(struct dmar_domain *domain,
 958                                unsigned long start_pfn,
 959                                unsigned long last_pfn)
 960{
 961        unsigned int large_page;
 962        struct dma_pte *first_pte, *pte;
 963
 964        BUG_ON(!domain_pfn_supported(domain, start_pfn));
 965        BUG_ON(!domain_pfn_supported(domain, last_pfn));
 966        BUG_ON(start_pfn > last_pfn);
 967
 968        /* we don't need lock here; nobody else touches the iova range */
 969        do {
 970                large_page = 1;
 971                first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
 972                if (!pte) {
 973                        start_pfn = align_to_level(start_pfn + 1, large_page + 1);
 974                        continue;
 975                }
 976                do {
 977                        dma_clear_pte(pte);
 978                        start_pfn += lvl_to_nr_pages(large_page);
 979                        pte++;
 980                } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
 981
 982                domain_flush_cache(domain, first_pte,
 983                                   (void *)pte - (void *)first_pte);
 984
 985        } while (start_pfn && start_pfn <= last_pfn);
 986}
 987
 988static void dma_pte_free_level(struct dmar_domain *domain, int level,
 989                               int retain_level, struct dma_pte *pte,
 990                               unsigned long pfn, unsigned long start_pfn,
 991                               unsigned long last_pfn)
 992{
 993        pfn = max(start_pfn, pfn);
 994        pte = &pte[pfn_level_offset(pfn, level)];
 995
 996        do {
 997                unsigned long level_pfn;
 998                struct dma_pte *level_pte;
 999
1000                if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1001                        goto next;
1002
1003                level_pfn = pfn & level_mask(level);
1004                level_pte = phys_to_virt(dma_pte_addr(pte));
1005
1006                if (level > 2) {
1007                        dma_pte_free_level(domain, level - 1, retain_level,
1008                                           level_pte, level_pfn, start_pfn,
1009                                           last_pfn);
1010                }
1011
1012                /*
1013                 * Free the page table if we're below the level we want to
1014                 * retain and the range covers the entire table.
1015                 */
1016                if (level < retain_level && !(start_pfn > level_pfn ||
1017                      last_pfn < level_pfn + level_size(level) - 1)) {
1018                        dma_clear_pte(pte);
1019                        domain_flush_cache(domain, pte, sizeof(*pte));
1020                        free_pgtable_page(level_pte);
1021                }
1022next:
1023                pfn += level_size(level);
1024        } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1025}
1026
1027/*
1028 * clear last level (leaf) ptes and free page table pages below the
1029 * level we wish to keep intact.
1030 */
1031static void dma_pte_free_pagetable(struct dmar_domain *domain,
1032                                   unsigned long start_pfn,
1033                                   unsigned long last_pfn,
1034                                   int retain_level)
1035{
1036        BUG_ON(!domain_pfn_supported(domain, start_pfn));
1037        BUG_ON(!domain_pfn_supported(domain, last_pfn));
1038        BUG_ON(start_pfn > last_pfn);
1039
1040        dma_pte_clear_range(domain, start_pfn, last_pfn);
1041
1042        /* We don't need lock here; nobody else touches the iova range */
1043        dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1044                           domain->pgd, 0, start_pfn, last_pfn);
1045
1046        /* free pgd */
1047        if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1048                free_pgtable_page(domain->pgd);
1049                domain->pgd = NULL;
1050        }
1051}
1052
1053/* When a page at a given level is being unlinked from its parent, we don't
1054   need to *modify* it at all. All we need to do is make a list of all the
1055   pages which can be freed just as soon as we've flushed the IOTLB and we
1056   know the hardware page-walk will no longer touch them.
1057   The 'pte' argument is the *parent* PTE, pointing to the page that is to
1058   be freed. */
1059static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1060                                            int level, struct dma_pte *pte,
1061                                            struct page *freelist)
1062{
1063        struct page *pg;
1064
1065        pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1066        pg->freelist = freelist;
1067        freelist = pg;
1068
1069        if (level == 1)
1070                return freelist;
1071
1072        pte = page_address(pg);
1073        do {
1074                if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1075                        freelist = dma_pte_list_pagetables(domain, level - 1,
1076                                                           pte, freelist);
1077                pte++;
1078        } while (!first_pte_in_page(pte));
1079
1080        return freelist;
1081}
1082
1083static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1084                                        struct dma_pte *pte, unsigned long pfn,
1085                                        unsigned long start_pfn,
1086                                        unsigned long last_pfn,
1087                                        struct page *freelist)
1088{
1089        struct dma_pte *first_pte = NULL, *last_pte = NULL;
1090
1091        pfn = max(start_pfn, pfn);
1092        pte = &pte[pfn_level_offset(pfn, level)];
1093
1094        do {
1095                unsigned long level_pfn;
1096
1097                if (!dma_pte_present(pte))
1098                        goto next;
1099
1100                level_pfn = pfn & level_mask(level);
1101
1102                /* If range covers entire pagetable, free it */
1103                if (start_pfn <= level_pfn &&
1104                    last_pfn >= level_pfn + level_size(level) - 1) {
1105                        /* These suborbinate page tables are going away entirely. Don't
1106                           bother to clear them; we're just going to *free* them. */
1107                        if (level > 1 && !dma_pte_superpage(pte))
1108                                freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1109
1110                        dma_clear_pte(pte);
1111                        if (!first_pte)
1112                                first_pte = pte;
1113                        last_pte = pte;
1114                } else if (level > 1) {
1115                        /* Recurse down into a level that isn't *entirely* obsolete */
1116                        freelist = dma_pte_clear_level(domain, level - 1,
1117                                                       phys_to_virt(dma_pte_addr(pte)),
1118                                                       level_pfn, start_pfn, last_pfn,
1119                                                       freelist);
1120                }
1121next:
1122                pfn += level_size(level);
1123        } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1124
1125        if (first_pte)
1126                domain_flush_cache(domain, first_pte,
1127                                   (void *)++last_pte - (void *)first_pte);
1128
1129        return freelist;
1130}
1131
1132/* We can't just free the pages because the IOMMU may still be walking
1133   the page tables, and may have cached the intermediate levels. The
1134   pages can only be freed after the IOTLB flush has been done. */
1135static struct page *domain_unmap(struct dmar_domain *domain,
1136                                 unsigned long start_pfn,
1137                                 unsigned long last_pfn)
1138{
1139        struct page *freelist;
1140
1141        BUG_ON(!domain_pfn_supported(domain, start_pfn));
1142        BUG_ON(!domain_pfn_supported(domain, last_pfn));
1143        BUG_ON(start_pfn > last_pfn);
1144
1145        /* we don't need lock here; nobody else touches the iova range */
1146        freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1147                                       domain->pgd, 0, start_pfn, last_pfn, NULL);
1148
1149        /* free pgd */
1150        if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1151                struct page *pgd_page = virt_to_page(domain->pgd);
1152                pgd_page->freelist = freelist;
1153                freelist = pgd_page;
1154
1155                domain->pgd = NULL;
1156        }
1157
1158        return freelist;
1159}
1160
1161static void dma_free_pagelist(struct page *freelist)
1162{
1163        struct page *pg;
1164
1165        while ((pg = freelist)) {
1166                freelist = pg->freelist;
1167                free_pgtable_page(page_address(pg));
1168        }
1169}
1170
1171static void iova_entry_free(unsigned long data)
1172{
1173        struct page *freelist = (struct page *)data;
1174
1175        dma_free_pagelist(freelist);
1176}
1177
1178/* iommu handling */
1179static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1180{
1181        struct root_entry *root;
1182        unsigned long flags;
1183
1184        root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1185        if (!root) {
1186                pr_err("Allocating root entry for %s failed\n",
1187                        iommu->name);
1188                return -ENOMEM;
1189        }
1190
1191        __iommu_flush_cache(iommu, root, ROOT_SIZE);
1192
1193        spin_lock_irqsave(&iommu->lock, flags);
1194        iommu->root_entry = root;
1195        spin_unlock_irqrestore(&iommu->lock, flags);
1196
1197        return 0;
1198}
1199
1200static void iommu_set_root_entry(struct intel_iommu *iommu)
1201{
1202        u64 addr;
1203        u32 sts;
1204        unsigned long flag;
1205
1206        addr = virt_to_phys(iommu->root_entry);
1207        if (sm_supported(iommu))
1208                addr |= DMA_RTADDR_SMT;
1209
1210        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1211        dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1212
1213        writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1214
1215        /* Make sure hardware complete it */
1216        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1217                      readl, (sts & DMA_GSTS_RTPS), sts);
1218
1219        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1220}
1221
1222void iommu_flush_write_buffer(struct intel_iommu *iommu)
1223{
1224        u32 val;
1225        unsigned long flag;
1226
1227        if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1228                return;
1229
1230        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1231        writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1232
1233        /* Make sure hardware complete it */
1234        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1235                      readl, (!(val & DMA_GSTS_WBFS)), val);
1236
1237        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1238}
1239
1240/* return value determine if we need a write buffer flush */
1241static void __iommu_flush_context(struct intel_iommu *iommu,
1242                                  u16 did, u16 source_id, u8 function_mask,
1243                                  u64 type)
1244{
1245        u64 val = 0;
1246        unsigned long flag;
1247
1248        switch (type) {
1249        case DMA_CCMD_GLOBAL_INVL:
1250                val = DMA_CCMD_GLOBAL_INVL;
1251                break;
1252        case DMA_CCMD_DOMAIN_INVL:
1253                val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1254                break;
1255        case DMA_CCMD_DEVICE_INVL:
1256                val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1257                        | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1258                break;
1259        default:
1260                BUG();
1261        }
1262        val |= DMA_CCMD_ICC;
1263
1264        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1265        dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1266
1267        /* Make sure hardware complete it */
1268        IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1269                dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1270
1271        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1272}
1273
1274/* return value determine if we need a write buffer flush */
1275static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1276                                u64 addr, unsigned int size_order, u64 type)
1277{
1278        int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1279        u64 val = 0, val_iva = 0;
1280        unsigned long flag;
1281
1282        switch (type) {
1283        case DMA_TLB_GLOBAL_FLUSH:
1284                /* global flush doesn't need set IVA_REG */
1285                val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1286                break;
1287        case DMA_TLB_DSI_FLUSH:
1288                val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1289                break;
1290        case DMA_TLB_PSI_FLUSH:
1291                val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1292                /* IH bit is passed in as part of address */
1293                val_iva = size_order | addr;
1294                break;
1295        default:
1296                BUG();
1297        }
1298        /* Note: set drain read/write */
1299#if 0
1300        /*
1301         * This is probably to be super secure.. Looks like we can
1302         * ignore it without any impact.
1303         */
1304        if (cap_read_drain(iommu->cap))
1305                val |= DMA_TLB_READ_DRAIN;
1306#endif
1307        if (cap_write_drain(iommu->cap))
1308                val |= DMA_TLB_WRITE_DRAIN;
1309
1310        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1311        /* Note: Only uses first TLB reg currently */
1312        if (val_iva)
1313                dmar_writeq(iommu->reg + tlb_offset, val_iva);
1314        dmar_writeq(iommu->reg + tlb_offset + 8, val);
1315
1316        /* Make sure hardware complete it */
1317        IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1318                dmar_readq, (!(val & DMA_TLB_IVT)), val);
1319
1320        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1321
1322        /* check IOTLB invalidation granularity */
1323        if (DMA_TLB_IAIG(val) == 0)
1324                pr_err("Flush IOTLB failed\n");
1325        if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1326                pr_debug("TLB flush request %Lx, actual %Lx\n",
1327                        (unsigned long long)DMA_TLB_IIRG(type),
1328                        (unsigned long long)DMA_TLB_IAIG(val));
1329}
1330
1331static struct device_domain_info *
1332iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1333                         u8 bus, u8 devfn)
1334{
1335        struct device_domain_info *info;
1336
1337        assert_spin_locked(&device_domain_lock);
1338
1339        if (!iommu->qi)
1340                return NULL;
1341
1342        list_for_each_entry(info, &domain->devices, link)
1343                if (info->iommu == iommu && info->bus == bus &&
1344                    info->devfn == devfn) {
1345                        if (info->ats_supported && info->dev)
1346                                return info;
1347                        break;
1348                }
1349
1350        return NULL;
1351}
1352
1353static void domain_update_iotlb(struct dmar_domain *domain)
1354{
1355        struct device_domain_info *info;
1356        bool has_iotlb_device = false;
1357
1358        assert_spin_locked(&device_domain_lock);
1359
1360        list_for_each_entry(info, &domain->devices, link) {
1361                struct pci_dev *pdev;
1362
1363                if (!info->dev || !dev_is_pci(info->dev))
1364                        continue;
1365
1366                pdev = to_pci_dev(info->dev);
1367                if (pdev->ats_enabled) {
1368                        has_iotlb_device = true;
1369                        break;
1370                }
1371        }
1372
1373        domain->has_iotlb_device = has_iotlb_device;
1374}
1375
1376static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1377{
1378        struct pci_dev *pdev;
1379
1380        assert_spin_locked(&device_domain_lock);
1381
1382        if (!info || !dev_is_pci(info->dev))
1383                return;
1384
1385        pdev = to_pci_dev(info->dev);
1386        /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1387         * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1388         * queue depth at PF level. If DIT is not set, PFSID will be treated as
1389         * reserved, which should be set to 0.
1390         */
1391        if (!ecap_dit(info->iommu->ecap))
1392                info->pfsid = 0;
1393        else {
1394                struct pci_dev *pf_pdev;
1395
1396                /* pdev will be returned if device is not a vf */
1397                pf_pdev = pci_physfn(pdev);
1398                info->pfsid = pci_dev_id(pf_pdev);
1399        }
1400
1401#ifdef CONFIG_INTEL_IOMMU_SVM
1402        /* The PCIe spec, in its wisdom, declares that the behaviour of
1403           the device if you enable PASID support after ATS support is
1404           undefined. So always enable PASID support on devices which
1405           have it, even if we can't yet know if we're ever going to
1406           use it. */
1407        if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1408                info->pasid_enabled = 1;
1409
1410        if (info->pri_supported &&
1411            (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1412            !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1413                info->pri_enabled = 1;
1414#endif
1415        if (!pdev->untrusted && info->ats_supported &&
1416            pci_ats_page_aligned(pdev) &&
1417            !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1418                info->ats_enabled = 1;
1419                domain_update_iotlb(info->domain);
1420                info->ats_qdep = pci_ats_queue_depth(pdev);
1421        }
1422}
1423
1424static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1425{
1426        struct pci_dev *pdev;
1427
1428        assert_spin_locked(&device_domain_lock);
1429
1430        if (!dev_is_pci(info->dev))
1431                return;
1432
1433        pdev = to_pci_dev(info->dev);
1434
1435        if (info->ats_enabled) {
1436                pci_disable_ats(pdev);
1437                info->ats_enabled = 0;
1438                domain_update_iotlb(info->domain);
1439        }
1440#ifdef CONFIG_INTEL_IOMMU_SVM
1441        if (info->pri_enabled) {
1442                pci_disable_pri(pdev);
1443                info->pri_enabled = 0;
1444        }
1445        if (info->pasid_enabled) {
1446                pci_disable_pasid(pdev);
1447                info->pasid_enabled = 0;
1448        }
1449#endif
1450}
1451
1452static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1453                                  u64 addr, unsigned mask)
1454{
1455        u16 sid, qdep;
1456        unsigned long flags;
1457        struct device_domain_info *info;
1458
1459        if (!domain->has_iotlb_device)
1460                return;
1461
1462        spin_lock_irqsave(&device_domain_lock, flags);
1463        list_for_each_entry(info, &domain->devices, link) {
1464                if (!info->ats_enabled)
1465                        continue;
1466
1467                sid = info->bus << 8 | info->devfn;
1468                qdep = info->ats_qdep;
1469                qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1470                                qdep, addr, mask);
1471        }
1472        spin_unlock_irqrestore(&device_domain_lock, flags);
1473}
1474
1475static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1476                                  struct dmar_domain *domain,
1477                                  unsigned long pfn, unsigned int pages,
1478                                  int ih, int map)
1479{
1480        unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1481        uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1482        u16 did = domain->iommu_did[iommu->seq_id];
1483
1484        BUG_ON(pages == 0);
1485
1486        if (ih)
1487                ih = 1 << 6;
1488        /*
1489         * Fallback to domain selective flush if no PSI support or the size is
1490         * too big.
1491         * PSI requires page size to be 2 ^ x, and the base address is naturally
1492         * aligned to the size
1493         */
1494        if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1495                iommu->flush.flush_iotlb(iommu, did, 0, 0,
1496                                                DMA_TLB_DSI_FLUSH);
1497        else
1498                iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1499                                                DMA_TLB_PSI_FLUSH);
1500
1501        /*
1502         * In caching mode, changes of pages from non-present to present require
1503         * flush. However, device IOTLB doesn't need to be flushed in this case.
1504         */
1505        if (!cap_caching_mode(iommu->cap) || !map)
1506                iommu_flush_dev_iotlb(domain, addr, mask);
1507}
1508
1509/* Notification for newly created mappings */
1510static inline void __mapping_notify_one(struct intel_iommu *iommu,
1511                                        struct dmar_domain *domain,
1512                                        unsigned long pfn, unsigned int pages)
1513{
1514        /* It's a non-present to present mapping. Only flush if caching mode */
1515        if (cap_caching_mode(iommu->cap))
1516                iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1517        else
1518                iommu_flush_write_buffer(iommu);
1519}
1520
1521static void iommu_flush_iova(struct iova_domain *iovad)
1522{
1523        struct dmar_domain *domain;
1524        int idx;
1525
1526        domain = container_of(iovad, struct dmar_domain, iovad);
1527
1528        for_each_domain_iommu(idx, domain) {
1529                struct intel_iommu *iommu = g_iommus[idx];
1530                u16 did = domain->iommu_did[iommu->seq_id];
1531
1532                iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1533
1534                if (!cap_caching_mode(iommu->cap))
1535                        iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1536                                              0, MAX_AGAW_PFN_WIDTH);
1537        }
1538}
1539
1540static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1541{
1542        u32 pmen;
1543        unsigned long flags;
1544
1545        if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1546                return;
1547
1548        raw_spin_lock_irqsave(&iommu->register_lock, flags);
1549        pmen = readl(iommu->reg + DMAR_PMEN_REG);
1550        pmen &= ~DMA_PMEN_EPM;
1551        writel(pmen, iommu->reg + DMAR_PMEN_REG);
1552
1553        /* wait for the protected region status bit to clear */
1554        IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1555                readl, !(pmen & DMA_PMEN_PRS), pmen);
1556
1557        raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1558}
1559
1560static void iommu_enable_translation(struct intel_iommu *iommu)
1561{
1562        u32 sts;
1563        unsigned long flags;
1564
1565        raw_spin_lock_irqsave(&iommu->register_lock, flags);
1566        iommu->gcmd |= DMA_GCMD_TE;
1567        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1568
1569        /* Make sure hardware complete it */
1570        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1571                      readl, (sts & DMA_GSTS_TES), sts);
1572
1573        raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1574}
1575
1576static void iommu_disable_translation(struct intel_iommu *iommu)
1577{
1578        u32 sts;
1579        unsigned long flag;
1580
1581        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1582        iommu->gcmd &= ~DMA_GCMD_TE;
1583        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1584
1585        /* Make sure hardware complete it */
1586        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1587                      readl, (!(sts & DMA_GSTS_TES)), sts);
1588
1589        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1590}
1591
1592static int iommu_init_domains(struct intel_iommu *iommu)
1593{
1594        u32 ndomains, nlongs;
1595        size_t size;
1596
1597        ndomains = cap_ndoms(iommu->cap);
1598        pr_debug("%s: Number of Domains supported <%d>\n",
1599                 iommu->name, ndomains);
1600        nlongs = BITS_TO_LONGS(ndomains);
1601
1602        spin_lock_init(&iommu->lock);
1603
1604        iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1605        if (!iommu->domain_ids) {
1606                pr_err("%s: Allocating domain id array failed\n",
1607                       iommu->name);
1608                return -ENOMEM;
1609        }
1610
1611        size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1612        iommu->domains = kzalloc(size, GFP_KERNEL);
1613
1614        if (iommu->domains) {
1615                size = 256 * sizeof(struct dmar_domain *);
1616                iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1617        }
1618
1619        if (!iommu->domains || !iommu->domains[0]) {
1620                pr_err("%s: Allocating domain array failed\n",
1621                       iommu->name);
1622                kfree(iommu->domain_ids);
1623                kfree(iommu->domains);
1624                iommu->domain_ids = NULL;
1625                iommu->domains    = NULL;
1626                return -ENOMEM;
1627        }
1628
1629        /*
1630         * If Caching mode is set, then invalid translations are tagged
1631         * with domain-id 0, hence we need to pre-allocate it. We also
1632         * use domain-id 0 as a marker for non-allocated domain-id, so
1633         * make sure it is not used for a real domain.
1634         */
1635        set_bit(0, iommu->domain_ids);
1636
1637        /*
1638         * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1639         * entry for first-level or pass-through translation modes should
1640         * be programmed with a domain id different from those used for
1641         * second-level or nested translation. We reserve a domain id for
1642         * this purpose.
1643         */
1644        if (sm_supported(iommu))
1645                set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1646
1647        return 0;
1648}
1649
1650static void disable_dmar_iommu(struct intel_iommu *iommu)
1651{
1652        struct device_domain_info *info, *tmp;
1653        unsigned long flags;
1654
1655        if (!iommu->domains || !iommu->domain_ids)
1656                return;
1657
1658        spin_lock_irqsave(&device_domain_lock, flags);
1659        list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1660                if (info->iommu != iommu)
1661                        continue;
1662
1663                if (!info->dev || !info->domain)
1664                        continue;
1665
1666                __dmar_remove_one_dev_info(info);
1667        }
1668        spin_unlock_irqrestore(&device_domain_lock, flags);
1669
1670        if (iommu->gcmd & DMA_GCMD_TE)
1671                iommu_disable_translation(iommu);
1672}
1673
1674static void free_dmar_iommu(struct intel_iommu *iommu)
1675{
1676        if ((iommu->domains) && (iommu->domain_ids)) {
1677                int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1678                int i;
1679
1680                for (i = 0; i < elems; i++)
1681                        kfree(iommu->domains[i]);
1682                kfree(iommu->domains);
1683                kfree(iommu->domain_ids);
1684                iommu->domains = NULL;
1685                iommu->domain_ids = NULL;
1686        }
1687
1688        g_iommus[iommu->seq_id] = NULL;
1689
1690        /* free context mapping */
1691        free_context_table(iommu);
1692
1693#ifdef CONFIG_INTEL_IOMMU_SVM
1694        if (pasid_supported(iommu)) {
1695                if (ecap_prs(iommu->ecap))
1696                        intel_svm_finish_prq(iommu);
1697        }
1698#endif
1699}
1700
1701static struct dmar_domain *alloc_domain(int flags)
1702{
1703        struct dmar_domain *domain;
1704
1705        domain = alloc_domain_mem();
1706        if (!domain)
1707                return NULL;
1708
1709        memset(domain, 0, sizeof(*domain));
1710        domain->nid = NUMA_NO_NODE;
1711        domain->flags = flags;
1712        domain->has_iotlb_device = false;
1713        INIT_LIST_HEAD(&domain->devices);
1714
1715        return domain;
1716}
1717
1718/* Must be called with iommu->lock */
1719static int domain_attach_iommu(struct dmar_domain *domain,
1720                               struct intel_iommu *iommu)
1721{
1722        unsigned long ndomains;
1723        int num;
1724
1725        assert_spin_locked(&device_domain_lock);
1726        assert_spin_locked(&iommu->lock);
1727
1728        domain->iommu_refcnt[iommu->seq_id] += 1;
1729        domain->iommu_count += 1;
1730        if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1731                ndomains = cap_ndoms(iommu->cap);
1732                num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1733
1734                if (num >= ndomains) {
1735                        pr_err("%s: No free domain ids\n", iommu->name);
1736                        domain->iommu_refcnt[iommu->seq_id] -= 1;
1737                        domain->iommu_count -= 1;
1738                        return -ENOSPC;
1739                }
1740
1741                set_bit(num, iommu->domain_ids);
1742                set_iommu_domain(iommu, num, domain);
1743
1744                domain->iommu_did[iommu->seq_id] = num;
1745                domain->nid                      = iommu->node;
1746
1747                domain_update_iommu_cap(domain);
1748        }
1749
1750        return 0;
1751}
1752
1753static int domain_detach_iommu(struct dmar_domain *domain,
1754                               struct intel_iommu *iommu)
1755{
1756        int num, count;
1757
1758        assert_spin_locked(&device_domain_lock);
1759        assert_spin_locked(&iommu->lock);
1760
1761        domain->iommu_refcnt[iommu->seq_id] -= 1;
1762        count = --domain->iommu_count;
1763        if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1764                num = domain->iommu_did[iommu->seq_id];
1765                clear_bit(num, iommu->domain_ids);
1766                set_iommu_domain(iommu, num, NULL);
1767
1768                domain_update_iommu_cap(domain);
1769                domain->iommu_did[iommu->seq_id] = 0;
1770        }
1771
1772        return count;
1773}
1774
1775static struct iova_domain reserved_iova_list;
1776static struct lock_class_key reserved_rbtree_key;
1777
1778static int dmar_init_reserved_ranges(void)
1779{
1780        struct pci_dev *pdev = NULL;
1781        struct iova *iova;
1782        int i;
1783
1784        init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1785
1786        lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1787                &reserved_rbtree_key);
1788
1789        /* IOAPIC ranges shouldn't be accessed by DMA */
1790        iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1791                IOVA_PFN(IOAPIC_RANGE_END));
1792        if (!iova) {
1793                pr_err("Reserve IOAPIC range failed\n");
1794                return -ENODEV;
1795        }
1796
1797        /* Reserve all PCI MMIO to avoid peer-to-peer access */
1798        for_each_pci_dev(pdev) {
1799                struct resource *r;
1800
1801                for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1802                        r = &pdev->resource[i];
1803                        if (!r->flags || !(r->flags & IORESOURCE_MEM))
1804                                continue;
1805                        iova = reserve_iova(&reserved_iova_list,
1806                                            IOVA_PFN(r->start),
1807                                            IOVA_PFN(r->end));
1808                        if (!iova) {
1809                                pci_err(pdev, "Reserve iova for %pR failed\n", r);
1810                                return -ENODEV;
1811                        }
1812                }
1813        }
1814        return 0;
1815}
1816
1817static void domain_reserve_special_ranges(struct dmar_domain *domain)
1818{
1819        copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1820}
1821
1822static inline int guestwidth_to_adjustwidth(int gaw)
1823{
1824        int agaw;
1825        int r = (gaw - 12) % 9;
1826
1827        if (r == 0)
1828                agaw = gaw;
1829        else
1830                agaw = gaw + 9 - r;
1831        if (agaw > 64)
1832                agaw = 64;
1833        return agaw;
1834}
1835
1836static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1837                       int guest_width)
1838{
1839        int adjust_width, agaw;
1840        unsigned long sagaw;
1841        int err;
1842
1843        init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1844
1845        err = init_iova_flush_queue(&domain->iovad,
1846                                    iommu_flush_iova, iova_entry_free);
1847        if (err)
1848                return err;
1849
1850        domain_reserve_special_ranges(domain);
1851
1852        /* calculate AGAW */
1853        if (guest_width > cap_mgaw(iommu->cap))
1854                guest_width = cap_mgaw(iommu->cap);
1855        domain->gaw = guest_width;
1856        adjust_width = guestwidth_to_adjustwidth(guest_width);
1857        agaw = width_to_agaw(adjust_width);
1858        sagaw = cap_sagaw(iommu->cap);
1859        if (!test_bit(agaw, &sagaw)) {
1860                /* hardware doesn't support it, choose a bigger one */
1861                pr_debug("Hardware doesn't support agaw %d\n", agaw);
1862                agaw = find_next_bit(&sagaw, 5, agaw);
1863                if (agaw >= 5)
1864                        return -ENODEV;
1865        }
1866        domain->agaw = agaw;
1867
1868        if (ecap_coherent(iommu->ecap))
1869                domain->iommu_coherency = 1;
1870        else
1871                domain->iommu_coherency = 0;
1872
1873        if (ecap_sc_support(iommu->ecap))
1874                domain->iommu_snooping = 1;
1875        else
1876                domain->iommu_snooping = 0;
1877
1878        if (intel_iommu_superpage)
1879                domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1880        else
1881                domain->iommu_superpage = 0;
1882
1883        domain->nid = iommu->node;
1884
1885        /* always allocate the top pgd */
1886        domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1887        if (!domain->pgd)
1888                return -ENOMEM;
1889        __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1890        return 0;
1891}
1892
1893static void domain_exit(struct dmar_domain *domain)
1894{
1895
1896        /* Remove associated devices and clear attached or cached domains */
1897        domain_remove_dev_info(domain);
1898
1899        /* destroy iovas */
1900        put_iova_domain(&domain->iovad);
1901
1902        if (domain->pgd) {
1903                struct page *freelist;
1904
1905                freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1906                dma_free_pagelist(freelist);
1907        }
1908
1909        free_domain_mem(domain);
1910}
1911
1912/*
1913 * Get the PASID directory size for scalable mode context entry.
1914 * Value of X in the PDTS field of a scalable mode context entry
1915 * indicates PASID directory with 2^(X + 7) entries.
1916 */
1917static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1918{
1919        int pds, max_pde;
1920
1921        max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1922        pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1923        if (pds < 7)
1924                return 0;
1925
1926        return pds - 7;
1927}
1928
1929/*
1930 * Set the RID_PASID field of a scalable mode context entry. The
1931 * IOMMU hardware will use the PASID value set in this field for
1932 * DMA translations of DMA requests without PASID.
1933 */
1934static inline void
1935context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1936{
1937        context->hi |= pasid & ((1 << 20) - 1);
1938        context->hi |= (1 << 20);
1939}
1940
1941/*
1942 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1943 * entry.
1944 */
1945static inline void context_set_sm_dte(struct context_entry *context)
1946{
1947        context->lo |= (1 << 2);
1948}
1949
1950/*
1951 * Set the PRE(Page Request Enable) field of a scalable mode context
1952 * entry.
1953 */
1954static inline void context_set_sm_pre(struct context_entry *context)
1955{
1956        context->lo |= (1 << 4);
1957}
1958
1959/* Convert value to context PASID directory size field coding. */
1960#define context_pdts(pds)       (((pds) & 0x7) << 9)
1961
1962static int domain_context_mapping_one(struct dmar_domain *domain,
1963                                      struct intel_iommu *iommu,
1964                                      struct pasid_table *table,
1965                                      u8 bus, u8 devfn)
1966{
1967        u16 did = domain->iommu_did[iommu->seq_id];
1968        int translation = CONTEXT_TT_MULTI_LEVEL;
1969        struct device_domain_info *info = NULL;
1970        struct context_entry *context;
1971        unsigned long flags;
1972        int ret;
1973
1974        WARN_ON(did == 0);
1975
1976        if (hw_pass_through && domain_type_is_si(domain))
1977                translation = CONTEXT_TT_PASS_THROUGH;
1978
1979        pr_debug("Set context mapping for %02x:%02x.%d\n",
1980                bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1981
1982        BUG_ON(!domain->pgd);
1983
1984        spin_lock_irqsave(&device_domain_lock, flags);
1985        spin_lock(&iommu->lock);
1986
1987        ret = -ENOMEM;
1988        context = iommu_context_addr(iommu, bus, devfn, 1);
1989        if (!context)
1990                goto out_unlock;
1991
1992        ret = 0;
1993        if (context_present(context))
1994                goto out_unlock;
1995
1996        /*
1997         * For kdump cases, old valid entries may be cached due to the
1998         * in-flight DMA and copied pgtable, but there is no unmapping
1999         * behaviour for them, thus we need an explicit cache flush for
2000         * the newly-mapped device. For kdump, at this point, the device
2001         * is supposed to finish reset at its driver probe stage, so no
2002         * in-flight DMA will exist, and we don't need to worry anymore
2003         * hereafter.
2004         */
2005        if (context_copied(context)) {
2006                u16 did_old = context_domain_id(context);
2007
2008                if (did_old < cap_ndoms(iommu->cap)) {
2009                        iommu->flush.flush_context(iommu, did_old,
2010                                                   (((u16)bus) << 8) | devfn,
2011                                                   DMA_CCMD_MASK_NOBIT,
2012                                                   DMA_CCMD_DEVICE_INVL);
2013                        iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2014                                                 DMA_TLB_DSI_FLUSH);
2015                }
2016        }
2017
2018        context_clear_entry(context);
2019
2020        if (sm_supported(iommu)) {
2021                unsigned long pds;
2022
2023                WARN_ON(!table);
2024
2025                /* Setup the PASID DIR pointer: */
2026                pds = context_get_sm_pds(table);
2027                context->lo = (u64)virt_to_phys(table->table) |
2028                                context_pdts(pds);
2029
2030                /* Setup the RID_PASID field: */
2031                context_set_sm_rid2pasid(context, PASID_RID2PASID);
2032
2033                /*
2034                 * Setup the Device-TLB enable bit and Page request
2035                 * Enable bit:
2036                 */
2037                info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2038                if (info && info->ats_supported)
2039                        context_set_sm_dte(context);
2040                if (info && info->pri_supported)
2041                        context_set_sm_pre(context);
2042        } else {
2043                struct dma_pte *pgd = domain->pgd;
2044                int agaw;
2045
2046                context_set_domain_id(context, did);
2047
2048                if (translation != CONTEXT_TT_PASS_THROUGH) {
2049                        /*
2050                         * Skip top levels of page tables for iommu which has
2051                         * less agaw than default. Unnecessary for PT mode.
2052                         */
2053                        for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2054                                ret = -ENOMEM;
2055                                pgd = phys_to_virt(dma_pte_addr(pgd));
2056                                if (!dma_pte_present(pgd))
2057                                        goto out_unlock;
2058                        }
2059
2060                        info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2061                        if (info && info->ats_supported)
2062                                translation = CONTEXT_TT_DEV_IOTLB;
2063                        else
2064                                translation = CONTEXT_TT_MULTI_LEVEL;
2065
2066                        context_set_address_root(context, virt_to_phys(pgd));
2067                        context_set_address_width(context, agaw);
2068                } else {
2069                        /*
2070                         * In pass through mode, AW must be programmed to
2071                         * indicate the largest AGAW value supported by
2072                         * hardware. And ASR is ignored by hardware.
2073                         */
2074                        context_set_address_width(context, iommu->msagaw);
2075                }
2076
2077                context_set_translation_type(context, translation);
2078        }
2079
2080        context_set_fault_enable(context);
2081        context_set_present(context);
2082        domain_flush_cache(domain, context, sizeof(*context));
2083
2084        /*
2085         * It's a non-present to present mapping. If hardware doesn't cache
2086         * non-present entry we only need to flush the write-buffer. If the
2087         * _does_ cache non-present entries, then it does so in the special
2088         * domain #0, which we have to flush:
2089         */
2090        if (cap_caching_mode(iommu->cap)) {
2091                iommu->flush.flush_context(iommu, 0,
2092                                           (((u16)bus) << 8) | devfn,
2093                                           DMA_CCMD_MASK_NOBIT,
2094                                           DMA_CCMD_DEVICE_INVL);
2095                iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2096        } else {
2097                iommu_flush_write_buffer(iommu);
2098        }
2099        iommu_enable_dev_iotlb(info);
2100
2101        ret = 0;
2102
2103out_unlock:
2104        spin_unlock(&iommu->lock);
2105        spin_unlock_irqrestore(&device_domain_lock, flags);
2106
2107        return ret;
2108}
2109
2110struct domain_context_mapping_data {
2111        struct dmar_domain *domain;
2112        struct intel_iommu *iommu;
2113        struct pasid_table *table;
2114};
2115
2116static int domain_context_mapping_cb(struct pci_dev *pdev,
2117                                     u16 alias, void *opaque)
2118{
2119        struct domain_context_mapping_data *data = opaque;
2120
2121        return domain_context_mapping_one(data->domain, data->iommu,
2122                                          data->table, PCI_BUS_NUM(alias),
2123                                          alias & 0xff);
2124}
2125
2126static int
2127domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2128{
2129        struct domain_context_mapping_data data;
2130        struct pasid_table *table;
2131        struct intel_iommu *iommu;
2132        u8 bus, devfn;
2133
2134        iommu = device_to_iommu(dev, &bus, &devfn);
2135        if (!iommu)
2136                return -ENODEV;
2137
2138        table = intel_pasid_get_table(dev);
2139
2140        if (!dev_is_pci(dev))
2141                return domain_context_mapping_one(domain, iommu, table,
2142                                                  bus, devfn);
2143
2144        data.domain = domain;
2145        data.iommu = iommu;
2146        data.table = table;
2147
2148        return pci_for_each_dma_alias(to_pci_dev(dev),
2149                                      &domain_context_mapping_cb, &data);
2150}
2151
2152static int domain_context_mapped_cb(struct pci_dev *pdev,
2153                                    u16 alias, void *opaque)
2154{
2155        struct intel_iommu *iommu = opaque;
2156
2157        return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2158}
2159
2160static int domain_context_mapped(struct device *dev)
2161{
2162        struct intel_iommu *iommu;
2163        u8 bus, devfn;
2164
2165        iommu = device_to_iommu(dev, &bus, &devfn);
2166        if (!iommu)
2167                return -ENODEV;
2168
2169        if (!dev_is_pci(dev))
2170                return device_context_mapped(iommu, bus, devfn);
2171
2172        return !pci_for_each_dma_alias(to_pci_dev(dev),
2173                                       domain_context_mapped_cb, iommu);
2174}
2175
2176/* Returns a number of VTD pages, but aligned to MM page size */
2177static inline unsigned long aligned_nrpages(unsigned long host_addr,
2178                                            size_t size)
2179{
2180        host_addr &= ~PAGE_MASK;
2181        return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2182}
2183
2184/* Return largest possible superpage level for a given mapping */
2185static inline int hardware_largepage_caps(struct dmar_domain *domain,
2186                                          unsigned long iov_pfn,
2187                                          unsigned long phy_pfn,
2188                                          unsigned long pages)
2189{
2190        int support, level = 1;
2191        unsigned long pfnmerge;
2192
2193        support = domain->iommu_superpage;
2194
2195        /* To use a large page, the virtual *and* physical addresses
2196           must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2197           of them will mean we have to use smaller pages. So just
2198           merge them and check both at once. */
2199        pfnmerge = iov_pfn | phy_pfn;
2200
2201        while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2202                pages >>= VTD_STRIDE_SHIFT;
2203                if (!pages)
2204                        break;
2205                pfnmerge >>= VTD_STRIDE_SHIFT;
2206                level++;
2207                support--;
2208        }
2209        return level;
2210}
2211
2212static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2213                            struct scatterlist *sg, unsigned long phys_pfn,
2214                            unsigned long nr_pages, int prot)
2215{
2216        struct dma_pte *first_pte = NULL, *pte = NULL;
2217        phys_addr_t uninitialized_var(pteval);
2218        unsigned long sg_res = 0;
2219        unsigned int largepage_lvl = 0;
2220        unsigned long lvl_pages = 0;
2221
2222        BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2223
2224        if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2225                return -EINVAL;
2226
2227        prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2228
2229        if (!sg) {
2230                sg_res = nr_pages;
2231                pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2232        }
2233
2234        while (nr_pages > 0) {
2235                uint64_t tmp;
2236
2237                if (!sg_res) {
2238                        unsigned int pgoff = sg->offset & ~PAGE_MASK;
2239
2240                        sg_res = aligned_nrpages(sg->offset, sg->length);
2241                        sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2242                        sg->dma_length = sg->length;
2243                        pteval = (sg_phys(sg) - pgoff) | prot;
2244                        phys_pfn = pteval >> VTD_PAGE_SHIFT;
2245                }
2246
2247                if (!pte) {
2248                        largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2249
2250                        first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2251                        if (!pte)
2252                                return -ENOMEM;
2253                        /* It is large page*/
2254                        if (largepage_lvl > 1) {
2255                                unsigned long nr_superpages, end_pfn;
2256
2257                                pteval |= DMA_PTE_LARGE_PAGE;
2258                                lvl_pages = lvl_to_nr_pages(largepage_lvl);
2259
2260                                nr_superpages = sg_res / lvl_pages;
2261                                end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2262
2263                                /*
2264                                 * Ensure that old small page tables are
2265                                 * removed to make room for superpage(s).
2266                                 * We're adding new large pages, so make sure
2267                                 * we don't remove their parent tables.
2268                                 */
2269                                dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2270                                                       largepage_lvl + 1);
2271                        } else {
2272                                pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2273                        }
2274
2275                }
2276                /* We don't need lock here, nobody else
2277                 * touches the iova range
2278                 */
2279                tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2280                if (tmp) {
2281                        static int dumps = 5;
2282                        pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2283                                iov_pfn, tmp, (unsigned long long)pteval);
2284                        if (dumps) {
2285                                dumps--;
2286                                debug_dma_dump_mappings(NULL);
2287                        }
2288                        WARN_ON(1);
2289                }
2290
2291                lvl_pages = lvl_to_nr_pages(largepage_lvl);
2292
2293                BUG_ON(nr_pages < lvl_pages);
2294                BUG_ON(sg_res < lvl_pages);
2295
2296                nr_pages -= lvl_pages;
2297                iov_pfn += lvl_pages;
2298                phys_pfn += lvl_pages;
2299                pteval += lvl_pages * VTD_PAGE_SIZE;
2300                sg_res -= lvl_pages;
2301
2302                /* If the next PTE would be the first in a new page, then we
2303                   need to flush the cache on the entries we've just written.
2304                   And then we'll need to recalculate 'pte', so clear it and
2305                   let it get set again in the if (!pte) block above.
2306
2307                   If we're done (!nr_pages) we need to flush the cache too.
2308
2309                   Also if we've been setting superpages, we may need to
2310                   recalculate 'pte' and switch back to smaller pages for the
2311                   end of the mapping, if the trailing size is not enough to
2312                   use another superpage (i.e. sg_res < lvl_pages). */
2313                pte++;
2314                if (!nr_pages || first_pte_in_page(pte) ||
2315                    (largepage_lvl > 1 && sg_res < lvl_pages)) {
2316                        domain_flush_cache(domain, first_pte,
2317                                           (void *)pte - (void *)first_pte);
2318                        pte = NULL;
2319                }
2320
2321                if (!sg_res && nr_pages)
2322                        sg = sg_next(sg);
2323        }
2324        return 0;
2325}
2326
2327static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2328                          struct scatterlist *sg, unsigned long phys_pfn,
2329                          unsigned long nr_pages, int prot)
2330{
2331        int iommu_id, ret;
2332        struct intel_iommu *iommu;
2333
2334        /* Do the real mapping first */
2335        ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2336        if (ret)
2337                return ret;
2338
2339        for_each_domain_iommu(iommu_id, domain) {
2340                iommu = g_iommus[iommu_id];
2341                __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2342        }
2343
2344        return 0;
2345}
2346
2347static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2348                                    struct scatterlist *sg, unsigned long nr_pages,
2349                                    int prot)
2350{
2351        return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2352}
2353
2354static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2355                                     unsigned long phys_pfn, unsigned long nr_pages,
2356                                     int prot)
2357{
2358        return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2359}
2360
2361static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2362{
2363        unsigned long flags;
2364        struct context_entry *context;
2365        u16 did_old;
2366
2367        if (!iommu)
2368                return;
2369
2370        spin_lock_irqsave(&iommu->lock, flags);
2371        context = iommu_context_addr(iommu, bus, devfn, 0);
2372        if (!context) {
2373                spin_unlock_irqrestore(&iommu->lock, flags);
2374                return;
2375        }
2376        did_old = context_domain_id(context);
2377        context_clear_entry(context);
2378        __iommu_flush_cache(iommu, context, sizeof(*context));
2379        spin_unlock_irqrestore(&iommu->lock, flags);
2380        iommu->flush.flush_context(iommu,
2381                                   did_old,
2382                                   (((u16)bus) << 8) | devfn,
2383                                   DMA_CCMD_MASK_NOBIT,
2384                                   DMA_CCMD_DEVICE_INVL);
2385        iommu->flush.flush_iotlb(iommu,
2386                                 did_old,
2387                                 0,
2388                                 0,
2389                                 DMA_TLB_DSI_FLUSH);
2390}
2391
2392static inline void unlink_domain_info(struct device_domain_info *info)
2393{
2394        assert_spin_locked(&device_domain_lock);
2395        list_del(&info->link);
2396        list_del(&info->global);
2397        if (info->dev)
2398                info->dev->archdata.iommu = NULL;
2399}
2400
2401static void domain_remove_dev_info(struct dmar_domain *domain)
2402{
2403        struct device_domain_info *info, *tmp;
2404        unsigned long flags;
2405
2406        spin_lock_irqsave(&device_domain_lock, flags);
2407        list_for_each_entry_safe(info, tmp, &domain->devices, link)
2408                __dmar_remove_one_dev_info(info);
2409        spin_unlock_irqrestore(&device_domain_lock, flags);
2410}
2411
2412/*
2413 * find_domain
2414 * Note: we use struct device->archdata.iommu stores the info
2415 */
2416static struct dmar_domain *find_domain(struct device *dev)
2417{
2418        struct device_domain_info *info;
2419
2420        if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) {
2421                struct iommu_domain *domain;
2422
2423                dev->archdata.iommu = NULL;
2424                domain = iommu_get_domain_for_dev(dev);
2425                if (domain)
2426                        intel_iommu_attach_device(domain, dev);
2427        }
2428
2429        /* No lock here, assumes no domain exit in normal case */
2430        info = dev->archdata.iommu;
2431
2432        if (likely(info))
2433                return info->domain;
2434        return NULL;
2435}
2436
2437static inline struct device_domain_info *
2438dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2439{
2440        struct device_domain_info *info;
2441
2442        list_for_each_entry(info, &device_domain_list, global)
2443                if (info->iommu->segment == segment && info->bus == bus &&
2444                    info->devfn == devfn)
2445                        return info;
2446
2447        return NULL;
2448}
2449
2450static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2451                                                    int bus, int devfn,
2452                                                    struct device *dev,
2453                                                    struct dmar_domain *domain)
2454{
2455        struct dmar_domain *found = NULL;
2456        struct device_domain_info *info;
2457        unsigned long flags;
2458        int ret;
2459
2460        info = alloc_devinfo_mem();
2461        if (!info)
2462                return NULL;
2463
2464        info->bus = bus;
2465        info->devfn = devfn;
2466        info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2467        info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2468        info->ats_qdep = 0;
2469        info->dev = dev;
2470        info->domain = domain;
2471        info->iommu = iommu;
2472        info->pasid_table = NULL;
2473        info->auxd_enabled = 0;
2474        INIT_LIST_HEAD(&info->auxiliary_domains);
2475
2476        if (dev && dev_is_pci(dev)) {
2477                struct pci_dev *pdev = to_pci_dev(info->dev);
2478
2479                if (!pdev->untrusted &&
2480                    !pci_ats_disabled() &&
2481                    ecap_dev_iotlb_support(iommu->ecap) &&
2482                    pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2483                    dmar_find_matched_atsr_unit(pdev))
2484                        info->ats_supported = 1;
2485
2486                if (sm_supported(iommu)) {
2487                        if (pasid_supported(iommu)) {
2488                                int features = pci_pasid_features(pdev);
2489                                if (features >= 0)
2490                                        info->pasid_supported = features | 1;
2491                        }
2492
2493                        if (info->ats_supported && ecap_prs(iommu->ecap) &&
2494                            pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2495                                info->pri_supported = 1;
2496                }
2497        }
2498
2499        spin_lock_irqsave(&device_domain_lock, flags);
2500        if (dev)
2501                found = find_domain(dev);
2502
2503        if (!found) {
2504                struct device_domain_info *info2;
2505                info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2506                if (info2) {
2507                        found      = info2->domain;
2508                        info2->dev = dev;
2509                }
2510        }
2511
2512        if (found) {
2513                spin_unlock_irqrestore(&device_domain_lock, flags);
2514                free_devinfo_mem(info);
2515                /* Caller must free the original domain */
2516                return found;
2517        }
2518
2519        spin_lock(&iommu->lock);
2520        ret = domain_attach_iommu(domain, iommu);
2521        spin_unlock(&iommu->lock);
2522
2523        if (ret) {
2524                spin_unlock_irqrestore(&device_domain_lock, flags);
2525                free_devinfo_mem(info);
2526                return NULL;
2527        }
2528
2529        list_add(&info->link, &domain->devices);
2530        list_add(&info->global, &device_domain_list);
2531        if (dev)
2532                dev->archdata.iommu = info;
2533        spin_unlock_irqrestore(&device_domain_lock, flags);
2534
2535        /* PASID table is mandatory for a PCI device in scalable mode. */
2536        if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2537                ret = intel_pasid_alloc_table(dev);
2538                if (ret) {
2539                        dev_err(dev, "PASID table allocation failed\n");
2540                        dmar_remove_one_dev_info(dev);
2541                        return NULL;
2542                }
2543
2544                /* Setup the PASID entry for requests without PASID: */
2545                spin_lock(&iommu->lock);
2546                if (hw_pass_through && domain_type_is_si(domain))
2547                        ret = intel_pasid_setup_pass_through(iommu, domain,
2548                                        dev, PASID_RID2PASID);
2549                else
2550                        ret = intel_pasid_setup_second_level(iommu, domain,
2551                                        dev, PASID_RID2PASID);
2552                spin_unlock(&iommu->lock);
2553                if (ret) {
2554                        dev_err(dev, "Setup RID2PASID failed\n");
2555                        dmar_remove_one_dev_info(dev);
2556                        return NULL;
2557                }
2558        }
2559
2560        if (dev && domain_context_mapping(domain, dev)) {
2561                dev_err(dev, "Domain context map failed\n");
2562                dmar_remove_one_dev_info(dev);
2563                return NULL;
2564        }
2565
2566        return domain;
2567}
2568
2569static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2570{
2571        *(u16 *)opaque = alias;
2572        return 0;
2573}
2574
2575static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2576{
2577        struct device_domain_info *info;
2578        struct dmar_domain *domain = NULL;
2579        struct intel_iommu *iommu;
2580        u16 dma_alias;
2581        unsigned long flags;
2582        u8 bus, devfn;
2583
2584        iommu = device_to_iommu(dev, &bus, &devfn);
2585        if (!iommu)
2586                return NULL;
2587
2588        if (dev_is_pci(dev)) {
2589                struct pci_dev *pdev = to_pci_dev(dev);
2590
2591                pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2592
2593                spin_lock_irqsave(&device_domain_lock, flags);
2594                info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2595                                                      PCI_BUS_NUM(dma_alias),
2596                                                      dma_alias & 0xff);
2597                if (info) {
2598                        iommu = info->iommu;
2599                        domain = info->domain;
2600                }
2601                spin_unlock_irqrestore(&device_domain_lock, flags);
2602
2603                /* DMA alias already has a domain, use it */
2604                if (info)
2605                        goto out;
2606        }
2607
2608        /* Allocate and initialize new domain for the device */
2609        domain = alloc_domain(0);
2610        if (!domain)
2611                return NULL;
2612        if (domain_init(domain, iommu, gaw)) {
2613                domain_exit(domain);
2614                return NULL;
2615        }
2616
2617out:
2618        return domain;
2619}
2620
2621static struct dmar_domain *set_domain_for_dev(struct device *dev,
2622                                              struct dmar_domain *domain)
2623{
2624        struct intel_iommu *iommu;
2625        struct dmar_domain *tmp;
2626        u16 req_id, dma_alias;
2627        u8 bus, devfn;
2628
2629        iommu = device_to_iommu(dev, &bus, &devfn);
2630        if (!iommu)
2631                return NULL;
2632
2633        req_id = ((u16)bus << 8) | devfn;
2634
2635        if (dev_is_pci(dev)) {
2636                struct pci_dev *pdev = to_pci_dev(dev);
2637
2638                pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2639
2640                /* register PCI DMA alias device */
2641                if (req_id != dma_alias) {
2642                        tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2643                                        dma_alias & 0xff, NULL, domain);
2644
2645                        if (!tmp || tmp != domain)
2646                                return tmp;
2647                }
2648        }
2649
2650        tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2651        if (!tmp || tmp != domain)
2652                return tmp;
2653
2654        return domain;
2655}
2656
2657static int iommu_domain_identity_map(struct dmar_domain *domain,
2658                                     unsigned long long start,
2659                                     unsigned long long end)
2660{
2661        unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2662        unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2663
2664        if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2665                          dma_to_mm_pfn(last_vpfn))) {
2666                pr_err("Reserving iova failed\n");
2667                return -ENOMEM;
2668        }
2669
2670        pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2671        /*
2672         * RMRR range might have overlap with physical memory range,
2673         * clear it first
2674         */
2675        dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2676
2677        return __domain_mapping(domain, first_vpfn, NULL,
2678                                first_vpfn, last_vpfn - first_vpfn + 1,
2679                                DMA_PTE_READ|DMA_PTE_WRITE);
2680}
2681
2682static int domain_prepare_identity_map(struct device *dev,
2683                                       struct dmar_domain *domain,
2684                                       unsigned long long start,
2685                                       unsigned long long end)
2686{
2687        /* For _hardware_ passthrough, don't bother. But for software
2688           passthrough, we do it anyway -- it may indicate a memory
2689           range which is reserved in E820, so which didn't get set
2690           up to start with in si_domain */
2691        if (domain == si_domain && hw_pass_through) {
2692                dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2693                         start, end);
2694                return 0;
2695        }
2696
2697        dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2698
2699        if (end < start) {
2700                WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2701                        "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2702                        dmi_get_system_info(DMI_BIOS_VENDOR),
2703                        dmi_get_system_info(DMI_BIOS_VERSION),
2704                     dmi_get_system_info(DMI_PRODUCT_VERSION));
2705                return -EIO;
2706        }
2707
2708        if (end >> agaw_to_width(domain->agaw)) {
2709                WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2710                     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2711                     agaw_to_width(domain->agaw),
2712                     dmi_get_system_info(DMI_BIOS_VENDOR),
2713                     dmi_get_system_info(DMI_BIOS_VERSION),
2714                     dmi_get_system_info(DMI_PRODUCT_VERSION));
2715                return -EIO;
2716        }
2717
2718        return iommu_domain_identity_map(domain, start, end);
2719}
2720
2721static int md_domain_init(struct dmar_domain *domain, int guest_width);
2722
2723static int __init si_domain_init(int hw)
2724{
2725        struct dmar_rmrr_unit *rmrr;
2726        struct device *dev;
2727        int i, nid, ret;
2728
2729        si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2730        if (!si_domain)
2731                return -EFAULT;
2732
2733        if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2734                domain_exit(si_domain);
2735                return -EFAULT;
2736        }
2737
2738        if (hw)
2739                return 0;
2740
2741        for_each_online_node(nid) {
2742                unsigned long start_pfn, end_pfn;
2743                int i;
2744
2745                for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2746                        ret = iommu_domain_identity_map(si_domain,
2747                                        PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2748                        if (ret)
2749                                return ret;
2750                }
2751        }
2752
2753        /*
2754         * Normally we use DMA domains for devices which have RMRRs. But we
2755         * loose this requirement for graphic and usb devices. Identity map
2756         * the RMRRs for graphic and USB devices so that they could use the
2757         * si_domain.
2758         */
2759        for_each_rmrr_units(rmrr) {
2760                for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2761                                          i, dev) {
2762                        unsigned long long start = rmrr->base_address;
2763                        unsigned long long end = rmrr->end_address;
2764
2765                        if (device_is_rmrr_locked(dev))
2766                                continue;
2767
2768                        if (WARN_ON(end < start ||
2769                                    end >> agaw_to_width(si_domain->agaw)))
2770                                continue;
2771
2772                        ret = iommu_domain_identity_map(si_domain, start, end);
2773                        if (ret)
2774                                return ret;
2775                }
2776        }
2777
2778        return 0;
2779}
2780
2781static int identity_mapping(struct device *dev)
2782{
2783        struct device_domain_info *info;
2784
2785        info = dev->archdata.iommu;
2786        if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2787                return (info->domain == si_domain);
2788
2789        return 0;
2790}
2791
2792static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2793{
2794        struct dmar_domain *ndomain;
2795        struct intel_iommu *iommu;
2796        u8 bus, devfn;
2797
2798        iommu = device_to_iommu(dev, &bus, &devfn);
2799        if (!iommu)
2800                return -ENODEV;
2801
2802        ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2803        if (ndomain != domain)
2804                return -EBUSY;
2805
2806        return 0;
2807}
2808
2809static bool device_has_rmrr(struct device *dev)
2810{
2811        struct dmar_rmrr_unit *rmrr;
2812        struct device *tmp;
2813        int i;
2814
2815        rcu_read_lock();
2816        for_each_rmrr_units(rmrr) {
2817                /*
2818                 * Return TRUE if this RMRR contains the device that
2819                 * is passed in.
2820                 */
2821                for_each_active_dev_scope(rmrr->devices,
2822                                          rmrr->devices_cnt, i, tmp)
2823                        if (tmp == dev ||
2824                            is_downstream_to_pci_bridge(dev, tmp)) {
2825                                rcu_read_unlock();
2826                                return true;
2827                        }
2828        }
2829        rcu_read_unlock();
2830        return false;
2831}
2832
2833/**
2834 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2835 * is relaxable (ie. is allowed to be not enforced under some conditions)
2836 * @dev: device handle
2837 *
2838 * We assume that PCI USB devices with RMRRs have them largely
2839 * for historical reasons and that the RMRR space is not actively used post
2840 * boot.  This exclusion may change if vendors begin to abuse it.
2841 *
2842 * The same exception is made for graphics devices, with the requirement that
2843 * any use of the RMRR regions will be torn down before assigning the device
2844 * to a guest.
2845 *
2846 * Return: true if the RMRR is relaxable, false otherwise
2847 */
2848static bool device_rmrr_is_relaxable(struct device *dev)
2849{
2850        struct pci_dev *pdev;
2851
2852        if (!dev_is_pci(dev))
2853                return false;
2854
2855        pdev = to_pci_dev(dev);
2856        if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2857                return true;
2858        else
2859                return false;
2860}
2861
2862/*
2863 * There are a couple cases where we need to restrict the functionality of
2864 * devices associated with RMRRs.  The first is when evaluating a device for
2865 * identity mapping because problems exist when devices are moved in and out
2866 * of domains and their respective RMRR information is lost.  This means that
2867 * a device with associated RMRRs will never be in a "passthrough" domain.
2868 * The second is use of the device through the IOMMU API.  This interface
2869 * expects to have full control of the IOVA space for the device.  We cannot
2870 * satisfy both the requirement that RMRR access is maintained and have an
2871 * unencumbered IOVA space.  We also have no ability to quiesce the device's
2872 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2873 * We therefore prevent devices associated with an RMRR from participating in
2874 * the IOMMU API, which eliminates them from device assignment.
2875 *
2876 * In both cases, devices which have relaxable RMRRs are not concerned by this
2877 * restriction. See device_rmrr_is_relaxable comment.
2878 */
2879static bool device_is_rmrr_locked(struct device *dev)
2880{
2881        if (!device_has_rmrr(dev))
2882                return false;
2883
2884        if (device_rmrr_is_relaxable(dev))
2885                return false;
2886
2887        return true;
2888}
2889
2890/*
2891 * Return the required default domain type for a specific device.
2892 *
2893 * @dev: the device in query
2894 * @startup: true if this is during early boot
2895 *
2896 * Returns:
2897 *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2898 *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2899 *  - 0: both identity and dynamic domains work for this device
2900 */
2901static int device_def_domain_type(struct device *dev)
2902{
2903        if (dev_is_pci(dev)) {
2904                struct pci_dev *pdev = to_pci_dev(dev);
2905
2906                if (device_is_rmrr_locked(dev))
2907                        return IOMMU_DOMAIN_DMA;
2908
2909                /*
2910                 * Prevent any device marked as untrusted from getting
2911                 * placed into the statically identity mapping domain.
2912                 */
2913                if (pdev->untrusted)
2914                        return IOMMU_DOMAIN_DMA;
2915
2916                if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2917                        return IOMMU_DOMAIN_IDENTITY;
2918
2919                if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2920                        return IOMMU_DOMAIN_IDENTITY;
2921
2922                /*
2923                 * We want to start off with all devices in the 1:1 domain, and
2924                 * take them out later if we find they can't access all of memory.
2925                 *
2926                 * However, we can't do this for PCI devices behind bridges,
2927                 * because all PCI devices behind the same bridge will end up
2928                 * with the same source-id on their transactions.
2929                 *
2930                 * Practically speaking, we can't change things around for these
2931                 * devices at run-time, because we can't be sure there'll be no
2932                 * DMA transactions in flight for any of their siblings.
2933                 *
2934                 * So PCI devices (unless they're on the root bus) as well as
2935                 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2936                 * the 1:1 domain, just in _case_ one of their siblings turns out
2937                 * not to be able to map all of memory.
2938                 */
2939                if (!pci_is_pcie(pdev)) {
2940                        if (!pci_is_root_bus(pdev->bus))
2941                                return IOMMU_DOMAIN_DMA;
2942                        if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2943                                return IOMMU_DOMAIN_DMA;
2944                } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2945                        return IOMMU_DOMAIN_DMA;
2946        } else {
2947                if (device_has_rmrr(dev))
2948                        return IOMMU_DOMAIN_DMA;
2949        }
2950
2951        return (iommu_identity_mapping & IDENTMAP_ALL) ?
2952                        IOMMU_DOMAIN_IDENTITY : 0;
2953}
2954
2955static void intel_iommu_init_qi(struct intel_iommu *iommu)
2956{
2957        /*
2958         * Start from the sane iommu hardware state.
2959         * If the queued invalidation is already initialized by us
2960         * (for example, while enabling interrupt-remapping) then
2961         * we got the things already rolling from a sane state.
2962         */
2963        if (!iommu->qi) {
2964                /*
2965                 * Clear any previous faults.
2966                 */
2967                dmar_fault(-1, iommu);
2968                /*
2969                 * Disable queued invalidation if supported and already enabled
2970                 * before OS handover.
2971                 */
2972                dmar_disable_qi(iommu);
2973        }
2974
2975        if (dmar_enable_qi(iommu)) {
2976                /*
2977                 * Queued Invalidate not enabled, use Register Based Invalidate
2978                 */
2979                iommu->flush.flush_context = __iommu_flush_context;
2980                iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2981                pr_info("%s: Using Register based invalidation\n",
2982                        iommu->name);
2983        } else {
2984                iommu->flush.flush_context = qi_flush_context;
2985                iommu->flush.flush_iotlb = qi_flush_iotlb;
2986                pr_info("%s: Using Queued invalidation\n", iommu->name);
2987        }
2988}
2989
2990static int copy_context_table(struct intel_iommu *iommu,
2991                              struct root_entry *old_re,
2992                              struct context_entry **tbl,
2993                              int bus, bool ext)
2994{
2995        int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2996        struct context_entry *new_ce = NULL, ce;
2997        struct context_entry *old_ce = NULL;
2998        struct root_entry re;
2999        phys_addr_t old_ce_phys;
3000
3001        tbl_idx = ext ? bus * 2 : bus;
3002        memcpy(&re, old_re, sizeof(re));
3003
3004        for (devfn = 0; devfn < 256; devfn++) {
3005                /* First calculate the correct index */
3006                idx = (ext ? devfn * 2 : devfn) % 256;
3007
3008                if (idx == 0) {
3009                        /* First save what we may have and clean up */
3010                        if (new_ce) {
3011                                tbl[tbl_idx] = new_ce;
3012                                __iommu_flush_cache(iommu, new_ce,
3013                                                    VTD_PAGE_SIZE);
3014                                pos = 1;
3015                        }
3016
3017                        if (old_ce)
3018                                memunmap(old_ce);
3019
3020                        ret = 0;
3021                        if (devfn < 0x80)
3022                                old_ce_phys = root_entry_lctp(&re);
3023                        else
3024                                old_ce_phys = root_entry_uctp(&re);
3025
3026                        if (!old_ce_phys) {
3027                                if (ext && devfn == 0) {
3028                                        /* No LCTP, try UCTP */
3029                                        devfn = 0x7f;
3030                                        continue;
3031                                } else {
3032                                        goto out;
3033                                }
3034                        }
3035
3036                        ret = -ENOMEM;
3037                        old_ce = memremap(old_ce_phys, PAGE_SIZE,
3038                                        MEMREMAP_WB);
3039                        if (!old_ce)
3040                                goto out;
3041
3042                        new_ce = alloc_pgtable_page(iommu->node);
3043                        if (!new_ce)
3044                                goto out_unmap;
3045
3046                        ret = 0;
3047                }
3048
3049                /* Now copy the context entry */
3050                memcpy(&ce, old_ce + idx, sizeof(ce));
3051
3052                if (!__context_present(&ce))
3053                        continue;
3054
3055                did = context_domain_id(&ce);
3056                if (did >= 0 && did < cap_ndoms(iommu->cap))
3057                        set_bit(did, iommu->domain_ids);
3058
3059                /*
3060                 * We need a marker for copied context entries. This
3061                 * marker needs to work for the old format as well as
3062                 * for extended context entries.
3063                 *
3064                 * Bit 67 of the context entry is used. In the old
3065                 * format this bit is available to software, in the
3066                 * extended format it is the PGE bit, but PGE is ignored
3067                 * by HW if PASIDs are disabled (and thus still
3068                 * available).
3069                 *
3070                 * So disable PASIDs first and then mark the entry
3071                 * copied. This means that we don't copy PASID
3072                 * translations from the old kernel, but this is fine as
3073                 * faults there are not fatal.
3074                 */
3075                context_clear_pasid_enable(&ce);
3076                context_set_copied(&ce);
3077
3078                new_ce[idx] = ce;
3079        }
3080
3081        tbl[tbl_idx + pos] = new_ce;
3082
3083        __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3084
3085out_unmap:
3086        memunmap(old_ce);
3087
3088out:
3089        return ret;
3090}
3091
3092static int copy_translation_tables(struct intel_iommu *iommu)
3093{
3094        struct context_entry **ctxt_tbls;
3095        struct root_entry *old_rt;
3096        phys_addr_t old_rt_phys;
3097        int ctxt_table_entries;
3098        unsigned long flags;
3099        u64 rtaddr_reg;
3100        int bus, ret;
3101        bool new_ext, ext;
3102
3103        rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3104        ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3105        new_ext    = !!ecap_ecs(iommu->ecap);
3106
3107        /*
3108         * The RTT bit can only be changed when translation is disabled,
3109         * but disabling translation means to open a window for data
3110         * corruption. So bail out and don't copy anything if we would
3111         * have to change the bit.
3112         */
3113        if (new_ext != ext)
3114                return -EINVAL;
3115
3116        old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3117        if (!old_rt_phys)
3118                return -EINVAL;
3119
3120        old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3121        if (!old_rt)
3122                return -ENOMEM;
3123
3124        /* This is too big for the stack - allocate it from slab */
3125        ctxt_table_entries = ext ? 512 : 256;
3126        ret = -ENOMEM;
3127        ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3128        if (!ctxt_tbls)
3129                goto out_unmap;
3130
3131        for (bus = 0; bus < 256; bus++) {
3132                ret = copy_context_table(iommu, &old_rt[bus],
3133                                         ctxt_tbls, bus, ext);
3134                if (ret) {
3135                        pr_err("%s: Failed to copy context table for bus %d\n",
3136                                iommu->name, bus);
3137                        continue;
3138                }
3139        }
3140
3141        spin_lock_irqsave(&iommu->lock, flags);
3142
3143        /* Context tables are copied, now write them to the root_entry table */
3144        for (bus = 0; bus < 256; bus++) {
3145                int idx = ext ? bus * 2 : bus;
3146                u64 val;
3147
3148                if (ctxt_tbls[idx]) {
3149                        val = virt_to_phys(ctxt_tbls[idx]) | 1;
3150                        iommu->root_entry[bus].lo = val;
3151                }
3152
3153                if (!ext || !ctxt_tbls[idx + 1])
3154                        continue;
3155
3156                val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3157                iommu->root_entry[bus].hi = val;
3158        }
3159
3160        spin_unlock_irqrestore(&iommu->lock, flags);
3161
3162        kfree(ctxt_tbls);
3163
3164        __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3165
3166        ret = 0;
3167
3168out_unmap:
3169        memunmap(old_rt);
3170
3171        return ret;
3172}
3173
3174static int __init init_dmars(void)
3175{
3176        struct dmar_drhd_unit *drhd;
3177        struct intel_iommu *iommu;
3178        int ret;
3179
3180        /*
3181         * for each drhd
3182         *    allocate root
3183         *    initialize and program root entry to not present
3184         * endfor
3185         */
3186        for_each_drhd_unit(drhd) {
3187                /*
3188                 * lock not needed as this is only incremented in the single
3189                 * threaded kernel __init code path all other access are read
3190                 * only
3191                 */
3192                if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3193                        g_num_of_iommus++;
3194                        continue;
3195                }
3196                pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3197        }
3198
3199        /* Preallocate enough resources for IOMMU hot-addition */
3200        if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3201                g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3202
3203        g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3204                        GFP_KERNEL);
3205        if (!g_iommus) {
3206                pr_err("Allocating global iommu array failed\n");
3207                ret = -ENOMEM;
3208                goto error;
3209        }
3210
3211        for_each_iommu(iommu, drhd) {
3212                if (drhd->ignored) {
3213                        iommu_disable_translation(iommu);
3214                        continue;
3215                }
3216
3217                /*
3218                 * Find the max pasid size of all IOMMU's in the system.
3219                 * We need to ensure the system pasid table is no bigger
3220                 * than the smallest supported.
3221                 */
3222                if (pasid_supported(iommu)) {
3223                        u32 temp = 2 << ecap_pss(iommu->ecap);
3224
3225                        intel_pasid_max_id = min_t(u32, temp,
3226                                                   intel_pasid_max_id);
3227                }
3228
3229                g_iommus[iommu->seq_id] = iommu;
3230
3231                intel_iommu_init_qi(iommu);
3232
3233                ret = iommu_init_domains(iommu);
3234                if (ret)
3235                        goto free_iommu;
3236
3237                init_translation_status(iommu);
3238
3239                if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3240                        iommu_disable_translation(iommu);
3241                        clear_translation_pre_enabled(iommu);
3242                        pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3243                                iommu->name);
3244                }
3245
3246                /*
3247                 * TBD:
3248                 * we could share the same root & context tables
3249                 * among all IOMMU's. Need to Split it later.
3250                 */
3251                ret = iommu_alloc_root_entry(iommu);
3252                if (ret)
3253                        goto free_iommu;
3254
3255                if (translation_pre_enabled(iommu)) {
3256                        pr_info("Translation already enabled - trying to copy translation structures\n");
3257
3258                        ret = copy_translation_tables(iommu);
3259                        if (ret) {
3260                                /*
3261                                 * We found the IOMMU with translation
3262                                 * enabled - but failed to copy over the
3263                                 * old root-entry table. Try to proceed
3264                                 * by disabling translation now and
3265                                 * allocating a clean root-entry table.
3266                                 * This might cause DMAR faults, but
3267                                 * probably the dump will still succeed.
3268                                 */
3269                                pr_err("Failed to copy translation tables from previous kernel for %s\n",
3270                                       iommu->name);
3271                                iommu_disable_translation(iommu);
3272                                clear_translation_pre_enabled(iommu);
3273                        } else {
3274                                pr_info("Copied translation tables from previous kernel for %s\n",
3275                                        iommu->name);
3276                        }
3277                }
3278
3279                if (!ecap_pass_through(iommu->ecap))
3280                        hw_pass_through = 0;
3281#ifdef CONFIG_INTEL_IOMMU_SVM
3282                if (pasid_supported(iommu))
3283                        intel_svm_init(iommu);
3284#endif
3285        }
3286
3287        /*
3288         * Now that qi is enabled on all iommus, set the root entry and flush
3289         * caches. This is required on some Intel X58 chipsets, otherwise the
3290         * flush_context function will loop forever and the boot hangs.
3291         */
3292        for_each_active_iommu(iommu, drhd) {
3293                iommu_flush_write_buffer(iommu);
3294                iommu_set_root_entry(iommu);
3295                iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3296                iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3297        }
3298
3299        if (iommu_pass_through)
3300                iommu_identity_mapping |= IDENTMAP_ALL;
3301
3302#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3303        dmar_map_gfx = 0;
3304#endif
3305
3306        if (!dmar_map_gfx)
3307                iommu_identity_mapping |= IDENTMAP_GFX;
3308
3309        check_tylersburg_isoch();
3310
3311        ret = si_domain_init(hw_pass_through);
3312        if (ret)
3313                goto free_iommu;
3314
3315        /*
3316         * for each drhd
3317         *   enable fault log
3318         *   global invalidate context cache
3319         *   global invalidate iotlb
3320         *   enable translation
3321         */
3322        for_each_iommu(iommu, drhd) {
3323                if (drhd->ignored) {
3324                        /*
3325                         * we always have to disable PMRs or DMA may fail on
3326                         * this device
3327                         */
3328                        if (force_on)
3329                                iommu_disable_protect_mem_regions(iommu);
3330                        continue;
3331                }
3332
3333                iommu_flush_write_buffer(iommu);
3334
3335#ifdef CONFIG_INTEL_IOMMU_SVM
3336                if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3337                        /*
3338                         * Call dmar_alloc_hwirq() with dmar_global_lock held,
3339                         * could cause possible lock race condition.
3340                         */
3341                        up_write(&dmar_global_lock);
3342                        ret = intel_svm_enable_prq(iommu);
3343                        down_write(&dmar_global_lock);
3344                        if (ret)
3345                                goto free_iommu;
3346                }
3347#endif
3348                ret = dmar_set_interrupt(iommu);
3349                if (ret)
3350                        goto free_iommu;
3351        }
3352
3353        return 0;
3354
3355free_iommu:
3356        for_each_active_iommu(iommu, drhd) {
3357                disable_dmar_iommu(iommu);
3358                free_dmar_iommu(iommu);
3359        }
3360
3361        kfree(g_iommus);
3362
3363error:
3364        return ret;
3365}
3366
3367/* This takes a number of _MM_ pages, not VTD pages */
3368static unsigned long intel_alloc_iova(struct device *dev,
3369                                     struct dmar_domain *domain,
3370                                     unsigned long nrpages, uint64_t dma_mask)
3371{
3372        unsigned long iova_pfn;
3373
3374        /* Restrict dma_mask to the width that the iommu can handle */
3375        dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3376        /* Ensure we reserve the whole size-aligned region */
3377        nrpages = __roundup_pow_of_two(nrpages);
3378
3379        if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3380                /*
3381                 * First try to allocate an io virtual address in
3382                 * DMA_BIT_MASK(32) and if that fails then try allocating
3383                 * from higher range
3384                 */
3385                iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3386                                           IOVA_PFN(DMA_BIT_MASK(32)), false);
3387                if (iova_pfn)
3388                        return iova_pfn;
3389        }
3390        iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3391                                   IOVA_PFN(dma_mask), true);
3392        if (unlikely(!iova_pfn)) {
3393                dev_err(dev, "Allocating %ld-page iova failed", nrpages);
3394                return 0;
3395        }
3396
3397        return iova_pfn;
3398}
3399
3400static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3401{
3402        struct dmar_domain *domain, *tmp;
3403        struct dmar_rmrr_unit *rmrr;
3404        struct device *i_dev;
3405        int i, ret;
3406
3407        /* Device shouldn't be attached by any domains. */
3408        domain = find_domain(dev);
3409        if (domain)
3410                return NULL;
3411
3412        domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3413        if (!domain)
3414                goto out;
3415
3416        /* We have a new domain - setup possible RMRRs for the device */
3417        rcu_read_lock();
3418        for_each_rmrr_units(rmrr) {
3419                for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3420                                          i, i_dev) {
3421                        if (i_dev != dev)
3422                                continue;
3423
3424                        ret = domain_prepare_identity_map(dev, domain,
3425                                                          rmrr->base_address,
3426                                                          rmrr->end_address);
3427                        if (ret)
3428                                dev_err(dev, "Mapping reserved region failed\n");
3429                }
3430        }
3431        rcu_read_unlock();
3432
3433        tmp = set_domain_for_dev(dev, domain);
3434        if (!tmp || domain != tmp) {
3435                domain_exit(domain);
3436                domain = tmp;
3437        }
3438
3439out:
3440        if (!domain)
3441                dev_err(dev, "Allocating domain failed\n");
3442        else
3443                domain->domain.type = IOMMU_DOMAIN_DMA;
3444
3445        return domain;
3446}
3447
3448/* Check if the dev needs to go through non-identity map and unmap process.*/
3449static bool iommu_need_mapping(struct device *dev)
3450{
3451        int ret;
3452
3453        if (iommu_dummy(dev))
3454                return false;
3455
3456        ret = identity_mapping(dev);
3457        if (ret) {
3458                u64 dma_mask = *dev->dma_mask;
3459
3460                if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3461                        dma_mask = dev->coherent_dma_mask;
3462
3463                if (dma_mask >= dma_get_required_mask(dev))
3464                        return false;
3465
3466                /*
3467                 * 32 bit DMA is removed from si_domain and fall back to
3468                 * non-identity mapping.
3469                 */
3470                dmar_remove_one_dev_info(dev);
3471                ret = iommu_request_dma_domain_for_dev(dev);
3472                if (ret) {
3473                        struct iommu_domain *domain;
3474                        struct dmar_domain *dmar_domain;
3475
3476                        domain = iommu_get_domain_for_dev(dev);
3477                        if (domain) {
3478                                dmar_domain = to_dmar_domain(domain);
3479                                dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3480                        }
3481                        dmar_remove_one_dev_info(dev);
3482                        get_private_domain_for_dev(dev);
3483                }
3484
3485                dev_info(dev, "32bit DMA uses non-identity mapping\n");
3486        }
3487
3488        return true;
3489}
3490
3491static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3492                                     size_t size, int dir, u64 dma_mask)
3493{
3494        struct dmar_domain *domain;
3495        phys_addr_t start_paddr;
3496        unsigned long iova_pfn;
3497        int prot = 0;
3498        int ret;
3499        struct intel_iommu *iommu;
3500        unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3501
3502        BUG_ON(dir == DMA_NONE);
3503
3504        domain = find_domain(dev);
3505        if (!domain)
3506                return DMA_MAPPING_ERROR;
3507
3508        iommu = domain_get_iommu(domain);
3509        size = aligned_nrpages(paddr, size);
3510
3511        iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3512        if (!iova_pfn)
3513                goto error;
3514
3515        /*
3516         * Check if DMAR supports zero-length reads on write only
3517         * mappings..
3518         */
3519        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3520                        !cap_zlr(iommu->cap))
3521                prot |= DMA_PTE_READ;
3522        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3523                prot |= DMA_PTE_WRITE;
3524        /*
3525         * paddr - (paddr + size) might be partial page, we should map the whole
3526         * page.  Note: if two part of one page are separately mapped, we
3527         * might have two guest_addr mapping to the same host paddr, but this
3528         * is not a big problem
3529         */
3530        ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3531                                 mm_to_dma_pfn(paddr_pfn), size, prot);
3532        if (ret)
3533                goto error;
3534
3535        start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3536        start_paddr += paddr & ~PAGE_MASK;
3537        return start_paddr;
3538
3539error:
3540        if (iova_pfn)
3541                free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3542        dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3543                size, (unsigned long long)paddr, dir);
3544        return DMA_MAPPING_ERROR;
3545}
3546
3547static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3548                                 unsigned long offset, size_t size,
3549                                 enum dma_data_direction dir,
3550                                 unsigned long attrs)
3551{
3552        if (iommu_need_mapping(dev))
3553                return __intel_map_single(dev, page_to_phys(page) + offset,
3554                                size, dir, *dev->dma_mask);
3555        return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3556}
3557
3558static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3559                                     size_t size, enum dma_data_direction dir,
3560                                     unsigned long attrs)
3561{
3562        if (iommu_need_mapping(dev))
3563                return __intel_map_single(dev, phys_addr, size, dir,
3564                                *dev->dma_mask);
3565        return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3566}
3567
3568static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3569{
3570        struct dmar_domain *domain;
3571        unsigned long start_pfn, last_pfn;
3572        unsigned long nrpages;
3573        unsigned long iova_pfn;
3574        struct intel_iommu *iommu;
3575        struct page *freelist;
3576        struct pci_dev *pdev = NULL;
3577
3578        domain = find_domain(dev);
3579        BUG_ON(!domain);
3580
3581        iommu = domain_get_iommu(domain);
3582
3583        iova_pfn = IOVA_PFN(dev_addr);
3584
3585        nrpages = aligned_nrpages(dev_addr, size);
3586        start_pfn = mm_to_dma_pfn(iova_pfn);
3587        last_pfn = start_pfn + nrpages - 1;
3588
3589        if (dev_is_pci(dev))
3590                pdev = to_pci_dev(dev);
3591
3592        dev_dbg(dev, "Device unmapping: pfn %lx-%lx\n", start_pfn, last_pfn);
3593
3594        freelist = domain_unmap(domain, start_pfn, last_pfn);
3595
3596        if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3597                        !has_iova_flush_queue(&domain->iovad)) {
3598                iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3599                                      nrpages, !freelist, 0);
3600                /* free iova */
3601                free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3602                dma_free_pagelist(freelist);
3603        } else {
3604                queue_iova(&domain->iovad, iova_pfn, nrpages,
3605                           (unsigned long)freelist);
3606                /*
3607                 * queue up the release of the unmap to save the 1/6th of the
3608                 * cpu used up by the iotlb flush operation...
3609                 */
3610        }
3611}
3612
3613static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3614                             size_t size, enum dma_data_direction dir,
3615                             unsigned long attrs)
3616{
3617        if (iommu_need_mapping(dev))
3618                intel_unmap(dev, dev_addr, size);
3619        else
3620                dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3621}
3622
3623static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3624                size_t size, enum dma_data_direction dir, unsigned long attrs)
3625{
3626        if (iommu_need_mapping(dev))
3627                intel_unmap(dev, dev_addr, size);
3628}
3629
3630static void *intel_alloc_coherent(struct device *dev, size_t size,
3631                                  dma_addr_t *dma_handle, gfp_t flags,
3632                                  unsigned long attrs)
3633{
3634        struct page *page = NULL;
3635        int order;
3636
3637        if (!iommu_need_mapping(dev))
3638                return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3639
3640        size = PAGE_ALIGN(size);
3641        order = get_order(size);
3642
3643        if (gfpflags_allow_blocking(flags)) {
3644                unsigned int count = size >> PAGE_SHIFT;
3645
3646                page = dma_alloc_from_contiguous(dev, count, order,
3647                                                 flags & __GFP_NOWARN);
3648        }
3649
3650        if (!page)
3651                page = alloc_pages(flags, order);
3652        if (!page)
3653                return NULL;
3654        memset(page_address(page), 0, size);
3655
3656        *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3657                                         DMA_BIDIRECTIONAL,
3658                                         dev->coherent_dma_mask);
3659        if (*dma_handle != DMA_MAPPING_ERROR)
3660                return page_address(page);
3661        if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3662                __free_pages(page, order);
3663
3664        return NULL;
3665}
3666
3667static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3668                                dma_addr_t dma_handle, unsigned long attrs)
3669{
3670        int order;
3671        struct page *page = virt_to_page(vaddr);
3672
3673        if (!iommu_need_mapping(dev))
3674                return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3675
3676        size = PAGE_ALIGN(size);
3677        order = get_order(size);
3678
3679        intel_unmap(dev, dma_handle, size);
3680        if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3681                __free_pages(page, order);
3682}
3683
3684static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3685                           int nelems, enum dma_data_direction dir,
3686                           unsigned long attrs)
3687{
3688        dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3689        unsigned long nrpages = 0;
3690        struct scatterlist *sg;
3691        int i;
3692
3693        if (!iommu_need_mapping(dev))
3694                return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3695
3696        for_each_sg(sglist, sg, nelems, i) {
3697                nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3698        }
3699
3700        intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3701}
3702
3703static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3704                        enum dma_data_direction dir, unsigned long attrs)
3705{
3706        int i;
3707        struct dmar_domain *domain;
3708        size_t size = 0;
3709        int prot = 0;
3710        unsigned long iova_pfn;
3711        int ret;
3712        struct scatterlist *sg;
3713        unsigned long start_vpfn;
3714        struct intel_iommu *iommu;
3715
3716        BUG_ON(dir == DMA_NONE);
3717        if (!iommu_need_mapping(dev))
3718                return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3719
3720        domain = find_domain(dev);
3721        if (!domain)
3722                return 0;
3723
3724        iommu = domain_get_iommu(domain);
3725
3726        for_each_sg(sglist, sg, nelems, i)
3727                size += aligned_nrpages(sg->offset, sg->length);
3728
3729        iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3730                                *dev->dma_mask);
3731        if (!iova_pfn) {
3732                sglist->dma_length = 0;
3733                return 0;
3734        }
3735
3736        /*
3737         * Check if DMAR supports zero-length reads on write only
3738         * mappings..
3739         */
3740        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3741                        !cap_zlr(iommu->cap))
3742                prot |= DMA_PTE_READ;
3743        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3744                prot |= DMA_PTE_WRITE;
3745
3746        start_vpfn = mm_to_dma_pfn(iova_pfn);
3747
3748        ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3749        if (unlikely(ret)) {
3750                dma_pte_free_pagetable(domain, start_vpfn,
3751                                       start_vpfn + size - 1,
3752                                       agaw_to_level(domain->agaw) + 1);
3753                free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3754                return 0;
3755        }
3756
3757        return nelems;
3758}
3759
3760static const struct dma_map_ops intel_dma_ops = {
3761        .alloc = intel_alloc_coherent,
3762        .free = intel_free_coherent,
3763        .map_sg = intel_map_sg,
3764        .unmap_sg = intel_unmap_sg,
3765        .map_page = intel_map_page,
3766        .unmap_page = intel_unmap_page,
3767        .map_resource = intel_map_resource,
3768        .unmap_resource = intel_unmap_resource,
3769        .dma_supported = dma_direct_supported,
3770};
3771
3772static inline int iommu_domain_cache_init(void)
3773{
3774        int ret = 0;
3775
3776        iommu_domain_cache = kmem_cache_create("iommu_domain",
3777                                         sizeof(struct dmar_domain),
3778                                         0,
3779                                         SLAB_HWCACHE_ALIGN,
3780
3781                                         NULL);
3782        if (!iommu_domain_cache) {
3783                pr_err("Couldn't create iommu_domain cache\n");
3784                ret = -ENOMEM;
3785        }
3786
3787        return ret;
3788}
3789
3790static inline int iommu_devinfo_cache_init(void)
3791{
3792        int ret = 0;
3793
3794        iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3795                                         sizeof(struct device_domain_info),
3796                                         0,
3797                                         SLAB_HWCACHE_ALIGN,
3798                                         NULL);
3799        if (!iommu_devinfo_cache) {
3800                pr_err("Couldn't create devinfo cache\n");
3801                ret = -ENOMEM;
3802        }
3803
3804        return ret;
3805}
3806
3807static int __init iommu_init_mempool(void)
3808{
3809        int ret;
3810        ret = iova_cache_get();
3811        if (ret)
3812                return ret;
3813
3814        ret = iommu_domain_cache_init();
3815        if (ret)
3816                goto domain_error;
3817
3818        ret = iommu_devinfo_cache_init();
3819        if (!ret)
3820                return ret;
3821
3822        kmem_cache_destroy(iommu_domain_cache);
3823domain_error:
3824        iova_cache_put();
3825
3826        return -ENOMEM;
3827}
3828
3829static void __init iommu_exit_mempool(void)
3830{
3831        kmem_cache_destroy(iommu_devinfo_cache);
3832        kmem_cache_destroy(iommu_domain_cache);
3833        iova_cache_put();
3834}
3835
3836static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3837{
3838        struct dmar_drhd_unit *drhd;
3839        u32 vtbar;
3840        int rc;
3841
3842        /* We know that this device on this chipset has its own IOMMU.
3843         * If we find it under a different IOMMU, then the BIOS is lying
3844         * to us. Hope that the IOMMU for this device is actually
3845         * disabled, and it needs no translation...
3846         */
3847        rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3848        if (rc) {
3849                /* "can't" happen */
3850                dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3851                return;
3852        }
3853        vtbar &= 0xffff0000;
3854
3855        /* we know that the this iommu should be at offset 0xa000 from vtbar */
3856        drhd = dmar_find_matched_drhd_unit(pdev);
3857        if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3858                            TAINT_FIRMWARE_WORKAROUND,
3859                            "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3860                pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3861}
3862DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3863
3864static void __init init_no_remapping_devices(void)
3865{
3866        struct dmar_drhd_unit *drhd;
3867        struct device *dev;
3868        int i;
3869
3870        for_each_drhd_unit(drhd) {
3871                if (!drhd->include_all) {
3872                        for_each_active_dev_scope(drhd->devices,
3873                                                  drhd->devices_cnt, i, dev)
3874                                break;
3875                        /* ignore DMAR unit if no devices exist */
3876                        if (i == drhd->devices_cnt)
3877                                drhd->ignored = 1;
3878                }
3879        }
3880
3881        for_each_active_drhd_unit(drhd) {
3882                if (drhd->include_all)
3883                        continue;
3884
3885                for_each_active_dev_scope(drhd->devices,
3886                                          drhd->devices_cnt, i, dev)
3887                        if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3888                                break;
3889                if (i < drhd->devices_cnt)
3890                        continue;
3891
3892                /* This IOMMU has *only* gfx devices. Either bypass it or
3893                   set the gfx_mapped flag, as appropriate */
3894                if (!dmar_map_gfx) {
3895                        drhd->ignored = 1;
3896                        for_each_active_dev_scope(drhd->devices,
3897                                                  drhd->devices_cnt, i, dev)
3898                                dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3899                }
3900        }
3901}
3902
3903#ifdef CONFIG_SUSPEND
3904static int init_iommu_hw(void)
3905{
3906        struct dmar_drhd_unit *drhd;
3907        struct intel_iommu *iommu = NULL;
3908
3909        for_each_active_iommu(iommu, drhd)
3910                if (iommu->qi)
3911                        dmar_reenable_qi(iommu);
3912
3913        for_each_iommu(iommu, drhd) {
3914                if (drhd->ignored) {
3915                        /*
3916                         * we always have to disable PMRs or DMA may fail on
3917                         * this device
3918                         */
3919                        if (force_on)
3920                                iommu_disable_protect_mem_regions(iommu);
3921                        continue;
3922                }
3923
3924                iommu_flush_write_buffer(iommu);
3925
3926                iommu_set_root_entry(iommu);
3927
3928                iommu->flush.flush_context(iommu, 0, 0, 0,
3929                                           DMA_CCMD_GLOBAL_INVL);
3930                iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3931                iommu_enable_translation(iommu);
3932                iommu_disable_protect_mem_regions(iommu);
3933        }
3934
3935        return 0;
3936}
3937
3938static void iommu_flush_all(void)
3939{
3940        struct dmar_drhd_unit *drhd;
3941        struct intel_iommu *iommu;
3942
3943        for_each_active_iommu(iommu, drhd) {
3944                iommu->flush.flush_context(iommu, 0, 0, 0,
3945                                           DMA_CCMD_GLOBAL_INVL);
3946                iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3947                                         DMA_TLB_GLOBAL_FLUSH);
3948        }
3949}
3950
3951static int iommu_suspend(void)
3952{
3953        struct dmar_drhd_unit *drhd;
3954        struct intel_iommu *iommu = NULL;
3955        unsigned long flag;
3956
3957        for_each_active_iommu(iommu, drhd) {
3958                iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3959                                                 GFP_ATOMIC);
3960                if (!iommu->iommu_state)
3961                        goto nomem;
3962        }
3963
3964        iommu_flush_all();
3965
3966        for_each_active_iommu(iommu, drhd) {
3967                iommu_disable_translation(iommu);
3968
3969                raw_spin_lock_irqsave(&iommu->register_lock, flag);
3970
3971                iommu->iommu_state[SR_DMAR_FECTL_REG] =
3972                        readl(iommu->reg + DMAR_FECTL_REG);
3973                iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3974                        readl(iommu->reg + DMAR_FEDATA_REG);
3975                iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3976                        readl(iommu->reg + DMAR_FEADDR_REG);
3977                iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3978                        readl(iommu->reg + DMAR_FEUADDR_REG);
3979
3980                raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3981        }
3982        return 0;
3983
3984nomem:
3985        for_each_active_iommu(iommu, drhd)
3986                kfree(iommu->iommu_state);
3987
3988        return -ENOMEM;
3989}
3990
3991static void iommu_resume(void)
3992{
3993        struct dmar_drhd_unit *drhd;
3994        struct intel_iommu *iommu = NULL;
3995        unsigned long flag;
3996
3997        if (init_iommu_hw()) {
3998                if (force_on)
3999                        panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4000                else
4001                        WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4002                return;
4003        }
4004
4005        for_each_active_iommu(iommu, drhd) {
4006
4007                raw_spin_lock_irqsave(&iommu->register_lock, flag);
4008
4009                writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4010                        iommu->reg + DMAR_FECTL_REG);
4011                writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4012                        iommu->reg + DMAR_FEDATA_REG);
4013                writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4014                        iommu->reg + DMAR_FEADDR_REG);
4015                writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4016                        iommu->reg + DMAR_FEUADDR_REG);
4017
4018                raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4019        }
4020
4021        for_each_active_iommu(iommu, drhd)
4022                kfree(iommu->iommu_state);
4023}
4024
4025static struct syscore_ops iommu_syscore_ops = {
4026        .resume         = iommu_resume,
4027        .suspend        = iommu_suspend,
4028};
4029
4030static void __init init_iommu_pm_ops(void)
4031{
4032        register_syscore_ops(&iommu_syscore_ops);
4033}
4034
4035#else
4036static inline void init_iommu_pm_ops(void) {}
4037#endif  /* CONFIG_PM */
4038
4039int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4040{
4041        struct acpi_dmar_reserved_memory *rmrr;
4042        struct dmar_rmrr_unit *rmrru;
4043
4044        rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4045        if (!rmrru)
4046                goto out;
4047
4048        rmrru->hdr = header;
4049        rmrr = (struct acpi_dmar_reserved_memory *)header;
4050        rmrru->base_address = rmrr->base_address;
4051        rmrru->end_address = rmrr->end_address;
4052
4053        rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4054                                ((void *)rmrr) + rmrr->header.length,
4055                                &rmrru->devices_cnt);
4056        if (rmrru->devices_cnt && rmrru->devices == NULL)
4057                goto free_rmrru;
4058
4059        list_add(&rmrru->list, &dmar_rmrr_units);
4060
4061        return 0;
4062free_rmrru:
4063        kfree(rmrru);
4064out:
4065        return -ENOMEM;
4066}
4067
4068static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4069{
4070        struct dmar_atsr_unit *atsru;
4071        struct acpi_dmar_atsr *tmp;
4072
4073        list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4074                tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4075                if (atsr->segment != tmp->segment)
4076                        continue;
4077                if (atsr->header.length != tmp->header.length)
4078                        continue;
4079                if (memcmp(atsr, tmp, atsr->header.length) == 0)
4080                        return atsru;
4081        }
4082
4083        return NULL;
4084}
4085
4086int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4087{
4088        struct acpi_dmar_atsr *atsr;
4089        struct dmar_atsr_unit *atsru;
4090
4091        if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4092                return 0;
4093
4094        atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4095        atsru = dmar_find_atsr(atsr);
4096        if (atsru)
4097                return 0;
4098
4099        atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4100        if (!atsru)
4101                return -ENOMEM;
4102
4103        /*
4104         * If memory is allocated from slab by ACPI _DSM method, we need to
4105         * copy the memory content because the memory buffer will be freed
4106         * on return.
4107         */
4108        atsru->hdr = (void *)(atsru + 1);
4109        memcpy(atsru->hdr, hdr, hdr->length);
4110        atsru->include_all = atsr->flags & 0x1;
4111        if (!atsru->include_all) {
4112                atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4113                                (void *)atsr + atsr->header.length,
4114                                &atsru->devices_cnt);
4115                if (atsru->devices_cnt && atsru->devices == NULL) {
4116                        kfree(atsru);
4117                        return -ENOMEM;
4118                }
4119        }
4120
4121        list_add_rcu(&atsru->list, &dmar_atsr_units);
4122
4123        return 0;
4124}
4125
4126static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4127{
4128        dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4129        kfree(atsru);
4130}
4131
4132int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4133{
4134        struct acpi_dmar_atsr *atsr;
4135        struct dmar_atsr_unit *atsru;
4136
4137        atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4138        atsru = dmar_find_atsr(atsr);
4139        if (atsru) {
4140                list_del_rcu(&atsru->list);
4141                synchronize_rcu();
4142                intel_iommu_free_atsr(atsru);
4143        }
4144
4145        return 0;
4146}
4147
4148int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4149{
4150        int i;
4151        struct device *dev;
4152        struct acpi_dmar_atsr *atsr;
4153        struct dmar_atsr_unit *atsru;
4154
4155        atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4156        atsru = dmar_find_atsr(atsr);
4157        if (!atsru)
4158                return 0;
4159
4160        if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4161                for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4162                                          i, dev)
4163                        return -EBUSY;
4164        }
4165
4166        return 0;
4167}
4168
4169static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4170{
4171        int sp, ret;
4172        struct intel_iommu *iommu = dmaru->iommu;
4173
4174        if (g_iommus[iommu->seq_id])
4175                return 0;
4176
4177        if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4178                pr_warn("%s: Doesn't support hardware pass through.\n",
4179                        iommu->name);
4180                return -ENXIO;
4181        }
4182        if (!ecap_sc_support(iommu->ecap) &&
4183            domain_update_iommu_snooping(iommu)) {
4184                pr_warn("%s: Doesn't support snooping.\n",
4185                        iommu->name);
4186                return -ENXIO;
4187        }
4188        sp = domain_update_iommu_superpage(iommu) - 1;
4189        if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4190                pr_warn("%s: Doesn't support large page.\n",
4191                        iommu->name);
4192                return -ENXIO;
4193        }
4194
4195        /*
4196         * Disable translation if already enabled prior to OS handover.
4197         */
4198        if (iommu->gcmd & DMA_GCMD_TE)
4199                iommu_disable_translation(iommu);
4200
4201        g_iommus[iommu->seq_id] = iommu;
4202        ret = iommu_init_domains(iommu);
4203        if (ret == 0)
4204                ret = iommu_alloc_root_entry(iommu);
4205        if (ret)
4206                goto out;
4207
4208#ifdef CONFIG_INTEL_IOMMU_SVM
4209        if (pasid_supported(iommu))
4210                intel_svm_init(iommu);
4211#endif
4212
4213        if (dmaru->ignored) {
4214                /*
4215                 * we always have to disable PMRs or DMA may fail on this device
4216                 */
4217                if (force_on)
4218                        iommu_disable_protect_mem_regions(iommu);
4219                return 0;
4220        }
4221
4222        intel_iommu_init_qi(iommu);
4223        iommu_flush_write_buffer(iommu);
4224
4225#ifdef CONFIG_INTEL_IOMMU_SVM
4226        if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4227                ret = intel_svm_enable_prq(iommu);
4228                if (ret)
4229                        goto disable_iommu;
4230        }
4231#endif
4232        ret = dmar_set_interrupt(iommu);
4233        if (ret)
4234                goto disable_iommu;
4235
4236        iommu_set_root_entry(iommu);
4237        iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4238        iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4239        iommu_enable_translation(iommu);
4240
4241        iommu_disable_protect_mem_regions(iommu);
4242        return 0;
4243
4244disable_iommu:
4245        disable_dmar_iommu(iommu);
4246out:
4247        free_dmar_iommu(iommu);
4248        return ret;
4249}
4250
4251int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4252{
4253        int ret = 0;
4254        struct intel_iommu *iommu = dmaru->iommu;
4255
4256        if (!intel_iommu_enabled)
4257                return 0;
4258        if (iommu == NULL)
4259                return -EINVAL;
4260
4261        if (insert) {
4262                ret = intel_iommu_add(dmaru);
4263        } else {
4264                disable_dmar_iommu(iommu);
4265                free_dmar_iommu(iommu);
4266        }
4267
4268        return ret;
4269}
4270
4271static void intel_iommu_free_dmars(void)
4272{
4273        struct dmar_rmrr_unit *rmrru, *rmrr_n;
4274        struct dmar_atsr_unit *atsru, *atsr_n;
4275
4276        list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4277                list_del(&rmrru->list);
4278                dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4279                kfree(rmrru);
4280        }
4281
4282        list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4283                list_del(&atsru->list);
4284                intel_iommu_free_atsr(atsru);
4285        }
4286}
4287
4288int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4289{
4290        int i, ret = 1;
4291        struct pci_bus *bus;
4292        struct pci_dev *bridge = NULL;
4293        struct device *tmp;
4294        struct acpi_dmar_atsr *atsr;
4295        struct dmar_atsr_unit *atsru;
4296
4297        dev = pci_physfn(dev);
4298        for (bus = dev->bus; bus; bus = bus->parent) {
4299                bridge = bus->self;
4300                /* If it's an integrated device, allow ATS */
4301                if (!bridge)
4302                        return 1;
4303                /* Connected via non-PCIe: no ATS */
4304                if (!pci_is_pcie(bridge) ||
4305                    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4306                        return 0;
4307                /* If we found the root port, look it up in the ATSR */
4308                if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4309                        break;
4310        }
4311
4312        rcu_read_lock();
4313        list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4314                atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4315                if (atsr->segment != pci_domain_nr(dev->bus))
4316                        continue;
4317
4318                for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4319                        if (tmp == &bridge->dev)
4320                                goto out;
4321
4322                if (atsru->include_all)
4323                        goto out;
4324        }
4325        ret = 0;
4326out:
4327        rcu_read_unlock();
4328
4329        return ret;
4330}
4331
4332int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4333{
4334        int ret;
4335        struct dmar_rmrr_unit *rmrru;
4336        struct dmar_atsr_unit *atsru;
4337        struct acpi_dmar_atsr *atsr;
4338        struct acpi_dmar_reserved_memory *rmrr;
4339
4340        if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4341                return 0;
4342
4343        list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4344                rmrr = container_of(rmrru->hdr,
4345                                    struct acpi_dmar_reserved_memory, header);
4346                if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4347                        ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4348                                ((void *)rmrr) + rmrr->header.length,
4349                                rmrr->segment, rmrru->devices,
4350                                rmrru->devices_cnt);
4351                        if (ret < 0)
4352                                return ret;
4353                } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4354                        dmar_remove_dev_scope(info, rmrr->segment,
4355                                rmrru->devices, rmrru->devices_cnt);
4356                }
4357        }
4358
4359        list_for_each_entry(atsru, &dmar_atsr_units, list) {
4360                if (atsru->include_all)
4361                        continue;
4362
4363                atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4364                if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4365                        ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4366                                        (void *)atsr + atsr->header.length,
4367                                        atsr->segment, atsru->devices,
4368                                        atsru->devices_cnt);
4369                        if (ret > 0)
4370                                break;
4371                        else if (ret < 0)
4372                                return ret;
4373                } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4374                        if (dmar_remove_dev_scope(info, atsr->segment,
4375                                        atsru->devices, atsru->devices_cnt))
4376                                break;
4377                }
4378        }
4379
4380        return 0;
4381}
4382
4383static int intel_iommu_memory_notifier(struct notifier_block *nb,
4384                                       unsigned long val, void *v)
4385{
4386        struct memory_notify *mhp = v;
4387        unsigned long long start, end;
4388        unsigned long start_vpfn, last_vpfn;
4389
4390        switch (val) {
4391        case MEM_GOING_ONLINE:
4392                start = mhp->start_pfn << PAGE_SHIFT;
4393                end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4394                if (iommu_domain_identity_map(si_domain, start, end)) {
4395                        pr_warn("Failed to build identity map for [%llx-%llx]\n",
4396                                start, end);
4397                        return NOTIFY_BAD;
4398                }
4399                break;
4400
4401        case MEM_OFFLINE:
4402        case MEM_CANCEL_ONLINE:
4403                start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4404                last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4405                while (start_vpfn <= last_vpfn) {
4406                        struct iova *iova;
4407                        struct dmar_drhd_unit *drhd;
4408                        struct intel_iommu *iommu;
4409                        struct page *freelist;
4410
4411                        iova = find_iova(&si_domain->iovad, start_vpfn);
4412                        if (iova == NULL) {
4413                                pr_debug("Failed get IOVA for PFN %lx\n",
4414                                         start_vpfn);
4415                                break;
4416                        }
4417
4418                        iova = split_and_remove_iova(&si_domain->iovad, iova,
4419                                                     start_vpfn, last_vpfn);
4420                        if (iova == NULL) {
4421                                pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4422                                        start_vpfn, last_vpfn);
4423                                return NOTIFY_BAD;
4424                        }
4425
4426                        freelist = domain_unmap(si_domain, iova->pfn_lo,
4427                                               iova->pfn_hi);
4428
4429                        rcu_read_lock();
4430                        for_each_active_iommu(iommu, drhd)
4431                                iommu_flush_iotlb_psi(iommu, si_domain,
4432                                        iova->pfn_lo, iova_size(iova),
4433                                        !freelist, 0);
4434                        rcu_read_unlock();
4435                        dma_free_pagelist(freelist);
4436
4437                        start_vpfn = iova->pfn_hi + 1;
4438                        free_iova_mem(iova);
4439                }
4440                break;
4441        }
4442
4443        return NOTIFY_OK;
4444}
4445
4446static struct notifier_block intel_iommu_memory_nb = {
4447        .notifier_call = intel_iommu_memory_notifier,
4448        .priority = 0
4449};
4450
4451static void free_all_cpu_cached_iovas(unsigned int cpu)
4452{
4453        int i;
4454
4455        for (i = 0; i < g_num_of_iommus; i++) {
4456                struct intel_iommu *iommu = g_iommus[i];
4457                struct dmar_domain *domain;
4458                int did;
4459
4460                if (!iommu)
4461                        continue;
4462
4463                for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4464                        domain = get_iommu_domain(iommu, (u16)did);
4465
4466                        if (!domain)
4467                                continue;
4468                        free_cpu_cached_iovas(cpu, &domain->iovad);
4469                }
4470        }
4471}
4472
4473static int intel_iommu_cpu_dead(unsigned int cpu)
4474{
4475        free_all_cpu_cached_iovas(cpu);
4476        return 0;
4477}
4478
4479static void intel_disable_iommus(void)
4480{
4481        struct intel_iommu *iommu = NULL;
4482        struct dmar_drhd_unit *drhd;
4483
4484        for_each_iommu(iommu, drhd)
4485                iommu_disable_translation(iommu);
4486}
4487
4488static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4489{
4490        struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4491
4492        return container_of(iommu_dev, struct intel_iommu, iommu);
4493}
4494
4495static ssize_t intel_iommu_show_version(struct device *dev,
4496                                        struct device_attribute *attr,
4497                                        char *buf)
4498{
4499        struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4500        u32 ver = readl(iommu->reg + DMAR_VER_REG);
4501        return sprintf(buf, "%d:%d\n",
4502                       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4503}
4504static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4505
4506static ssize_t intel_iommu_show_address(struct device *dev,
4507                                        struct device_attribute *attr,
4508                                        char *buf)
4509{
4510        struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4511        return sprintf(buf, "%llx\n", iommu->reg_phys);
4512}
4513static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4514
4515static ssize_t intel_iommu_show_cap(struct device *dev,
4516                                    struct device_attribute *attr,
4517                                    char *buf)
4518{
4519        struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4520        return sprintf(buf, "%llx\n", iommu->cap);
4521}
4522static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4523
4524static ssize_t intel_iommu_show_ecap(struct device *dev,
4525                                    struct device_attribute *attr,
4526                                    char *buf)
4527{
4528        struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4529        return sprintf(buf, "%llx\n", iommu->ecap);
4530}
4531static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4532
4533static ssize_t intel_iommu_show_ndoms(struct device *dev,
4534                                      struct device_attribute *attr,
4535                                      char *buf)
4536{
4537        struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4538        return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4539}
4540static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4541
4542static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4543                                           struct device_attribute *attr,
4544                                           char *buf)
4545{
4546        struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4547        return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4548                                                  cap_ndoms(iommu->cap)));
4549}
4550static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4551
4552static struct attribute *intel_iommu_attrs[] = {
4553        &dev_attr_version.attr,
4554        &dev_attr_address.attr,
4555        &dev_attr_cap.attr,
4556        &dev_attr_ecap.attr,
4557        &dev_attr_domains_supported.attr,
4558        &dev_attr_domains_used.attr,
4559        NULL,
4560};
4561
4562static struct attribute_group intel_iommu_group = {
4563        .name = "intel-iommu",
4564        .attrs = intel_iommu_attrs,
4565};
4566
4567const struct attribute_group *intel_iommu_groups[] = {
4568        &intel_iommu_group,
4569        NULL,
4570};
4571
4572static int __init platform_optin_force_iommu(void)
4573{
4574        struct pci_dev *pdev = NULL;
4575        bool has_untrusted_dev = false;
4576
4577        if (!dmar_platform_optin() || no_platform_optin)
4578                return 0;
4579
4580        for_each_pci_dev(pdev) {
4581                if (pdev->untrusted) {
4582                        has_untrusted_dev = true;
4583                        break;
4584                }
4585        }
4586
4587        if (!has_untrusted_dev)
4588                return 0;
4589
4590        if (no_iommu || dmar_disabled)
4591                pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4592
4593        /*
4594         * If Intel-IOMMU is disabled by default, we will apply identity
4595         * map for all devices except those marked as being untrusted.
4596         */
4597        if (dmar_disabled)
4598                iommu_identity_mapping |= IDENTMAP_ALL;
4599
4600        dmar_disabled = 0;
4601#if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4602        swiotlb = 0;
4603#endif
4604        no_iommu = 0;
4605
4606        return 1;
4607}
4608
4609static int __init probe_acpi_namespace_devices(void)
4610{
4611        struct dmar_drhd_unit *drhd;
4612        /* To avoid a -Wunused-but-set-variable warning. */
4613        struct intel_iommu *iommu __maybe_unused;
4614        struct device *dev;
4615        int i, ret = 0;
4616
4617        for_each_active_iommu(iommu, drhd) {
4618                for_each_active_dev_scope(drhd->devices,
4619                                          drhd->devices_cnt, i, dev) {
4620                        struct acpi_device_physical_node *pn;
4621                        struct iommu_group *group;
4622                        struct acpi_device *adev;
4623
4624                        if (dev->bus != &acpi_bus_type)
4625                                continue;
4626
4627                        adev = to_acpi_device(dev);
4628                        mutex_lock(&adev->physical_node_lock);
4629                        list_for_each_entry(pn,
4630                                            &adev->physical_node_list, node) {
4631                                group = iommu_group_get(pn->dev);
4632                                if (group) {
4633                                        iommu_group_put(group);
4634                                        continue;
4635                                }
4636
4637                                pn->dev->bus->iommu_ops = &intel_iommu_ops;
4638                                ret = iommu_probe_device(pn->dev);
4639                                if (ret)
4640                                        break;
4641                        }
4642                        mutex_unlock(&adev->physical_node_lock);
4643
4644                        if (ret)
4645                                return ret;
4646                }
4647        }
4648
4649        return 0;
4650}
4651
4652int __init intel_iommu_init(void)
4653{
4654        int ret = -ENODEV;
4655        struct dmar_drhd_unit *drhd;
4656        struct intel_iommu *iommu;
4657
4658        /*
4659         * Intel IOMMU is required for a TXT/tboot launch or platform
4660         * opt in, so enforce that.
4661         */
4662        force_on = tboot_force_iommu() || platform_optin_force_iommu();
4663
4664        if (iommu_init_mempool()) {
4665                if (force_on)
4666                        panic("tboot: Failed to initialize iommu memory\n");
4667                return -ENOMEM;
4668        }
4669
4670        down_write(&dmar_global_lock);
4671        if (dmar_table_init()) {
4672                if (force_on)
4673                        panic("tboot: Failed to initialize DMAR table\n");
4674                goto out_free_dmar;
4675        }
4676
4677        if (dmar_dev_scope_init() < 0) {
4678                if (force_on)
4679                        panic("tboot: Failed to initialize DMAR device scope\n");
4680                goto out_free_dmar;
4681        }
4682
4683        up_write(&dmar_global_lock);
4684
4685        /*
4686         * The bus notifier takes the dmar_global_lock, so lockdep will
4687         * complain later when we register it under the lock.
4688         */
4689        dmar_register_bus_notifier();
4690
4691        down_write(&dmar_global_lock);
4692
4693        if (no_iommu || dmar_disabled) {
4694                /*
4695                 * We exit the function here to ensure IOMMU's remapping and
4696                 * mempool aren't setup, which means that the IOMMU's PMRs
4697                 * won't be disabled via the call to init_dmars(). So disable
4698                 * it explicitly here. The PMRs were setup by tboot prior to
4699                 * calling SENTER, but the kernel is expected to reset/tear
4700                 * down the PMRs.
4701                 */
4702                if (intel_iommu_tboot_noforce) {
4703                        for_each_iommu(iommu, drhd)
4704                                iommu_disable_protect_mem_regions(iommu);
4705                }
4706
4707                /*
4708                 * Make sure the IOMMUs are switched off, even when we
4709                 * boot into a kexec kernel and the previous kernel left
4710                 * them enabled
4711                 */
4712                intel_disable_iommus();
4713                goto out_free_dmar;
4714        }
4715
4716        if (list_empty(&dmar_rmrr_units))
4717                pr_info("No RMRR found\n");
4718
4719        if (list_empty(&dmar_atsr_units))
4720                pr_info("No ATSR found\n");
4721
4722        if (dmar_init_reserved_ranges()) {
4723                if (force_on)
4724                        panic("tboot: Failed to reserve iommu ranges\n");
4725                goto out_free_reserved_range;
4726        }
4727
4728        if (dmar_map_gfx)
4729                intel_iommu_gfx_mapped = 1;
4730
4731        init_no_remapping_devices();
4732
4733        ret = init_dmars();
4734        if (ret) {
4735                if (force_on)
4736                        panic("tboot: Failed to initialize DMARs\n");
4737                pr_err("Initialization failed\n");
4738                goto out_free_reserved_range;
4739        }
4740        up_write(&dmar_global_lock);
4741
4742#if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4743        swiotlb = 0;
4744#endif
4745        dma_ops = &intel_dma_ops;
4746
4747        init_iommu_pm_ops();
4748
4749        for_each_active_iommu(iommu, drhd) {
4750                iommu_device_sysfs_add(&iommu->iommu, NULL,
4751                                       intel_iommu_groups,
4752                                       "%s", iommu->name);
4753                iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4754                iommu_device_register(&iommu->iommu);
4755        }
4756
4757        bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4758        if (si_domain && !hw_pass_through)
4759                register_memory_notifier(&intel_iommu_memory_nb);
4760        cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4761                          intel_iommu_cpu_dead);
4762
4763        down_read(&dmar_global_lock);
4764        if (probe_acpi_namespace_devices())
4765                pr_warn("ACPI name space devices didn't probe correctly\n");
4766        up_read(&dmar_global_lock);
4767
4768        /* Finally, we enable the DMA remapping hardware. */
4769        for_each_iommu(iommu, drhd) {
4770                if (!drhd->ignored && !translation_pre_enabled(iommu))
4771                        iommu_enable_translation(iommu);
4772
4773                iommu_disable_protect_mem_regions(iommu);
4774        }
4775        pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4776
4777        intel_iommu_enabled = 1;
4778        intel_iommu_debugfs_init();
4779
4780        return 0;
4781
4782out_free_reserved_range:
4783        put_iova_domain(&reserved_iova_list);
4784out_free_dmar:
4785        intel_iommu_free_dmars();
4786        up_write(&dmar_global_lock);
4787        iommu_exit_mempool();
4788        return ret;
4789}
4790
4791static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4792{
4793        struct intel_iommu *iommu = opaque;
4794
4795        domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4796        return 0;
4797}
4798
4799/*
4800 * NB - intel-iommu lacks any sort of reference counting for the users of
4801 * dependent devices.  If multiple endpoints have intersecting dependent
4802 * devices, unbinding the driver from any one of them will possibly leave
4803 * the others unable to operate.
4804 */
4805static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4806{
4807        if (!iommu || !dev || !dev_is_pci(dev))
4808                return;
4809
4810        pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4811}
4812
4813static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4814{
4815        struct dmar_domain *domain;
4816        struct intel_iommu *iommu;
4817        unsigned long flags;
4818
4819        assert_spin_locked(&device_domain_lock);
4820
4821        if (WARN_ON(!info))
4822                return;
4823
4824        iommu = info->iommu;
4825        domain = info->domain;
4826
4827        if (info->dev) {
4828                if (dev_is_pci(info->dev) && sm_supported(iommu))
4829                        intel_pasid_tear_down_entry(iommu, info->dev,
4830                                        PASID_RID2PASID);
4831
4832                iommu_disable_dev_iotlb(info);
4833                domain_context_clear(iommu, info->dev);
4834                intel_pasid_free_table(info->dev);
4835        }
4836
4837        unlink_domain_info(info);
4838
4839        spin_lock_irqsave(&iommu->lock, flags);
4840        domain_detach_iommu(domain, iommu);
4841        spin_unlock_irqrestore(&iommu->lock, flags);
4842
4843        /* free the private domain */
4844        if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
4845            !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
4846            list_empty(&domain->devices))
4847                domain_exit(info->domain);
4848
4849        free_devinfo_mem(info);
4850}
4851
4852static void dmar_remove_one_dev_info(struct device *dev)
4853{
4854        struct device_domain_info *info;
4855        unsigned long flags;
4856
4857        spin_lock_irqsave(&device_domain_lock, flags);
4858        info = dev->archdata.iommu;
4859        if (info)
4860                __dmar_remove_one_dev_info(info);
4861        spin_unlock_irqrestore(&device_domain_lock, flags);
4862}
4863
4864static int md_domain_init(struct dmar_domain *domain, int guest_width)
4865{
4866        int adjust_width;
4867
4868        init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
4869        domain_reserve_special_ranges(domain);
4870
4871        /* calculate AGAW */
4872        domain->gaw = guest_width;
4873        adjust_width = guestwidth_to_adjustwidth(guest_width);
4874        domain->agaw = width_to_agaw(adjust_width);
4875
4876        domain->iommu_coherency = 0;
4877        domain->iommu_snooping = 0;
4878        domain->iommu_superpage = 0;
4879        domain->max_addr = 0;
4880
4881        /* always allocate the top pgd */
4882        domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4883        if (!domain->pgd)
4884                return -ENOMEM;
4885        domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4886        return 0;
4887}
4888
4889static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4890{
4891        struct dmar_domain *dmar_domain;
4892        struct iommu_domain *domain;
4893
4894        switch (type) {
4895        case IOMMU_DOMAIN_DMA:
4896        /* fallthrough */
4897        case IOMMU_DOMAIN_UNMANAGED:
4898                dmar_domain = alloc_domain(0);
4899                if (!dmar_domain) {
4900                        pr_err("Can't allocate dmar_domain\n");
4901                        return NULL;
4902                }
4903                if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4904                        pr_err("Domain initialization failed\n");
4905                        domain_exit(dmar_domain);
4906                        return NULL;
4907                }
4908
4909                if (type == IOMMU_DOMAIN_DMA &&
4910                    init_iova_flush_queue(&dmar_domain->iovad,
4911                                          iommu_flush_iova, iova_entry_free)) {
4912                        pr_warn("iova flush queue initialization failed\n");
4913                        intel_iommu_strict = 1;
4914                }
4915
4916                domain_update_iommu_cap(dmar_domain);
4917
4918                domain = &dmar_domain->domain;
4919                domain->geometry.aperture_start = 0;
4920                domain->geometry.aperture_end   =
4921                                __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4922                domain->geometry.force_aperture = true;
4923
4924                return domain;
4925        case IOMMU_DOMAIN_IDENTITY:
4926                return &si_domain->domain;
4927        default:
4928                return NULL;
4929        }
4930
4931        return NULL;
4932}
4933
4934static void intel_iommu_domain_free(struct iommu_domain *domain)
4935{
4936        if (domain != &si_domain->domain)
4937                domain_exit(to_dmar_domain(domain));
4938}
4939
4940/*
4941 * Check whether a @domain could be attached to the @dev through the
4942 * aux-domain attach/detach APIs.
4943 */
4944static inline bool
4945is_aux_domain(struct device *dev, struct iommu_domain *domain)
4946{
4947        struct device_domain_info *info = dev->archdata.iommu;
4948
4949        return info && info->auxd_enabled &&
4950                        domain->type == IOMMU_DOMAIN_UNMANAGED;
4951}
4952
4953static void auxiliary_link_device(struct dmar_domain *domain,
4954                                  struct device *dev)
4955{
4956        struct device_domain_info *info = dev->archdata.iommu;
4957
4958        assert_spin_locked(&device_domain_lock);
4959        if (WARN_ON(!info))
4960                return;
4961
4962        domain->auxd_refcnt++;
4963        list_add(&domain->auxd, &info->auxiliary_domains);
4964}
4965
4966static void auxiliary_unlink_device(struct dmar_domain *domain,
4967                                    struct device *dev)
4968{
4969        struct device_domain_info *info = dev->archdata.iommu;
4970
4971        assert_spin_locked(&device_domain_lock);
4972        if (WARN_ON(!info))
4973                return;
4974
4975        list_del(&domain->auxd);
4976        domain->auxd_refcnt--;
4977
4978        if (!domain->auxd_refcnt && domain->default_pasid > 0)
4979                intel_pasid_free_id(domain->default_pasid);
4980}
4981
4982static int aux_domain_add_dev(struct dmar_domain *domain,
4983                              struct device *dev)
4984{
4985        int ret;
4986        u8 bus, devfn;
4987        unsigned long flags;
4988        struct intel_iommu *iommu;
4989
4990        iommu = device_to_iommu(dev, &bus, &devfn);
4991        if (!iommu)
4992                return -ENODEV;
4993
4994        if (domain->default_pasid <= 0) {
4995                int pasid;
4996
4997                pasid = intel_pasid_alloc_id(domain, PASID_MIN,
4998                                             pci_max_pasids(to_pci_dev(dev)),
4999                                             GFP_KERNEL);
5000                if (pasid <= 0) {
5001                        pr_err("Can't allocate default pasid\n");
5002                        return -ENODEV;
5003                }
5004                domain->default_pasid = pasid;
5005        }
5006
5007        spin_lock_irqsave(&device_domain_lock, flags);
5008        /*
5009         * iommu->lock must be held to attach domain to iommu and setup the
5010         * pasid entry for second level translation.
5011         */
5012        spin_lock(&iommu->lock);
5013        ret = domain_attach_iommu(domain, iommu);
5014        if (ret)
5015                goto attach_failed;
5016
5017        /* Setup the PASID entry for mediated devices: */
5018        ret = intel_pasid_setup_second_level(iommu, domain, dev,
5019                                             domain->default_pasid);
5020        if (ret)
5021                goto table_failed;
5022        spin_unlock(&iommu->lock);
5023
5024        auxiliary_link_device(domain, dev);
5025
5026        spin_unlock_irqrestore(&device_domain_lock, flags);
5027
5028        return 0;
5029
5030table_failed:
5031        domain_detach_iommu(domain, iommu);
5032attach_failed:
5033        spin_unlock(&iommu->lock);
5034        spin_unlock_irqrestore(&device_domain_lock, flags);
5035        if (!domain->auxd_refcnt && domain->default_pasid > 0)
5036                intel_pasid_free_id(domain->default_pasid);
5037
5038        return ret;
5039}
5040
5041static void aux_domain_remove_dev(struct dmar_domain *domain,
5042                                  struct device *dev)
5043{
5044        struct device_domain_info *info;
5045        struct intel_iommu *iommu;
5046        unsigned long flags;
5047
5048        if (!is_aux_domain(dev, &domain->domain))
5049                return;
5050
5051        spin_lock_irqsave(&device_domain_lock, flags);
5052        info = dev->archdata.iommu;
5053        iommu = info->iommu;
5054
5055        auxiliary_unlink_device(domain, dev);
5056
5057        spin_lock(&iommu->lock);
5058        intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5059        domain_detach_iommu(domain, iommu);
5060        spin_unlock(&iommu->lock);
5061
5062        spin_unlock_irqrestore(&device_domain_lock, flags);
5063}
5064
5065static int prepare_domain_attach_device(struct iommu_domain *domain,
5066                                        struct device *dev)
5067{
5068        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5069        struct intel_iommu *iommu;
5070        int addr_width;
5071        u8 bus, devfn;
5072
5073        iommu = device_to_iommu(dev, &bus, &devfn);
5074        if (!iommu)
5075                return -ENODEV;
5076
5077        /* check if this iommu agaw is sufficient for max mapped address */
5078        addr_width = agaw_to_width(iommu->agaw);
5079        if (addr_width > cap_mgaw(iommu->cap))
5080                addr_width = cap_mgaw(iommu->cap);
5081
5082        if (dmar_domain->max_addr > (1LL << addr_width)) {
5083                dev_err(dev, "%s: iommu width (%d) is not "
5084                        "sufficient for the mapped address (%llx)\n",
5085                        __func__, addr_width, dmar_domain->max_addr);
5086                return -EFAULT;
5087        }
5088        dmar_domain->gaw = addr_width;
5089
5090        /*
5091         * Knock out extra levels of page tables if necessary
5092         */
5093        while (iommu->agaw < dmar_domain->agaw) {
5094                struct dma_pte *pte;
5095
5096                pte = dmar_domain->pgd;
5097                if (dma_pte_present(pte)) {
5098                        dmar_domain->pgd = (struct dma_pte *)
5099                                phys_to_virt(dma_pte_addr(pte));
5100                        free_pgtable_page(pte);
5101                }
5102                dmar_domain->agaw--;
5103        }
5104
5105        return 0;
5106}
5107
5108static int intel_iommu_attach_device(struct iommu_domain *domain,
5109                                     struct device *dev)
5110{
5111        int ret;
5112
5113        if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5114            device_is_rmrr_locked(dev)) {
5115                dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5116                return -EPERM;
5117        }
5118
5119        if (is_aux_domain(dev, domain))
5120                return -EPERM;
5121
5122        /* normally dev is not mapped */
5123        if (unlikely(domain_context_mapped(dev))) {
5124                struct dmar_domain *old_domain;
5125
5126                old_domain = find_domain(dev);
5127                if (old_domain)
5128                        dmar_remove_one_dev_info(dev);
5129        }
5130
5131        ret = prepare_domain_attach_device(domain, dev);
5132        if (ret)
5133                return ret;
5134
5135        return domain_add_dev_info(to_dmar_domain(domain), dev);
5136}
5137
5138static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5139                                         struct device *dev)
5140{
5141        int ret;
5142
5143        if (!is_aux_domain(dev, domain))
5144                return -EPERM;
5145
5146        ret = prepare_domain_attach_device(domain, dev);
5147        if (ret)
5148                return ret;
5149
5150        return aux_domain_add_dev(to_dmar_domain(domain), dev);
5151}
5152
5153static void intel_iommu_detach_device(struct iommu_domain *domain,
5154                                      struct device *dev)
5155{
5156        dmar_remove_one_dev_info(dev);
5157}
5158
5159static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5160                                          struct device *dev)
5161{
5162        aux_domain_remove_dev(to_dmar_domain(domain), dev);
5163}
5164
5165static int intel_iommu_map(struct iommu_domain *domain,
5166                           unsigned long iova, phys_addr_t hpa,
5167                           size_t size, int iommu_prot)
5168{
5169        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5170        u64 max_addr;
5171        int prot = 0;
5172        int ret;
5173
5174        if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5175                return -EINVAL;
5176
5177        if (iommu_prot & IOMMU_READ)
5178                prot |= DMA_PTE_READ;
5179        if (iommu_prot & IOMMU_WRITE)
5180                prot |= DMA_PTE_WRITE;
5181        if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5182                prot |= DMA_PTE_SNP;
5183
5184        max_addr = iova + size;
5185        if (dmar_domain->max_addr < max_addr) {
5186                u64 end;
5187
5188                /* check if minimum agaw is sufficient for mapped address */
5189                end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5190                if (end < max_addr) {
5191                        pr_err("%s: iommu width (%d) is not "
5192                               "sufficient for the mapped address (%llx)\n",
5193                               __func__, dmar_domain->gaw, max_addr);
5194                        return -EFAULT;
5195                }
5196                dmar_domain->max_addr = max_addr;
5197        }
5198        /* Round up size to next multiple of PAGE_SIZE, if it and
5199           the low bits of hpa would take us onto the next page */
5200        size = aligned_nrpages(hpa, size);
5201        ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5202                                 hpa >> VTD_PAGE_SHIFT, size, prot);
5203        return ret;
5204}
5205
5206static size_t intel_iommu_unmap(struct iommu_domain *domain,
5207                                unsigned long iova, size_t size)
5208{
5209        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5210        struct page *freelist = NULL;
5211        unsigned long start_pfn, last_pfn;
5212        unsigned int npages;
5213        int iommu_id, level = 0;
5214
5215        /* Cope with horrid API which requires us to unmap more than the
5216           size argument if it happens to be a large-page mapping. */
5217        BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5218        if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5219                return 0;
5220
5221        if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5222                size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5223
5224        start_pfn = iova >> VTD_PAGE_SHIFT;
5225        last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5226
5227        freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5228
5229        npages = last_pfn - start_pfn + 1;
5230
5231        for_each_domain_iommu(iommu_id, dmar_domain)
5232                iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5233                                      start_pfn, npages, !freelist, 0);
5234
5235        dma_free_pagelist(freelist);
5236
5237        if (dmar_domain->max_addr == iova + size)
5238                dmar_domain->max_addr = iova;
5239
5240        return size;
5241}
5242
5243static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5244                                            dma_addr_t iova)
5245{
5246        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5247        struct dma_pte *pte;
5248        int level = 0;
5249        u64 phys = 0;
5250
5251        if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5252                return 0;
5253
5254        pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5255        if (pte)
5256                phys = dma_pte_addr(pte);
5257
5258        return phys;
5259}
5260
5261static inline bool scalable_mode_support(void)
5262{
5263        struct dmar_drhd_unit *drhd;
5264        struct intel_iommu *iommu;
5265        bool ret = true;
5266
5267        rcu_read_lock();
5268        for_each_active_iommu(iommu, drhd) {
5269                if (!sm_supported(iommu)) {
5270                        ret = false;
5271                        break;
5272                }
5273        }
5274        rcu_read_unlock();
5275
5276        return ret;
5277}
5278
5279static inline bool iommu_pasid_support(void)
5280{
5281        struct dmar_drhd_unit *drhd;
5282        struct intel_iommu *iommu;
5283        bool ret = true;
5284
5285        rcu_read_lock();
5286        for_each_active_iommu(iommu, drhd) {
5287                if (!pasid_supported(iommu)) {
5288                        ret = false;
5289                        break;
5290                }
5291        }
5292        rcu_read_unlock();
5293
5294        return ret;
5295}
5296
5297static bool intel_iommu_capable(enum iommu_cap cap)
5298{
5299        if (cap == IOMMU_CAP_CACHE_COHERENCY)
5300                return domain_update_iommu_snooping(NULL) == 1;
5301        if (cap == IOMMU_CAP_INTR_REMAP)
5302                return irq_remapping_enabled == 1;
5303
5304        return false;
5305}
5306
5307static int intel_iommu_add_device(struct device *dev)
5308{
5309        struct dmar_domain *dmar_domain;
5310        struct iommu_domain *domain;
5311        struct intel_iommu *iommu;
5312        struct iommu_group *group;
5313        u8 bus, devfn;
5314        int ret;
5315
5316        iommu = device_to_iommu(dev, &bus, &devfn);
5317        if (!iommu)
5318                return -ENODEV;
5319
5320        iommu_device_link(&iommu->iommu, dev);
5321
5322        if (translation_pre_enabled(iommu))
5323                dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5324
5325        group = iommu_group_get_for_dev(dev);
5326
5327        if (IS_ERR(group))
5328                return PTR_ERR(group);
5329
5330        iommu_group_put(group);
5331
5332        domain = iommu_get_domain_for_dev(dev);
5333        dmar_domain = to_dmar_domain(domain);
5334        if (domain->type == IOMMU_DOMAIN_DMA) {
5335                if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5336                        ret = iommu_request_dm_for_dev(dev);
5337                        if (ret) {
5338                                dmar_remove_one_dev_info(dev);
5339                                dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5340                                domain_add_dev_info(si_domain, dev);
5341                                dev_info(dev,
5342                                         "Device uses a private identity domain.\n");
5343                        }
5344                }
5345        } else {
5346                if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5347                        ret = iommu_request_dma_domain_for_dev(dev);
5348                        if (ret) {
5349                                dmar_remove_one_dev_info(dev);
5350                                dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5351                                if (!get_private_domain_for_dev(dev)) {
5352                                        dev_warn(dev,
5353                                                 "Failed to get a private domain.\n");
5354                                        return -ENOMEM;
5355                                }
5356
5357                                dev_info(dev,
5358                                         "Device uses a private dma domain.\n");
5359                        }
5360                }
5361        }
5362
5363        return 0;
5364}
5365
5366static void intel_iommu_remove_device(struct device *dev)
5367{
5368        struct intel_iommu *iommu;
5369        u8 bus, devfn;
5370
5371        iommu = device_to_iommu(dev, &bus, &devfn);
5372        if (!iommu)
5373                return;
5374
5375        dmar_remove_one_dev_info(dev);
5376
5377        iommu_group_remove_device(dev);
5378
5379        iommu_device_unlink(&iommu->iommu, dev);
5380}
5381
5382static void intel_iommu_get_resv_regions(struct device *device,
5383                                         struct list_head *head)
5384{
5385        int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5386        struct iommu_resv_region *reg;
5387        struct dmar_rmrr_unit *rmrr;
5388        struct device *i_dev;
5389        int i;
5390
5391        down_read(&dmar_global_lock);
5392        for_each_rmrr_units(rmrr) {
5393                for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5394                                          i, i_dev) {
5395                        struct iommu_resv_region *resv;
5396                        enum iommu_resv_type type;
5397                        size_t length;
5398
5399                        if (i_dev != device &&
5400                            !is_downstream_to_pci_bridge(device, i_dev))
5401                                continue;
5402
5403                        length = rmrr->end_address - rmrr->base_address + 1;
5404
5405                        type = device_rmrr_is_relaxable(device) ?
5406                                IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5407
5408                        resv = iommu_alloc_resv_region(rmrr->base_address,
5409                                                       length, prot, type);
5410                        if (!resv)
5411                                break;
5412
5413                        list_add_tail(&resv->list, head);
5414                }
5415        }
5416        up_read(&dmar_global_lock);
5417
5418#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5419        if (dev_is_pci(device)) {
5420                struct pci_dev *pdev = to_pci_dev(device);
5421
5422                if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5423                        reg = iommu_alloc_resv_region(0, 1UL << 24, 0,
5424                                                      IOMMU_RESV_DIRECT);
5425                        if (reg)
5426                                list_add_tail(&reg->list, head);
5427                }
5428        }
5429#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5430
5431        reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5432                                      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5433                                      0, IOMMU_RESV_MSI);
5434        if (!reg)
5435                return;
5436        list_add_tail(&reg->list, head);
5437}
5438
5439static void intel_iommu_put_resv_regions(struct device *dev,
5440                                         struct list_head *head)
5441{
5442        struct iommu_resv_region *entry, *next;
5443
5444        list_for_each_entry_safe(entry, next, head, list)
5445                kfree(entry);
5446}
5447
5448int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5449{
5450        struct device_domain_info *info;
5451        struct context_entry *context;
5452        struct dmar_domain *domain;
5453        unsigned long flags;
5454        u64 ctx_lo;
5455        int ret;
5456
5457        domain = find_domain(dev);
5458        if (!domain)
5459                return -EINVAL;
5460
5461        spin_lock_irqsave(&device_domain_lock, flags);
5462        spin_lock(&iommu->lock);
5463
5464        ret = -EINVAL;
5465        info = dev->archdata.iommu;
5466        if (!info || !info->pasid_supported)
5467                goto out;
5468
5469        context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5470        if (WARN_ON(!context))
5471                goto out;
5472
5473        ctx_lo = context[0].lo;
5474
5475        if (!(ctx_lo & CONTEXT_PASIDE)) {
5476                ctx_lo |= CONTEXT_PASIDE;
5477                context[0].lo = ctx_lo;
5478                wmb();
5479                iommu->flush.flush_context(iommu,
5480                                           domain->iommu_did[iommu->seq_id],
5481                                           PCI_DEVID(info->bus, info->devfn),
5482                                           DMA_CCMD_MASK_NOBIT,
5483                                           DMA_CCMD_DEVICE_INVL);
5484        }
5485
5486        /* Enable PASID support in the device, if it wasn't already */
5487        if (!info->pasid_enabled)
5488                iommu_enable_dev_iotlb(info);
5489
5490        ret = 0;
5491
5492 out:
5493        spin_unlock(&iommu->lock);
5494        spin_unlock_irqrestore(&device_domain_lock, flags);
5495
5496        return ret;
5497}
5498
5499static void intel_iommu_apply_resv_region(struct device *dev,
5500                                          struct iommu_domain *domain,
5501                                          struct iommu_resv_region *region)
5502{
5503        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5504        unsigned long start, end;
5505
5506        start = IOVA_PFN(region->start);
5507        end   = IOVA_PFN(region->start + region->length - 1);
5508
5509        WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5510}
5511
5512#ifdef CONFIG_INTEL_IOMMU_SVM
5513struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5514{
5515        struct intel_iommu *iommu;
5516        u8 bus, devfn;
5517
5518        if (iommu_dummy(dev)) {
5519                dev_warn(dev,
5520                         "No IOMMU translation for device; cannot enable SVM\n");
5521                return NULL;
5522        }
5523
5524        iommu = device_to_iommu(dev, &bus, &devfn);
5525        if ((!iommu)) {
5526                dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5527                return NULL;
5528        }
5529
5530        return iommu;
5531}
5532#endif /* CONFIG_INTEL_IOMMU_SVM */
5533
5534static int intel_iommu_enable_auxd(struct device *dev)
5535{
5536        struct device_domain_info *info;
5537        struct intel_iommu *iommu;
5538        unsigned long flags;
5539        u8 bus, devfn;
5540        int ret;
5541
5542        iommu = device_to_iommu(dev, &bus, &devfn);
5543        if (!iommu || dmar_disabled)
5544                return -EINVAL;
5545
5546        if (!sm_supported(iommu) || !pasid_supported(iommu))
5547                return -EINVAL;
5548
5549        ret = intel_iommu_enable_pasid(iommu, dev);
5550        if (ret)
5551                return -ENODEV;
5552
5553        spin_lock_irqsave(&device_domain_lock, flags);
5554        info = dev->archdata.iommu;
5555        info->auxd_enabled = 1;
5556        spin_unlock_irqrestore(&device_domain_lock, flags);
5557
5558        return 0;
5559}
5560
5561static int intel_iommu_disable_auxd(struct device *dev)
5562{
5563        struct device_domain_info *info;
5564        unsigned long flags;
5565
5566        spin_lock_irqsave(&device_domain_lock, flags);
5567        info = dev->archdata.iommu;
5568        if (!WARN_ON(!info))
5569                info->auxd_enabled = 0;
5570        spin_unlock_irqrestore(&device_domain_lock, flags);
5571
5572        return 0;
5573}
5574
5575/*
5576 * A PCI express designated vendor specific extended capability is defined
5577 * in the section 3.7 of Intel scalable I/O virtualization technical spec
5578 * for system software and tools to detect endpoint devices supporting the
5579 * Intel scalable IO virtualization without host driver dependency.
5580 *
5581 * Returns the address of the matching extended capability structure within
5582 * the device's PCI configuration space or 0 if the device does not support
5583 * it.
5584 */
5585static int siov_find_pci_dvsec(struct pci_dev *pdev)
5586{
5587        int pos;
5588        u16 vendor, id;
5589
5590        pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5591        while (pos) {
5592                pci_read_config_word(pdev, pos + 4, &vendor);
5593                pci_read_config_word(pdev, pos + 8, &id);
5594                if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5595                        return pos;
5596
5597                pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5598        }
5599
5600        return 0;
5601}
5602
5603static bool
5604intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5605{
5606        if (feat == IOMMU_DEV_FEAT_AUX) {
5607                int ret;
5608
5609                if (!dev_is_pci(dev) || dmar_disabled ||
5610                    !scalable_mode_support() || !iommu_pasid_support())
5611                        return false;
5612
5613                ret = pci_pasid_features(to_pci_dev(dev));
5614                if (ret < 0)
5615                        return false;
5616
5617                return !!siov_find_pci_dvsec(to_pci_dev(dev));
5618        }
5619
5620        return false;
5621}
5622
5623static int
5624intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5625{
5626        if (feat == IOMMU_DEV_FEAT_AUX)
5627                return intel_iommu_enable_auxd(dev);
5628
5629        return -ENODEV;
5630}
5631
5632static int
5633intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5634{
5635        if (feat == IOMMU_DEV_FEAT_AUX)
5636                return intel_iommu_disable_auxd(dev);
5637
5638        return -ENODEV;
5639}
5640
5641static bool
5642intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5643{
5644        struct device_domain_info *info = dev->archdata.iommu;
5645
5646        if (feat == IOMMU_DEV_FEAT_AUX)
5647                return scalable_mode_support() && info && info->auxd_enabled;
5648
5649        return false;
5650}
5651
5652static int
5653intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5654{
5655        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5656
5657        return dmar_domain->default_pasid > 0 ?
5658                        dmar_domain->default_pasid : -EINVAL;
5659}
5660
5661static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5662                                           struct device *dev)
5663{
5664        return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
5665}
5666
5667const struct iommu_ops intel_iommu_ops = {
5668        .capable                = intel_iommu_capable,
5669        .domain_alloc           = intel_iommu_domain_alloc,
5670        .domain_free            = intel_iommu_domain_free,
5671        .attach_dev             = intel_iommu_attach_device,
5672        .detach_dev             = intel_iommu_detach_device,
5673        .aux_attach_dev         = intel_iommu_aux_attach_device,
5674        .aux_detach_dev         = intel_iommu_aux_detach_device,
5675        .aux_get_pasid          = intel_iommu_aux_get_pasid,
5676        .map                    = intel_iommu_map,
5677        .unmap                  = intel_iommu_unmap,
5678        .iova_to_phys           = intel_iommu_iova_to_phys,
5679        .add_device             = intel_iommu_add_device,
5680        .remove_device          = intel_iommu_remove_device,
5681        .get_resv_regions       = intel_iommu_get_resv_regions,
5682        .put_resv_regions       = intel_iommu_put_resv_regions,
5683        .apply_resv_region      = intel_iommu_apply_resv_region,
5684        .device_group           = pci_device_group,
5685        .dev_has_feat           = intel_iommu_dev_has_feat,
5686        .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
5687        .dev_enable_feat        = intel_iommu_dev_enable_feat,
5688        .dev_disable_feat       = intel_iommu_dev_disable_feat,
5689        .is_attach_deferred     = intel_iommu_is_attach_deferred,
5690        .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
5691};
5692
5693static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5694{
5695        /* G4x/GM45 integrated gfx dmar support is totally busted. */
5696        pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5697        dmar_map_gfx = 0;
5698}
5699
5700DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5701DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5702DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5703DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5704DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5705DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5706DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5707
5708static void quirk_iommu_rwbf(struct pci_dev *dev)
5709{
5710        /*
5711         * Mobile 4 Series Chipset neglects to set RWBF capability,
5712         * but needs it. Same seems to hold for the desktop versions.
5713         */
5714        pci_info(dev, "Forcing write-buffer flush capability\n");
5715        rwbf_quirk = 1;
5716}
5717
5718DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5719DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5720DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5721DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5722DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5723DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5724DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5725
5726#define GGC 0x52
5727#define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5728#define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5729#define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5730#define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5731#define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5732#define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5733#define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5734#define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5735
5736static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5737{
5738        unsigned short ggc;
5739
5740        if (pci_read_config_word(dev, GGC, &ggc))
5741                return;
5742
5743        if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5744                pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5745                dmar_map_gfx = 0;
5746        } else if (dmar_map_gfx) {
5747                /* we have to ensure the gfx device is idle before we flush */
5748                pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5749                intel_iommu_strict = 1;
5750       }
5751}
5752DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5753DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5754DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5755DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5756
5757/* On Tylersburg chipsets, some BIOSes have been known to enable the
5758   ISOCH DMAR unit for the Azalia sound device, but not give it any
5759   TLB entries, which causes it to deadlock. Check for that.  We do
5760   this in a function called from init_dmars(), instead of in a PCI
5761   quirk, because we don't want to print the obnoxious "BIOS broken"
5762   message if VT-d is actually disabled.
5763*/
5764static void __init check_tylersburg_isoch(void)
5765{
5766        struct pci_dev *pdev;
5767        uint32_t vtisochctrl;
5768
5769        /* If there's no Azalia in the system anyway, forget it. */
5770        pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5771        if (!pdev)
5772                return;
5773        pci_dev_put(pdev);
5774
5775        /* System Management Registers. Might be hidden, in which case
5776           we can't do the sanity check. But that's OK, because the
5777           known-broken BIOSes _don't_ actually hide it, so far. */
5778        pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5779        if (!pdev)
5780                return;
5781
5782        if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5783                pci_dev_put(pdev);
5784                return;
5785        }
5786
5787        pci_dev_put(pdev);
5788
5789        /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5790        if (vtisochctrl & 1)
5791                return;
5792
5793        /* Drop all bits other than the number of TLB entries */
5794        vtisochctrl &= 0x1c;
5795
5796        /* If we have the recommended number of TLB entries (16), fine. */
5797        if (vtisochctrl == 0x10)
5798                return;
5799
5800        /* Zero TLB entries? You get to ride the short bus to school. */
5801        if (!vtisochctrl) {
5802                WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5803                     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5804                     dmi_get_system_info(DMI_BIOS_VENDOR),
5805                     dmi_get_system_info(DMI_BIOS_VERSION),
5806                     dmi_get_system_info(DMI_PRODUCT_VERSION));
5807                iommu_identity_mapping |= IDENTMAP_AZALIA;
5808                return;
5809        }
5810
5811        pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5812               vtisochctrl);
5813}
5814