linux/drivers/iommu/intel-iommu.c
<<
>>
Prefs
   1/*
   2 * Copyright © 2006-2014 Intel Corporation.
   3 *
   4 * This program is free software; you can redistribute it and/or modify it
   5 * under the terms and conditions of the GNU General Public License,
   6 * version 2, as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope it will be useful, but WITHOUT
   9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  11 * more details.
  12 *
  13 * Authors: David Woodhouse <dwmw2@infradead.org>,
  14 *          Ashok Raj <ashok.raj@intel.com>,
  15 *          Shaohua Li <shaohua.li@intel.com>,
  16 *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
  17 *          Fenghua Yu <fenghua.yu@intel.com>
  18 *          Joerg Roedel <jroedel@suse.de>
  19 */
  20
  21#define pr_fmt(fmt)     "DMAR: " fmt
  22
  23#include <linux/init.h>
  24#include <linux/bitmap.h>
  25#include <linux/debugfs.h>
  26#include <linux/export.h>
  27#include <linux/slab.h>
  28#include <linux/irq.h>
  29#include <linux/interrupt.h>
  30#include <linux/spinlock.h>
  31#include <linux/pci.h>
  32#include <linux/dmar.h>
  33#include <linux/dma-mapping.h>
  34#include <linux/dma-direct.h>
  35#include <linux/mempool.h>
  36#include <linux/memory.h>
  37#include <linux/cpu.h>
  38#include <linux/timer.h>
  39#include <linux/io.h>
  40#include <linux/iova.h>
  41#include <linux/iommu.h>
  42#include <linux/intel-iommu.h>
  43#include <linux/syscore_ops.h>
  44#include <linux/tboot.h>
  45#include <linux/dmi.h>
  46#include <linux/pci-ats.h>
  47#include <linux/memblock.h>
  48#include <linux/dma-contiguous.h>
  49#include <linux/dma-direct.h>
  50#include <linux/crash_dump.h>
  51#include <asm/irq_remapping.h>
  52#include <asm/cacheflush.h>
  53#include <asm/iommu.h>
  54
  55#include "irq_remapping.h"
  56
  57#define ROOT_SIZE               VTD_PAGE_SIZE
  58#define CONTEXT_SIZE            VTD_PAGE_SIZE
  59
  60#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  61#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  62#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  63#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  64
  65#define IOAPIC_RANGE_START      (0xfee00000)
  66#define IOAPIC_RANGE_END        (0xfeefffff)
  67#define IOVA_START_ADDR         (0x1000)
  68
  69#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
  70
  71#define MAX_AGAW_WIDTH 64
  72#define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
  73
  74#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  75#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  76
  77/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  78   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  79#define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  80                                __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  81#define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  82
  83/* IO virtual address start page frame number */
  84#define IOVA_START_PFN          (1)
  85
  86#define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  87
  88/* page table handling */
  89#define LEVEL_STRIDE            (9)
  90#define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  91
  92/*
  93 * This bitmap is used to advertise the page sizes our hardware support
  94 * to the IOMMU core, which will then use this information to split
  95 * physically contiguous memory regions it is mapping into page sizes
  96 * that we support.
  97 *
  98 * Traditionally the IOMMU core just handed us the mappings directly,
  99 * after making sure the size is an order of a 4KiB page and that the
 100 * mapping has natural alignment.
 101 *
 102 * To retain this behavior, we currently advertise that we support
 103 * all page sizes that are an order of 4KiB.
 104 *
 105 * If at some point we'd like to utilize the IOMMU core's new behavior,
 106 * we could change this to advertise the real page sizes we support.
 107 */
 108#define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
 109
 110static inline int agaw_to_level(int agaw)
 111{
 112        return agaw + 2;
 113}
 114
 115static inline int agaw_to_width(int agaw)
 116{
 117        return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
 118}
 119
 120static inline int width_to_agaw(int width)
 121{
 122        return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
 123}
 124
 125static inline unsigned int level_to_offset_bits(int level)
 126{
 127        return (level - 1) * LEVEL_STRIDE;
 128}
 129
 130static inline int pfn_level_offset(unsigned long pfn, int level)
 131{
 132        return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 133}
 134
 135static inline unsigned long level_mask(int level)
 136{
 137        return -1UL << level_to_offset_bits(level);
 138}
 139
 140static inline unsigned long level_size(int level)
 141{
 142        return 1UL << level_to_offset_bits(level);
 143}
 144
 145static inline unsigned long align_to_level(unsigned long pfn, int level)
 146{
 147        return (pfn + level_size(level) - 1) & level_mask(level);
 148}
 149
 150static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 151{
 152        return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
 153}
 154
 155/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 156   are never going to work. */
 157static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 158{
 159        return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 160}
 161
 162static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 163{
 164        return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 165}
 166static inline unsigned long page_to_dma_pfn(struct page *pg)
 167{
 168        return mm_to_dma_pfn(page_to_pfn(pg));
 169}
 170static inline unsigned long virt_to_dma_pfn(void *p)
 171{
 172        return page_to_dma_pfn(virt_to_page(p));
 173}
 174
 175/* global iommu list, set NULL for ignored DMAR units */
 176static struct intel_iommu **g_iommus;
 177
 178static void __init check_tylersburg_isoch(void);
 179static int rwbf_quirk;
 180
 181/*
 182 * set to 1 to panic kernel if can't successfully enable VT-d
 183 * (used when kernel is launched w/ TXT)
 184 */
 185static int force_on = 0;
 186int intel_iommu_tboot_noforce;
 187
 188/*
 189 * 0: Present
 190 * 1-11: Reserved
 191 * 12-63: Context Ptr (12 - (haw-1))
 192 * 64-127: Reserved
 193 */
 194struct root_entry {
 195        u64     lo;
 196        u64     hi;
 197};
 198#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 199
 200/*
 201 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 202 * if marked present.
 203 */
 204static phys_addr_t root_entry_lctp(struct root_entry *re)
 205{
 206        if (!(re->lo & 1))
 207                return 0;
 208
 209        return re->lo & VTD_PAGE_MASK;
 210}
 211
 212/*
 213 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 214 * if marked present.
 215 */
 216static phys_addr_t root_entry_uctp(struct root_entry *re)
 217{
 218        if (!(re->hi & 1))
 219                return 0;
 220
 221        return re->hi & VTD_PAGE_MASK;
 222}
 223/*
 224 * low 64 bits:
 225 * 0: present
 226 * 1: fault processing disable
 227 * 2-3: translation type
 228 * 12-63: address space root
 229 * high 64 bits:
 230 * 0-2: address width
 231 * 3-6: aval
 232 * 8-23: domain id
 233 */
 234struct context_entry {
 235        u64 lo;
 236        u64 hi;
 237};
 238
 239static inline void context_clear_pasid_enable(struct context_entry *context)
 240{
 241        context->lo &= ~(1ULL << 11);
 242}
 243
 244static inline bool context_pasid_enabled(struct context_entry *context)
 245{
 246        return !!(context->lo & (1ULL << 11));
 247}
 248
 249static inline void context_set_copied(struct context_entry *context)
 250{
 251        context->hi |= (1ull << 3);
 252}
 253
 254static inline bool context_copied(struct context_entry *context)
 255{
 256        return !!(context->hi & (1ULL << 3));
 257}
 258
 259static inline bool __context_present(struct context_entry *context)
 260{
 261        return (context->lo & 1);
 262}
 263
 264static inline bool context_present(struct context_entry *context)
 265{
 266        return context_pasid_enabled(context) ?
 267             __context_present(context) :
 268             __context_present(context) && !context_copied(context);
 269}
 270
 271static inline void context_set_present(struct context_entry *context)
 272{
 273        context->lo |= 1;
 274}
 275
 276static inline void context_set_fault_enable(struct context_entry *context)
 277{
 278        context->lo &= (((u64)-1) << 2) | 1;
 279}
 280
 281static inline void context_set_translation_type(struct context_entry *context,
 282                                                unsigned long value)
 283{
 284        context->lo &= (((u64)-1) << 4) | 3;
 285        context->lo |= (value & 3) << 2;
 286}
 287
 288static inline void context_set_address_root(struct context_entry *context,
 289                                            unsigned long value)
 290{
 291        context->lo &= ~VTD_PAGE_MASK;
 292        context->lo |= value & VTD_PAGE_MASK;
 293}
 294
 295static inline void context_set_address_width(struct context_entry *context,
 296                                             unsigned long value)
 297{
 298        context->hi |= value & 7;
 299}
 300
 301static inline void context_set_domain_id(struct context_entry *context,
 302                                         unsigned long value)
 303{
 304        context->hi |= (value & ((1 << 16) - 1)) << 8;
 305}
 306
 307static inline int context_domain_id(struct context_entry *c)
 308{
 309        return((c->hi >> 8) & 0xffff);
 310}
 311
 312static inline void context_clear_entry(struct context_entry *context)
 313{
 314        context->lo = 0;
 315        context->hi = 0;
 316}
 317
 318/*
 319 * 0: readable
 320 * 1: writable
 321 * 2-6: reserved
 322 * 7: super page
 323 * 8-10: available
 324 * 11: snoop behavior
 325 * 12-63: Host physcial address
 326 */
 327struct dma_pte {
 328        u64 val;
 329};
 330
 331static inline void dma_clear_pte(struct dma_pte *pte)
 332{
 333        pte->val = 0;
 334}
 335
 336static inline u64 dma_pte_addr(struct dma_pte *pte)
 337{
 338#ifdef CONFIG_64BIT
 339        return pte->val & VTD_PAGE_MASK;
 340#else
 341        /* Must have a full atomic 64-bit read */
 342        return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
 343#endif
 344}
 345
 346static inline bool dma_pte_present(struct dma_pte *pte)
 347{
 348        return (pte->val & 3) != 0;
 349}
 350
 351static inline bool dma_pte_superpage(struct dma_pte *pte)
 352{
 353        return (pte->val & DMA_PTE_LARGE_PAGE);
 354}
 355
 356static inline int first_pte_in_page(struct dma_pte *pte)
 357{
 358        return !((unsigned long)pte & ~VTD_PAGE_MASK);
 359}
 360
 361/*
 362 * This domain is a statically identity mapping domain.
 363 *      1. This domain creats a static 1:1 mapping to all usable memory.
 364 *      2. It maps to each iommu if successful.
 365 *      3. Each iommu mapps to this domain if successful.
 366 */
 367static struct dmar_domain *si_domain;
 368static int hw_pass_through = 1;
 369
 370/*
 371 * Domain represents a virtual machine, more than one devices
 372 * across iommus may be owned in one domain, e.g. kvm guest.
 373 */
 374#define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 0)
 375
 376/* si_domain contains mulitple devices */
 377#define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 1)
 378
 379#define for_each_domain_iommu(idx, domain)                      \
 380        for (idx = 0; idx < g_num_of_iommus; idx++)             \
 381                if (domain->iommu_refcnt[idx])
 382
 383struct dmar_domain {
 384        int     nid;                    /* node id */
 385
 386        unsigned        iommu_refcnt[DMAR_UNITS_SUPPORTED];
 387                                        /* Refcount of devices per iommu */
 388
 389
 390        u16             iommu_did[DMAR_UNITS_SUPPORTED];
 391                                        /* Domain ids per IOMMU. Use u16 since
 392                                         * domain ids are 16 bit wide according
 393                                         * to VT-d spec, section 9.3 */
 394
 395        bool has_iotlb_device;
 396        struct list_head devices;       /* all devices' list */
 397        struct iova_domain iovad;       /* iova's that belong to this domain */
 398
 399        struct dma_pte  *pgd;           /* virtual address */
 400        int             gaw;            /* max guest address width */
 401
 402        /* adjusted guest address width, 0 is level 2 30-bit */
 403        int             agaw;
 404
 405        int             flags;          /* flags to find out type of domain */
 406
 407        int             iommu_coherency;/* indicate coherency of iommu access */
 408        int             iommu_snooping; /* indicate snooping control feature*/
 409        int             iommu_count;    /* reference count of iommu */
 410        int             iommu_superpage;/* Level of superpages supported:
 411                                           0 == 4KiB (no superpages), 1 == 2MiB,
 412                                           2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
 413        u64             max_addr;       /* maximum mapped address */
 414
 415        struct iommu_domain domain;     /* generic domain data structure for
 416                                           iommu core */
 417};
 418
 419/* PCI domain-device relationship */
 420struct device_domain_info {
 421        struct list_head link;  /* link to domain siblings */
 422        struct list_head global; /* link to global list */
 423        u8 bus;                 /* PCI bus number */
 424        u8 devfn;               /* PCI devfn number */
 425        u8 pasid_supported:3;
 426        u8 pasid_enabled:1;
 427        u8 pri_supported:1;
 428        u8 pri_enabled:1;
 429        u8 ats_supported:1;
 430        u8 ats_enabled:1;
 431        u8 ats_qdep;
 432        struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
 433        struct intel_iommu *iommu; /* IOMMU used by this device */
 434        struct dmar_domain *domain; /* pointer to domain */
 435};
 436
 437struct dmar_rmrr_unit {
 438        struct list_head list;          /* list of rmrr units   */
 439        struct acpi_dmar_header *hdr;   /* ACPI header          */
 440        u64     base_address;           /* reserved base address*/
 441        u64     end_address;            /* reserved end address */
 442        struct dmar_dev_scope *devices; /* target devices */
 443        int     devices_cnt;            /* target device count */
 444        struct iommu_resv_region *resv; /* reserved region handle */
 445};
 446
 447struct dmar_atsr_unit {
 448        struct list_head list;          /* list of ATSR units */
 449        struct acpi_dmar_header *hdr;   /* ACPI header */
 450        struct dmar_dev_scope *devices; /* target devices */
 451        int devices_cnt;                /* target device count */
 452        u8 include_all:1;               /* include all ports */
 453};
 454
 455static LIST_HEAD(dmar_atsr_units);
 456static LIST_HEAD(dmar_rmrr_units);
 457
 458#define for_each_rmrr_units(rmrr) \
 459        list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 460
 461/* bitmap for indexing intel_iommus */
 462static int g_num_of_iommus;
 463
 464static void domain_exit(struct dmar_domain *domain);
 465static void domain_remove_dev_info(struct dmar_domain *domain);
 466static void dmar_remove_one_dev_info(struct dmar_domain *domain,
 467                                     struct device *dev);
 468static void __dmar_remove_one_dev_info(struct device_domain_info *info);
 469static void domain_context_clear(struct intel_iommu *iommu,
 470                                 struct device *dev);
 471static int domain_detach_iommu(struct dmar_domain *domain,
 472                               struct intel_iommu *iommu);
 473
 474#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
 475int dmar_disabled = 0;
 476#else
 477int dmar_disabled = 1;
 478#endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
 479
 480int intel_iommu_enabled = 0;
 481EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 482
 483static int dmar_map_gfx = 1;
 484static int dmar_forcedac;
 485static int intel_iommu_strict;
 486static int intel_iommu_superpage = 1;
 487static int intel_iommu_ecs = 1;
 488static int intel_iommu_pasid28;
 489static int iommu_identity_mapping;
 490
 491#define IDENTMAP_ALL            1
 492#define IDENTMAP_GFX            2
 493#define IDENTMAP_AZALIA         4
 494
 495/* Broadwell and Skylake have broken ECS support — normal so-called "second
 496 * level" translation of DMA requests-without-PASID doesn't actually happen
 497 * unless you also set the NESTE bit in an extended context-entry. Which of
 498 * course means that SVM doesn't work because it's trying to do nested
 499 * translation of the physical addresses it finds in the process page tables,
 500 * through the IOVA->phys mapping found in the "second level" page tables.
 501 *
 502 * The VT-d specification was retroactively changed to change the definition
 503 * of the capability bits and pretend that Broadwell/Skylake never happened...
 504 * but unfortunately the wrong bit was changed. It's ECS which is broken, but
 505 * for some reason it was the PASID capability bit which was redefined (from
 506 * bit 28 on BDW/SKL to bit 40 in future).
 507 *
 508 * So our test for ECS needs to eschew those implementations which set the old
 509 * PASID capabiity bit 28, since those are the ones on which ECS is broken.
 510 * Unless we are working around the 'pasid28' limitations, that is, by putting
 511 * the device into passthrough mode for normal DMA and thus masking the bug.
 512 */
 513#define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
 514                            (intel_iommu_pasid28 || !ecap_broken_pasid(iommu->ecap)))
 515/* PASID support is thus enabled if ECS is enabled and *either* of the old
 516 * or new capability bits are set. */
 517#define pasid_enabled(iommu) (ecs_enabled(iommu) &&                     \
 518                              (ecap_pasid(iommu->ecap) || ecap_broken_pasid(iommu->ecap)))
 519
 520int intel_iommu_gfx_mapped;
 521EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 522
 523#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 524static DEFINE_SPINLOCK(device_domain_lock);
 525static LIST_HEAD(device_domain_list);
 526
 527const struct iommu_ops intel_iommu_ops;
 528
 529static bool translation_pre_enabled(struct intel_iommu *iommu)
 530{
 531        return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 532}
 533
 534static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 535{
 536        iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 537}
 538
 539static void init_translation_status(struct intel_iommu *iommu)
 540{
 541        u32 gsts;
 542
 543        gsts = readl(iommu->reg + DMAR_GSTS_REG);
 544        if (gsts & DMA_GSTS_TES)
 545                iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 546}
 547
 548/* Convert generic 'struct iommu_domain to private struct dmar_domain */
 549static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
 550{
 551        return container_of(dom, struct dmar_domain, domain);
 552}
 553
 554static int __init intel_iommu_setup(char *str)
 555{
 556        if (!str)
 557                return -EINVAL;
 558        while (*str) {
 559                if (!strncmp(str, "on", 2)) {
 560                        dmar_disabled = 0;
 561                        pr_info("IOMMU enabled\n");
 562                } else if (!strncmp(str, "off", 3)) {
 563                        dmar_disabled = 1;
 564                        pr_info("IOMMU disabled\n");
 565                } else if (!strncmp(str, "igfx_off", 8)) {
 566                        dmar_map_gfx = 0;
 567                        pr_info("Disable GFX device mapping\n");
 568                } else if (!strncmp(str, "forcedac", 8)) {
 569                        pr_info("Forcing DAC for PCI devices\n");
 570                        dmar_forcedac = 1;
 571                } else if (!strncmp(str, "strict", 6)) {
 572                        pr_info("Disable batched IOTLB flush\n");
 573                        intel_iommu_strict = 1;
 574                } else if (!strncmp(str, "sp_off", 6)) {
 575                        pr_info("Disable supported super page\n");
 576                        intel_iommu_superpage = 0;
 577                } else if (!strncmp(str, "ecs_off", 7)) {
 578                        printk(KERN_INFO
 579                                "Intel-IOMMU: disable extended context table support\n");
 580                        intel_iommu_ecs = 0;
 581                } else if (!strncmp(str, "pasid28", 7)) {
 582                        printk(KERN_INFO
 583                                "Intel-IOMMU: enable pre-production PASID support\n");
 584                        intel_iommu_pasid28 = 1;
 585                        iommu_identity_mapping |= IDENTMAP_GFX;
 586                } else if (!strncmp(str, "tboot_noforce", 13)) {
 587                        printk(KERN_INFO
 588                                "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 589                        intel_iommu_tboot_noforce = 1;
 590                }
 591
 592                str += strcspn(str, ",");
 593                while (*str == ',')
 594                        str++;
 595        }
 596        return 0;
 597}
 598__setup("intel_iommu=", intel_iommu_setup);
 599
 600static struct kmem_cache *iommu_domain_cache;
 601static struct kmem_cache *iommu_devinfo_cache;
 602
 603static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
 604{
 605        struct dmar_domain **domains;
 606        int idx = did >> 8;
 607
 608        domains = iommu->domains[idx];
 609        if (!domains)
 610                return NULL;
 611
 612        return domains[did & 0xff];
 613}
 614
 615static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
 616                             struct dmar_domain *domain)
 617{
 618        struct dmar_domain **domains;
 619        int idx = did >> 8;
 620
 621        if (!iommu->domains[idx]) {
 622                size_t size = 256 * sizeof(struct dmar_domain *);
 623                iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
 624        }
 625
 626        domains = iommu->domains[idx];
 627        if (WARN_ON(!domains))
 628                return;
 629        else
 630                domains[did & 0xff] = domain;
 631}
 632
 633static inline void *alloc_pgtable_page(int node)
 634{
 635        struct page *page;
 636        void *vaddr = NULL;
 637
 638        page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 639        if (page)
 640                vaddr = page_address(page);
 641        return vaddr;
 642}
 643
 644static inline void free_pgtable_page(void *vaddr)
 645{
 646        free_page((unsigned long)vaddr);
 647}
 648
 649static inline void *alloc_domain_mem(void)
 650{
 651        return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 652}
 653
 654static void free_domain_mem(void *vaddr)
 655{
 656        kmem_cache_free(iommu_domain_cache, vaddr);
 657}
 658
 659static inline void * alloc_devinfo_mem(void)
 660{
 661        return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 662}
 663
 664static inline void free_devinfo_mem(void *vaddr)
 665{
 666        kmem_cache_free(iommu_devinfo_cache, vaddr);
 667}
 668
 669static inline int domain_type_is_vm(struct dmar_domain *domain)
 670{
 671        return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
 672}
 673
 674static inline int domain_type_is_si(struct dmar_domain *domain)
 675{
 676        return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
 677}
 678
 679static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
 680{
 681        return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
 682                                DOMAIN_FLAG_STATIC_IDENTITY);
 683}
 684
 685static inline int domain_pfn_supported(struct dmar_domain *domain,
 686                                       unsigned long pfn)
 687{
 688        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 689
 690        return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 691}
 692
 693static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 694{
 695        unsigned long sagaw;
 696        int agaw = -1;
 697
 698        sagaw = cap_sagaw(iommu->cap);
 699        for (agaw = width_to_agaw(max_gaw);
 700             agaw >= 0; agaw--) {
 701                if (test_bit(agaw, &sagaw))
 702                        break;
 703        }
 704
 705        return agaw;
 706}
 707
 708/*
 709 * Calculate max SAGAW for each iommu.
 710 */
 711int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 712{
 713        return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 714}
 715
 716/*
 717 * calculate agaw for each iommu.
 718 * "SAGAW" may be different across iommus, use a default agaw, and
 719 * get a supported less agaw for iommus that don't support the default agaw.
 720 */
 721int iommu_calculate_agaw(struct intel_iommu *iommu)
 722{
 723        return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 724}
 725
 726/* This functionin only returns single iommu in a domain */
 727static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 728{
 729        int iommu_id;
 730
 731        /* si_domain and vm domain should not get here. */
 732        BUG_ON(domain_type_is_vm_or_si(domain));
 733        for_each_domain_iommu(iommu_id, domain)
 734                break;
 735
 736        if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 737                return NULL;
 738
 739        return g_iommus[iommu_id];
 740}
 741
 742static void domain_update_iommu_coherency(struct dmar_domain *domain)
 743{
 744        struct dmar_drhd_unit *drhd;
 745        struct intel_iommu *iommu;
 746        bool found = false;
 747        int i;
 748
 749        domain->iommu_coherency = 1;
 750
 751        for_each_domain_iommu(i, domain) {
 752                found = true;
 753                if (!ecap_coherent(g_iommus[i]->ecap)) {
 754                        domain->iommu_coherency = 0;
 755                        break;
 756                }
 757        }
 758        if (found)
 759                return;
 760
 761        /* No hardware attached; use lowest common denominator */
 762        rcu_read_lock();
 763        for_each_active_iommu(iommu, drhd) {
 764                if (!ecap_coherent(iommu->ecap)) {
 765                        domain->iommu_coherency = 0;
 766                        break;
 767                }
 768        }
 769        rcu_read_unlock();
 770}
 771
 772static int domain_update_iommu_snooping(struct intel_iommu *skip)
 773{
 774        struct dmar_drhd_unit *drhd;
 775        struct intel_iommu *iommu;
 776        int ret = 1;
 777
 778        rcu_read_lock();
 779        for_each_active_iommu(iommu, drhd) {
 780                if (iommu != skip) {
 781                        if (!ecap_sc_support(iommu->ecap)) {
 782                                ret = 0;
 783                                break;
 784                        }
 785                }
 786        }
 787        rcu_read_unlock();
 788
 789        return ret;
 790}
 791
 792static int domain_update_iommu_superpage(struct intel_iommu *skip)
 793{
 794        struct dmar_drhd_unit *drhd;
 795        struct intel_iommu *iommu;
 796        int mask = 0xf;
 797
 798        if (!intel_iommu_superpage) {
 799                return 0;
 800        }
 801
 802        /* set iommu_superpage to the smallest common denominator */
 803        rcu_read_lock();
 804        for_each_active_iommu(iommu, drhd) {
 805                if (iommu != skip) {
 806                        mask &= cap_super_page_val(iommu->cap);
 807                        if (!mask)
 808                                break;
 809                }
 810        }
 811        rcu_read_unlock();
 812
 813        return fls(mask);
 814}
 815
 816/* Some capabilities may be different across iommus */
 817static void domain_update_iommu_cap(struct dmar_domain *domain)
 818{
 819        domain_update_iommu_coherency(domain);
 820        domain->iommu_snooping = domain_update_iommu_snooping(NULL);
 821        domain->iommu_superpage = domain_update_iommu_superpage(NULL);
 822}
 823
 824static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
 825                                                       u8 bus, u8 devfn, int alloc)
 826{
 827        struct root_entry *root = &iommu->root_entry[bus];
 828        struct context_entry *context;
 829        u64 *entry;
 830
 831        entry = &root->lo;
 832        if (ecs_enabled(iommu)) {
 833                if (devfn >= 0x80) {
 834                        devfn -= 0x80;
 835                        entry = &root->hi;
 836                }
 837                devfn *= 2;
 838        }
 839        if (*entry & 1)
 840                context = phys_to_virt(*entry & VTD_PAGE_MASK);
 841        else {
 842                unsigned long phy_addr;
 843                if (!alloc)
 844                        return NULL;
 845
 846                context = alloc_pgtable_page(iommu->node);
 847                if (!context)
 848                        return NULL;
 849
 850                __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 851                phy_addr = virt_to_phys((void *)context);
 852                *entry = phy_addr | 1;
 853                __iommu_flush_cache(iommu, entry, sizeof(*entry));
 854        }
 855        return &context[devfn];
 856}
 857
 858static int iommu_dummy(struct device *dev)
 859{
 860        return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
 861}
 862
 863static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 864{
 865        struct dmar_drhd_unit *drhd = NULL;
 866        struct intel_iommu *iommu;
 867        struct device *tmp;
 868        struct pci_dev *ptmp, *pdev = NULL;
 869        u16 segment = 0;
 870        int i;
 871
 872        if (iommu_dummy(dev))
 873                return NULL;
 874
 875        if (dev_is_pci(dev)) {
 876                struct pci_dev *pf_pdev;
 877
 878                pdev = to_pci_dev(dev);
 879
 880#ifdef CONFIG_X86
 881                /* VMD child devices currently cannot be handled individually */
 882                if (is_vmd(pdev->bus))
 883                        return NULL;
 884#endif
 885
 886                /* VFs aren't listed in scope tables; we need to look up
 887                 * the PF instead to find the IOMMU. */
 888                pf_pdev = pci_physfn(pdev);
 889                dev = &pf_pdev->dev;
 890                segment = pci_domain_nr(pdev->bus);
 891        } else if (has_acpi_companion(dev))
 892                dev = &ACPI_COMPANION(dev)->dev;
 893
 894        rcu_read_lock();
 895        for_each_active_iommu(iommu, drhd) {
 896                if (pdev && segment != drhd->segment)
 897                        continue;
 898
 899                for_each_active_dev_scope(drhd->devices,
 900                                          drhd->devices_cnt, i, tmp) {
 901                        if (tmp == dev) {
 902                                /* For a VF use its original BDF# not that of the PF
 903                                 * which we used for the IOMMU lookup. Strictly speaking
 904                                 * we could do this for all PCI devices; we only need to
 905                                 * get the BDF# from the scope table for ACPI matches. */
 906                                if (pdev && pdev->is_virtfn)
 907                                        goto got_pdev;
 908
 909                                *bus = drhd->devices[i].bus;
 910                                *devfn = drhd->devices[i].devfn;
 911                                goto out;
 912                        }
 913
 914                        if (!pdev || !dev_is_pci(tmp))
 915                                continue;
 916
 917                        ptmp = to_pci_dev(tmp);
 918                        if (ptmp->subordinate &&
 919                            ptmp->subordinate->number <= pdev->bus->number &&
 920                            ptmp->subordinate->busn_res.end >= pdev->bus->number)
 921                                goto got_pdev;
 922                }
 923
 924                if (pdev && drhd->include_all) {
 925                got_pdev:
 926                        *bus = pdev->bus->number;
 927                        *devfn = pdev->devfn;
 928                        goto out;
 929                }
 930        }
 931        iommu = NULL;
 932 out:
 933        rcu_read_unlock();
 934
 935        return iommu;
 936}
 937
 938static void domain_flush_cache(struct dmar_domain *domain,
 939                               void *addr, int size)
 940{
 941        if (!domain->iommu_coherency)
 942                clflush_cache_range(addr, size);
 943}
 944
 945static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 946{
 947        struct context_entry *context;
 948        int ret = 0;
 949        unsigned long flags;
 950
 951        spin_lock_irqsave(&iommu->lock, flags);
 952        context = iommu_context_addr(iommu, bus, devfn, 0);
 953        if (context)
 954                ret = context_present(context);
 955        spin_unlock_irqrestore(&iommu->lock, flags);
 956        return ret;
 957}
 958
 959static void free_context_table(struct intel_iommu *iommu)
 960{
 961        int i;
 962        unsigned long flags;
 963        struct context_entry *context;
 964
 965        spin_lock_irqsave(&iommu->lock, flags);
 966        if (!iommu->root_entry) {
 967                goto out;
 968        }
 969        for (i = 0; i < ROOT_ENTRY_NR; i++) {
 970                context = iommu_context_addr(iommu, i, 0, 0);
 971                if (context)
 972                        free_pgtable_page(context);
 973
 974                if (!ecs_enabled(iommu))
 975                        continue;
 976
 977                context = iommu_context_addr(iommu, i, 0x80, 0);
 978                if (context)
 979                        free_pgtable_page(context);
 980
 981        }
 982        free_pgtable_page(iommu->root_entry);
 983        iommu->root_entry = NULL;
 984out:
 985        spin_unlock_irqrestore(&iommu->lock, flags);
 986}
 987
 988static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 989                                      unsigned long pfn, int *target_level)
 990{
 991        struct dma_pte *parent, *pte = NULL;
 992        int level = agaw_to_level(domain->agaw);
 993        int offset;
 994
 995        BUG_ON(!domain->pgd);
 996
 997        if (!domain_pfn_supported(domain, pfn))
 998                /* Address beyond IOMMU's addressing capabilities. */
 999                return NULL;
1000
1001        parent = domain->pgd;
1002
1003        while (1) {
1004                void *tmp_page;
1005
1006                offset = pfn_level_offset(pfn, level);
1007                pte = &parent[offset];
1008                if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1009                        break;
1010                if (level == *target_level)
1011                        break;
1012
1013                if (!dma_pte_present(pte)) {
1014                        uint64_t pteval;
1015
1016                        tmp_page = alloc_pgtable_page(domain->nid);
1017
1018                        if (!tmp_page)
1019                                return NULL;
1020
1021                        domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1022                        pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1023                        if (cmpxchg64(&pte->val, 0ULL, pteval))
1024                                /* Someone else set it while we were thinking; use theirs. */
1025                                free_pgtable_page(tmp_page);
1026                        else
1027                                domain_flush_cache(domain, pte, sizeof(*pte));
1028                }
1029                if (level == 1)
1030                        break;
1031
1032                parent = phys_to_virt(dma_pte_addr(pte));
1033                level--;
1034        }
1035
1036        if (!*target_level)
1037                *target_level = level;
1038
1039        return pte;
1040}
1041
1042
1043/* return address's pte at specific level */
1044static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1045                                         unsigned long pfn,
1046                                         int level, int *large_page)
1047{
1048        struct dma_pte *parent, *pte = NULL;
1049        int total = agaw_to_level(domain->agaw);
1050        int offset;
1051
1052        parent = domain->pgd;
1053        while (level <= total) {
1054                offset = pfn_level_offset(pfn, total);
1055                pte = &parent[offset];
1056                if (level == total)
1057                        return pte;
1058
1059                if (!dma_pte_present(pte)) {
1060                        *large_page = total;
1061                        break;
1062                }
1063
1064                if (dma_pte_superpage(pte)) {
1065                        *large_page = total;
1066                        return pte;
1067                }
1068
1069                parent = phys_to_virt(dma_pte_addr(pte));
1070                total--;
1071        }
1072        return NULL;
1073}
1074
1075/* clear last level pte, a tlb flush should be followed */
1076static void dma_pte_clear_range(struct dmar_domain *domain,
1077                                unsigned long start_pfn,
1078                                unsigned long last_pfn)
1079{
1080        unsigned int large_page = 1;
1081        struct dma_pte *first_pte, *pte;
1082
1083        BUG_ON(!domain_pfn_supported(domain, start_pfn));
1084        BUG_ON(!domain_pfn_supported(domain, last_pfn));
1085        BUG_ON(start_pfn > last_pfn);
1086
1087        /* we don't need lock here; nobody else touches the iova range */
1088        do {
1089                large_page = 1;
1090                first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1091                if (!pte) {
1092                        start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1093                        continue;
1094                }
1095                do {
1096                        dma_clear_pte(pte);
1097                        start_pfn += lvl_to_nr_pages(large_page);
1098                        pte++;
1099                } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1100
1101                domain_flush_cache(domain, first_pte,
1102                                   (void *)pte - (void *)first_pte);
1103
1104        } while (start_pfn && start_pfn <= last_pfn);
1105}
1106
1107static void dma_pte_free_level(struct dmar_domain *domain, int level,
1108                               int retain_level, struct dma_pte *pte,
1109                               unsigned long pfn, unsigned long start_pfn,
1110                               unsigned long last_pfn)
1111{
1112        pfn = max(start_pfn, pfn);
1113        pte = &pte[pfn_level_offset(pfn, level)];
1114
1115        do {
1116                unsigned long level_pfn;
1117                struct dma_pte *level_pte;
1118
1119                if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1120                        goto next;
1121
1122                level_pfn = pfn & level_mask(level);
1123                level_pte = phys_to_virt(dma_pte_addr(pte));
1124
1125                if (level > 2) {
1126                        dma_pte_free_level(domain, level - 1, retain_level,
1127                                           level_pte, level_pfn, start_pfn,
1128                                           last_pfn);
1129                }
1130
1131                /*
1132                 * Free the page table if we're below the level we want to
1133                 * retain and the range covers the entire table.
1134                 */
1135                if (level < retain_level && !(start_pfn > level_pfn ||
1136                      last_pfn < level_pfn + level_size(level) - 1)) {
1137                        dma_clear_pte(pte);
1138                        domain_flush_cache(domain, pte, sizeof(*pte));
1139                        free_pgtable_page(level_pte);
1140                }
1141next:
1142                pfn += level_size(level);
1143        } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1144}
1145
1146/*
1147 * clear last level (leaf) ptes and free page table pages below the
1148 * level we wish to keep intact.
1149 */
1150static void dma_pte_free_pagetable(struct dmar_domain *domain,
1151                                   unsigned long start_pfn,
1152                                   unsigned long last_pfn,
1153                                   int retain_level)
1154{
1155        BUG_ON(!domain_pfn_supported(domain, start_pfn));
1156        BUG_ON(!domain_pfn_supported(domain, last_pfn));
1157        BUG_ON(start_pfn > last_pfn);
1158
1159        dma_pte_clear_range(domain, start_pfn, last_pfn);
1160
1161        /* We don't need lock here; nobody else touches the iova range */
1162        dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1163                           domain->pgd, 0, start_pfn, last_pfn);
1164
1165        /* free pgd */
1166        if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1167                free_pgtable_page(domain->pgd);
1168                domain->pgd = NULL;
1169        }
1170}
1171
1172/* When a page at a given level is being unlinked from its parent, we don't
1173   need to *modify* it at all. All we need to do is make a list of all the
1174   pages which can be freed just as soon as we've flushed the IOTLB and we
1175   know the hardware page-walk will no longer touch them.
1176   The 'pte' argument is the *parent* PTE, pointing to the page that is to
1177   be freed. */
1178static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1179                                            int level, struct dma_pte *pte,
1180                                            struct page *freelist)
1181{
1182        struct page *pg;
1183
1184        pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1185        pg->freelist = freelist;
1186        freelist = pg;
1187
1188        if (level == 1)
1189                return freelist;
1190
1191        pte = page_address(pg);
1192        do {
1193                if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1194                        freelist = dma_pte_list_pagetables(domain, level - 1,
1195                                                           pte, freelist);
1196                pte++;
1197        } while (!first_pte_in_page(pte));
1198
1199        return freelist;
1200}
1201
1202static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1203                                        struct dma_pte *pte, unsigned long pfn,
1204                                        unsigned long start_pfn,
1205                                        unsigned long last_pfn,
1206                                        struct page *freelist)
1207{
1208        struct dma_pte *first_pte = NULL, *last_pte = NULL;
1209
1210        pfn = max(start_pfn, pfn);
1211        pte = &pte[pfn_level_offset(pfn, level)];
1212
1213        do {
1214                unsigned long level_pfn;
1215
1216                if (!dma_pte_present(pte))
1217                        goto next;
1218
1219                level_pfn = pfn & level_mask(level);
1220
1221                /* If range covers entire pagetable, free it */
1222                if (start_pfn <= level_pfn &&
1223                    last_pfn >= level_pfn + level_size(level) - 1) {
1224                        /* These suborbinate page tables are going away entirely. Don't
1225                           bother to clear them; we're just going to *free* them. */
1226                        if (level > 1 && !dma_pte_superpage(pte))
1227                                freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1228
1229                        dma_clear_pte(pte);
1230                        if (!first_pte)
1231                                first_pte = pte;
1232                        last_pte = pte;
1233                } else if (level > 1) {
1234                        /* Recurse down into a level that isn't *entirely* obsolete */
1235                        freelist = dma_pte_clear_level(domain, level - 1,
1236                                                       phys_to_virt(dma_pte_addr(pte)),
1237                                                       level_pfn, start_pfn, last_pfn,
1238                                                       freelist);
1239                }
1240next:
1241                pfn += level_size(level);
1242        } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1243
1244        if (first_pte)
1245                domain_flush_cache(domain, first_pte,
1246                                   (void *)++last_pte - (void *)first_pte);
1247
1248        return freelist;
1249}
1250
1251/* We can't just free the pages because the IOMMU may still be walking
1252   the page tables, and may have cached the intermediate levels. The
1253   pages can only be freed after the IOTLB flush has been done. */
1254static struct page *domain_unmap(struct dmar_domain *domain,
1255                                 unsigned long start_pfn,
1256                                 unsigned long last_pfn)
1257{
1258        struct page *freelist = NULL;
1259
1260        BUG_ON(!domain_pfn_supported(domain, start_pfn));
1261        BUG_ON(!domain_pfn_supported(domain, last_pfn));
1262        BUG_ON(start_pfn > last_pfn);
1263
1264        /* we don't need lock here; nobody else touches the iova range */
1265        freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1266                                       domain->pgd, 0, start_pfn, last_pfn, NULL);
1267
1268        /* free pgd */
1269        if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1270                struct page *pgd_page = virt_to_page(domain->pgd);
1271                pgd_page->freelist = freelist;
1272                freelist = pgd_page;
1273
1274                domain->pgd = NULL;
1275        }
1276
1277        return freelist;
1278}
1279
1280static void dma_free_pagelist(struct page *freelist)
1281{
1282        struct page *pg;
1283
1284        while ((pg = freelist)) {
1285                freelist = pg->freelist;
1286                free_pgtable_page(page_address(pg));
1287        }
1288}
1289
1290static void iova_entry_free(unsigned long data)
1291{
1292        struct page *freelist = (struct page *)data;
1293
1294        dma_free_pagelist(freelist);
1295}
1296
1297/* iommu handling */
1298static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1299{
1300        struct root_entry *root;
1301        unsigned long flags;
1302
1303        root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1304        if (!root) {
1305                pr_err("Allocating root entry for %s failed\n",
1306                        iommu->name);
1307                return -ENOMEM;
1308        }
1309
1310        __iommu_flush_cache(iommu, root, ROOT_SIZE);
1311
1312        spin_lock_irqsave(&iommu->lock, flags);
1313        iommu->root_entry = root;
1314        spin_unlock_irqrestore(&iommu->lock, flags);
1315
1316        return 0;
1317}
1318
1319static void iommu_set_root_entry(struct intel_iommu *iommu)
1320{
1321        u64 addr;
1322        u32 sts;
1323        unsigned long flag;
1324
1325        addr = virt_to_phys(iommu->root_entry);
1326        if (ecs_enabled(iommu))
1327                addr |= DMA_RTADDR_RTT;
1328
1329        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1330        dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1331
1332        writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1333
1334        /* Make sure hardware complete it */
1335        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1336                      readl, (sts & DMA_GSTS_RTPS), sts);
1337
1338        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1339}
1340
1341static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1342{
1343        u32 val;
1344        unsigned long flag;
1345
1346        if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1347                return;
1348
1349        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1350        writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1351
1352        /* Make sure hardware complete it */
1353        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1354                      readl, (!(val & DMA_GSTS_WBFS)), val);
1355
1356        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1357}
1358
1359/* return value determine if we need a write buffer flush */
1360static void __iommu_flush_context(struct intel_iommu *iommu,
1361                                  u16 did, u16 source_id, u8 function_mask,
1362                                  u64 type)
1363{
1364        u64 val = 0;
1365        unsigned long flag;
1366
1367        switch (type) {
1368        case DMA_CCMD_GLOBAL_INVL:
1369                val = DMA_CCMD_GLOBAL_INVL;
1370                break;
1371        case DMA_CCMD_DOMAIN_INVL:
1372                val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1373                break;
1374        case DMA_CCMD_DEVICE_INVL:
1375                val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1376                        | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1377                break;
1378        default:
1379                BUG();
1380        }
1381        val |= DMA_CCMD_ICC;
1382
1383        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1384        dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1385
1386        /* Make sure hardware complete it */
1387        IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1388                dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1389
1390        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1391}
1392
1393/* return value determine if we need a write buffer flush */
1394static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1395                                u64 addr, unsigned int size_order, u64 type)
1396{
1397        int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1398        u64 val = 0, val_iva = 0;
1399        unsigned long flag;
1400
1401        switch (type) {
1402        case DMA_TLB_GLOBAL_FLUSH:
1403                /* global flush doesn't need set IVA_REG */
1404                val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1405                break;
1406        case DMA_TLB_DSI_FLUSH:
1407                val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1408                break;
1409        case DMA_TLB_PSI_FLUSH:
1410                val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1411                /* IH bit is passed in as part of address */
1412                val_iva = size_order | addr;
1413                break;
1414        default:
1415                BUG();
1416        }
1417        /* Note: set drain read/write */
1418#if 0
1419        /*
1420         * This is probably to be super secure.. Looks like we can
1421         * ignore it without any impact.
1422         */
1423        if (cap_read_drain(iommu->cap))
1424                val |= DMA_TLB_READ_DRAIN;
1425#endif
1426        if (cap_write_drain(iommu->cap))
1427                val |= DMA_TLB_WRITE_DRAIN;
1428
1429        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1430        /* Note: Only uses first TLB reg currently */
1431        if (val_iva)
1432                dmar_writeq(iommu->reg + tlb_offset, val_iva);
1433        dmar_writeq(iommu->reg + tlb_offset + 8, val);
1434
1435        /* Make sure hardware complete it */
1436        IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1437                dmar_readq, (!(val & DMA_TLB_IVT)), val);
1438
1439        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1440
1441        /* check IOTLB invalidation granularity */
1442        if (DMA_TLB_IAIG(val) == 0)
1443                pr_err("Flush IOTLB failed\n");
1444        if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1445                pr_debug("TLB flush request %Lx, actual %Lx\n",
1446                        (unsigned long long)DMA_TLB_IIRG(type),
1447                        (unsigned long long)DMA_TLB_IAIG(val));
1448}
1449
1450static struct device_domain_info *
1451iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1452                         u8 bus, u8 devfn)
1453{
1454        struct device_domain_info *info;
1455
1456        assert_spin_locked(&device_domain_lock);
1457
1458        if (!iommu->qi)
1459                return NULL;
1460
1461        list_for_each_entry(info, &domain->devices, link)
1462                if (info->iommu == iommu && info->bus == bus &&
1463                    info->devfn == devfn) {
1464                        if (info->ats_supported && info->dev)
1465                                return info;
1466                        break;
1467                }
1468
1469        return NULL;
1470}
1471
1472static void domain_update_iotlb(struct dmar_domain *domain)
1473{
1474        struct device_domain_info *info;
1475        bool has_iotlb_device = false;
1476
1477        assert_spin_locked(&device_domain_lock);
1478
1479        list_for_each_entry(info, &domain->devices, link) {
1480                struct pci_dev *pdev;
1481
1482                if (!info->dev || !dev_is_pci(info->dev))
1483                        continue;
1484
1485                pdev = to_pci_dev(info->dev);
1486                if (pdev->ats_enabled) {
1487                        has_iotlb_device = true;
1488                        break;
1489                }
1490        }
1491
1492        domain->has_iotlb_device = has_iotlb_device;
1493}
1494
1495static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1496{
1497        struct pci_dev *pdev;
1498
1499        assert_spin_locked(&device_domain_lock);
1500
1501        if (!info || !dev_is_pci(info->dev))
1502                return;
1503
1504        pdev = to_pci_dev(info->dev);
1505
1506#ifdef CONFIG_INTEL_IOMMU_SVM
1507        /* The PCIe spec, in its wisdom, declares that the behaviour of
1508           the device if you enable PASID support after ATS support is
1509           undefined. So always enable PASID support on devices which
1510           have it, even if we can't yet know if we're ever going to
1511           use it. */
1512        if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1513                info->pasid_enabled = 1;
1514
1515        if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1516                info->pri_enabled = 1;
1517#endif
1518        if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1519                info->ats_enabled = 1;
1520                domain_update_iotlb(info->domain);
1521                info->ats_qdep = pci_ats_queue_depth(pdev);
1522        }
1523}
1524
1525static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1526{
1527        struct pci_dev *pdev;
1528
1529        assert_spin_locked(&device_domain_lock);
1530
1531        if (!dev_is_pci(info->dev))
1532                return;
1533
1534        pdev = to_pci_dev(info->dev);
1535
1536        if (info->ats_enabled) {
1537                pci_disable_ats(pdev);
1538                info->ats_enabled = 0;
1539                domain_update_iotlb(info->domain);
1540        }
1541#ifdef CONFIG_INTEL_IOMMU_SVM
1542        if (info->pri_enabled) {
1543                pci_disable_pri(pdev);
1544                info->pri_enabled = 0;
1545        }
1546        if (info->pasid_enabled) {
1547                pci_disable_pasid(pdev);
1548                info->pasid_enabled = 0;
1549        }
1550#endif
1551}
1552
1553static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1554                                  u64 addr, unsigned mask)
1555{
1556        u16 sid, qdep;
1557        unsigned long flags;
1558        struct device_domain_info *info;
1559
1560        if (!domain->has_iotlb_device)
1561                return;
1562
1563        spin_lock_irqsave(&device_domain_lock, flags);
1564        list_for_each_entry(info, &domain->devices, link) {
1565                if (!info->ats_enabled)
1566                        continue;
1567
1568                sid = info->bus << 8 | info->devfn;
1569                qdep = info->ats_qdep;
1570                qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1571        }
1572        spin_unlock_irqrestore(&device_domain_lock, flags);
1573}
1574
1575static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1576                                  struct dmar_domain *domain,
1577                                  unsigned long pfn, unsigned int pages,
1578                                  int ih, int map)
1579{
1580        unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1581        uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1582        u16 did = domain->iommu_did[iommu->seq_id];
1583
1584        BUG_ON(pages == 0);
1585
1586        if (ih)
1587                ih = 1 << 6;
1588        /*
1589         * Fallback to domain selective flush if no PSI support or the size is
1590         * too big.
1591         * PSI requires page size to be 2 ^ x, and the base address is naturally
1592         * aligned to the size
1593         */
1594        if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1595                iommu->flush.flush_iotlb(iommu, did, 0, 0,
1596                                                DMA_TLB_DSI_FLUSH);
1597        else
1598                iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1599                                                DMA_TLB_PSI_FLUSH);
1600
1601        /*
1602         * In caching mode, changes of pages from non-present to present require
1603         * flush. However, device IOTLB doesn't need to be flushed in this case.
1604         */
1605        if (!cap_caching_mode(iommu->cap) || !map)
1606                iommu_flush_dev_iotlb(domain, addr, mask);
1607}
1608
1609static void iommu_flush_iova(struct iova_domain *iovad)
1610{
1611        struct dmar_domain *domain;
1612        int idx;
1613
1614        domain = container_of(iovad, struct dmar_domain, iovad);
1615
1616        for_each_domain_iommu(idx, domain) {
1617                struct intel_iommu *iommu = g_iommus[idx];
1618                u16 did = domain->iommu_did[iommu->seq_id];
1619
1620                iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1621
1622                if (!cap_caching_mode(iommu->cap))
1623                        iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1624                                              0, MAX_AGAW_PFN_WIDTH);
1625        }
1626}
1627
1628static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1629{
1630        u32 pmen;
1631        unsigned long flags;
1632
1633        raw_spin_lock_irqsave(&iommu->register_lock, flags);
1634        pmen = readl(iommu->reg + DMAR_PMEN_REG);
1635        pmen &= ~DMA_PMEN_EPM;
1636        writel(pmen, iommu->reg + DMAR_PMEN_REG);
1637
1638        /* wait for the protected region status bit to clear */
1639        IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1640                readl, !(pmen & DMA_PMEN_PRS), pmen);
1641
1642        raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1643}
1644
1645static void iommu_enable_translation(struct intel_iommu *iommu)
1646{
1647        u32 sts;
1648        unsigned long flags;
1649
1650        raw_spin_lock_irqsave(&iommu->register_lock, flags);
1651        iommu->gcmd |= DMA_GCMD_TE;
1652        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1653
1654        /* Make sure hardware complete it */
1655        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1656                      readl, (sts & DMA_GSTS_TES), sts);
1657
1658        raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1659}
1660
1661static void iommu_disable_translation(struct intel_iommu *iommu)
1662{
1663        u32 sts;
1664        unsigned long flag;
1665
1666        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1667        iommu->gcmd &= ~DMA_GCMD_TE;
1668        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1669
1670        /* Make sure hardware complete it */
1671        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1672                      readl, (!(sts & DMA_GSTS_TES)), sts);
1673
1674        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1675}
1676
1677
1678static int iommu_init_domains(struct intel_iommu *iommu)
1679{
1680        u32 ndomains, nlongs;
1681        size_t size;
1682
1683        ndomains = cap_ndoms(iommu->cap);
1684        pr_debug("%s: Number of Domains supported <%d>\n",
1685                 iommu->name, ndomains);
1686        nlongs = BITS_TO_LONGS(ndomains);
1687
1688        spin_lock_init(&iommu->lock);
1689
1690        iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1691        if (!iommu->domain_ids) {
1692                pr_err("%s: Allocating domain id array failed\n",
1693                       iommu->name);
1694                return -ENOMEM;
1695        }
1696
1697        size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1698        iommu->domains = kzalloc(size, GFP_KERNEL);
1699
1700        if (iommu->domains) {
1701                size = 256 * sizeof(struct dmar_domain *);
1702                iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1703        }
1704
1705        if (!iommu->domains || !iommu->domains[0]) {
1706                pr_err("%s: Allocating domain array failed\n",
1707                       iommu->name);
1708                kfree(iommu->domain_ids);
1709                kfree(iommu->domains);
1710                iommu->domain_ids = NULL;
1711                iommu->domains    = NULL;
1712                return -ENOMEM;
1713        }
1714
1715
1716
1717        /*
1718         * If Caching mode is set, then invalid translations are tagged
1719         * with domain-id 0, hence we need to pre-allocate it. We also
1720         * use domain-id 0 as a marker for non-allocated domain-id, so
1721         * make sure it is not used for a real domain.
1722         */
1723        set_bit(0, iommu->domain_ids);
1724
1725        return 0;
1726}
1727
1728static void disable_dmar_iommu(struct intel_iommu *iommu)
1729{
1730        struct device_domain_info *info, *tmp;
1731        unsigned long flags;
1732
1733        if (!iommu->domains || !iommu->domain_ids)
1734                return;
1735
1736again:
1737        spin_lock_irqsave(&device_domain_lock, flags);
1738        list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1739                struct dmar_domain *domain;
1740
1741                if (info->iommu != iommu)
1742                        continue;
1743
1744                if (!info->dev || !info->domain)
1745                        continue;
1746
1747                domain = info->domain;
1748
1749                __dmar_remove_one_dev_info(info);
1750
1751                if (!domain_type_is_vm_or_si(domain)) {
1752                        /*
1753                         * The domain_exit() function  can't be called under
1754                         * device_domain_lock, as it takes this lock itself.
1755                         * So release the lock here and re-run the loop
1756                         * afterwards.
1757                         */
1758                        spin_unlock_irqrestore(&device_domain_lock, flags);
1759                        domain_exit(domain);
1760                        goto again;
1761                }
1762        }
1763        spin_unlock_irqrestore(&device_domain_lock, flags);
1764
1765        if (iommu->gcmd & DMA_GCMD_TE)
1766                iommu_disable_translation(iommu);
1767}
1768
1769static void free_dmar_iommu(struct intel_iommu *iommu)
1770{
1771        if ((iommu->domains) && (iommu->domain_ids)) {
1772                int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1773                int i;
1774
1775                for (i = 0; i < elems; i++)
1776                        kfree(iommu->domains[i]);
1777                kfree(iommu->domains);
1778                kfree(iommu->domain_ids);
1779                iommu->domains = NULL;
1780                iommu->domain_ids = NULL;
1781        }
1782
1783        g_iommus[iommu->seq_id] = NULL;
1784
1785        /* free context mapping */
1786        free_context_table(iommu);
1787
1788#ifdef CONFIG_INTEL_IOMMU_SVM
1789        if (pasid_enabled(iommu)) {
1790                if (ecap_prs(iommu->ecap))
1791                        intel_svm_finish_prq(iommu);
1792                intel_svm_free_pasid_tables(iommu);
1793        }
1794#endif
1795}
1796
1797static struct dmar_domain *alloc_domain(int flags)
1798{
1799        struct dmar_domain *domain;
1800
1801        domain = alloc_domain_mem();
1802        if (!domain)
1803                return NULL;
1804
1805        memset(domain, 0, sizeof(*domain));
1806        domain->nid = -1;
1807        domain->flags = flags;
1808        domain->has_iotlb_device = false;
1809        INIT_LIST_HEAD(&domain->devices);
1810
1811        return domain;
1812}
1813
1814/* Must be called with iommu->lock */
1815static int domain_attach_iommu(struct dmar_domain *domain,
1816                               struct intel_iommu *iommu)
1817{
1818        unsigned long ndomains;
1819        int num;
1820
1821        assert_spin_locked(&device_domain_lock);
1822        assert_spin_locked(&iommu->lock);
1823
1824        domain->iommu_refcnt[iommu->seq_id] += 1;
1825        domain->iommu_count += 1;
1826        if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1827                ndomains = cap_ndoms(iommu->cap);
1828                num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1829
1830                if (num >= ndomains) {
1831                        pr_err("%s: No free domain ids\n", iommu->name);
1832                        domain->iommu_refcnt[iommu->seq_id] -= 1;
1833                        domain->iommu_count -= 1;
1834                        return -ENOSPC;
1835                }
1836
1837                set_bit(num, iommu->domain_ids);
1838                set_iommu_domain(iommu, num, domain);
1839
1840                domain->iommu_did[iommu->seq_id] = num;
1841                domain->nid                      = iommu->node;
1842
1843                domain_update_iommu_cap(domain);
1844        }
1845
1846        return 0;
1847}
1848
1849static int domain_detach_iommu(struct dmar_domain *domain,
1850                               struct intel_iommu *iommu)
1851{
1852        int num, count = INT_MAX;
1853
1854        assert_spin_locked(&device_domain_lock);
1855        assert_spin_locked(&iommu->lock);
1856
1857        domain->iommu_refcnt[iommu->seq_id] -= 1;
1858        count = --domain->iommu_count;
1859        if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1860                num = domain->iommu_did[iommu->seq_id];
1861                clear_bit(num, iommu->domain_ids);
1862                set_iommu_domain(iommu, num, NULL);
1863
1864                domain_update_iommu_cap(domain);
1865                domain->iommu_did[iommu->seq_id] = 0;
1866        }
1867
1868        return count;
1869}
1870
1871static struct iova_domain reserved_iova_list;
1872static struct lock_class_key reserved_rbtree_key;
1873
1874static int dmar_init_reserved_ranges(void)
1875{
1876        struct pci_dev *pdev = NULL;
1877        struct iova *iova;
1878        int i;
1879
1880        init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1881
1882        lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1883                &reserved_rbtree_key);
1884
1885        /* IOAPIC ranges shouldn't be accessed by DMA */
1886        iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1887                IOVA_PFN(IOAPIC_RANGE_END));
1888        if (!iova) {
1889                pr_err("Reserve IOAPIC range failed\n");
1890                return -ENODEV;
1891        }
1892
1893        /* Reserve all PCI MMIO to avoid peer-to-peer access */
1894        for_each_pci_dev(pdev) {
1895                struct resource *r;
1896
1897                for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1898                        r = &pdev->resource[i];
1899                        if (!r->flags || !(r->flags & IORESOURCE_MEM))
1900                                continue;
1901                        iova = reserve_iova(&reserved_iova_list,
1902                                            IOVA_PFN(r->start),
1903                                            IOVA_PFN(r->end));
1904                        if (!iova) {
1905                                pr_err("Reserve iova failed\n");
1906                                return -ENODEV;
1907                        }
1908                }
1909        }
1910        return 0;
1911}
1912
1913static void domain_reserve_special_ranges(struct dmar_domain *domain)
1914{
1915        copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1916}
1917
1918static inline int guestwidth_to_adjustwidth(int gaw)
1919{
1920        int agaw;
1921        int r = (gaw - 12) % 9;
1922
1923        if (r == 0)
1924                agaw = gaw;
1925        else
1926                agaw = gaw + 9 - r;
1927        if (agaw > 64)
1928                agaw = 64;
1929        return agaw;
1930}
1931
1932static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1933                       int guest_width)
1934{
1935        int adjust_width, agaw;
1936        unsigned long sagaw;
1937        int err;
1938
1939        init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1940
1941        err = init_iova_flush_queue(&domain->iovad,
1942                                    iommu_flush_iova, iova_entry_free);
1943        if (err)
1944                return err;
1945
1946        domain_reserve_special_ranges(domain);
1947
1948        /* calculate AGAW */
1949        if (guest_width > cap_mgaw(iommu->cap))
1950                guest_width = cap_mgaw(iommu->cap);
1951        domain->gaw = guest_width;
1952        adjust_width = guestwidth_to_adjustwidth(guest_width);
1953        agaw = width_to_agaw(adjust_width);
1954        sagaw = cap_sagaw(iommu->cap);
1955        if (!test_bit(agaw, &sagaw)) {
1956                /* hardware doesn't support it, choose a bigger one */
1957                pr_debug("Hardware doesn't support agaw %d\n", agaw);
1958                agaw = find_next_bit(&sagaw, 5, agaw);
1959                if (agaw >= 5)
1960                        return -ENODEV;
1961        }
1962        domain->agaw = agaw;
1963
1964        if (ecap_coherent(iommu->ecap))
1965                domain->iommu_coherency = 1;
1966        else
1967                domain->iommu_coherency = 0;
1968
1969        if (ecap_sc_support(iommu->ecap))
1970                domain->iommu_snooping = 1;
1971        else
1972                domain->iommu_snooping = 0;
1973
1974        if (intel_iommu_superpage)
1975                domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1976        else
1977                domain->iommu_superpage = 0;
1978
1979        domain->nid = iommu->node;
1980
1981        /* always allocate the top pgd */
1982        domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1983        if (!domain->pgd)
1984                return -ENOMEM;
1985        __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1986        return 0;
1987}
1988
1989static void domain_exit(struct dmar_domain *domain)
1990{
1991        struct page *freelist = NULL;
1992
1993        /* Domain 0 is reserved, so dont process it */
1994        if (!domain)
1995                return;
1996
1997        /* Remove associated devices and clear attached or cached domains */
1998        rcu_read_lock();
1999        domain_remove_dev_info(domain);
2000        rcu_read_unlock();
2001
2002        /* destroy iovas */
2003        put_iova_domain(&domain->iovad);
2004
2005        freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
2006
2007        dma_free_pagelist(freelist);
2008
2009        free_domain_mem(domain);
2010}
2011
2012static int domain_context_mapping_one(struct dmar_domain *domain,
2013                                      struct intel_iommu *iommu,
2014                                      u8 bus, u8 devfn)
2015{
2016        u16 did = domain->iommu_did[iommu->seq_id];
2017        int translation = CONTEXT_TT_MULTI_LEVEL;
2018        struct device_domain_info *info = NULL;
2019        struct context_entry *context;
2020        unsigned long flags;
2021        struct dma_pte *pgd;
2022        int ret, agaw;
2023
2024        WARN_ON(did == 0);
2025
2026        if (hw_pass_through && domain_type_is_si(domain))
2027                translation = CONTEXT_TT_PASS_THROUGH;
2028
2029        pr_debug("Set context mapping for %02x:%02x.%d\n",
2030                bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2031
2032        BUG_ON(!domain->pgd);
2033
2034        spin_lock_irqsave(&device_domain_lock, flags);
2035        spin_lock(&iommu->lock);
2036
2037        ret = -ENOMEM;
2038        context = iommu_context_addr(iommu, bus, devfn, 1);
2039        if (!context)
2040                goto out_unlock;
2041
2042        ret = 0;
2043        if (context_present(context))
2044                goto out_unlock;
2045
2046        /*
2047         * For kdump cases, old valid entries may be cached due to the
2048         * in-flight DMA and copied pgtable, but there is no unmapping
2049         * behaviour for them, thus we need an explicit cache flush for
2050         * the newly-mapped device. For kdump, at this point, the device
2051         * is supposed to finish reset at its driver probe stage, so no
2052         * in-flight DMA will exist, and we don't need to worry anymore
2053         * hereafter.
2054         */
2055        if (context_copied(context)) {
2056                u16 did_old = context_domain_id(context);
2057
2058                if (did_old < cap_ndoms(iommu->cap)) {
2059                        iommu->flush.flush_context(iommu, did_old,
2060                                                   (((u16)bus) << 8) | devfn,
2061                                                   DMA_CCMD_MASK_NOBIT,
2062                                                   DMA_CCMD_DEVICE_INVL);
2063                        iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2064                                                 DMA_TLB_DSI_FLUSH);
2065                }
2066        }
2067
2068        pgd = domain->pgd;
2069
2070        context_clear_entry(context);
2071        context_set_domain_id(context, did);
2072
2073        /*
2074         * Skip top levels of page tables for iommu which has less agaw
2075         * than default.  Unnecessary for PT mode.
2076         */
2077        if (translation != CONTEXT_TT_PASS_THROUGH) {
2078                for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
2079                        ret = -ENOMEM;
2080                        pgd = phys_to_virt(dma_pte_addr(pgd));
2081                        if (!dma_pte_present(pgd))
2082                                goto out_unlock;
2083                }
2084
2085                info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2086                if (info && info->ats_supported)
2087                        translation = CONTEXT_TT_DEV_IOTLB;
2088                else
2089                        translation = CONTEXT_TT_MULTI_LEVEL;
2090
2091                context_set_address_root(context, virt_to_phys(pgd));
2092                context_set_address_width(context, iommu->agaw);
2093        } else {
2094                /*
2095                 * In pass through mode, AW must be programmed to
2096                 * indicate the largest AGAW value supported by
2097                 * hardware. And ASR is ignored by hardware.
2098                 */
2099                context_set_address_width(context, iommu->msagaw);
2100        }
2101
2102        context_set_translation_type(context, translation);
2103        context_set_fault_enable(context);
2104        context_set_present(context);
2105        domain_flush_cache(domain, context, sizeof(*context));
2106
2107        /*
2108         * It's a non-present to present mapping. If hardware doesn't cache
2109         * non-present entry we only need to flush the write-buffer. If the
2110         * _does_ cache non-present entries, then it does so in the special
2111         * domain #0, which we have to flush:
2112         */
2113        if (cap_caching_mode(iommu->cap)) {
2114                iommu->flush.flush_context(iommu, 0,
2115                                           (((u16)bus) << 8) | devfn,
2116                                           DMA_CCMD_MASK_NOBIT,
2117                                           DMA_CCMD_DEVICE_INVL);
2118                iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2119        } else {
2120                iommu_flush_write_buffer(iommu);
2121        }
2122        iommu_enable_dev_iotlb(info);
2123
2124        ret = 0;
2125
2126out_unlock:
2127        spin_unlock(&iommu->lock);
2128        spin_unlock_irqrestore(&device_domain_lock, flags);
2129
2130        return ret;
2131}
2132
2133struct domain_context_mapping_data {
2134        struct dmar_domain *domain;
2135        struct intel_iommu *iommu;
2136};
2137
2138static int domain_context_mapping_cb(struct pci_dev *pdev,
2139                                     u16 alias, void *opaque)
2140{
2141        struct domain_context_mapping_data *data = opaque;
2142
2143        return domain_context_mapping_one(data->domain, data->iommu,
2144                                          PCI_BUS_NUM(alias), alias & 0xff);
2145}
2146
2147static int
2148domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2149{
2150        struct intel_iommu *iommu;
2151        u8 bus, devfn;
2152        struct domain_context_mapping_data data;
2153
2154        iommu = device_to_iommu(dev, &bus, &devfn);
2155        if (!iommu)
2156                return -ENODEV;
2157
2158        if (!dev_is_pci(dev))
2159                return domain_context_mapping_one(domain, iommu, bus, devfn);
2160
2161        data.domain = domain;
2162        data.iommu = iommu;
2163
2164        return pci_for_each_dma_alias(to_pci_dev(dev),
2165                                      &domain_context_mapping_cb, &data);
2166}
2167
2168static int domain_context_mapped_cb(struct pci_dev *pdev,
2169                                    u16 alias, void *opaque)
2170{
2171        struct intel_iommu *iommu = opaque;
2172
2173        return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2174}
2175
2176static int domain_context_mapped(struct device *dev)
2177{
2178        struct intel_iommu *iommu;
2179        u8 bus, devfn;
2180
2181        iommu = device_to_iommu(dev, &bus, &devfn);
2182        if (!iommu)
2183                return -ENODEV;
2184
2185        if (!dev_is_pci(dev))
2186                return device_context_mapped(iommu, bus, devfn);
2187
2188        return !pci_for_each_dma_alias(to_pci_dev(dev),
2189                                       domain_context_mapped_cb, iommu);
2190}
2191
2192/* Returns a number of VTD pages, but aligned to MM page size */
2193static inline unsigned long aligned_nrpages(unsigned long host_addr,
2194                                            size_t size)
2195{
2196        host_addr &= ~PAGE_MASK;
2197        return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2198}
2199
2200/* Return largest possible superpage level for a given mapping */
2201static inline int hardware_largepage_caps(struct dmar_domain *domain,
2202                                          unsigned long iov_pfn,
2203                                          unsigned long phy_pfn,
2204                                          unsigned long pages)
2205{
2206        int support, level = 1;
2207        unsigned long pfnmerge;
2208
2209        support = domain->iommu_superpage;
2210
2211        /* To use a large page, the virtual *and* physical addresses
2212           must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2213           of them will mean we have to use smaller pages. So just
2214           merge them and check both at once. */
2215        pfnmerge = iov_pfn | phy_pfn;
2216
2217        while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2218                pages >>= VTD_STRIDE_SHIFT;
2219                if (!pages)
2220                        break;
2221                pfnmerge >>= VTD_STRIDE_SHIFT;
2222                level++;
2223                support--;
2224        }
2225        return level;
2226}
2227
2228static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2229                            struct scatterlist *sg, unsigned long phys_pfn,
2230                            unsigned long nr_pages, int prot)
2231{
2232        struct dma_pte *first_pte = NULL, *pte = NULL;
2233        phys_addr_t uninitialized_var(pteval);
2234        unsigned long sg_res = 0;
2235        unsigned int largepage_lvl = 0;
2236        unsigned long lvl_pages = 0;
2237
2238        BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2239
2240        if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2241                return -EINVAL;
2242
2243        prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2244
2245        if (!sg) {
2246                sg_res = nr_pages;
2247                pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2248        }
2249
2250        while (nr_pages > 0) {
2251                uint64_t tmp;
2252
2253                if (!sg_res) {
2254                        unsigned int pgoff = sg->offset & ~PAGE_MASK;
2255
2256                        sg_res = aligned_nrpages(sg->offset, sg->length);
2257                        sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2258                        sg->dma_length = sg->length;
2259                        pteval = (sg_phys(sg) - pgoff) | prot;
2260                        phys_pfn = pteval >> VTD_PAGE_SHIFT;
2261                }
2262
2263                if (!pte) {
2264                        largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2265
2266                        first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2267                        if (!pte)
2268                                return -ENOMEM;
2269                        /* It is large page*/
2270                        if (largepage_lvl > 1) {
2271                                unsigned long nr_superpages, end_pfn;
2272
2273                                pteval |= DMA_PTE_LARGE_PAGE;
2274                                lvl_pages = lvl_to_nr_pages(largepage_lvl);
2275
2276                                nr_superpages = sg_res / lvl_pages;
2277                                end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2278
2279                                /*
2280                                 * Ensure that old small page tables are
2281                                 * removed to make room for superpage(s).
2282                                 * We're adding new large pages, so make sure
2283                                 * we don't remove their parent tables.
2284                                 */
2285                                dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2286                                                       largepage_lvl + 1);
2287                        } else {
2288                                pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2289                        }
2290
2291                }
2292                /* We don't need lock here, nobody else
2293                 * touches the iova range
2294                 */
2295                tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2296                if (tmp) {
2297                        static int dumps = 5;
2298                        pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2299                                iov_pfn, tmp, (unsigned long long)pteval);
2300                        if (dumps) {
2301                                dumps--;
2302                                debug_dma_dump_mappings(NULL);
2303                        }
2304                        WARN_ON(1);
2305                }
2306
2307                lvl_pages = lvl_to_nr_pages(largepage_lvl);
2308
2309                BUG_ON(nr_pages < lvl_pages);
2310                BUG_ON(sg_res < lvl_pages);
2311
2312                nr_pages -= lvl_pages;
2313                iov_pfn += lvl_pages;
2314                phys_pfn += lvl_pages;
2315                pteval += lvl_pages * VTD_PAGE_SIZE;
2316                sg_res -= lvl_pages;
2317
2318                /* If the next PTE would be the first in a new page, then we
2319                   need to flush the cache on the entries we've just written.
2320                   And then we'll need to recalculate 'pte', so clear it and
2321                   let it get set again in the if (!pte) block above.
2322
2323                   If we're done (!nr_pages) we need to flush the cache too.
2324
2325                   Also if we've been setting superpages, we may need to
2326                   recalculate 'pte' and switch back to smaller pages for the
2327                   end of the mapping, if the trailing size is not enough to
2328                   use another superpage (i.e. sg_res < lvl_pages). */
2329                pte++;
2330                if (!nr_pages || first_pte_in_page(pte) ||
2331                    (largepage_lvl > 1 && sg_res < lvl_pages)) {
2332                        domain_flush_cache(domain, first_pte,
2333                                           (void *)pte - (void *)first_pte);
2334                        pte = NULL;
2335                }
2336
2337                if (!sg_res && nr_pages)
2338                        sg = sg_next(sg);
2339        }
2340        return 0;
2341}
2342
2343static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2344                                    struct scatterlist *sg, unsigned long nr_pages,
2345                                    int prot)
2346{
2347        return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2348}
2349
2350static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2351                                     unsigned long phys_pfn, unsigned long nr_pages,
2352                                     int prot)
2353{
2354        return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2355}
2356
2357static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2358{
2359        unsigned long flags;
2360        struct context_entry *context;
2361        u16 did_old;
2362
2363        if (!iommu)
2364                return;
2365
2366        spin_lock_irqsave(&iommu->lock, flags);
2367        context = iommu_context_addr(iommu, bus, devfn, 0);
2368        if (!context) {
2369                spin_unlock_irqrestore(&iommu->lock, flags);
2370                return;
2371        }
2372        did_old = context_domain_id(context);
2373        context_clear_entry(context);
2374        __iommu_flush_cache(iommu, context, sizeof(*context));
2375        spin_unlock_irqrestore(&iommu->lock, flags);
2376        iommu->flush.flush_context(iommu,
2377                                   did_old,
2378                                   (((u16)bus) << 8) | devfn,
2379                                   DMA_CCMD_MASK_NOBIT,
2380                                   DMA_CCMD_DEVICE_INVL);
2381        iommu->flush.flush_iotlb(iommu,
2382                                 did_old,
2383                                 0,
2384                                 0,
2385                                 DMA_TLB_DSI_FLUSH);
2386}
2387
2388static inline void unlink_domain_info(struct device_domain_info *info)
2389{
2390        assert_spin_locked(&device_domain_lock);
2391        list_del(&info->link);
2392        list_del(&info->global);
2393        if (info->dev)
2394                info->dev->archdata.iommu = NULL;
2395}
2396
2397static void domain_remove_dev_info(struct dmar_domain *domain)
2398{
2399        struct device_domain_info *info, *tmp;
2400        unsigned long flags;
2401
2402        spin_lock_irqsave(&device_domain_lock, flags);
2403        list_for_each_entry_safe(info, tmp, &domain->devices, link)
2404                __dmar_remove_one_dev_info(info);
2405        spin_unlock_irqrestore(&device_domain_lock, flags);
2406}
2407
2408/*
2409 * find_domain
2410 * Note: we use struct device->archdata.iommu stores the info
2411 */
2412static struct dmar_domain *find_domain(struct device *dev)
2413{
2414        struct device_domain_info *info;
2415
2416        /* No lock here, assumes no domain exit in normal case */
2417        info = dev->archdata.iommu;
2418        if (likely(info))
2419                return info->domain;
2420        return NULL;
2421}
2422
2423static inline struct device_domain_info *
2424dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2425{
2426        struct device_domain_info *info;
2427
2428        list_for_each_entry(info, &device_domain_list, global)
2429                if (info->iommu->segment == segment && info->bus == bus &&
2430                    info->devfn == devfn)
2431                        return info;
2432
2433        return NULL;
2434}
2435
2436static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2437                                                    int bus, int devfn,
2438                                                    struct device *dev,
2439                                                    struct dmar_domain *domain)
2440{
2441        struct dmar_domain *found = NULL;
2442        struct device_domain_info *info;
2443        unsigned long flags;
2444        int ret;
2445
2446        info = alloc_devinfo_mem();
2447        if (!info)
2448                return NULL;
2449
2450        info->bus = bus;
2451        info->devfn = devfn;
2452        info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2453        info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2454        info->ats_qdep = 0;
2455        info->dev = dev;
2456        info->domain = domain;
2457        info->iommu = iommu;
2458
2459        if (dev && dev_is_pci(dev)) {
2460                struct pci_dev *pdev = to_pci_dev(info->dev);
2461
2462                if (ecap_dev_iotlb_support(iommu->ecap) &&
2463                    pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2464                    dmar_find_matched_atsr_unit(pdev))
2465                        info->ats_supported = 1;
2466
2467                if (ecs_enabled(iommu)) {
2468                        if (pasid_enabled(iommu)) {
2469                                int features = pci_pasid_features(pdev);
2470                                if (features >= 0)
2471                                        info->pasid_supported = features | 1;
2472                        }
2473
2474                        if (info->ats_supported && ecap_prs(iommu->ecap) &&
2475                            pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2476                                info->pri_supported = 1;
2477                }
2478        }
2479
2480        spin_lock_irqsave(&device_domain_lock, flags);
2481        if (dev)
2482                found = find_domain(dev);
2483
2484        if (!found) {
2485                struct device_domain_info *info2;
2486                info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2487                if (info2) {
2488                        found      = info2->domain;
2489                        info2->dev = dev;
2490                }
2491        }
2492
2493        if (found) {
2494                spin_unlock_irqrestore(&device_domain_lock, flags);
2495                free_devinfo_mem(info);
2496                /* Caller must free the original domain */
2497                return found;
2498        }
2499
2500        spin_lock(&iommu->lock);
2501        ret = domain_attach_iommu(domain, iommu);
2502        spin_unlock(&iommu->lock);
2503
2504        if (ret) {
2505                spin_unlock_irqrestore(&device_domain_lock, flags);
2506                free_devinfo_mem(info);
2507                return NULL;
2508        }
2509
2510        list_add(&info->link, &domain->devices);
2511        list_add(&info->global, &device_domain_list);
2512        if (dev)
2513                dev->archdata.iommu = info;
2514        spin_unlock_irqrestore(&device_domain_lock, flags);
2515
2516        if (dev && domain_context_mapping(domain, dev)) {
2517                pr_err("Domain context map for %s failed\n", dev_name(dev));
2518                dmar_remove_one_dev_info(domain, dev);
2519                return NULL;
2520        }
2521
2522        return domain;
2523}
2524
2525static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2526{
2527        *(u16 *)opaque = alias;
2528        return 0;
2529}
2530
2531static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2532{
2533        struct device_domain_info *info = NULL;
2534        struct dmar_domain *domain = NULL;
2535        struct intel_iommu *iommu;
2536        u16 req_id, dma_alias;
2537        unsigned long flags;
2538        u8 bus, devfn;
2539
2540        iommu = device_to_iommu(dev, &bus, &devfn);
2541        if (!iommu)
2542                return NULL;
2543
2544        req_id = ((u16)bus << 8) | devfn;
2545
2546        if (dev_is_pci(dev)) {
2547                struct pci_dev *pdev = to_pci_dev(dev);
2548
2549                pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2550
2551                spin_lock_irqsave(&device_domain_lock, flags);
2552                info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2553                                                      PCI_BUS_NUM(dma_alias),
2554                                                      dma_alias & 0xff);
2555                if (info) {
2556                        iommu = info->iommu;
2557                        domain = info->domain;
2558                }
2559                spin_unlock_irqrestore(&device_domain_lock, flags);
2560
2561                /* DMA alias already has a domain, use it */
2562                if (info)
2563                        goto out;
2564        }
2565
2566        /* Allocate and initialize new domain for the device */
2567        domain = alloc_domain(0);
2568        if (!domain)
2569                return NULL;
2570        if (domain_init(domain, iommu, gaw)) {
2571                domain_exit(domain);
2572                return NULL;
2573        }
2574
2575out:
2576
2577        return domain;
2578}
2579
2580static struct dmar_domain *set_domain_for_dev(struct device *dev,
2581                                              struct dmar_domain *domain)
2582{
2583        struct intel_iommu *iommu;
2584        struct dmar_domain *tmp;
2585        u16 req_id, dma_alias;
2586        u8 bus, devfn;
2587
2588        iommu = device_to_iommu(dev, &bus, &devfn);
2589        if (!iommu)
2590                return NULL;
2591
2592        req_id = ((u16)bus << 8) | devfn;
2593
2594        if (dev_is_pci(dev)) {
2595                struct pci_dev *pdev = to_pci_dev(dev);
2596
2597                pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2598
2599                /* register PCI DMA alias device */
2600                if (req_id != dma_alias) {
2601                        tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2602                                        dma_alias & 0xff, NULL, domain);
2603
2604                        if (!tmp || tmp != domain)
2605                                return tmp;
2606                }
2607        }
2608
2609        tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2610        if (!tmp || tmp != domain)
2611                return tmp;
2612
2613        return domain;
2614}
2615
2616static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2617{
2618        struct dmar_domain *domain, *tmp;
2619
2620        domain = find_domain(dev);
2621        if (domain)
2622                goto out;
2623
2624        domain = find_or_alloc_domain(dev, gaw);
2625        if (!domain)
2626                goto out;
2627
2628        tmp = set_domain_for_dev(dev, domain);
2629        if (!tmp || domain != tmp) {
2630                domain_exit(domain);
2631                domain = tmp;
2632        }
2633
2634out:
2635
2636        return domain;
2637}
2638
2639static int iommu_domain_identity_map(struct dmar_domain *domain,
2640                                     unsigned long long start,
2641                                     unsigned long long end)
2642{
2643        unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2644        unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2645
2646        if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2647                          dma_to_mm_pfn(last_vpfn))) {
2648                pr_err("Reserving iova failed\n");
2649                return -ENOMEM;
2650        }
2651
2652        pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2653        /*
2654         * RMRR range might have overlap with physical memory range,
2655         * clear it first
2656         */
2657        dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2658
2659        return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2660                                  last_vpfn - first_vpfn + 1,
2661                                  DMA_PTE_READ|DMA_PTE_WRITE);
2662}
2663
2664static int domain_prepare_identity_map(struct device *dev,
2665                                       struct dmar_domain *domain,
2666                                       unsigned long long start,
2667                                       unsigned long long end)
2668{
2669        /* For _hardware_ passthrough, don't bother. But for software
2670           passthrough, we do it anyway -- it may indicate a memory
2671           range which is reserved in E820, so which didn't get set
2672           up to start with in si_domain */
2673        if (domain == si_domain && hw_pass_through) {
2674                pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2675                        dev_name(dev), start, end);
2676                return 0;
2677        }
2678
2679        pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2680                dev_name(dev), start, end);
2681
2682        if (end < start) {
2683                WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2684                        "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2685                        dmi_get_system_info(DMI_BIOS_VENDOR),
2686                        dmi_get_system_info(DMI_BIOS_VERSION),
2687                     dmi_get_system_info(DMI_PRODUCT_VERSION));
2688                return -EIO;
2689        }
2690
2691        if (end >> agaw_to_width(domain->agaw)) {
2692                WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2693                     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2694                     agaw_to_width(domain->agaw),
2695                     dmi_get_system_info(DMI_BIOS_VENDOR),
2696                     dmi_get_system_info(DMI_BIOS_VERSION),
2697                     dmi_get_system_info(DMI_PRODUCT_VERSION));
2698                return -EIO;
2699        }
2700
2701        return iommu_domain_identity_map(domain, start, end);
2702}
2703
2704static int iommu_prepare_identity_map(struct device *dev,
2705                                      unsigned long long start,
2706                                      unsigned long long end)
2707{
2708        struct dmar_domain *domain;
2709        int ret;
2710
2711        domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2712        if (!domain)
2713                return -ENOMEM;
2714
2715        ret = domain_prepare_identity_map(dev, domain, start, end);
2716        if (ret)
2717                domain_exit(domain);
2718
2719        return ret;
2720}
2721
2722static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2723                                         struct device *dev)
2724{
2725        if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2726                return 0;
2727        return iommu_prepare_identity_map(dev, rmrr->base_address,
2728                                          rmrr->end_address);
2729}
2730
2731#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2732static inline void iommu_prepare_isa(void)
2733{
2734        struct pci_dev *pdev;
2735        int ret;
2736
2737        pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2738        if (!pdev)
2739                return;
2740
2741        pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2742        ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2743
2744        if (ret)
2745                pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2746
2747        pci_dev_put(pdev);
2748}
2749#else
2750static inline void iommu_prepare_isa(void)
2751{
2752        return;
2753}
2754#endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2755
2756static int md_domain_init(struct dmar_domain *domain, int guest_width);
2757
2758static int __init si_domain_init(int hw)
2759{
2760        int nid, ret = 0;
2761
2762        si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2763        if (!si_domain)
2764                return -EFAULT;
2765
2766        if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2767                domain_exit(si_domain);
2768                return -EFAULT;
2769        }
2770
2771        pr_debug("Identity mapping domain allocated\n");
2772
2773        if (hw)
2774                return 0;
2775
2776        for_each_online_node(nid) {
2777                unsigned long start_pfn, end_pfn;
2778                int i;
2779
2780                for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2781                        ret = iommu_domain_identity_map(si_domain,
2782                                        PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2783                        if (ret)
2784                                return ret;
2785                }
2786        }
2787
2788        return 0;
2789}
2790
2791static int identity_mapping(struct device *dev)
2792{
2793        struct device_domain_info *info;
2794
2795        if (likely(!iommu_identity_mapping))
2796                return 0;
2797
2798        info = dev->archdata.iommu;
2799        if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2800                return (info->domain == si_domain);
2801
2802        return 0;
2803}
2804
2805static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2806{
2807        struct dmar_domain *ndomain;
2808        struct intel_iommu *iommu;
2809        u8 bus, devfn;
2810
2811        iommu = device_to_iommu(dev, &bus, &devfn);
2812        if (!iommu)
2813                return -ENODEV;
2814
2815        ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2816        if (ndomain != domain)
2817                return -EBUSY;
2818
2819        return 0;
2820}
2821
2822static bool device_has_rmrr(struct device *dev)
2823{
2824        struct dmar_rmrr_unit *rmrr;
2825        struct device *tmp;
2826        int i;
2827
2828        rcu_read_lock();
2829        for_each_rmrr_units(rmrr) {
2830                /*
2831                 * Return TRUE if this RMRR contains the device that
2832                 * is passed in.
2833                 */
2834                for_each_active_dev_scope(rmrr->devices,
2835                                          rmrr->devices_cnt, i, tmp)
2836                        if (tmp == dev) {
2837                                rcu_read_unlock();
2838                                return true;
2839                        }
2840        }
2841        rcu_read_unlock();
2842        return false;
2843}
2844
2845/*
2846 * There are a couple cases where we need to restrict the functionality of
2847 * devices associated with RMRRs.  The first is when evaluating a device for
2848 * identity mapping because problems exist when devices are moved in and out
2849 * of domains and their respective RMRR information is lost.  This means that
2850 * a device with associated RMRRs will never be in a "passthrough" domain.
2851 * The second is use of the device through the IOMMU API.  This interface
2852 * expects to have full control of the IOVA space for the device.  We cannot
2853 * satisfy both the requirement that RMRR access is maintained and have an
2854 * unencumbered IOVA space.  We also have no ability to quiesce the device's
2855 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2856 * We therefore prevent devices associated with an RMRR from participating in
2857 * the IOMMU API, which eliminates them from device assignment.
2858 *
2859 * In both cases we assume that PCI USB devices with RMRRs have them largely
2860 * for historical reasons and that the RMRR space is not actively used post
2861 * boot.  This exclusion may change if vendors begin to abuse it.
2862 *
2863 * The same exception is made for graphics devices, with the requirement that
2864 * any use of the RMRR regions will be torn down before assigning the device
2865 * to a guest.
2866 */
2867static bool device_is_rmrr_locked(struct device *dev)
2868{
2869        if (!device_has_rmrr(dev))
2870                return false;
2871
2872        if (dev_is_pci(dev)) {
2873                struct pci_dev *pdev = to_pci_dev(dev);
2874
2875                if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2876                        return false;
2877        }
2878
2879        return true;
2880}
2881
2882static int iommu_should_identity_map(struct device *dev, int startup)
2883{
2884
2885        if (dev_is_pci(dev)) {
2886                struct pci_dev *pdev = to_pci_dev(dev);
2887
2888                if (device_is_rmrr_locked(dev))
2889                        return 0;
2890
2891                if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2892                        return 1;
2893
2894                if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2895                        return 1;
2896
2897                if (!(iommu_identity_mapping & IDENTMAP_ALL))
2898                        return 0;
2899
2900                /*
2901                 * We want to start off with all devices in the 1:1 domain, and
2902                 * take them out later if we find they can't access all of memory.
2903                 *
2904                 * However, we can't do this for PCI devices behind bridges,
2905                 * because all PCI devices behind the same bridge will end up
2906                 * with the same source-id on their transactions.
2907                 *
2908                 * Practically speaking, we can't change things around for these
2909                 * devices at run-time, because we can't be sure there'll be no
2910                 * DMA transactions in flight for any of their siblings.
2911                 *
2912                 * So PCI devices (unless they're on the root bus) as well as
2913                 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2914                 * the 1:1 domain, just in _case_ one of their siblings turns out
2915                 * not to be able to map all of memory.
2916                 */
2917                if (!pci_is_pcie(pdev)) {
2918                        if (!pci_is_root_bus(pdev->bus))
2919                                return 0;
2920                        if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2921                                return 0;
2922                } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2923                        return 0;
2924        } else {
2925                if (device_has_rmrr(dev))
2926                        return 0;
2927        }
2928
2929        /*
2930         * At boot time, we don't yet know if devices will be 64-bit capable.
2931         * Assume that they will — if they turn out not to be, then we can
2932         * take them out of the 1:1 domain later.
2933         */
2934        if (!startup) {
2935                /*
2936                 * If the device's dma_mask is less than the system's memory
2937                 * size then this is not a candidate for identity mapping.
2938                 */
2939                u64 dma_mask = *dev->dma_mask;
2940
2941                if (dev->coherent_dma_mask &&
2942                    dev->coherent_dma_mask < dma_mask)
2943                        dma_mask = dev->coherent_dma_mask;
2944
2945                return dma_mask >= dma_get_required_mask(dev);
2946        }
2947
2948        return 1;
2949}
2950
2951static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2952{
2953        int ret;
2954
2955        if (!iommu_should_identity_map(dev, 1))
2956                return 0;
2957
2958        ret = domain_add_dev_info(si_domain, dev);
2959        if (!ret)
2960                pr_info("%s identity mapping for device %s\n",
2961                        hw ? "Hardware" : "Software", dev_name(dev));
2962        else if (ret == -ENODEV)
2963                /* device not associated with an iommu */
2964                ret = 0;
2965
2966        return ret;
2967}
2968
2969
2970static int __init iommu_prepare_static_identity_mapping(int hw)
2971{
2972        struct pci_dev *pdev = NULL;
2973        struct dmar_drhd_unit *drhd;
2974        struct intel_iommu *iommu;
2975        struct device *dev;
2976        int i;
2977        int ret = 0;
2978
2979        for_each_pci_dev(pdev) {
2980                ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2981                if (ret)
2982                        return ret;
2983        }
2984
2985        for_each_active_iommu(iommu, drhd)
2986                for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2987                        struct acpi_device_physical_node *pn;
2988                        struct acpi_device *adev;
2989
2990                        if (dev->bus != &acpi_bus_type)
2991                                continue;
2992
2993                        adev= to_acpi_device(dev);
2994                        mutex_lock(&adev->physical_node_lock);
2995                        list_for_each_entry(pn, &adev->physical_node_list, node) {
2996                                ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2997                                if (ret)
2998                                        break;
2999                        }
3000                        mutex_unlock(&adev->physical_node_lock);
3001                        if (ret)
3002                                return ret;
3003                }
3004
3005        return 0;
3006}
3007
3008static void intel_iommu_init_qi(struct intel_iommu *iommu)
3009{
3010        /*
3011         * Start from the sane iommu hardware state.
3012         * If the queued invalidation is already initialized by us
3013         * (for example, while enabling interrupt-remapping) then
3014         * we got the things already rolling from a sane state.
3015         */
3016        if (!iommu->qi) {
3017                /*
3018                 * Clear any previous faults.
3019                 */
3020                dmar_fault(-1, iommu);
3021                /*
3022                 * Disable queued invalidation if supported and already enabled
3023                 * before OS handover.
3024                 */
3025                dmar_disable_qi(iommu);
3026        }
3027
3028        if (dmar_enable_qi(iommu)) {
3029                /*
3030                 * Queued Invalidate not enabled, use Register Based Invalidate
3031                 */
3032                iommu->flush.flush_context = __iommu_flush_context;
3033                iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3034                pr_info("%s: Using Register based invalidation\n",
3035                        iommu->name);
3036        } else {
3037                iommu->flush.flush_context = qi_flush_context;
3038                iommu->flush.flush_iotlb = qi_flush_iotlb;
3039                pr_info("%s: Using Queued invalidation\n", iommu->name);
3040        }
3041}
3042
3043static int copy_context_table(struct intel_iommu *iommu,
3044                              struct root_entry *old_re,
3045                              struct context_entry **tbl,
3046                              int bus, bool ext)
3047{
3048        int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3049        struct context_entry *new_ce = NULL, ce;
3050        struct context_entry *old_ce = NULL;
3051        struct root_entry re;
3052        phys_addr_t old_ce_phys;
3053
3054        tbl_idx = ext ? bus * 2 : bus;
3055        memcpy(&re, old_re, sizeof(re));
3056
3057        for (devfn = 0; devfn < 256; devfn++) {
3058                /* First calculate the correct index */
3059                idx = (ext ? devfn * 2 : devfn) % 256;
3060
3061                if (idx == 0) {
3062                        /* First save what we may have and clean up */
3063                        if (new_ce) {
3064                                tbl[tbl_idx] = new_ce;
3065                                __iommu_flush_cache(iommu, new_ce,
3066                                                    VTD_PAGE_SIZE);
3067                                pos = 1;
3068                        }
3069
3070                        if (old_ce)
3071                                iounmap(old_ce);
3072
3073                        ret = 0;
3074                        if (devfn < 0x80)
3075                                old_ce_phys = root_entry_lctp(&re);
3076                        else
3077                                old_ce_phys = root_entry_uctp(&re);
3078
3079                        if (!old_ce_phys) {
3080                                if (ext && devfn == 0) {
3081                                        /* No LCTP, try UCTP */
3082                                        devfn = 0x7f;
3083                                        continue;
3084                                } else {
3085                                        goto out;
3086                                }
3087                        }
3088
3089                        ret = -ENOMEM;
3090                        old_ce = memremap(old_ce_phys, PAGE_SIZE,
3091                                        MEMREMAP_WB);
3092                        if (!old_ce)
3093                                goto out;
3094
3095                        new_ce = alloc_pgtable_page(iommu->node);
3096                        if (!new_ce)
3097                                goto out_unmap;
3098
3099                        ret = 0;
3100                }
3101
3102                /* Now copy the context entry */
3103                memcpy(&ce, old_ce + idx, sizeof(ce));
3104
3105                if (!__context_present(&ce))
3106                        continue;
3107
3108                did = context_domain_id(&ce);
3109                if (did >= 0 && did < cap_ndoms(iommu->cap))
3110                        set_bit(did, iommu->domain_ids);
3111
3112                /*
3113                 * We need a marker for copied context entries. This
3114                 * marker needs to work for the old format as well as
3115                 * for extended context entries.
3116                 *
3117                 * Bit 67 of the context entry is used. In the old
3118                 * format this bit is available to software, in the
3119                 * extended format it is the PGE bit, but PGE is ignored
3120                 * by HW if PASIDs are disabled (and thus still
3121                 * available).
3122                 *
3123                 * So disable PASIDs first and then mark the entry
3124                 * copied. This means that we don't copy PASID
3125                 * translations from the old kernel, but this is fine as
3126                 * faults there are not fatal.
3127                 */
3128                context_clear_pasid_enable(&ce);
3129                context_set_copied(&ce);
3130
3131                new_ce[idx] = ce;
3132        }
3133
3134        tbl[tbl_idx + pos] = new_ce;
3135
3136        __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3137
3138out_unmap:
3139        memunmap(old_ce);
3140
3141out:
3142        return ret;
3143}
3144
3145static int copy_translation_tables(struct intel_iommu *iommu)
3146{
3147        struct context_entry **ctxt_tbls;
3148        struct root_entry *old_rt;
3149        phys_addr_t old_rt_phys;
3150        int ctxt_table_entries;
3151        unsigned long flags;
3152        u64 rtaddr_reg;
3153        int bus, ret;
3154        bool new_ext, ext;
3155
3156        rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3157        ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3158        new_ext    = !!ecap_ecs(iommu->ecap);
3159
3160        /*
3161         * The RTT bit can only be changed when translation is disabled,
3162         * but disabling translation means to open a window for data
3163         * corruption. So bail out and don't copy anything if we would
3164         * have to change the bit.
3165         */
3166        if (new_ext != ext)
3167                return -EINVAL;
3168
3169        old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3170        if (!old_rt_phys)
3171                return -EINVAL;
3172
3173        old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3174        if (!old_rt)
3175                return -ENOMEM;
3176
3177        /* This is too big for the stack - allocate it from slab */
3178        ctxt_table_entries = ext ? 512 : 256;
3179        ret = -ENOMEM;
3180        ctxt_tbls = kzalloc(ctxt_table_entries * sizeof(void *), GFP_KERNEL);
3181        if (!ctxt_tbls)
3182                goto out_unmap;
3183
3184        for (bus = 0; bus < 256; bus++) {
3185                ret = copy_context_table(iommu, &old_rt[bus],
3186                                         ctxt_tbls, bus, ext);
3187                if (ret) {
3188                        pr_err("%s: Failed to copy context table for bus %d\n",
3189                                iommu->name, bus);
3190                        continue;
3191                }
3192        }
3193
3194        spin_lock_irqsave(&iommu->lock, flags);
3195
3196        /* Context tables are copied, now write them to the root_entry table */
3197        for (bus = 0; bus < 256; bus++) {
3198                int idx = ext ? bus * 2 : bus;
3199                u64 val;
3200
3201                if (ctxt_tbls[idx]) {
3202                        val = virt_to_phys(ctxt_tbls[idx]) | 1;
3203                        iommu->root_entry[bus].lo = val;
3204                }
3205
3206                if (!ext || !ctxt_tbls[idx + 1])
3207                        continue;
3208
3209                val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3210                iommu->root_entry[bus].hi = val;
3211        }
3212
3213        spin_unlock_irqrestore(&iommu->lock, flags);
3214
3215        kfree(ctxt_tbls);
3216
3217        __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3218
3219        ret = 0;
3220
3221out_unmap:
3222        memunmap(old_rt);
3223
3224        return ret;
3225}
3226
3227static int __init init_dmars(void)
3228{
3229        struct dmar_drhd_unit *drhd;
3230        struct dmar_rmrr_unit *rmrr;
3231        bool copied_tables = false;
3232        struct device *dev;
3233        struct intel_iommu *iommu;
3234        int i, ret;
3235
3236        /*
3237         * for each drhd
3238         *    allocate root
3239         *    initialize and program root entry to not present
3240         * endfor
3241         */
3242        for_each_drhd_unit(drhd) {
3243                /*
3244                 * lock not needed as this is only incremented in the single
3245                 * threaded kernel __init code path all other access are read
3246                 * only
3247                 */
3248                if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3249                        g_num_of_iommus++;
3250                        continue;
3251                }
3252                pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3253        }
3254
3255        /* Preallocate enough resources for IOMMU hot-addition */
3256        if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3257                g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3258
3259        g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3260                        GFP_KERNEL);
3261        if (!g_iommus) {
3262                pr_err("Allocating global iommu array failed\n");
3263                ret = -ENOMEM;
3264                goto error;
3265        }
3266
3267        for_each_active_iommu(iommu, drhd) {
3268                g_iommus[iommu->seq_id] = iommu;
3269
3270                intel_iommu_init_qi(iommu);
3271
3272                ret = iommu_init_domains(iommu);
3273                if (ret)
3274                        goto free_iommu;
3275
3276                init_translation_status(iommu);
3277
3278                if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3279                        iommu_disable_translation(iommu);
3280                        clear_translation_pre_enabled(iommu);
3281                        pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3282                                iommu->name);
3283                }
3284
3285                /*
3286                 * TBD:
3287                 * we could share the same root & context tables
3288                 * among all IOMMU's. Need to Split it later.
3289                 */
3290                ret = iommu_alloc_root_entry(iommu);
3291                if (ret)
3292                        goto free_iommu;
3293
3294                if (translation_pre_enabled(iommu)) {
3295                        pr_info("Translation already enabled - trying to copy translation structures\n");
3296
3297                        ret = copy_translation_tables(iommu);
3298                        if (ret) {
3299                                /*
3300                                 * We found the IOMMU with translation
3301                                 * enabled - but failed to copy over the
3302                                 * old root-entry table. Try to proceed
3303                                 * by disabling translation now and
3304                                 * allocating a clean root-entry table.
3305                                 * This might cause DMAR faults, but
3306                                 * probably the dump will still succeed.
3307                                 */
3308                                pr_err("Failed to copy translation tables from previous kernel for %s\n",
3309                                       iommu->name);
3310                                iommu_disable_translation(iommu);
3311                                clear_translation_pre_enabled(iommu);
3312                        } else {
3313                                pr_info("Copied translation tables from previous kernel for %s\n",
3314                                        iommu->name);
3315                                copied_tables = true;
3316                        }
3317                }
3318
3319                if (!ecap_pass_through(iommu->ecap))
3320                        hw_pass_through = 0;
3321#ifdef CONFIG_INTEL_IOMMU_SVM
3322                if (pasid_enabled(iommu))
3323                        intel_svm_alloc_pasid_tables(iommu);
3324#endif
3325        }
3326
3327        /*
3328         * Now that qi is enabled on all iommus, set the root entry and flush
3329         * caches. This is required on some Intel X58 chipsets, otherwise the
3330         * flush_context function will loop forever and the boot hangs.
3331         */
3332        for_each_active_iommu(iommu, drhd) {
3333                iommu_flush_write_buffer(iommu);
3334                iommu_set_root_entry(iommu);
3335                iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3336                iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3337        }
3338
3339        if (iommu_pass_through)
3340                iommu_identity_mapping |= IDENTMAP_ALL;
3341
3342#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3343        iommu_identity_mapping |= IDENTMAP_GFX;
3344#endif
3345
3346        check_tylersburg_isoch();
3347
3348        if (iommu_identity_mapping) {
3349                ret = si_domain_init(hw_pass_through);
3350                if (ret)
3351                        goto free_iommu;
3352        }
3353
3354
3355        /*
3356         * If we copied translations from a previous kernel in the kdump
3357         * case, we can not assign the devices to domains now, as that
3358         * would eliminate the old mappings. So skip this part and defer
3359         * the assignment to device driver initialization time.
3360         */
3361        if (copied_tables)
3362                goto domains_done;
3363
3364        /*
3365         * If pass through is not set or not enabled, setup context entries for
3366         * identity mappings for rmrr, gfx, and isa and may fall back to static
3367         * identity mapping if iommu_identity_mapping is set.
3368         */
3369        if (iommu_identity_mapping) {
3370                ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3371                if (ret) {
3372                        pr_crit("Failed to setup IOMMU pass-through\n");
3373                        goto free_iommu;
3374                }
3375        }
3376        /*
3377         * For each rmrr
3378         *   for each dev attached to rmrr
3379         *   do
3380         *     locate drhd for dev, alloc domain for dev
3381         *     allocate free domain
3382         *     allocate page table entries for rmrr
3383         *     if context not allocated for bus
3384         *           allocate and init context
3385         *           set present in root table for this bus
3386         *     init context with domain, translation etc
3387         *    endfor
3388         * endfor
3389         */
3390        pr_info("Setting RMRR:\n");
3391        for_each_rmrr_units(rmrr) {
3392                /* some BIOS lists non-exist devices in DMAR table. */
3393                for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3394                                          i, dev) {
3395                        ret = iommu_prepare_rmrr_dev(rmrr, dev);
3396                        if (ret)
3397                                pr_err("Mapping reserved region failed\n");
3398                }
3399        }
3400
3401        iommu_prepare_isa();
3402
3403domains_done:
3404
3405        /*
3406         * for each drhd
3407         *   enable fault log
3408         *   global invalidate context cache
3409         *   global invalidate iotlb
3410         *   enable translation
3411         */
3412        for_each_iommu(iommu, drhd) {
3413                if (drhd->ignored) {
3414                        /*
3415                         * we always have to disable PMRs or DMA may fail on
3416                         * this device
3417                         */
3418                        if (force_on)
3419                                iommu_disable_protect_mem_regions(iommu);
3420                        continue;
3421                }
3422
3423                iommu_flush_write_buffer(iommu);
3424
3425#ifdef CONFIG_INTEL_IOMMU_SVM
3426                if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
3427                        ret = intel_svm_enable_prq(iommu);
3428                        if (ret)
3429                                goto free_iommu;
3430                }
3431#endif
3432                ret = dmar_set_interrupt(iommu);
3433                if (ret)
3434                        goto free_iommu;
3435
3436                if (!translation_pre_enabled(iommu))
3437                        iommu_enable_translation(iommu);
3438
3439                iommu_disable_protect_mem_regions(iommu);
3440        }
3441
3442        return 0;
3443
3444free_iommu:
3445        for_each_active_iommu(iommu, drhd) {
3446                disable_dmar_iommu(iommu);
3447                free_dmar_iommu(iommu);
3448        }
3449
3450        kfree(g_iommus);
3451
3452error:
3453        return ret;
3454}
3455
3456/* This takes a number of _MM_ pages, not VTD pages */
3457static unsigned long intel_alloc_iova(struct device *dev,
3458                                     struct dmar_domain *domain,
3459                                     unsigned long nrpages, uint64_t dma_mask)
3460{
3461        unsigned long iova_pfn = 0;
3462
3463        /* Restrict dma_mask to the width that the iommu can handle */
3464        dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3465        /* Ensure we reserve the whole size-aligned region */
3466        nrpages = __roundup_pow_of_two(nrpages);
3467
3468        if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3469                /*
3470                 * First try to allocate an io virtual address in
3471                 * DMA_BIT_MASK(32) and if that fails then try allocating
3472                 * from higher range
3473                 */
3474                iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3475                                           IOVA_PFN(DMA_BIT_MASK(32)), false);
3476                if (iova_pfn)
3477                        return iova_pfn;
3478        }
3479        iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3480                                   IOVA_PFN(dma_mask), true);
3481        if (unlikely(!iova_pfn)) {
3482                pr_err("Allocating %ld-page iova for %s failed",
3483                       nrpages, dev_name(dev));
3484                return 0;
3485        }
3486
3487        return iova_pfn;
3488}
3489
3490static struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3491{
3492        struct dmar_domain *domain, *tmp;
3493        struct dmar_rmrr_unit *rmrr;
3494        struct device *i_dev;
3495        int i, ret;
3496
3497        domain = find_domain(dev);
3498        if (domain)
3499                goto out;
3500
3501        domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3502        if (!domain)
3503                goto out;
3504
3505        /* We have a new domain - setup possible RMRRs for the device */
3506        rcu_read_lock();
3507        for_each_rmrr_units(rmrr) {
3508                for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3509                                          i, i_dev) {
3510                        if (i_dev != dev)
3511                                continue;
3512
3513                        ret = domain_prepare_identity_map(dev, domain,
3514                                                          rmrr->base_address,
3515                                                          rmrr->end_address);
3516                        if (ret)
3517                                dev_err(dev, "Mapping reserved region failed\n");
3518                }
3519        }
3520        rcu_read_unlock();
3521
3522        tmp = set_domain_for_dev(dev, domain);
3523        if (!tmp || domain != tmp) {
3524                domain_exit(domain);
3525                domain = tmp;
3526        }
3527
3528out:
3529
3530        if (!domain)
3531                pr_err("Allocating domain for %s failed\n", dev_name(dev));
3532
3533
3534        return domain;
3535}
3536
3537/* Check if the dev needs to go through non-identity map and unmap process.*/
3538static int iommu_no_mapping(struct device *dev)
3539{
3540        int found;
3541
3542        if (iommu_dummy(dev))
3543                return 1;
3544
3545        if (!iommu_identity_mapping)
3546                return 0;
3547
3548        found = identity_mapping(dev);
3549        if (found) {
3550                if (iommu_should_identity_map(dev, 0))
3551                        return 1;
3552                else {
3553                        /*
3554                         * 32 bit DMA is removed from si_domain and fall back
3555                         * to non-identity mapping.
3556                         */
3557                        dmar_remove_one_dev_info(si_domain, dev);
3558                        pr_info("32bit %s uses non-identity mapping\n",
3559                                dev_name(dev));
3560                        return 0;
3561                }
3562        } else {
3563                /*
3564                 * In case of a detached 64 bit DMA device from vm, the device
3565                 * is put into si_domain for identity mapping.
3566                 */
3567                if (iommu_should_identity_map(dev, 0)) {
3568                        int ret;
3569                        ret = domain_add_dev_info(si_domain, dev);
3570                        if (!ret) {
3571                                pr_info("64bit %s uses identity mapping\n",
3572                                        dev_name(dev));
3573                                return 1;
3574                        }
3575                }
3576        }
3577
3578        return 0;
3579}
3580
3581static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3582                                     size_t size, int dir, u64 dma_mask)
3583{
3584        struct dmar_domain *domain;
3585        phys_addr_t start_paddr;
3586        unsigned long iova_pfn;
3587        int prot = 0;
3588        int ret;
3589        struct intel_iommu *iommu;
3590        unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3591
3592        BUG_ON(dir == DMA_NONE);
3593
3594        if (iommu_no_mapping(dev))
3595                return paddr;
3596
3597        domain = get_valid_domain_for_dev(dev);
3598        if (!domain)
3599                return 0;
3600
3601        iommu = domain_get_iommu(domain);
3602        size = aligned_nrpages(paddr, size);
3603
3604        iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3605        if (!iova_pfn)
3606                goto error;
3607
3608        /*
3609         * Check if DMAR supports zero-length reads on write only
3610         * mappings..
3611         */
3612        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3613                        !cap_zlr(iommu->cap))
3614                prot |= DMA_PTE_READ;
3615        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3616                prot |= DMA_PTE_WRITE;
3617        /*
3618         * paddr - (paddr + size) might be partial page, we should map the whole
3619         * page.  Note: if two part of one page are separately mapped, we
3620         * might have two guest_addr mapping to the same host paddr, but this
3621         * is not a big problem
3622         */
3623        ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3624                                 mm_to_dma_pfn(paddr_pfn), size, prot);
3625        if (ret)
3626                goto error;
3627
3628        /* it's a non-present to present mapping. Only flush if caching mode */
3629        if (cap_caching_mode(iommu->cap))
3630                iommu_flush_iotlb_psi(iommu, domain,
3631                                      mm_to_dma_pfn(iova_pfn),
3632                                      size, 0, 1);
3633        else
3634                iommu_flush_write_buffer(iommu);
3635
3636        start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3637        start_paddr += paddr & ~PAGE_MASK;
3638        return start_paddr;
3639
3640error:
3641        if (iova_pfn)
3642                free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3643        pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3644                dev_name(dev), size, (unsigned long long)paddr, dir);
3645        return 0;
3646}
3647
3648static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3649                                 unsigned long offset, size_t size,
3650                                 enum dma_data_direction dir,
3651                                 unsigned long attrs)
3652{
3653        return __intel_map_single(dev, page_to_phys(page) + offset, size,
3654                                  dir, *dev->dma_mask);
3655}
3656
3657static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3658{
3659        struct dmar_domain *domain;
3660        unsigned long start_pfn, last_pfn;
3661        unsigned long nrpages;
3662        unsigned long iova_pfn;
3663        struct intel_iommu *iommu;
3664        struct page *freelist;
3665
3666        if (iommu_no_mapping(dev))
3667                return;
3668
3669        domain = find_domain(dev);
3670        BUG_ON(!domain);
3671
3672        iommu = domain_get_iommu(domain);
3673
3674        iova_pfn = IOVA_PFN(dev_addr);
3675
3676        nrpages = aligned_nrpages(dev_addr, size);
3677        start_pfn = mm_to_dma_pfn(iova_pfn);
3678        last_pfn = start_pfn + nrpages - 1;
3679
3680        pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3681                 dev_name(dev), start_pfn, last_pfn);
3682
3683        freelist = domain_unmap(domain, start_pfn, last_pfn);
3684
3685        if (intel_iommu_strict) {
3686                iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3687                                      nrpages, !freelist, 0);
3688                /* free iova */
3689                free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3690                dma_free_pagelist(freelist);
3691        } else {
3692                queue_iova(&domain->iovad, iova_pfn, nrpages,
3693                           (unsigned long)freelist);
3694                /*
3695                 * queue up the release of the unmap to save the 1/6th of the
3696                 * cpu used up by the iotlb flush operation...
3697                 */
3698        }
3699}
3700
3701static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3702                             size_t size, enum dma_data_direction dir,
3703                             unsigned long attrs)
3704{
3705        intel_unmap(dev, dev_addr, size);
3706}
3707
3708static void *intel_alloc_coherent(struct device *dev, size_t size,
3709                                  dma_addr_t *dma_handle, gfp_t flags,
3710                                  unsigned long attrs)
3711{
3712        void *vaddr;
3713
3714        vaddr = dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3715        if (iommu_no_mapping(dev) || !vaddr)
3716                return vaddr;
3717
3718        *dma_handle = __intel_map_single(dev, virt_to_phys(vaddr),
3719                        PAGE_ALIGN(size), DMA_BIDIRECTIONAL,
3720                        dev->coherent_dma_mask);
3721        if (!*dma_handle)
3722                goto out_free_pages;
3723        return vaddr;
3724
3725out_free_pages:
3726        dma_direct_free(dev, size, vaddr, *dma_handle, attrs);
3727        return NULL;
3728}
3729
3730static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3731                                dma_addr_t dma_handle, unsigned long attrs)
3732{
3733        if (!iommu_no_mapping(dev))
3734                intel_unmap(dev, dma_handle, PAGE_ALIGN(size));
3735        dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3736}
3737
3738static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3739                           int nelems, enum dma_data_direction dir,
3740                           unsigned long attrs)
3741{
3742        dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3743        unsigned long nrpages = 0;
3744        struct scatterlist *sg;
3745        int i;
3746
3747        for_each_sg(sglist, sg, nelems, i) {
3748                nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3749        }
3750
3751        intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3752}
3753
3754static int intel_nontranslate_map_sg(struct device *hddev,
3755        struct scatterlist *sglist, int nelems, int dir)
3756{
3757        int i;
3758        struct scatterlist *sg;
3759
3760        for_each_sg(sglist, sg, nelems, i) {
3761                BUG_ON(!sg_page(sg));
3762                sg->dma_address = sg_phys(sg);
3763                sg->dma_length = sg->length;
3764        }
3765        return nelems;
3766}
3767
3768static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3769                        enum dma_data_direction dir, unsigned long attrs)
3770{
3771        int i;
3772        struct dmar_domain *domain;
3773        size_t size = 0;
3774        int prot = 0;
3775        unsigned long iova_pfn;
3776        int ret;
3777        struct scatterlist *sg;
3778        unsigned long start_vpfn;
3779        struct intel_iommu *iommu;
3780
3781        BUG_ON(dir == DMA_NONE);
3782        if (iommu_no_mapping(dev))
3783                return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3784
3785        domain = get_valid_domain_for_dev(dev);
3786        if (!domain)
3787                return 0;
3788
3789        iommu = domain_get_iommu(domain);
3790
3791        for_each_sg(sglist, sg, nelems, i)
3792                size += aligned_nrpages(sg->offset, sg->length);
3793
3794        iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3795                                *dev->dma_mask);
3796        if (!iova_pfn) {
3797                sglist->dma_length = 0;
3798                return 0;
3799        }
3800
3801        /*
3802         * Check if DMAR supports zero-length reads on write only
3803         * mappings..
3804         */
3805        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3806                        !cap_zlr(iommu->cap))
3807                prot |= DMA_PTE_READ;
3808        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3809                prot |= DMA_PTE_WRITE;
3810
3811        start_vpfn = mm_to_dma_pfn(iova_pfn);
3812
3813        ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3814        if (unlikely(ret)) {
3815                dma_pte_free_pagetable(domain, start_vpfn,
3816                                       start_vpfn + size - 1,
3817                                       agaw_to_level(domain->agaw) + 1);
3818                free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3819                return 0;
3820        }
3821
3822        /* it's a non-present to present mapping. Only flush if caching mode */
3823        if (cap_caching_mode(iommu->cap))
3824                iommu_flush_iotlb_psi(iommu, domain, start_vpfn, size, 0, 1);
3825        else
3826                iommu_flush_write_buffer(iommu);
3827
3828        return nelems;
3829}
3830
3831static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3832{
3833        return !dma_addr;
3834}
3835
3836const struct dma_map_ops intel_dma_ops = {
3837        .alloc = intel_alloc_coherent,
3838        .free = intel_free_coherent,
3839        .map_sg = intel_map_sg,
3840        .unmap_sg = intel_unmap_sg,
3841        .map_page = intel_map_page,
3842        .unmap_page = intel_unmap_page,
3843        .mapping_error = intel_mapping_error,
3844#ifdef CONFIG_X86
3845        .dma_supported = dma_direct_supported,
3846#endif
3847};
3848
3849static inline int iommu_domain_cache_init(void)
3850{
3851        int ret = 0;
3852
3853        iommu_domain_cache = kmem_cache_create("iommu_domain",
3854                                         sizeof(struct dmar_domain),
3855                                         0,
3856                                         SLAB_HWCACHE_ALIGN,
3857
3858                                         NULL);
3859        if (!iommu_domain_cache) {
3860                pr_err("Couldn't create iommu_domain cache\n");
3861                ret = -ENOMEM;
3862        }
3863
3864        return ret;
3865}
3866
3867static inline int iommu_devinfo_cache_init(void)
3868{
3869        int ret = 0;
3870
3871        iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3872                                         sizeof(struct device_domain_info),
3873                                         0,
3874                                         SLAB_HWCACHE_ALIGN,
3875                                         NULL);
3876        if (!iommu_devinfo_cache) {
3877                pr_err("Couldn't create devinfo cache\n");
3878                ret = -ENOMEM;
3879        }
3880
3881        return ret;
3882}
3883
3884static int __init iommu_init_mempool(void)
3885{
3886        int ret;
3887        ret = iova_cache_get();
3888        if (ret)
3889                return ret;
3890
3891        ret = iommu_domain_cache_init();
3892        if (ret)
3893                goto domain_error;
3894
3895        ret = iommu_devinfo_cache_init();
3896        if (!ret)
3897                return ret;
3898
3899        kmem_cache_destroy(iommu_domain_cache);
3900domain_error:
3901        iova_cache_put();
3902
3903        return -ENOMEM;
3904}
3905
3906static void __init iommu_exit_mempool(void)
3907{
3908        kmem_cache_destroy(iommu_devinfo_cache);
3909        kmem_cache_destroy(iommu_domain_cache);
3910        iova_cache_put();
3911}
3912
3913static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3914{
3915        struct dmar_drhd_unit *drhd;
3916        u32 vtbar;
3917        int rc;
3918
3919        /* We know that this device on this chipset has its own IOMMU.
3920         * If we find it under a different IOMMU, then the BIOS is lying
3921         * to us. Hope that the IOMMU for this device is actually
3922         * disabled, and it needs no translation...
3923         */
3924        rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3925        if (rc) {
3926                /* "can't" happen */
3927                dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3928                return;
3929        }
3930        vtbar &= 0xffff0000;
3931
3932        /* we know that the this iommu should be at offset 0xa000 from vtbar */
3933        drhd = dmar_find_matched_drhd_unit(pdev);
3934        if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3935                            TAINT_FIRMWARE_WORKAROUND,
3936                            "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3937                pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3938}
3939DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3940
3941static void __init init_no_remapping_devices(void)
3942{
3943        struct dmar_drhd_unit *drhd;
3944        struct device *dev;
3945        int i;
3946
3947        for_each_drhd_unit(drhd) {
3948                if (!drhd->include_all) {
3949                        for_each_active_dev_scope(drhd->devices,
3950                                                  drhd->devices_cnt, i, dev)
3951                                break;
3952                        /* ignore DMAR unit if no devices exist */
3953                        if (i == drhd->devices_cnt)
3954                                drhd->ignored = 1;
3955                }
3956        }
3957
3958        for_each_active_drhd_unit(drhd) {
3959                if (drhd->include_all)
3960                        continue;
3961
3962                for_each_active_dev_scope(drhd->devices,
3963                                          drhd->devices_cnt, i, dev)
3964                        if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3965                                break;
3966                if (i < drhd->devices_cnt)
3967                        continue;
3968
3969                /* This IOMMU has *only* gfx devices. Either bypass it or
3970                   set the gfx_mapped flag, as appropriate */
3971                if (dmar_map_gfx) {
3972                        intel_iommu_gfx_mapped = 1;
3973                } else {
3974                        drhd->ignored = 1;
3975                        for_each_active_dev_scope(drhd->devices,
3976                                                  drhd->devices_cnt, i, dev)
3977                                dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3978                }
3979        }
3980}
3981
3982#ifdef CONFIG_SUSPEND
3983static int init_iommu_hw(void)
3984{
3985        struct dmar_drhd_unit *drhd;
3986        struct intel_iommu *iommu = NULL;
3987
3988        for_each_active_iommu(iommu, drhd)
3989                if (iommu->qi)
3990                        dmar_reenable_qi(iommu);
3991
3992        for_each_iommu(iommu, drhd) {
3993                if (drhd->ignored) {
3994                        /*
3995                         * we always have to disable PMRs or DMA may fail on
3996                         * this device
3997                         */
3998                        if (force_on)
3999                                iommu_disable_protect_mem_regions(iommu);
4000                        continue;
4001                }
4002        
4003                iommu_flush_write_buffer(iommu);
4004
4005                iommu_set_root_entry(iommu);
4006
4007                iommu->flush.flush_context(iommu, 0, 0, 0,
4008                                           DMA_CCMD_GLOBAL_INVL);
4009                iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4010                iommu_enable_translation(iommu);
4011                iommu_disable_protect_mem_regions(iommu);
4012        }
4013
4014        return 0;
4015}
4016
4017static void iommu_flush_all(void)
4018{
4019        struct dmar_drhd_unit *drhd;
4020        struct intel_iommu *iommu;
4021
4022        for_each_active_iommu(iommu, drhd) {
4023                iommu->flush.flush_context(iommu, 0, 0, 0,
4024                                           DMA_CCMD_GLOBAL_INVL);
4025                iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4026                                         DMA_TLB_GLOBAL_FLUSH);
4027        }
4028}
4029
4030static int iommu_suspend(void)
4031{
4032        struct dmar_drhd_unit *drhd;
4033        struct intel_iommu *iommu = NULL;
4034        unsigned long flag;
4035
4036        for_each_active_iommu(iommu, drhd) {
4037                iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
4038                                                 GFP_ATOMIC);
4039                if (!iommu->iommu_state)
4040                        goto nomem;
4041        }
4042
4043        iommu_flush_all();
4044
4045        for_each_active_iommu(iommu, drhd) {
4046                iommu_disable_translation(iommu);
4047
4048                raw_spin_lock_irqsave(&iommu->register_lock, flag);
4049
4050                iommu->iommu_state[SR_DMAR_FECTL_REG] =
4051                        readl(iommu->reg + DMAR_FECTL_REG);
4052                iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4053                        readl(iommu->reg + DMAR_FEDATA_REG);
4054                iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4055                        readl(iommu->reg + DMAR_FEADDR_REG);
4056                iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4057                        readl(iommu->reg + DMAR_FEUADDR_REG);
4058
4059                raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4060        }
4061        return 0;
4062
4063nomem:
4064        for_each_active_iommu(iommu, drhd)
4065                kfree(iommu->iommu_state);
4066
4067        return -ENOMEM;
4068}
4069
4070static void iommu_resume(void)
4071{
4072        struct dmar_drhd_unit *drhd;
4073        struct intel_iommu *iommu = NULL;
4074        unsigned long flag;
4075
4076        if (init_iommu_hw()) {
4077                if (force_on)
4078                        panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4079                else
4080                        WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4081                return;
4082        }
4083
4084        for_each_active_iommu(iommu, drhd) {
4085
4086                raw_spin_lock_irqsave(&iommu->register_lock, flag);
4087
4088                writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4089                        iommu->reg + DMAR_FECTL_REG);
4090                writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4091                        iommu->reg + DMAR_FEDATA_REG);
4092                writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4093                        iommu->reg + DMAR_FEADDR_REG);
4094                writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4095                        iommu->reg + DMAR_FEUADDR_REG);
4096
4097                raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4098        }
4099
4100        for_each_active_iommu(iommu, drhd)
4101                kfree(iommu->iommu_state);
4102}
4103
4104static struct syscore_ops iommu_syscore_ops = {
4105        .resume         = iommu_resume,
4106        .suspend        = iommu_suspend,
4107};
4108
4109static void __init init_iommu_pm_ops(void)
4110{
4111        register_syscore_ops(&iommu_syscore_ops);
4112}
4113
4114#else
4115static inline void init_iommu_pm_ops(void) {}
4116#endif  /* CONFIG_PM */
4117
4118
4119int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4120{
4121        struct acpi_dmar_reserved_memory *rmrr;
4122        int prot = DMA_PTE_READ|DMA_PTE_WRITE;
4123        struct dmar_rmrr_unit *rmrru;
4124        size_t length;
4125
4126        rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4127        if (!rmrru)
4128                goto out;
4129
4130        rmrru->hdr = header;
4131        rmrr = (struct acpi_dmar_reserved_memory *)header;
4132        rmrru->base_address = rmrr->base_address;
4133        rmrru->end_address = rmrr->end_address;
4134
4135        length = rmrr->end_address - rmrr->base_address + 1;
4136        rmrru->resv = iommu_alloc_resv_region(rmrr->base_address, length, prot,
4137                                              IOMMU_RESV_DIRECT);
4138        if (!rmrru->resv)
4139                goto free_rmrru;
4140
4141        rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4142                                ((void *)rmrr) + rmrr->header.length,
4143                                &rmrru->devices_cnt);
4144        if (rmrru->devices_cnt && rmrru->devices == NULL)
4145                goto free_all;
4146
4147        list_add(&rmrru->list, &dmar_rmrr_units);
4148
4149        return 0;
4150free_all:
4151        kfree(rmrru->resv);
4152free_rmrru:
4153        kfree(rmrru);
4154out:
4155        return -ENOMEM;
4156}
4157
4158static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4159{
4160        struct dmar_atsr_unit *atsru;
4161        struct acpi_dmar_atsr *tmp;
4162
4163        list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4164                tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4165                if (atsr->segment != tmp->segment)
4166                        continue;
4167                if (atsr->header.length != tmp->header.length)
4168                        continue;
4169                if (memcmp(atsr, tmp, atsr->header.length) == 0)
4170                        return atsru;
4171        }
4172
4173        return NULL;
4174}
4175
4176int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4177{
4178        struct acpi_dmar_atsr *atsr;
4179        struct dmar_atsr_unit *atsru;
4180
4181        if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4182                return 0;
4183
4184        atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4185        atsru = dmar_find_atsr(atsr);
4186        if (atsru)
4187                return 0;
4188
4189        atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4190        if (!atsru)
4191                return -ENOMEM;
4192
4193        /*
4194         * If memory is allocated from slab by ACPI _DSM method, we need to
4195         * copy the memory content because the memory buffer will be freed
4196         * on return.
4197         */
4198        atsru->hdr = (void *)(atsru + 1);
4199        memcpy(atsru->hdr, hdr, hdr->length);
4200        atsru->include_all = atsr->flags & 0x1;
4201        if (!atsru->include_all) {
4202                atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4203                                (void *)atsr + atsr->header.length,
4204                                &atsru->devices_cnt);
4205                if (atsru->devices_cnt && atsru->devices == NULL) {
4206                        kfree(atsru);
4207                        return -ENOMEM;
4208                }
4209        }
4210
4211        list_add_rcu(&atsru->list, &dmar_atsr_units);
4212
4213        return 0;
4214}
4215
4216static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4217{
4218        dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4219        kfree(atsru);
4220}
4221
4222int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4223{
4224        struct acpi_dmar_atsr *atsr;
4225        struct dmar_atsr_unit *atsru;
4226
4227        atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4228        atsru = dmar_find_atsr(atsr);
4229        if (atsru) {
4230                list_del_rcu(&atsru->list);
4231                synchronize_rcu();
4232                intel_iommu_free_atsr(atsru);
4233        }
4234
4235        return 0;
4236}
4237
4238int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4239{
4240        int i;
4241        struct device *dev;
4242        struct acpi_dmar_atsr *atsr;
4243        struct dmar_atsr_unit *atsru;
4244
4245        atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4246        atsru = dmar_find_atsr(atsr);
4247        if (!atsru)
4248                return 0;
4249
4250        if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4251                for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4252                                          i, dev)
4253                        return -EBUSY;
4254        }
4255
4256        return 0;
4257}
4258
4259static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4260{
4261        int sp, ret = 0;
4262        struct intel_iommu *iommu = dmaru->iommu;
4263
4264        if (g_iommus[iommu->seq_id])
4265                return 0;
4266
4267        if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4268                pr_warn("%s: Doesn't support hardware pass through.\n",
4269                        iommu->name);
4270                return -ENXIO;
4271        }
4272        if (!ecap_sc_support(iommu->ecap) &&
4273            domain_update_iommu_snooping(iommu)) {
4274                pr_warn("%s: Doesn't support snooping.\n",
4275                        iommu->name);
4276                return -ENXIO;
4277        }
4278        sp = domain_update_iommu_superpage(iommu) - 1;
4279        if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4280                pr_warn("%s: Doesn't support large page.\n",
4281                        iommu->name);
4282                return -ENXIO;
4283        }
4284
4285        /*
4286         * Disable translation if already enabled prior to OS handover.
4287         */
4288        if (iommu->gcmd & DMA_GCMD_TE)
4289                iommu_disable_translation(iommu);
4290
4291        g_iommus[iommu->seq_id] = iommu;
4292        ret = iommu_init_domains(iommu);
4293        if (ret == 0)
4294                ret = iommu_alloc_root_entry(iommu);
4295        if (ret)
4296                goto out;
4297
4298#ifdef CONFIG_INTEL_IOMMU_SVM
4299        if (pasid_enabled(iommu))
4300                intel_svm_alloc_pasid_tables(iommu);
4301#endif
4302
4303        if (dmaru->ignored) {
4304                /*
4305                 * we always have to disable PMRs or DMA may fail on this device
4306                 */
4307                if (force_on)
4308                        iommu_disable_protect_mem_regions(iommu);
4309                return 0;
4310        }
4311
4312        intel_iommu_init_qi(iommu);
4313        iommu_flush_write_buffer(iommu);
4314
4315#ifdef CONFIG_INTEL_IOMMU_SVM
4316        if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
4317                ret = intel_svm_enable_prq(iommu);
4318                if (ret)
4319                        goto disable_iommu;
4320        }
4321#endif
4322        ret = dmar_set_interrupt(iommu);
4323        if (ret)
4324                goto disable_iommu;
4325
4326        iommu_set_root_entry(iommu);
4327        iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4328        iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4329        iommu_enable_translation(iommu);
4330
4331        iommu_disable_protect_mem_regions(iommu);
4332        return 0;
4333
4334disable_iommu:
4335        disable_dmar_iommu(iommu);
4336out:
4337        free_dmar_iommu(iommu);
4338        return ret;
4339}
4340
4341int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4342{
4343        int ret = 0;
4344        struct intel_iommu *iommu = dmaru->iommu;
4345
4346        if (!intel_iommu_enabled)
4347                return 0;
4348        if (iommu == NULL)
4349                return -EINVAL;
4350
4351        if (insert) {
4352                ret = intel_iommu_add(dmaru);
4353        } else {
4354                disable_dmar_iommu(iommu);
4355                free_dmar_iommu(iommu);
4356        }
4357
4358        return ret;
4359}
4360
4361static void intel_iommu_free_dmars(void)
4362{
4363        struct dmar_rmrr_unit *rmrru, *rmrr_n;
4364        struct dmar_atsr_unit *atsru, *atsr_n;
4365
4366        list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4367                list_del(&rmrru->list);
4368                dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4369                kfree(rmrru->resv);
4370                kfree(rmrru);
4371        }
4372
4373        list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4374                list_del(&atsru->list);
4375                intel_iommu_free_atsr(atsru);
4376        }
4377}
4378
4379int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4380{
4381        int i, ret = 1;
4382        struct pci_bus *bus;
4383        struct pci_dev *bridge = NULL;
4384        struct device *tmp;
4385        struct acpi_dmar_atsr *atsr;
4386        struct dmar_atsr_unit *atsru;
4387
4388        dev = pci_physfn(dev);
4389        for (bus = dev->bus; bus; bus = bus->parent) {
4390                bridge = bus->self;
4391                /* If it's an integrated device, allow ATS */
4392                if (!bridge)
4393                        return 1;
4394                /* Connected via non-PCIe: no ATS */
4395                if (!pci_is_pcie(bridge) ||
4396                    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4397                        return 0;
4398                /* If we found the root port, look it up in the ATSR */
4399                if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4400                        break;
4401        }
4402
4403        rcu_read_lock();
4404        list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4405                atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4406                if (atsr->segment != pci_domain_nr(dev->bus))
4407                        continue;
4408
4409                for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4410                        if (tmp == &bridge->dev)
4411                                goto out;
4412
4413                if (atsru->include_all)
4414                        goto out;
4415        }
4416        ret = 0;
4417out:
4418        rcu_read_unlock();
4419
4420        return ret;
4421}
4422
4423int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4424{
4425        int ret = 0;
4426        struct dmar_rmrr_unit *rmrru;
4427        struct dmar_atsr_unit *atsru;
4428        struct acpi_dmar_atsr *atsr;
4429        struct acpi_dmar_reserved_memory *rmrr;
4430
4431        if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4432                return 0;
4433
4434        list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4435                rmrr = container_of(rmrru->hdr,
4436                                    struct acpi_dmar_reserved_memory, header);
4437                if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4438                        ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4439                                ((void *)rmrr) + rmrr->header.length,
4440                                rmrr->segment, rmrru->devices,
4441                                rmrru->devices_cnt);
4442                        if(ret < 0)
4443                                return ret;
4444                } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4445                        dmar_remove_dev_scope(info, rmrr->segment,
4446                                rmrru->devices, rmrru->devices_cnt);
4447                }
4448        }
4449
4450        list_for_each_entry(atsru, &dmar_atsr_units, list) {
4451                if (atsru->include_all)
4452                        continue;
4453
4454                atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4455                if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4456                        ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4457                                        (void *)atsr + atsr->header.length,
4458                                        atsr->segment, atsru->devices,
4459                                        atsru->devices_cnt);
4460                        if (ret > 0)
4461                                break;
4462                        else if(ret < 0)
4463                                return ret;
4464                } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4465                        if (dmar_remove_dev_scope(info, atsr->segment,
4466                                        atsru->devices, atsru->devices_cnt))
4467                                break;
4468                }
4469        }
4470
4471        return 0;
4472}
4473
4474/*
4475 * Here we only respond to action of unbound device from driver.
4476 *
4477 * Added device is not attached to its DMAR domain here yet. That will happen
4478 * when mapping the device to iova.
4479 */
4480static int device_notifier(struct notifier_block *nb,
4481                                  unsigned long action, void *data)
4482{
4483        struct device *dev = data;
4484        struct dmar_domain *domain;
4485
4486        if (iommu_dummy(dev))
4487                return 0;
4488
4489        if (action != BUS_NOTIFY_REMOVED_DEVICE)
4490                return 0;
4491
4492        domain = find_domain(dev);
4493        if (!domain)
4494                return 0;
4495
4496        dmar_remove_one_dev_info(domain, dev);
4497        if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4498                domain_exit(domain);
4499
4500        return 0;
4501}
4502
4503static struct notifier_block device_nb = {
4504        .notifier_call = device_notifier,
4505};
4506
4507static int intel_iommu_memory_notifier(struct notifier_block *nb,
4508                                       unsigned long val, void *v)
4509{
4510        struct memory_notify *mhp = v;
4511        unsigned long long start, end;
4512        unsigned long start_vpfn, last_vpfn;
4513
4514        switch (val) {
4515        case MEM_GOING_ONLINE:
4516                start = mhp->start_pfn << PAGE_SHIFT;
4517                end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4518                if (iommu_domain_identity_map(si_domain, start, end)) {
4519                        pr_warn("Failed to build identity map for [%llx-%llx]\n",
4520                                start, end);
4521                        return NOTIFY_BAD;
4522                }
4523                break;
4524
4525        case MEM_OFFLINE:
4526        case MEM_CANCEL_ONLINE:
4527                start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4528                last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4529                while (start_vpfn <= last_vpfn) {
4530                        struct iova *iova;
4531                        struct dmar_drhd_unit *drhd;
4532                        struct intel_iommu *iommu;
4533                        struct page *freelist;
4534
4535                        iova = find_iova(&si_domain->iovad, start_vpfn);
4536                        if (iova == NULL) {
4537                                pr_debug("Failed get IOVA for PFN %lx\n",
4538                                         start_vpfn);
4539                                break;
4540                        }
4541
4542                        iova = split_and_remove_iova(&si_domain->iovad, iova,
4543                                                     start_vpfn, last_vpfn);
4544                        if (iova == NULL) {
4545                                pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4546                                        start_vpfn, last_vpfn);
4547                                return NOTIFY_BAD;
4548                        }
4549
4550                        freelist = domain_unmap(si_domain, iova->pfn_lo,
4551                                               iova->pfn_hi);
4552
4553                        rcu_read_lock();
4554                        for_each_active_iommu(iommu, drhd)
4555                                iommu_flush_iotlb_psi(iommu, si_domain,
4556                                        iova->pfn_lo, iova_size(iova),
4557                                        !freelist, 0);
4558                        rcu_read_unlock();
4559                        dma_free_pagelist(freelist);
4560
4561                        start_vpfn = iova->pfn_hi + 1;
4562                        free_iova_mem(iova);
4563                }
4564                break;
4565        }
4566
4567        return NOTIFY_OK;
4568}
4569
4570static struct notifier_block intel_iommu_memory_nb = {
4571        .notifier_call = intel_iommu_memory_notifier,
4572        .priority = 0
4573};
4574
4575static void free_all_cpu_cached_iovas(unsigned int cpu)
4576{
4577        int i;
4578
4579        for (i = 0; i < g_num_of_iommus; i++) {
4580                struct intel_iommu *iommu = g_iommus[i];
4581                struct dmar_domain *domain;
4582                int did;
4583
4584                if (!iommu)
4585                        continue;
4586
4587                for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4588                        domain = get_iommu_domain(iommu, (u16)did);
4589
4590                        if (!domain)
4591                                continue;
4592                        free_cpu_cached_iovas(cpu, &domain->iovad);
4593                }
4594        }
4595}
4596
4597static int intel_iommu_cpu_dead(unsigned int cpu)
4598{
4599        free_all_cpu_cached_iovas(cpu);
4600        return 0;
4601}
4602
4603static void intel_disable_iommus(void)
4604{
4605        struct intel_iommu *iommu = NULL;
4606        struct dmar_drhd_unit *drhd;
4607
4608        for_each_iommu(iommu, drhd)
4609                iommu_disable_translation(iommu);
4610}
4611
4612static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4613{
4614        struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4615
4616        return container_of(iommu_dev, struct intel_iommu, iommu);
4617}
4618
4619static ssize_t intel_iommu_show_version(struct device *dev,
4620                                        struct device_attribute *attr,
4621                                        char *buf)
4622{
4623        struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4624        u32 ver = readl(iommu->reg + DMAR_VER_REG);
4625        return sprintf(buf, "%d:%d\n",
4626                       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4627}
4628static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4629
4630static ssize_t intel_iommu_show_address(struct device *dev,
4631                                        struct device_attribute *attr,
4632                                        char *buf)
4633{
4634        struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4635        return sprintf(buf, "%llx\n", iommu->reg_phys);
4636}
4637static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4638
4639static ssize_t intel_iommu_show_cap(struct device *dev,
4640                                    struct device_attribute *attr,
4641                                    char *buf)
4642{
4643        struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4644        return sprintf(buf, "%llx\n", iommu->cap);
4645}
4646static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4647
4648static ssize_t intel_iommu_show_ecap(struct device *dev,
4649                                    struct device_attribute *attr,
4650                                    char *buf)
4651{
4652        struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4653        return sprintf(buf, "%llx\n", iommu->ecap);
4654}
4655static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4656
4657static ssize_t intel_iommu_show_ndoms(struct device *dev,
4658                                      struct device_attribute *attr,
4659                                      char *buf)
4660{
4661        struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4662        return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4663}
4664static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4665
4666static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4667                                           struct device_attribute *attr,
4668                                           char *buf)
4669{
4670        struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4671        return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4672                                                  cap_ndoms(iommu->cap)));
4673}
4674static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4675
4676static struct attribute *intel_iommu_attrs[] = {
4677        &dev_attr_version.attr,
4678        &dev_attr_address.attr,
4679        &dev_attr_cap.attr,
4680        &dev_attr_ecap.attr,
4681        &dev_attr_domains_supported.attr,
4682        &dev_attr_domains_used.attr,
4683        NULL,
4684};
4685
4686static struct attribute_group intel_iommu_group = {
4687        .name = "intel-iommu",
4688        .attrs = intel_iommu_attrs,
4689};
4690
4691const struct attribute_group *intel_iommu_groups[] = {
4692        &intel_iommu_group,
4693        NULL,
4694};
4695
4696int __init intel_iommu_init(void)
4697{
4698        int ret = -ENODEV;
4699        struct dmar_drhd_unit *drhd;
4700        struct intel_iommu *iommu;
4701
4702        /* VT-d is required for a TXT/tboot launch, so enforce that */
4703        force_on = tboot_force_iommu();
4704
4705        if (iommu_init_mempool()) {
4706                if (force_on)
4707                        panic("tboot: Failed to initialize iommu memory\n");
4708                return -ENOMEM;
4709        }
4710
4711        down_write(&dmar_global_lock);
4712        if (dmar_table_init()) {
4713                if (force_on)
4714                        panic("tboot: Failed to initialize DMAR table\n");
4715                goto out_free_dmar;
4716        }
4717
4718        if (dmar_dev_scope_init() < 0) {
4719                if (force_on)
4720                        panic("tboot: Failed to initialize DMAR device scope\n");
4721                goto out_free_dmar;
4722        }
4723
4724        up_write(&dmar_global_lock);
4725
4726        /*
4727         * The bus notifier takes the dmar_global_lock, so lockdep will
4728         * complain later when we register it under the lock.
4729         */
4730        dmar_register_bus_notifier();
4731
4732        down_write(&dmar_global_lock);
4733
4734        if (no_iommu || dmar_disabled) {
4735                /*
4736                 * We exit the function here to ensure IOMMU's remapping and
4737                 * mempool aren't setup, which means that the IOMMU's PMRs
4738                 * won't be disabled via the call to init_dmars(). So disable
4739                 * it explicitly here. The PMRs were setup by tboot prior to
4740                 * calling SENTER, but the kernel is expected to reset/tear
4741                 * down the PMRs.
4742                 */
4743                if (intel_iommu_tboot_noforce) {
4744                        for_each_iommu(iommu, drhd)
4745                                iommu_disable_protect_mem_regions(iommu);
4746                }
4747
4748                /*
4749                 * Make sure the IOMMUs are switched off, even when we
4750                 * boot into a kexec kernel and the previous kernel left
4751                 * them enabled
4752                 */
4753                intel_disable_iommus();
4754                goto out_free_dmar;
4755        }
4756
4757        if (list_empty(&dmar_rmrr_units))
4758                pr_info("No RMRR found\n");
4759
4760        if (list_empty(&dmar_atsr_units))
4761                pr_info("No ATSR found\n");
4762
4763        if (dmar_init_reserved_ranges()) {
4764                if (force_on)
4765                        panic("tboot: Failed to reserve iommu ranges\n");
4766                goto out_free_reserved_range;
4767        }
4768
4769        init_no_remapping_devices();
4770
4771        ret = init_dmars();
4772        if (ret) {
4773                if (force_on)
4774                        panic("tboot: Failed to initialize DMARs\n");
4775                pr_err("Initialization failed\n");
4776                goto out_free_reserved_range;
4777        }
4778        up_write(&dmar_global_lock);
4779        pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4780
4781#if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4782        swiotlb = 0;
4783#endif
4784        dma_ops = &intel_dma_ops;
4785
4786        init_iommu_pm_ops();
4787
4788        for_each_active_iommu(iommu, drhd) {
4789                iommu_device_sysfs_add(&iommu->iommu, NULL,
4790                                       intel_iommu_groups,
4791                                       "%s", iommu->name);
4792                iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4793                iommu_device_register(&iommu->iommu);
4794        }
4795
4796        bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4797        bus_register_notifier(&pci_bus_type, &device_nb);
4798        if (si_domain && !hw_pass_through)
4799                register_memory_notifier(&intel_iommu_memory_nb);
4800        cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4801                          intel_iommu_cpu_dead);
4802        intel_iommu_enabled = 1;
4803
4804        return 0;
4805
4806out_free_reserved_range:
4807        put_iova_domain(&reserved_iova_list);
4808out_free_dmar:
4809        intel_iommu_free_dmars();
4810        up_write(&dmar_global_lock);
4811        iommu_exit_mempool();
4812        return ret;
4813}
4814
4815static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4816{
4817        struct intel_iommu *iommu = opaque;
4818
4819        domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4820        return 0;
4821}
4822
4823/*
4824 * NB - intel-iommu lacks any sort of reference counting for the users of
4825 * dependent devices.  If multiple endpoints have intersecting dependent
4826 * devices, unbinding the driver from any one of them will possibly leave
4827 * the others unable to operate.
4828 */
4829static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4830{
4831        if (!iommu || !dev || !dev_is_pci(dev))
4832                return;
4833
4834        pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4835}
4836
4837static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4838{
4839        struct intel_iommu *iommu;
4840        unsigned long flags;
4841
4842        assert_spin_locked(&device_domain_lock);
4843
4844        if (WARN_ON(!info))
4845                return;
4846
4847        iommu = info->iommu;
4848
4849        if (info->dev) {
4850                iommu_disable_dev_iotlb(info);
4851                domain_context_clear(iommu, info->dev);
4852        }
4853
4854        unlink_domain_info(info);
4855
4856        spin_lock_irqsave(&iommu->lock, flags);
4857        domain_detach_iommu(info->domain, iommu);
4858        spin_unlock_irqrestore(&iommu->lock, flags);
4859
4860        free_devinfo_mem(info);
4861}
4862
4863static void dmar_remove_one_dev_info(struct dmar_domain *domain,
4864                                     struct device *dev)
4865{
4866        struct device_domain_info *info;
4867        unsigned long flags;
4868
4869        spin_lock_irqsave(&device_domain_lock, flags);
4870        info = dev->archdata.iommu;
4871        __dmar_remove_one_dev_info(info);
4872        spin_unlock_irqrestore(&device_domain_lock, flags);
4873}
4874
4875static int md_domain_init(struct dmar_domain *domain, int guest_width)
4876{
4877        int adjust_width;
4878
4879        init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
4880        domain_reserve_special_ranges(domain);
4881
4882        /* calculate AGAW */
4883        domain->gaw = guest_width;
4884        adjust_width = guestwidth_to_adjustwidth(guest_width);
4885        domain->agaw = width_to_agaw(adjust_width);
4886
4887        domain->iommu_coherency = 0;
4888        domain->iommu_snooping = 0;
4889        domain->iommu_superpage = 0;
4890        domain->max_addr = 0;
4891
4892        /* always allocate the top pgd */
4893        domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4894        if (!domain->pgd)
4895                return -ENOMEM;
4896        domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4897        return 0;
4898}
4899
4900static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4901{
4902        struct dmar_domain *dmar_domain;
4903        struct iommu_domain *domain;
4904
4905        if (type != IOMMU_DOMAIN_UNMANAGED)
4906                return NULL;
4907
4908        dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4909        if (!dmar_domain) {
4910                pr_err("Can't allocate dmar_domain\n");
4911                return NULL;
4912        }
4913        if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4914                pr_err("Domain initialization failed\n");
4915                domain_exit(dmar_domain);
4916                return NULL;
4917        }
4918        domain_update_iommu_cap(dmar_domain);
4919
4920        domain = &dmar_domain->domain;
4921        domain->geometry.aperture_start = 0;
4922        domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4923        domain->geometry.force_aperture = true;
4924
4925        return domain;
4926}
4927
4928static void intel_iommu_domain_free(struct iommu_domain *domain)
4929{
4930        domain_exit(to_dmar_domain(domain));
4931}
4932
4933static int intel_iommu_attach_device(struct iommu_domain *domain,
4934                                     struct device *dev)
4935{
4936        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4937        struct intel_iommu *iommu;
4938        int addr_width;
4939        u8 bus, devfn;
4940
4941        if (device_is_rmrr_locked(dev)) {
4942                dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4943                return -EPERM;
4944        }
4945
4946        /* normally dev is not mapped */
4947        if (unlikely(domain_context_mapped(dev))) {
4948                struct dmar_domain *old_domain;
4949
4950                old_domain = find_domain(dev);
4951                if (old_domain) {
4952                        rcu_read_lock();
4953                        dmar_remove_one_dev_info(old_domain, dev);
4954                        rcu_read_unlock();
4955
4956                        if (!domain_type_is_vm_or_si(old_domain) &&
4957                             list_empty(&old_domain->devices))
4958                                domain_exit(old_domain);
4959                }
4960        }
4961
4962        iommu = device_to_iommu(dev, &bus, &devfn);
4963        if (!iommu)
4964                return -ENODEV;
4965
4966        /* check if this iommu agaw is sufficient for max mapped address */
4967        addr_width = agaw_to_width(iommu->agaw);
4968        if (addr_width > cap_mgaw(iommu->cap))
4969                addr_width = cap_mgaw(iommu->cap);
4970
4971        if (dmar_domain->max_addr > (1LL << addr_width)) {
4972                pr_err("%s: iommu width (%d) is not "
4973                       "sufficient for the mapped address (%llx)\n",
4974                       __func__, addr_width, dmar_domain->max_addr);
4975                return -EFAULT;
4976        }
4977        dmar_domain->gaw = addr_width;
4978
4979        /*
4980         * Knock out extra levels of page tables if necessary
4981         */
4982        while (iommu->agaw < dmar_domain->agaw) {
4983                struct dma_pte *pte;
4984
4985                pte = dmar_domain->pgd;
4986                if (dma_pte_present(pte)) {
4987                        dmar_domain->pgd = (struct dma_pte *)
4988                                phys_to_virt(dma_pte_addr(pte));
4989                        free_pgtable_page(pte);
4990                }
4991                dmar_domain->agaw--;
4992        }
4993
4994        return domain_add_dev_info(dmar_domain, dev);
4995}
4996
4997static void intel_iommu_detach_device(struct iommu_domain *domain,
4998                                      struct device *dev)
4999{
5000        dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
5001}
5002
5003static int intel_iommu_map(struct iommu_domain *domain,
5004                           unsigned long iova, phys_addr_t hpa,
5005                           size_t size, int iommu_prot)
5006{
5007        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5008        u64 max_addr;
5009        int prot = 0;
5010        int ret;
5011
5012        if (iommu_prot & IOMMU_READ)
5013                prot |= DMA_PTE_READ;
5014        if (iommu_prot & IOMMU_WRITE)
5015                prot |= DMA_PTE_WRITE;
5016        if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5017                prot |= DMA_PTE_SNP;
5018
5019        max_addr = iova + size;
5020        if (dmar_domain->max_addr < max_addr) {
5021                u64 end;
5022
5023                /* check if minimum agaw is sufficient for mapped address */
5024                end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5025                if (end < max_addr) {
5026                        pr_err("%s: iommu width (%d) is not "
5027                               "sufficient for the mapped address (%llx)\n",
5028                               __func__, dmar_domain->gaw, max_addr);
5029                        return -EFAULT;
5030                }
5031                dmar_domain->max_addr = max_addr;
5032        }
5033        /* Round up size to next multiple of PAGE_SIZE, if it and
5034           the low bits of hpa would take us onto the next page */
5035        size = aligned_nrpages(hpa, size);
5036        ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5037                                 hpa >> VTD_PAGE_SHIFT, size, prot);
5038        return ret;
5039}
5040
5041static size_t intel_iommu_unmap(struct iommu_domain *domain,
5042                                unsigned long iova, size_t size)
5043{
5044        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5045        struct page *freelist = NULL;
5046        unsigned long start_pfn, last_pfn;
5047        unsigned int npages;
5048        int iommu_id, level = 0;
5049
5050        /* Cope with horrid API which requires us to unmap more than the
5051           size argument if it happens to be a large-page mapping. */
5052        BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5053
5054        if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5055                size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5056
5057        start_pfn = iova >> VTD_PAGE_SHIFT;
5058        last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5059
5060        freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5061
5062        npages = last_pfn - start_pfn + 1;
5063
5064        for_each_domain_iommu(iommu_id, dmar_domain)
5065                iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5066                                      start_pfn, npages, !freelist, 0);
5067
5068        dma_free_pagelist(freelist);
5069
5070        if (dmar_domain->max_addr == iova + size)
5071                dmar_domain->max_addr = iova;
5072
5073        return size;
5074}
5075
5076static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5077                                            dma_addr_t iova)
5078{
5079        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5080        struct dma_pte *pte;
5081        int level = 0;
5082        u64 phys = 0;
5083
5084        pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5085        if (pte)
5086                phys = dma_pte_addr(pte);
5087
5088        return phys;
5089}
5090
5091static bool intel_iommu_capable(enum iommu_cap cap)
5092{
5093        if (cap == IOMMU_CAP_CACHE_COHERENCY)
5094                return domain_update_iommu_snooping(NULL) == 1;
5095        if (cap == IOMMU_CAP_INTR_REMAP)
5096                return irq_remapping_enabled == 1;
5097
5098        return false;
5099}
5100
5101static int intel_iommu_add_device(struct device *dev)
5102{
5103        struct intel_iommu *iommu;
5104        struct iommu_group *group;
5105        u8 bus, devfn;
5106
5107        iommu = device_to_iommu(dev, &bus, &devfn);
5108        if (!iommu)
5109                return -ENODEV;
5110
5111        iommu_device_link(&iommu->iommu, dev);
5112
5113        group = iommu_group_get_for_dev(dev);
5114
5115        if (IS_ERR(group))
5116                return PTR_ERR(group);
5117
5118        iommu_group_put(group);
5119        return 0;
5120}
5121
5122static void intel_iommu_remove_device(struct device *dev)
5123{
5124        struct intel_iommu *iommu;
5125        u8 bus, devfn;
5126
5127        iommu = device_to_iommu(dev, &bus, &devfn);
5128        if (!iommu)
5129                return;
5130
5131        iommu_group_remove_device(dev);
5132
5133        iommu_device_unlink(&iommu->iommu, dev);
5134}
5135
5136static void intel_iommu_get_resv_regions(struct device *device,
5137                                         struct list_head *head)
5138{
5139        struct iommu_resv_region *reg;
5140        struct dmar_rmrr_unit *rmrr;
5141        struct device *i_dev;
5142        int i;
5143
5144        rcu_read_lock();
5145        for_each_rmrr_units(rmrr) {
5146                for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5147                                          i, i_dev) {
5148                        if (i_dev != device)
5149                                continue;
5150
5151                        list_add_tail(&rmrr->resv->list, head);
5152                }
5153        }
5154        rcu_read_unlock();
5155
5156        reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5157                                      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5158                                      0, IOMMU_RESV_MSI);
5159        if (!reg)
5160                return;
5161        list_add_tail(&reg->list, head);
5162}
5163
5164static void intel_iommu_put_resv_regions(struct device *dev,
5165                                         struct list_head *head)
5166{
5167        struct iommu_resv_region *entry, *next;
5168
5169        list_for_each_entry_safe(entry, next, head, list) {
5170                if (entry->type == IOMMU_RESV_RESERVED)
5171                        kfree(entry);
5172        }
5173}
5174
5175#ifdef CONFIG_INTEL_IOMMU_SVM
5176#define MAX_NR_PASID_BITS (20)
5177static inline unsigned long intel_iommu_get_pts(struct intel_iommu *iommu)
5178{
5179        /*
5180         * Convert ecap_pss to extend context entry pts encoding, also
5181         * respect the soft pasid_max value set by the iommu.
5182         * - number of PASID bits = ecap_pss + 1
5183         * - number of PASID table entries = 2^(pts + 5)
5184         * Therefore, pts = ecap_pss - 4
5185         * e.g. KBL ecap_pss = 0x13, PASID has 20 bits, pts = 15
5186         */
5187        if (ecap_pss(iommu->ecap) < 5)
5188                return 0;
5189
5190        /* pasid_max is encoded as actual number of entries not the bits */
5191        return find_first_bit((unsigned long *)&iommu->pasid_max,
5192                        MAX_NR_PASID_BITS) - 5;
5193}
5194
5195int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
5196{
5197        struct device_domain_info *info;
5198        struct context_entry *context;
5199        struct dmar_domain *domain;
5200        unsigned long flags;
5201        u64 ctx_lo;
5202        int ret;
5203
5204        domain = get_valid_domain_for_dev(sdev->dev);
5205        if (!domain)
5206                return -EINVAL;
5207
5208        spin_lock_irqsave(&device_domain_lock, flags);
5209        spin_lock(&iommu->lock);
5210
5211        ret = -EINVAL;
5212        info = sdev->dev->archdata.iommu;
5213        if (!info || !info->pasid_supported)
5214                goto out;
5215
5216        context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5217        if (WARN_ON(!context))
5218                goto out;
5219
5220        ctx_lo = context[0].lo;
5221
5222        sdev->did = domain->iommu_did[iommu->seq_id];
5223        sdev->sid = PCI_DEVID(info->bus, info->devfn);
5224
5225        if (!(ctx_lo & CONTEXT_PASIDE)) {
5226                if (iommu->pasid_state_table)
5227                        context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table);
5228                context[1].lo = (u64)virt_to_phys(iommu->pasid_table) |
5229                        intel_iommu_get_pts(iommu);
5230
5231                wmb();
5232                /* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both
5233                 * extended to permit requests-with-PASID if the PASIDE bit
5234                 * is set. which makes sense. For CONTEXT_TT_PASS_THROUGH,
5235                 * however, the PASIDE bit is ignored and requests-with-PASID
5236                 * are unconditionally blocked. Which makes less sense.
5237                 * So convert from CONTEXT_TT_PASS_THROUGH to one of the new
5238                 * "guest mode" translation types depending on whether ATS
5239                 * is available or not. Annoyingly, we can't use the new
5240                 * modes *unless* PASIDE is set. */
5241                if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) {
5242                        ctx_lo &= ~CONTEXT_TT_MASK;
5243                        if (info->ats_supported)
5244                                ctx_lo |= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2;
5245                        else
5246                                ctx_lo |= CONTEXT_TT_PT_PASID << 2;
5247                }
5248                ctx_lo |= CONTEXT_PASIDE;
5249                if (iommu->pasid_state_table)
5250                        ctx_lo |= CONTEXT_DINVE;
5251                if (info->pri_supported)
5252                        ctx_lo |= CONTEXT_PRS;
5253                context[0].lo = ctx_lo;
5254                wmb();
5255                iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
5256                                           DMA_CCMD_MASK_NOBIT,
5257                                           DMA_CCMD_DEVICE_INVL);
5258        }
5259
5260        /* Enable PASID support in the device, if it wasn't already */
5261        if (!info->pasid_enabled)
5262                iommu_enable_dev_iotlb(info);
5263
5264        if (info->ats_enabled) {
5265                sdev->dev_iotlb = 1;
5266                sdev->qdep = info->ats_qdep;
5267                if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
5268                        sdev->qdep = 0;
5269        }
5270        ret = 0;
5271
5272 out:
5273        spin_unlock(&iommu->lock);
5274        spin_unlock_irqrestore(&device_domain_lock, flags);
5275
5276        return ret;
5277}
5278
5279struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5280{
5281        struct intel_iommu *iommu;
5282        u8 bus, devfn;
5283
5284        if (iommu_dummy(dev)) {
5285                dev_warn(dev,
5286                         "No IOMMU translation for device; cannot enable SVM\n");
5287                return NULL;
5288        }
5289
5290        iommu = device_to_iommu(dev, &bus, &devfn);
5291        if ((!iommu)) {
5292                dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5293                return NULL;
5294        }
5295
5296        if (!iommu->pasid_table) {
5297                dev_err(dev, "PASID not enabled on IOMMU; cannot enable SVM\n");
5298                return NULL;
5299        }
5300
5301        return iommu;
5302}
5303#endif /* CONFIG_INTEL_IOMMU_SVM */
5304
5305const struct iommu_ops intel_iommu_ops = {
5306        .capable                = intel_iommu_capable,
5307        .domain_alloc           = intel_iommu_domain_alloc,
5308        .domain_free            = intel_iommu_domain_free,
5309        .attach_dev             = intel_iommu_attach_device,
5310        .detach_dev             = intel_iommu_detach_device,
5311        .map                    = intel_iommu_map,
5312        .unmap                  = intel_iommu_unmap,
5313        .map_sg                 = default_iommu_map_sg,
5314        .iova_to_phys           = intel_iommu_iova_to_phys,
5315        .add_device             = intel_iommu_add_device,
5316        .remove_device          = intel_iommu_remove_device,
5317        .get_resv_regions       = intel_iommu_get_resv_regions,
5318        .put_resv_regions       = intel_iommu_put_resv_regions,
5319        .device_group           = pci_device_group,
5320        .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
5321};
5322
5323static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5324{
5325        /* G4x/GM45 integrated gfx dmar support is totally busted. */
5326        pr_info("Disabling IOMMU for graphics on this chipset\n");
5327        dmar_map_gfx = 0;
5328}
5329
5330DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5331DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5332DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5333DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5334DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5335DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5336DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5337
5338static void quirk_iommu_rwbf(struct pci_dev *dev)
5339{
5340        /*
5341         * Mobile 4 Series Chipset neglects to set RWBF capability,
5342         * but needs it. Same seems to hold for the desktop versions.
5343         */
5344        pr_info("Forcing write-buffer flush capability\n");
5345        rwbf_quirk = 1;
5346}
5347
5348DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5349DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5350DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5351DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5352DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5353DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5354DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5355
5356#define GGC 0x52
5357#define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5358#define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5359#define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5360#define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5361#define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5362#define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5363#define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5364#define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5365
5366static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5367{
5368        unsigned short ggc;
5369
5370        if (pci_read_config_word(dev, GGC, &ggc))
5371                return;
5372
5373        if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5374                pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5375                dmar_map_gfx = 0;
5376        } else if (dmar_map_gfx) {
5377                /* we have to ensure the gfx device is idle before we flush */
5378                pr_info("Disabling batched IOTLB flush on Ironlake\n");
5379                intel_iommu_strict = 1;
5380       }
5381}
5382DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5383DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5384DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5385DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5386
5387/* On Tylersburg chipsets, some BIOSes have been known to enable the
5388   ISOCH DMAR unit for the Azalia sound device, but not give it any
5389   TLB entries, which causes it to deadlock. Check for that.  We do
5390   this in a function called from init_dmars(), instead of in a PCI
5391   quirk, because we don't want to print the obnoxious "BIOS broken"
5392   message if VT-d is actually disabled.
5393*/
5394static void __init check_tylersburg_isoch(void)
5395{
5396        struct pci_dev *pdev;
5397        uint32_t vtisochctrl;
5398
5399        /* If there's no Azalia in the system anyway, forget it. */
5400        pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5401        if (!pdev)
5402                return;
5403        pci_dev_put(pdev);
5404
5405        /* System Management Registers. Might be hidden, in which case
5406           we can't do the sanity check. But that's OK, because the
5407           known-broken BIOSes _don't_ actually hide it, so far. */
5408        pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5409        if (!pdev)
5410                return;
5411
5412        if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5413                pci_dev_put(pdev);
5414                return;
5415        }
5416
5417        pci_dev_put(pdev);
5418
5419        /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5420        if (vtisochctrl & 1)
5421                return;
5422
5423        /* Drop all bits other than the number of TLB entries */
5424        vtisochctrl &= 0x1c;
5425
5426        /* If we have the recommended number of TLB entries (16), fine. */
5427        if (vtisochctrl == 0x10)
5428                return;
5429
5430        /* Zero TLB entries? You get to ride the short bus to school. */
5431        if (!vtisochctrl) {
5432                WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5433                     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5434                     dmi_get_system_info(DMI_BIOS_VENDOR),
5435                     dmi_get_system_info(DMI_BIOS_VERSION),
5436                     dmi_get_system_info(DMI_PRODUCT_VERSION));
5437                iommu_identity_mapping |= IDENTMAP_AZALIA;
5438                return;
5439        }
5440
5441        pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5442               vtisochctrl);
5443}
5444