linux/drivers/iommu/intel-iommu.c
<<
>>
Prefs
   1/*
   2 * Copyright © 2006-2014 Intel Corporation.
   3 *
   4 * This program is free software; you can redistribute it and/or modify it
   5 * under the terms and conditions of the GNU General Public License,
   6 * version 2, as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope it will be useful, but WITHOUT
   9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  11 * more details.
  12 *
  13 * Authors: David Woodhouse <dwmw2@infradead.org>,
  14 *          Ashok Raj <ashok.raj@intel.com>,
  15 *          Shaohua Li <shaohua.li@intel.com>,
  16 *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
  17 *          Fenghua Yu <fenghua.yu@intel.com>
  18 *          Joerg Roedel <jroedel@suse.de>
  19 */
  20
  21#define pr_fmt(fmt)     "DMAR: " fmt
  22
  23#include <linux/init.h>
  24#include <linux/bitmap.h>
  25#include <linux/debugfs.h>
  26#include <linux/export.h>
  27#include <linux/slab.h>
  28#include <linux/irq.h>
  29#include <linux/interrupt.h>
  30#include <linux/spinlock.h>
  31#include <linux/pci.h>
  32#include <linux/dmar.h>
  33#include <linux/dma-mapping.h>
  34#include <linux/mempool.h>
  35#include <linux/memory.h>
  36#include <linux/cpu.h>
  37#include <linux/timer.h>
  38#include <linux/io.h>
  39#include <linux/iova.h>
  40#include <linux/iommu.h>
  41#include <linux/intel-iommu.h>
  42#include <linux/syscore_ops.h>
  43#include <linux/tboot.h>
  44#include <linux/dmi.h>
  45#include <linux/pci-ats.h>
  46#include <linux/memblock.h>
  47#include <linux/dma-contiguous.h>
  48#include <linux/crash_dump.h>
  49#include <asm/irq_remapping.h>
  50#include <asm/cacheflush.h>
  51#include <asm/iommu.h>
  52
  53#include "irq_remapping.h"
  54
  55#define ROOT_SIZE               VTD_PAGE_SIZE
  56#define CONTEXT_SIZE            VTD_PAGE_SIZE
  57
  58#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  59#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  60#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  61#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  62
  63#define IOAPIC_RANGE_START      (0xfee00000)
  64#define IOAPIC_RANGE_END        (0xfeefffff)
  65#define IOVA_START_ADDR         (0x1000)
  66
  67#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
  68
  69#define MAX_AGAW_WIDTH 64
  70#define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
  71
  72#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  73#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  74
  75/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  76   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  77#define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  78                                __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  79#define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  80
  81/* IO virtual address start page frame number */
  82#define IOVA_START_PFN          (1)
  83
  84#define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  85#define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
  86#define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
  87
  88/* page table handling */
  89#define LEVEL_STRIDE            (9)
  90#define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  91
  92/*
  93 * This bitmap is used to advertise the page sizes our hardware support
  94 * to the IOMMU core, which will then use this information to split
  95 * physically contiguous memory regions it is mapping into page sizes
  96 * that we support.
  97 *
  98 * Traditionally the IOMMU core just handed us the mappings directly,
  99 * after making sure the size is an order of a 4KiB page and that the
 100 * mapping has natural alignment.
 101 *
 102 * To retain this behavior, we currently advertise that we support
 103 * all page sizes that are an order of 4KiB.
 104 *
 105 * If at some point we'd like to utilize the IOMMU core's new behavior,
 106 * we could change this to advertise the real page sizes we support.
 107 */
 108#define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
 109
 110static inline int agaw_to_level(int agaw)
 111{
 112        return agaw + 2;
 113}
 114
 115static inline int agaw_to_width(int agaw)
 116{
 117        return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
 118}
 119
 120static inline int width_to_agaw(int width)
 121{
 122        return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
 123}
 124
 125static inline unsigned int level_to_offset_bits(int level)
 126{
 127        return (level - 1) * LEVEL_STRIDE;
 128}
 129
 130static inline int pfn_level_offset(unsigned long pfn, int level)
 131{
 132        return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 133}
 134
 135static inline unsigned long level_mask(int level)
 136{
 137        return -1UL << level_to_offset_bits(level);
 138}
 139
 140static inline unsigned long level_size(int level)
 141{
 142        return 1UL << level_to_offset_bits(level);
 143}
 144
 145static inline unsigned long align_to_level(unsigned long pfn, int level)
 146{
 147        return (pfn + level_size(level) - 1) & level_mask(level);
 148}
 149
 150static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 151{
 152        return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
 153}
 154
 155/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 156   are never going to work. */
 157static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 158{
 159        return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 160}
 161
 162static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 163{
 164        return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 165}
 166static inline unsigned long page_to_dma_pfn(struct page *pg)
 167{
 168        return mm_to_dma_pfn(page_to_pfn(pg));
 169}
 170static inline unsigned long virt_to_dma_pfn(void *p)
 171{
 172        return page_to_dma_pfn(virt_to_page(p));
 173}
 174
 175/* global iommu list, set NULL for ignored DMAR units */
 176static struct intel_iommu **g_iommus;
 177
 178static void __init check_tylersburg_isoch(void);
 179static int rwbf_quirk;
 180
 181/*
 182 * set to 1 to panic kernel if can't successfully enable VT-d
 183 * (used when kernel is launched w/ TXT)
 184 */
 185static int force_on = 0;
 186
 187/*
 188 * 0: Present
 189 * 1-11: Reserved
 190 * 12-63: Context Ptr (12 - (haw-1))
 191 * 64-127: Reserved
 192 */
 193struct root_entry {
 194        u64     lo;
 195        u64     hi;
 196};
 197#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 198
 199/*
 200 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 201 * if marked present.
 202 */
 203static phys_addr_t root_entry_lctp(struct root_entry *re)
 204{
 205        if (!(re->lo & 1))
 206                return 0;
 207
 208        return re->lo & VTD_PAGE_MASK;
 209}
 210
 211/*
 212 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 213 * if marked present.
 214 */
 215static phys_addr_t root_entry_uctp(struct root_entry *re)
 216{
 217        if (!(re->hi & 1))
 218                return 0;
 219
 220        return re->hi & VTD_PAGE_MASK;
 221}
 222/*
 223 * low 64 bits:
 224 * 0: present
 225 * 1: fault processing disable
 226 * 2-3: translation type
 227 * 12-63: address space root
 228 * high 64 bits:
 229 * 0-2: address width
 230 * 3-6: aval
 231 * 8-23: domain id
 232 */
 233struct context_entry {
 234        u64 lo;
 235        u64 hi;
 236};
 237
 238static inline void context_clear_pasid_enable(struct context_entry *context)
 239{
 240        context->lo &= ~(1ULL << 11);
 241}
 242
 243static inline bool context_pasid_enabled(struct context_entry *context)
 244{
 245        return !!(context->lo & (1ULL << 11));
 246}
 247
 248static inline void context_set_copied(struct context_entry *context)
 249{
 250        context->hi |= (1ull << 3);
 251}
 252
 253static inline bool context_copied(struct context_entry *context)
 254{
 255        return !!(context->hi & (1ULL << 3));
 256}
 257
 258static inline bool __context_present(struct context_entry *context)
 259{
 260        return (context->lo & 1);
 261}
 262
 263static inline bool context_present(struct context_entry *context)
 264{
 265        return context_pasid_enabled(context) ?
 266             __context_present(context) :
 267             __context_present(context) && !context_copied(context);
 268}
 269
 270static inline void context_set_present(struct context_entry *context)
 271{
 272        context->lo |= 1;
 273}
 274
 275static inline void context_set_fault_enable(struct context_entry *context)
 276{
 277        context->lo &= (((u64)-1) << 2) | 1;
 278}
 279
 280static inline void context_set_translation_type(struct context_entry *context,
 281                                                unsigned long value)
 282{
 283        context->lo &= (((u64)-1) << 4) | 3;
 284        context->lo |= (value & 3) << 2;
 285}
 286
 287static inline void context_set_address_root(struct context_entry *context,
 288                                            unsigned long value)
 289{
 290        context->lo &= ~VTD_PAGE_MASK;
 291        context->lo |= value & VTD_PAGE_MASK;
 292}
 293
 294static inline void context_set_address_width(struct context_entry *context,
 295                                             unsigned long value)
 296{
 297        context->hi |= value & 7;
 298}
 299
 300static inline void context_set_domain_id(struct context_entry *context,
 301                                         unsigned long value)
 302{
 303        context->hi |= (value & ((1 << 16) - 1)) << 8;
 304}
 305
 306static inline int context_domain_id(struct context_entry *c)
 307{
 308        return((c->hi >> 8) & 0xffff);
 309}
 310
 311static inline void context_clear_entry(struct context_entry *context)
 312{
 313        context->lo = 0;
 314        context->hi = 0;
 315}
 316
 317/*
 318 * 0: readable
 319 * 1: writable
 320 * 2-6: reserved
 321 * 7: super page
 322 * 8-10: available
 323 * 11: snoop behavior
 324 * 12-63: Host physcial address
 325 */
 326struct dma_pte {
 327        u64 val;
 328};
 329
 330static inline void dma_clear_pte(struct dma_pte *pte)
 331{
 332        pte->val = 0;
 333}
 334
 335static inline u64 dma_pte_addr(struct dma_pte *pte)
 336{
 337#ifdef CONFIG_64BIT
 338        return pte->val & VTD_PAGE_MASK;
 339#else
 340        /* Must have a full atomic 64-bit read */
 341        return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
 342#endif
 343}
 344
 345static inline bool dma_pte_present(struct dma_pte *pte)
 346{
 347        return (pte->val & 3) != 0;
 348}
 349
 350static inline bool dma_pte_superpage(struct dma_pte *pte)
 351{
 352        return (pte->val & DMA_PTE_LARGE_PAGE);
 353}
 354
 355static inline int first_pte_in_page(struct dma_pte *pte)
 356{
 357        return !((unsigned long)pte & ~VTD_PAGE_MASK);
 358}
 359
 360/*
 361 * This domain is a statically identity mapping domain.
 362 *      1. This domain creats a static 1:1 mapping to all usable memory.
 363 *      2. It maps to each iommu if successful.
 364 *      3. Each iommu mapps to this domain if successful.
 365 */
 366static struct dmar_domain *si_domain;
 367static int hw_pass_through = 1;
 368
 369/*
 370 * Domain represents a virtual machine, more than one devices
 371 * across iommus may be owned in one domain, e.g. kvm guest.
 372 */
 373#define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 0)
 374
 375/* si_domain contains mulitple devices */
 376#define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 1)
 377
 378#define for_each_domain_iommu(idx, domain)                      \
 379        for (idx = 0; idx < g_num_of_iommus; idx++)             \
 380                if (domain->iommu_refcnt[idx])
 381
 382struct dmar_domain {
 383        int     nid;                    /* node id */
 384
 385        unsigned        iommu_refcnt[DMAR_UNITS_SUPPORTED];
 386                                        /* Refcount of devices per iommu */
 387
 388
 389        u16             iommu_did[DMAR_UNITS_SUPPORTED];
 390                                        /* Domain ids per IOMMU. Use u16 since
 391                                         * domain ids are 16 bit wide according
 392                                         * to VT-d spec, section 9.3 */
 393
 394        bool has_iotlb_device;
 395        struct list_head devices;       /* all devices' list */
 396        struct iova_domain iovad;       /* iova's that belong to this domain */
 397
 398        struct dma_pte  *pgd;           /* virtual address */
 399        int             gaw;            /* max guest address width */
 400
 401        /* adjusted guest address width, 0 is level 2 30-bit */
 402        int             agaw;
 403
 404        int             flags;          /* flags to find out type of domain */
 405
 406        int             iommu_coherency;/* indicate coherency of iommu access */
 407        int             iommu_snooping; /* indicate snooping control feature*/
 408        int             iommu_count;    /* reference count of iommu */
 409        int             iommu_superpage;/* Level of superpages supported:
 410                                           0 == 4KiB (no superpages), 1 == 2MiB,
 411                                           2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
 412        u64             max_addr;       /* maximum mapped address */
 413
 414        struct iommu_domain domain;     /* generic domain data structure for
 415                                           iommu core */
 416};
 417
 418/* PCI domain-device relationship */
 419struct device_domain_info {
 420        struct list_head link;  /* link to domain siblings */
 421        struct list_head global; /* link to global list */
 422        u8 bus;                 /* PCI bus number */
 423        u8 devfn;               /* PCI devfn number */
 424        u8 pasid_supported:3;
 425        u8 pasid_enabled:1;
 426        u8 pri_supported:1;
 427        u8 pri_enabled:1;
 428        u8 ats_supported:1;
 429        u8 ats_enabled:1;
 430        u8 ats_qdep;
 431        struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
 432        struct intel_iommu *iommu; /* IOMMU used by this device */
 433        struct dmar_domain *domain; /* pointer to domain */
 434};
 435
 436struct dmar_rmrr_unit {
 437        struct list_head list;          /* list of rmrr units   */
 438        struct acpi_dmar_header *hdr;   /* ACPI header          */
 439        u64     base_address;           /* reserved base address*/
 440        u64     end_address;            /* reserved end address */
 441        struct dmar_dev_scope *devices; /* target devices */
 442        int     devices_cnt;            /* target device count */
 443};
 444
 445struct dmar_atsr_unit {
 446        struct list_head list;          /* list of ATSR units */
 447        struct acpi_dmar_header *hdr;   /* ACPI header */
 448        struct dmar_dev_scope *devices; /* target devices */
 449        int devices_cnt;                /* target device count */
 450        u8 include_all:1;               /* include all ports */
 451};
 452
 453static LIST_HEAD(dmar_atsr_units);
 454static LIST_HEAD(dmar_rmrr_units);
 455
 456#define for_each_rmrr_units(rmrr) \
 457        list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 458
 459static void flush_unmaps_timeout(unsigned long data);
 460
 461struct deferred_flush_entry {
 462        unsigned long iova_pfn;
 463        unsigned long nrpages;
 464        struct dmar_domain *domain;
 465        struct page *freelist;
 466};
 467
 468#define HIGH_WATER_MARK 250
 469struct deferred_flush_table {
 470        int next;
 471        struct deferred_flush_entry entries[HIGH_WATER_MARK];
 472};
 473
 474struct deferred_flush_data {
 475        spinlock_t lock;
 476        int timer_on;
 477        struct timer_list timer;
 478        long size;
 479        struct deferred_flush_table *tables;
 480};
 481
 482DEFINE_PER_CPU(struct deferred_flush_data, deferred_flush);
 483
 484/* bitmap for indexing intel_iommus */
 485static int g_num_of_iommus;
 486
 487static void domain_exit(struct dmar_domain *domain);
 488static void domain_remove_dev_info(struct dmar_domain *domain);
 489static void dmar_remove_one_dev_info(struct dmar_domain *domain,
 490                                     struct device *dev);
 491static void __dmar_remove_one_dev_info(struct device_domain_info *info);
 492static void domain_context_clear(struct intel_iommu *iommu,
 493                                 struct device *dev);
 494static int domain_detach_iommu(struct dmar_domain *domain,
 495                               struct intel_iommu *iommu);
 496
 497#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
 498int dmar_disabled = 0;
 499#else
 500int dmar_disabled = 1;
 501#endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
 502
 503int intel_iommu_enabled = 0;
 504EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 505
 506static int dmar_map_gfx = 1;
 507static int dmar_forcedac;
 508static int intel_iommu_strict;
 509static int intel_iommu_superpage = 1;
 510static int intel_iommu_ecs = 1;
 511static int intel_iommu_pasid28;
 512static int iommu_identity_mapping;
 513
 514#define IDENTMAP_ALL            1
 515#define IDENTMAP_GFX            2
 516#define IDENTMAP_AZALIA         4
 517
 518/* Broadwell and Skylake have broken ECS support — normal so-called "second
 519 * level" translation of DMA requests-without-PASID doesn't actually happen
 520 * unless you also set the NESTE bit in an extended context-entry. Which of
 521 * course means that SVM doesn't work because it's trying to do nested
 522 * translation of the physical addresses it finds in the process page tables,
 523 * through the IOVA->phys mapping found in the "second level" page tables.
 524 *
 525 * The VT-d specification was retroactively changed to change the definition
 526 * of the capability bits and pretend that Broadwell/Skylake never happened...
 527 * but unfortunately the wrong bit was changed. It's ECS which is broken, but
 528 * for some reason it was the PASID capability bit which was redefined (from
 529 * bit 28 on BDW/SKL to bit 40 in future).
 530 *
 531 * So our test for ECS needs to eschew those implementations which set the old
 532 * PASID capabiity bit 28, since those are the ones on which ECS is broken.
 533 * Unless we are working around the 'pasid28' limitations, that is, by putting
 534 * the device into passthrough mode for normal DMA and thus masking the bug.
 535 */
 536#define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
 537                            (intel_iommu_pasid28 || !ecap_broken_pasid(iommu->ecap)))
 538/* PASID support is thus enabled if ECS is enabled and *either* of the old
 539 * or new capability bits are set. */
 540#define pasid_enabled(iommu) (ecs_enabled(iommu) &&                     \
 541                              (ecap_pasid(iommu->ecap) || ecap_broken_pasid(iommu->ecap)))
 542
 543int intel_iommu_gfx_mapped;
 544EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 545
 546#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 547static DEFINE_SPINLOCK(device_domain_lock);
 548static LIST_HEAD(device_domain_list);
 549
 550static const struct iommu_ops intel_iommu_ops;
 551
 552static bool translation_pre_enabled(struct intel_iommu *iommu)
 553{
 554        return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 555}
 556
 557static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 558{
 559        iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 560}
 561
 562static void init_translation_status(struct intel_iommu *iommu)
 563{
 564        u32 gsts;
 565
 566        gsts = readl(iommu->reg + DMAR_GSTS_REG);
 567        if (gsts & DMA_GSTS_TES)
 568                iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 569}
 570
 571/* Convert generic 'struct iommu_domain to private struct dmar_domain */
 572static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
 573{
 574        return container_of(dom, struct dmar_domain, domain);
 575}
 576
 577static int __init intel_iommu_setup(char *str)
 578{
 579        if (!str)
 580                return -EINVAL;
 581        while (*str) {
 582                if (!strncmp(str, "on", 2)) {
 583                        dmar_disabled = 0;
 584                        pr_info("IOMMU enabled\n");
 585                } else if (!strncmp(str, "off", 3)) {
 586                        dmar_disabled = 1;
 587                        pr_info("IOMMU disabled\n");
 588                } else if (!strncmp(str, "igfx_off", 8)) {
 589                        dmar_map_gfx = 0;
 590                        pr_info("Disable GFX device mapping\n");
 591                } else if (!strncmp(str, "forcedac", 8)) {
 592                        pr_info("Forcing DAC for PCI devices\n");
 593                        dmar_forcedac = 1;
 594                } else if (!strncmp(str, "strict", 6)) {
 595                        pr_info("Disable batched IOTLB flush\n");
 596                        intel_iommu_strict = 1;
 597                } else if (!strncmp(str, "sp_off", 6)) {
 598                        pr_info("Disable supported super page\n");
 599                        intel_iommu_superpage = 0;
 600                } else if (!strncmp(str, "ecs_off", 7)) {
 601                        printk(KERN_INFO
 602                                "Intel-IOMMU: disable extended context table support\n");
 603                        intel_iommu_ecs = 0;
 604                } else if (!strncmp(str, "pasid28", 7)) {
 605                        printk(KERN_INFO
 606                                "Intel-IOMMU: enable pre-production PASID support\n");
 607                        intel_iommu_pasid28 = 1;
 608                        iommu_identity_mapping |= IDENTMAP_GFX;
 609                }
 610
 611                str += strcspn(str, ",");
 612                while (*str == ',')
 613                        str++;
 614        }
 615        return 0;
 616}
 617__setup("intel_iommu=", intel_iommu_setup);
 618
 619static struct kmem_cache *iommu_domain_cache;
 620static struct kmem_cache *iommu_devinfo_cache;
 621
 622static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
 623{
 624        struct dmar_domain **domains;
 625        int idx = did >> 8;
 626
 627        domains = iommu->domains[idx];
 628        if (!domains)
 629                return NULL;
 630
 631        return domains[did & 0xff];
 632}
 633
 634static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
 635                             struct dmar_domain *domain)
 636{
 637        struct dmar_domain **domains;
 638        int idx = did >> 8;
 639
 640        if (!iommu->domains[idx]) {
 641                size_t size = 256 * sizeof(struct dmar_domain *);
 642                iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
 643        }
 644
 645        domains = iommu->domains[idx];
 646        if (WARN_ON(!domains))
 647                return;
 648        else
 649                domains[did & 0xff] = domain;
 650}
 651
 652static inline void *alloc_pgtable_page(int node)
 653{
 654        struct page *page;
 655        void *vaddr = NULL;
 656
 657        page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 658        if (page)
 659                vaddr = page_address(page);
 660        return vaddr;
 661}
 662
 663static inline void free_pgtable_page(void *vaddr)
 664{
 665        free_page((unsigned long)vaddr);
 666}
 667
 668static inline void *alloc_domain_mem(void)
 669{
 670        return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 671}
 672
 673static void free_domain_mem(void *vaddr)
 674{
 675        kmem_cache_free(iommu_domain_cache, vaddr);
 676}
 677
 678static inline void * alloc_devinfo_mem(void)
 679{
 680        return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 681}
 682
 683static inline void free_devinfo_mem(void *vaddr)
 684{
 685        kmem_cache_free(iommu_devinfo_cache, vaddr);
 686}
 687
 688static inline int domain_type_is_vm(struct dmar_domain *domain)
 689{
 690        return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
 691}
 692
 693static inline int domain_type_is_si(struct dmar_domain *domain)
 694{
 695        return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
 696}
 697
 698static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
 699{
 700        return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
 701                                DOMAIN_FLAG_STATIC_IDENTITY);
 702}
 703
 704static inline int domain_pfn_supported(struct dmar_domain *domain,
 705                                       unsigned long pfn)
 706{
 707        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 708
 709        return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 710}
 711
 712static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 713{
 714        unsigned long sagaw;
 715        int agaw = -1;
 716
 717        sagaw = cap_sagaw(iommu->cap);
 718        for (agaw = width_to_agaw(max_gaw);
 719             agaw >= 0; agaw--) {
 720                if (test_bit(agaw, &sagaw))
 721                        break;
 722        }
 723
 724        return agaw;
 725}
 726
 727/*
 728 * Calculate max SAGAW for each iommu.
 729 */
 730int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 731{
 732        return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 733}
 734
 735/*
 736 * calculate agaw for each iommu.
 737 * "SAGAW" may be different across iommus, use a default agaw, and
 738 * get a supported less agaw for iommus that don't support the default agaw.
 739 */
 740int iommu_calculate_agaw(struct intel_iommu *iommu)
 741{
 742        return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 743}
 744
 745/* This functionin only returns single iommu in a domain */
 746static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 747{
 748        int iommu_id;
 749
 750        /* si_domain and vm domain should not get here. */
 751        BUG_ON(domain_type_is_vm_or_si(domain));
 752        for_each_domain_iommu(iommu_id, domain)
 753                break;
 754
 755        if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 756                return NULL;
 757
 758        return g_iommus[iommu_id];
 759}
 760
 761static void domain_update_iommu_coherency(struct dmar_domain *domain)
 762{
 763        struct dmar_drhd_unit *drhd;
 764        struct intel_iommu *iommu;
 765        bool found = false;
 766        int i;
 767
 768        domain->iommu_coherency = 1;
 769
 770        for_each_domain_iommu(i, domain) {
 771                found = true;
 772                if (!ecap_coherent(g_iommus[i]->ecap)) {
 773                        domain->iommu_coherency = 0;
 774                        break;
 775                }
 776        }
 777        if (found)
 778                return;
 779
 780        /* No hardware attached; use lowest common denominator */
 781        rcu_read_lock();
 782        for_each_active_iommu(iommu, drhd) {
 783                if (!ecap_coherent(iommu->ecap)) {
 784                        domain->iommu_coherency = 0;
 785                        break;
 786                }
 787        }
 788        rcu_read_unlock();
 789}
 790
 791static int domain_update_iommu_snooping(struct intel_iommu *skip)
 792{
 793        struct dmar_drhd_unit *drhd;
 794        struct intel_iommu *iommu;
 795        int ret = 1;
 796
 797        rcu_read_lock();
 798        for_each_active_iommu(iommu, drhd) {
 799                if (iommu != skip) {
 800                        if (!ecap_sc_support(iommu->ecap)) {
 801                                ret = 0;
 802                                break;
 803                        }
 804                }
 805        }
 806        rcu_read_unlock();
 807
 808        return ret;
 809}
 810
 811static int domain_update_iommu_superpage(struct intel_iommu *skip)
 812{
 813        struct dmar_drhd_unit *drhd;
 814        struct intel_iommu *iommu;
 815        int mask = 0xf;
 816
 817        if (!intel_iommu_superpage) {
 818                return 0;
 819        }
 820
 821        /* set iommu_superpage to the smallest common denominator */
 822        rcu_read_lock();
 823        for_each_active_iommu(iommu, drhd) {
 824                if (iommu != skip) {
 825                        mask &= cap_super_page_val(iommu->cap);
 826                        if (!mask)
 827                                break;
 828                }
 829        }
 830        rcu_read_unlock();
 831
 832        return fls(mask);
 833}
 834
 835/* Some capabilities may be different across iommus */
 836static void domain_update_iommu_cap(struct dmar_domain *domain)
 837{
 838        domain_update_iommu_coherency(domain);
 839        domain->iommu_snooping = domain_update_iommu_snooping(NULL);
 840        domain->iommu_superpage = domain_update_iommu_superpage(NULL);
 841}
 842
 843static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
 844                                                       u8 bus, u8 devfn, int alloc)
 845{
 846        struct root_entry *root = &iommu->root_entry[bus];
 847        struct context_entry *context;
 848        u64 *entry;
 849
 850        entry = &root->lo;
 851        if (ecs_enabled(iommu)) {
 852                if (devfn >= 0x80) {
 853                        devfn -= 0x80;
 854                        entry = &root->hi;
 855                }
 856                devfn *= 2;
 857        }
 858        if (*entry & 1)
 859                context = phys_to_virt(*entry & VTD_PAGE_MASK);
 860        else {
 861                unsigned long phy_addr;
 862                if (!alloc)
 863                        return NULL;
 864
 865                context = alloc_pgtable_page(iommu->node);
 866                if (!context)
 867                        return NULL;
 868
 869                __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 870                phy_addr = virt_to_phys((void *)context);
 871                *entry = phy_addr | 1;
 872                __iommu_flush_cache(iommu, entry, sizeof(*entry));
 873        }
 874        return &context[devfn];
 875}
 876
 877static int iommu_dummy(struct device *dev)
 878{
 879        return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
 880}
 881
 882static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 883{
 884        struct dmar_drhd_unit *drhd = NULL;
 885        struct intel_iommu *iommu;
 886        struct device *tmp;
 887        struct pci_dev *ptmp, *pdev = NULL;
 888        u16 segment = 0;
 889        int i;
 890
 891        if (iommu_dummy(dev))
 892                return NULL;
 893
 894        if (dev_is_pci(dev)) {
 895                pdev = to_pci_dev(dev);
 896                segment = pci_domain_nr(pdev->bus);
 897        } else if (has_acpi_companion(dev))
 898                dev = &ACPI_COMPANION(dev)->dev;
 899
 900        rcu_read_lock();
 901        for_each_active_iommu(iommu, drhd) {
 902                if (pdev && segment != drhd->segment)
 903                        continue;
 904
 905                for_each_active_dev_scope(drhd->devices,
 906                                          drhd->devices_cnt, i, tmp) {
 907                        if (tmp == dev) {
 908                                *bus = drhd->devices[i].bus;
 909                                *devfn = drhd->devices[i].devfn;
 910                                goto out;
 911                        }
 912
 913                        if (!pdev || !dev_is_pci(tmp))
 914                                continue;
 915
 916                        ptmp = to_pci_dev(tmp);
 917                        if (ptmp->subordinate &&
 918                            ptmp->subordinate->number <= pdev->bus->number &&
 919                            ptmp->subordinate->busn_res.end >= pdev->bus->number)
 920                                goto got_pdev;
 921                }
 922
 923                if (pdev && drhd->include_all) {
 924                got_pdev:
 925                        *bus = pdev->bus->number;
 926                        *devfn = pdev->devfn;
 927                        goto out;
 928                }
 929        }
 930        iommu = NULL;
 931 out:
 932        rcu_read_unlock();
 933
 934        return iommu;
 935}
 936
 937static void domain_flush_cache(struct dmar_domain *domain,
 938                               void *addr, int size)
 939{
 940        if (!domain->iommu_coherency)
 941                clflush_cache_range(addr, size);
 942}
 943
 944static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 945{
 946        struct context_entry *context;
 947        int ret = 0;
 948        unsigned long flags;
 949
 950        spin_lock_irqsave(&iommu->lock, flags);
 951        context = iommu_context_addr(iommu, bus, devfn, 0);
 952        if (context)
 953                ret = context_present(context);
 954        spin_unlock_irqrestore(&iommu->lock, flags);
 955        return ret;
 956}
 957
 958static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
 959{
 960        struct context_entry *context;
 961        unsigned long flags;
 962
 963        spin_lock_irqsave(&iommu->lock, flags);
 964        context = iommu_context_addr(iommu, bus, devfn, 0);
 965        if (context) {
 966                context_clear_entry(context);
 967                __iommu_flush_cache(iommu, context, sizeof(*context));
 968        }
 969        spin_unlock_irqrestore(&iommu->lock, flags);
 970}
 971
 972static void free_context_table(struct intel_iommu *iommu)
 973{
 974        int i;
 975        unsigned long flags;
 976        struct context_entry *context;
 977
 978        spin_lock_irqsave(&iommu->lock, flags);
 979        if (!iommu->root_entry) {
 980                goto out;
 981        }
 982        for (i = 0; i < ROOT_ENTRY_NR; i++) {
 983                context = iommu_context_addr(iommu, i, 0, 0);
 984                if (context)
 985                        free_pgtable_page(context);
 986
 987                if (!ecs_enabled(iommu))
 988                        continue;
 989
 990                context = iommu_context_addr(iommu, i, 0x80, 0);
 991                if (context)
 992                        free_pgtable_page(context);
 993
 994        }
 995        free_pgtable_page(iommu->root_entry);
 996        iommu->root_entry = NULL;
 997out:
 998        spin_unlock_irqrestore(&iommu->lock, flags);
 999}
1000
1001static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
1002                                      unsigned long pfn, int *target_level)
1003{
1004        struct dma_pte *parent, *pte = NULL;
1005        int level = agaw_to_level(domain->agaw);
1006        int offset;
1007
1008        BUG_ON(!domain->pgd);
1009
1010        if (!domain_pfn_supported(domain, pfn))
1011                /* Address beyond IOMMU's addressing capabilities. */
1012                return NULL;
1013
1014        parent = domain->pgd;
1015
1016        while (1) {
1017                void *tmp_page;
1018
1019                offset = pfn_level_offset(pfn, level);
1020                pte = &parent[offset];
1021                if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1022                        break;
1023                if (level == *target_level)
1024                        break;
1025
1026                if (!dma_pte_present(pte)) {
1027                        uint64_t pteval;
1028
1029                        tmp_page = alloc_pgtable_page(domain->nid);
1030
1031                        if (!tmp_page)
1032                                return NULL;
1033
1034                        domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1035                        pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1036                        if (cmpxchg64(&pte->val, 0ULL, pteval))
1037                                /* Someone else set it while we were thinking; use theirs. */
1038                                free_pgtable_page(tmp_page);
1039                        else
1040                                domain_flush_cache(domain, pte, sizeof(*pte));
1041                }
1042                if (level == 1)
1043                        break;
1044
1045                parent = phys_to_virt(dma_pte_addr(pte));
1046                level--;
1047        }
1048
1049        if (!*target_level)
1050                *target_level = level;
1051
1052        return pte;
1053}
1054
1055
1056/* return address's pte at specific level */
1057static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1058                                         unsigned long pfn,
1059                                         int level, int *large_page)
1060{
1061        struct dma_pte *parent, *pte = NULL;
1062        int total = agaw_to_level(domain->agaw);
1063        int offset;
1064
1065        parent = domain->pgd;
1066        while (level <= total) {
1067                offset = pfn_level_offset(pfn, total);
1068                pte = &parent[offset];
1069                if (level == total)
1070                        return pte;
1071
1072                if (!dma_pte_present(pte)) {
1073                        *large_page = total;
1074                        break;
1075                }
1076
1077                if (dma_pte_superpage(pte)) {
1078                        *large_page = total;
1079                        return pte;
1080                }
1081
1082                parent = phys_to_virt(dma_pte_addr(pte));
1083                total--;
1084        }
1085        return NULL;
1086}
1087
1088/* clear last level pte, a tlb flush should be followed */
1089static void dma_pte_clear_range(struct dmar_domain *domain,
1090                                unsigned long start_pfn,
1091                                unsigned long last_pfn)
1092{
1093        unsigned int large_page = 1;
1094        struct dma_pte *first_pte, *pte;
1095
1096        BUG_ON(!domain_pfn_supported(domain, start_pfn));
1097        BUG_ON(!domain_pfn_supported(domain, last_pfn));
1098        BUG_ON(start_pfn > last_pfn);
1099
1100        /* we don't need lock here; nobody else touches the iova range */
1101        do {
1102                large_page = 1;
1103                first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1104                if (!pte) {
1105                        start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1106                        continue;
1107                }
1108                do {
1109                        dma_clear_pte(pte);
1110                        start_pfn += lvl_to_nr_pages(large_page);
1111                        pte++;
1112                } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1113
1114                domain_flush_cache(domain, first_pte,
1115                                   (void *)pte - (void *)first_pte);
1116
1117        } while (start_pfn && start_pfn <= last_pfn);
1118}
1119
1120static void dma_pte_free_level(struct dmar_domain *domain, int level,
1121                               struct dma_pte *pte, unsigned long pfn,
1122                               unsigned long start_pfn, unsigned long last_pfn)
1123{
1124        pfn = max(start_pfn, pfn);
1125        pte = &pte[pfn_level_offset(pfn, level)];
1126
1127        do {
1128                unsigned long level_pfn;
1129                struct dma_pte *level_pte;
1130
1131                if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1132                        goto next;
1133
1134                level_pfn = pfn & level_mask(level - 1);
1135                level_pte = phys_to_virt(dma_pte_addr(pte));
1136
1137                if (level > 2)
1138                        dma_pte_free_level(domain, level - 1, level_pte,
1139                                           level_pfn, start_pfn, last_pfn);
1140
1141                /* If range covers entire pagetable, free it */
1142                if (!(start_pfn > level_pfn ||
1143                      last_pfn < level_pfn + level_size(level) - 1)) {
1144                        dma_clear_pte(pte);
1145                        domain_flush_cache(domain, pte, sizeof(*pte));
1146                        free_pgtable_page(level_pte);
1147                }
1148next:
1149                pfn += level_size(level);
1150        } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1151}
1152
1153/* clear last level (leaf) ptes and free page table pages. */
1154static void dma_pte_free_pagetable(struct dmar_domain *domain,
1155                                   unsigned long start_pfn,
1156                                   unsigned long last_pfn)
1157{
1158        BUG_ON(!domain_pfn_supported(domain, start_pfn));
1159        BUG_ON(!domain_pfn_supported(domain, last_pfn));
1160        BUG_ON(start_pfn > last_pfn);
1161
1162        dma_pte_clear_range(domain, start_pfn, last_pfn);
1163
1164        /* We don't need lock here; nobody else touches the iova range */
1165        dma_pte_free_level(domain, agaw_to_level(domain->agaw),
1166                           domain->pgd, 0, start_pfn, last_pfn);
1167
1168        /* free pgd */
1169        if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1170                free_pgtable_page(domain->pgd);
1171                domain->pgd = NULL;
1172        }
1173}
1174
1175/* When a page at a given level is being unlinked from its parent, we don't
1176   need to *modify* it at all. All we need to do is make a list of all the
1177   pages which can be freed just as soon as we've flushed the IOTLB and we
1178   know the hardware page-walk will no longer touch them.
1179   The 'pte' argument is the *parent* PTE, pointing to the page that is to
1180   be freed. */
1181static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1182                                            int level, struct dma_pte *pte,
1183                                            struct page *freelist)
1184{
1185        struct page *pg;
1186
1187        pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1188        pg->freelist = freelist;
1189        freelist = pg;
1190
1191        if (level == 1)
1192                return freelist;
1193
1194        pte = page_address(pg);
1195        do {
1196                if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1197                        freelist = dma_pte_list_pagetables(domain, level - 1,
1198                                                           pte, freelist);
1199                pte++;
1200        } while (!first_pte_in_page(pte));
1201
1202        return freelist;
1203}
1204
1205static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1206                                        struct dma_pte *pte, unsigned long pfn,
1207                                        unsigned long start_pfn,
1208                                        unsigned long last_pfn,
1209                                        struct page *freelist)
1210{
1211        struct dma_pte *first_pte = NULL, *last_pte = NULL;
1212
1213        pfn = max(start_pfn, pfn);
1214        pte = &pte[pfn_level_offset(pfn, level)];
1215
1216        do {
1217                unsigned long level_pfn;
1218
1219                if (!dma_pte_present(pte))
1220                        goto next;
1221
1222                level_pfn = pfn & level_mask(level);
1223
1224                /* If range covers entire pagetable, free it */
1225                if (start_pfn <= level_pfn &&
1226                    last_pfn >= level_pfn + level_size(level) - 1) {
1227                        /* These suborbinate page tables are going away entirely. Don't
1228                           bother to clear them; we're just going to *free* them. */
1229                        if (level > 1 && !dma_pte_superpage(pte))
1230                                freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1231
1232                        dma_clear_pte(pte);
1233                        if (!first_pte)
1234                                first_pte = pte;
1235                        last_pte = pte;
1236                } else if (level > 1) {
1237                        /* Recurse down into a level that isn't *entirely* obsolete */
1238                        freelist = dma_pte_clear_level(domain, level - 1,
1239                                                       phys_to_virt(dma_pte_addr(pte)),
1240                                                       level_pfn, start_pfn, last_pfn,
1241                                                       freelist);
1242                }
1243next:
1244                pfn += level_size(level);
1245        } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1246
1247        if (first_pte)
1248                domain_flush_cache(domain, first_pte,
1249                                   (void *)++last_pte - (void *)first_pte);
1250
1251        return freelist;
1252}
1253
1254/* We can't just free the pages because the IOMMU may still be walking
1255   the page tables, and may have cached the intermediate levels. The
1256   pages can only be freed after the IOTLB flush has been done. */
1257static struct page *domain_unmap(struct dmar_domain *domain,
1258                                 unsigned long start_pfn,
1259                                 unsigned long last_pfn)
1260{
1261        struct page *freelist = NULL;
1262
1263        BUG_ON(!domain_pfn_supported(domain, start_pfn));
1264        BUG_ON(!domain_pfn_supported(domain, last_pfn));
1265        BUG_ON(start_pfn > last_pfn);
1266
1267        /* we don't need lock here; nobody else touches the iova range */
1268        freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1269                                       domain->pgd, 0, start_pfn, last_pfn, NULL);
1270
1271        /* free pgd */
1272        if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1273                struct page *pgd_page = virt_to_page(domain->pgd);
1274                pgd_page->freelist = freelist;
1275                freelist = pgd_page;
1276
1277                domain->pgd = NULL;
1278        }
1279
1280        return freelist;
1281}
1282
1283static void dma_free_pagelist(struct page *freelist)
1284{
1285        struct page *pg;
1286
1287        while ((pg = freelist)) {
1288                freelist = pg->freelist;
1289                free_pgtable_page(page_address(pg));
1290        }
1291}
1292
1293/* iommu handling */
1294static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1295{
1296        struct root_entry *root;
1297        unsigned long flags;
1298
1299        root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1300        if (!root) {
1301                pr_err("Allocating root entry for %s failed\n",
1302                        iommu->name);
1303                return -ENOMEM;
1304        }
1305
1306        __iommu_flush_cache(iommu, root, ROOT_SIZE);
1307
1308        spin_lock_irqsave(&iommu->lock, flags);
1309        iommu->root_entry = root;
1310        spin_unlock_irqrestore(&iommu->lock, flags);
1311
1312        return 0;
1313}
1314
1315static void iommu_set_root_entry(struct intel_iommu *iommu)
1316{
1317        u64 addr;
1318        u32 sts;
1319        unsigned long flag;
1320
1321        addr = virt_to_phys(iommu->root_entry);
1322        if (ecs_enabled(iommu))
1323                addr |= DMA_RTADDR_RTT;
1324
1325        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1326        dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1327
1328        writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1329
1330        /* Make sure hardware complete it */
1331        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1332                      readl, (sts & DMA_GSTS_RTPS), sts);
1333
1334        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1335}
1336
1337static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1338{
1339        u32 val;
1340        unsigned long flag;
1341
1342        if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1343                return;
1344
1345        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1346        writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1347
1348        /* Make sure hardware complete it */
1349        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1350                      readl, (!(val & DMA_GSTS_WBFS)), val);
1351
1352        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1353}
1354
1355/* return value determine if we need a write buffer flush */
1356static void __iommu_flush_context(struct intel_iommu *iommu,
1357                                  u16 did, u16 source_id, u8 function_mask,
1358                                  u64 type)
1359{
1360        u64 val = 0;
1361        unsigned long flag;
1362
1363        switch (type) {
1364        case DMA_CCMD_GLOBAL_INVL:
1365                val = DMA_CCMD_GLOBAL_INVL;
1366                break;
1367        case DMA_CCMD_DOMAIN_INVL:
1368                val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1369                break;
1370        case DMA_CCMD_DEVICE_INVL:
1371                val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1372                        | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1373                break;
1374        default:
1375                BUG();
1376        }
1377        val |= DMA_CCMD_ICC;
1378
1379        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1380        dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1381
1382        /* Make sure hardware complete it */
1383        IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1384                dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1385
1386        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1387}
1388
1389/* return value determine if we need a write buffer flush */
1390static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1391                                u64 addr, unsigned int size_order, u64 type)
1392{
1393        int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1394        u64 val = 0, val_iva = 0;
1395        unsigned long flag;
1396
1397        switch (type) {
1398        case DMA_TLB_GLOBAL_FLUSH:
1399                /* global flush doesn't need set IVA_REG */
1400                val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1401                break;
1402        case DMA_TLB_DSI_FLUSH:
1403                val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1404                break;
1405        case DMA_TLB_PSI_FLUSH:
1406                val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1407                /* IH bit is passed in as part of address */
1408                val_iva = size_order | addr;
1409                break;
1410        default:
1411                BUG();
1412        }
1413        /* Note: set drain read/write */
1414#if 0
1415        /*
1416         * This is probably to be super secure.. Looks like we can
1417         * ignore it without any impact.
1418         */
1419        if (cap_read_drain(iommu->cap))
1420                val |= DMA_TLB_READ_DRAIN;
1421#endif
1422        if (cap_write_drain(iommu->cap))
1423                val |= DMA_TLB_WRITE_DRAIN;
1424
1425        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1426        /* Note: Only uses first TLB reg currently */
1427        if (val_iva)
1428                dmar_writeq(iommu->reg + tlb_offset, val_iva);
1429        dmar_writeq(iommu->reg + tlb_offset + 8, val);
1430
1431        /* Make sure hardware complete it */
1432        IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1433                dmar_readq, (!(val & DMA_TLB_IVT)), val);
1434
1435        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1436
1437        /* check IOTLB invalidation granularity */
1438        if (DMA_TLB_IAIG(val) == 0)
1439                pr_err("Flush IOTLB failed\n");
1440        if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1441                pr_debug("TLB flush request %Lx, actual %Lx\n",
1442                        (unsigned long long)DMA_TLB_IIRG(type),
1443                        (unsigned long long)DMA_TLB_IAIG(val));
1444}
1445
1446static struct device_domain_info *
1447iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1448                         u8 bus, u8 devfn)
1449{
1450        struct device_domain_info *info;
1451
1452        assert_spin_locked(&device_domain_lock);
1453
1454        if (!iommu->qi)
1455                return NULL;
1456
1457        list_for_each_entry(info, &domain->devices, link)
1458                if (info->iommu == iommu && info->bus == bus &&
1459                    info->devfn == devfn) {
1460                        if (info->ats_supported && info->dev)
1461                                return info;
1462                        break;
1463                }
1464
1465        return NULL;
1466}
1467
1468static void domain_update_iotlb(struct dmar_domain *domain)
1469{
1470        struct device_domain_info *info;
1471        bool has_iotlb_device = false;
1472
1473        assert_spin_locked(&device_domain_lock);
1474
1475        list_for_each_entry(info, &domain->devices, link) {
1476                struct pci_dev *pdev;
1477
1478                if (!info->dev || !dev_is_pci(info->dev))
1479                        continue;
1480
1481                pdev = to_pci_dev(info->dev);
1482                if (pdev->ats_enabled) {
1483                        has_iotlb_device = true;
1484                        break;
1485                }
1486        }
1487
1488        domain->has_iotlb_device = has_iotlb_device;
1489}
1490
1491static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1492{
1493        struct pci_dev *pdev;
1494
1495        assert_spin_locked(&device_domain_lock);
1496
1497        if (!info || !dev_is_pci(info->dev))
1498                return;
1499
1500        pdev = to_pci_dev(info->dev);
1501
1502#ifdef CONFIG_INTEL_IOMMU_SVM
1503        /* The PCIe spec, in its wisdom, declares that the behaviour of
1504           the device if you enable PASID support after ATS support is
1505           undefined. So always enable PASID support on devices which
1506           have it, even if we can't yet know if we're ever going to
1507           use it. */
1508        if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1509                info->pasid_enabled = 1;
1510
1511        if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1512                info->pri_enabled = 1;
1513#endif
1514        if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1515                info->ats_enabled = 1;
1516                domain_update_iotlb(info->domain);
1517                info->ats_qdep = pci_ats_queue_depth(pdev);
1518        }
1519}
1520
1521static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1522{
1523        struct pci_dev *pdev;
1524
1525        assert_spin_locked(&device_domain_lock);
1526
1527        if (!dev_is_pci(info->dev))
1528                return;
1529
1530        pdev = to_pci_dev(info->dev);
1531
1532        if (info->ats_enabled) {
1533                pci_disable_ats(pdev);
1534                info->ats_enabled = 0;
1535                domain_update_iotlb(info->domain);
1536        }
1537#ifdef CONFIG_INTEL_IOMMU_SVM
1538        if (info->pri_enabled) {
1539                pci_disable_pri(pdev);
1540                info->pri_enabled = 0;
1541        }
1542        if (info->pasid_enabled) {
1543                pci_disable_pasid(pdev);
1544                info->pasid_enabled = 0;
1545        }
1546#endif
1547}
1548
1549static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1550                                  u64 addr, unsigned mask)
1551{
1552        u16 sid, qdep;
1553        unsigned long flags;
1554        struct device_domain_info *info;
1555
1556        if (!domain->has_iotlb_device)
1557                return;
1558
1559        spin_lock_irqsave(&device_domain_lock, flags);
1560        list_for_each_entry(info, &domain->devices, link) {
1561                if (!info->ats_enabled)
1562                        continue;
1563
1564                sid = info->bus << 8 | info->devfn;
1565                qdep = info->ats_qdep;
1566                qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1567        }
1568        spin_unlock_irqrestore(&device_domain_lock, flags);
1569}
1570
1571static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1572                                  struct dmar_domain *domain,
1573                                  unsigned long pfn, unsigned int pages,
1574                                  int ih, int map)
1575{
1576        unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1577        uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1578        u16 did = domain->iommu_did[iommu->seq_id];
1579
1580        BUG_ON(pages == 0);
1581
1582        if (ih)
1583                ih = 1 << 6;
1584        /*
1585         * Fallback to domain selective flush if no PSI support or the size is
1586         * too big.
1587         * PSI requires page size to be 2 ^ x, and the base address is naturally
1588         * aligned to the size
1589         */
1590        if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1591                iommu->flush.flush_iotlb(iommu, did, 0, 0,
1592                                                DMA_TLB_DSI_FLUSH);
1593        else
1594                iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1595                                                DMA_TLB_PSI_FLUSH);
1596
1597        /*
1598         * In caching mode, changes of pages from non-present to present require
1599         * flush. However, device IOTLB doesn't need to be flushed in this case.
1600         */
1601        if (!cap_caching_mode(iommu->cap) || !map)
1602                iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1603                                      addr, mask);
1604}
1605
1606static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1607{
1608        u32 pmen;
1609        unsigned long flags;
1610
1611        raw_spin_lock_irqsave(&iommu->register_lock, flags);
1612        pmen = readl(iommu->reg + DMAR_PMEN_REG);
1613        pmen &= ~DMA_PMEN_EPM;
1614        writel(pmen, iommu->reg + DMAR_PMEN_REG);
1615
1616        /* wait for the protected region status bit to clear */
1617        IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1618                readl, !(pmen & DMA_PMEN_PRS), pmen);
1619
1620        raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1621}
1622
1623static void iommu_enable_translation(struct intel_iommu *iommu)
1624{
1625        u32 sts;
1626        unsigned long flags;
1627
1628        raw_spin_lock_irqsave(&iommu->register_lock, flags);
1629        iommu->gcmd |= DMA_GCMD_TE;
1630        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1631
1632        /* Make sure hardware complete it */
1633        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1634                      readl, (sts & DMA_GSTS_TES), sts);
1635
1636        raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1637}
1638
1639static void iommu_disable_translation(struct intel_iommu *iommu)
1640{
1641        u32 sts;
1642        unsigned long flag;
1643
1644        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1645        iommu->gcmd &= ~DMA_GCMD_TE;
1646        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1647
1648        /* Make sure hardware complete it */
1649        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1650                      readl, (!(sts & DMA_GSTS_TES)), sts);
1651
1652        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1653}
1654
1655
1656static int iommu_init_domains(struct intel_iommu *iommu)
1657{
1658        u32 ndomains, nlongs;
1659        size_t size;
1660
1661        ndomains = cap_ndoms(iommu->cap);
1662        pr_debug("%s: Number of Domains supported <%d>\n",
1663                 iommu->name, ndomains);
1664        nlongs = BITS_TO_LONGS(ndomains);
1665
1666        spin_lock_init(&iommu->lock);
1667
1668        iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1669        if (!iommu->domain_ids) {
1670                pr_err("%s: Allocating domain id array failed\n",
1671                       iommu->name);
1672                return -ENOMEM;
1673        }
1674
1675        size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1676        iommu->domains = kzalloc(size, GFP_KERNEL);
1677
1678        if (iommu->domains) {
1679                size = 256 * sizeof(struct dmar_domain *);
1680                iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1681        }
1682
1683        if (!iommu->domains || !iommu->domains[0]) {
1684                pr_err("%s: Allocating domain array failed\n",
1685                       iommu->name);
1686                kfree(iommu->domain_ids);
1687                kfree(iommu->domains);
1688                iommu->domain_ids = NULL;
1689                iommu->domains    = NULL;
1690                return -ENOMEM;
1691        }
1692
1693
1694
1695        /*
1696         * If Caching mode is set, then invalid translations are tagged
1697         * with domain-id 0, hence we need to pre-allocate it. We also
1698         * use domain-id 0 as a marker for non-allocated domain-id, so
1699         * make sure it is not used for a real domain.
1700         */
1701        set_bit(0, iommu->domain_ids);
1702
1703        return 0;
1704}
1705
1706static void disable_dmar_iommu(struct intel_iommu *iommu)
1707{
1708        struct device_domain_info *info, *tmp;
1709        unsigned long flags;
1710
1711        if (!iommu->domains || !iommu->domain_ids)
1712                return;
1713
1714        spin_lock_irqsave(&device_domain_lock, flags);
1715        list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1716                struct dmar_domain *domain;
1717
1718                if (info->iommu != iommu)
1719                        continue;
1720
1721                if (!info->dev || !info->domain)
1722                        continue;
1723
1724                domain = info->domain;
1725
1726                dmar_remove_one_dev_info(domain, info->dev);
1727
1728                if (!domain_type_is_vm_or_si(domain))
1729                        domain_exit(domain);
1730        }
1731        spin_unlock_irqrestore(&device_domain_lock, flags);
1732
1733        if (iommu->gcmd & DMA_GCMD_TE)
1734                iommu_disable_translation(iommu);
1735}
1736
1737static void free_dmar_iommu(struct intel_iommu *iommu)
1738{
1739        if ((iommu->domains) && (iommu->domain_ids)) {
1740                int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1741                int i;
1742
1743                for (i = 0; i < elems; i++)
1744                        kfree(iommu->domains[i]);
1745                kfree(iommu->domains);
1746                kfree(iommu->domain_ids);
1747                iommu->domains = NULL;
1748                iommu->domain_ids = NULL;
1749        }
1750
1751        g_iommus[iommu->seq_id] = NULL;
1752
1753        /* free context mapping */
1754        free_context_table(iommu);
1755
1756#ifdef CONFIG_INTEL_IOMMU_SVM
1757        if (pasid_enabled(iommu)) {
1758                if (ecap_prs(iommu->ecap))
1759                        intel_svm_finish_prq(iommu);
1760                intel_svm_free_pasid_tables(iommu);
1761        }
1762#endif
1763}
1764
1765static struct dmar_domain *alloc_domain(int flags)
1766{
1767        struct dmar_domain *domain;
1768
1769        domain = alloc_domain_mem();
1770        if (!domain)
1771                return NULL;
1772
1773        memset(domain, 0, sizeof(*domain));
1774        domain->nid = -1;
1775        domain->flags = flags;
1776        domain->has_iotlb_device = false;
1777        INIT_LIST_HEAD(&domain->devices);
1778
1779        return domain;
1780}
1781
1782/* Must be called with iommu->lock */
1783static int domain_attach_iommu(struct dmar_domain *domain,
1784                               struct intel_iommu *iommu)
1785{
1786        unsigned long ndomains;
1787        int num;
1788
1789        assert_spin_locked(&device_domain_lock);
1790        assert_spin_locked(&iommu->lock);
1791
1792        domain->iommu_refcnt[iommu->seq_id] += 1;
1793        domain->iommu_count += 1;
1794        if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1795                ndomains = cap_ndoms(iommu->cap);
1796                num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1797
1798                if (num >= ndomains) {
1799                        pr_err("%s: No free domain ids\n", iommu->name);
1800                        domain->iommu_refcnt[iommu->seq_id] -= 1;
1801                        domain->iommu_count -= 1;
1802                        return -ENOSPC;
1803                }
1804
1805                set_bit(num, iommu->domain_ids);
1806                set_iommu_domain(iommu, num, domain);
1807
1808                domain->iommu_did[iommu->seq_id] = num;
1809                domain->nid                      = iommu->node;
1810
1811                domain_update_iommu_cap(domain);
1812        }
1813
1814        return 0;
1815}
1816
1817static int domain_detach_iommu(struct dmar_domain *domain,
1818                               struct intel_iommu *iommu)
1819{
1820        int num, count = INT_MAX;
1821
1822        assert_spin_locked(&device_domain_lock);
1823        assert_spin_locked(&iommu->lock);
1824
1825        domain->iommu_refcnt[iommu->seq_id] -= 1;
1826        count = --domain->iommu_count;
1827        if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1828                num = domain->iommu_did[iommu->seq_id];
1829                clear_bit(num, iommu->domain_ids);
1830                set_iommu_domain(iommu, num, NULL);
1831
1832                domain_update_iommu_cap(domain);
1833                domain->iommu_did[iommu->seq_id] = 0;
1834        }
1835
1836        return count;
1837}
1838
1839static struct iova_domain reserved_iova_list;
1840static struct lock_class_key reserved_rbtree_key;
1841
1842static int dmar_init_reserved_ranges(void)
1843{
1844        struct pci_dev *pdev = NULL;
1845        struct iova *iova;
1846        int i;
1847
1848        init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN,
1849                        DMA_32BIT_PFN);
1850
1851        lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1852                &reserved_rbtree_key);
1853
1854        /* IOAPIC ranges shouldn't be accessed by DMA */
1855        iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1856                IOVA_PFN(IOAPIC_RANGE_END));
1857        if (!iova) {
1858                pr_err("Reserve IOAPIC range failed\n");
1859                return -ENODEV;
1860        }
1861
1862        /* Reserve all PCI MMIO to avoid peer-to-peer access */
1863        for_each_pci_dev(pdev) {
1864                struct resource *r;
1865
1866                for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1867                        r = &pdev->resource[i];
1868                        if (!r->flags || !(r->flags & IORESOURCE_MEM))
1869                                continue;
1870                        iova = reserve_iova(&reserved_iova_list,
1871                                            IOVA_PFN(r->start),
1872                                            IOVA_PFN(r->end));
1873                        if (!iova) {
1874                                pr_err("Reserve iova failed\n");
1875                                return -ENODEV;
1876                        }
1877                }
1878        }
1879        return 0;
1880}
1881
1882static void domain_reserve_special_ranges(struct dmar_domain *domain)
1883{
1884        copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1885}
1886
1887static inline int guestwidth_to_adjustwidth(int gaw)
1888{
1889        int agaw;
1890        int r = (gaw - 12) % 9;
1891
1892        if (r == 0)
1893                agaw = gaw;
1894        else
1895                agaw = gaw + 9 - r;
1896        if (agaw > 64)
1897                agaw = 64;
1898        return agaw;
1899}
1900
1901static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1902                       int guest_width)
1903{
1904        int adjust_width, agaw;
1905        unsigned long sagaw;
1906
1907        init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
1908                        DMA_32BIT_PFN);
1909        domain_reserve_special_ranges(domain);
1910
1911        /* calculate AGAW */
1912        if (guest_width > cap_mgaw(iommu->cap))
1913                guest_width = cap_mgaw(iommu->cap);
1914        domain->gaw = guest_width;
1915        adjust_width = guestwidth_to_adjustwidth(guest_width);
1916        agaw = width_to_agaw(adjust_width);
1917        sagaw = cap_sagaw(iommu->cap);
1918        if (!test_bit(agaw, &sagaw)) {
1919                /* hardware doesn't support it, choose a bigger one */
1920                pr_debug("Hardware doesn't support agaw %d\n", agaw);
1921                agaw = find_next_bit(&sagaw, 5, agaw);
1922                if (agaw >= 5)
1923                        return -ENODEV;
1924        }
1925        domain->agaw = agaw;
1926
1927        if (ecap_coherent(iommu->ecap))
1928                domain->iommu_coherency = 1;
1929        else
1930                domain->iommu_coherency = 0;
1931
1932        if (ecap_sc_support(iommu->ecap))
1933                domain->iommu_snooping = 1;
1934        else
1935                domain->iommu_snooping = 0;
1936
1937        if (intel_iommu_superpage)
1938                domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1939        else
1940                domain->iommu_superpage = 0;
1941
1942        domain->nid = iommu->node;
1943
1944        /* always allocate the top pgd */
1945        domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1946        if (!domain->pgd)
1947                return -ENOMEM;
1948        __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1949        return 0;
1950}
1951
1952static void domain_exit(struct dmar_domain *domain)
1953{
1954        struct page *freelist = NULL;
1955
1956        /* Domain 0 is reserved, so dont process it */
1957        if (!domain)
1958                return;
1959
1960        /* Flush any lazy unmaps that may reference this domain */
1961        if (!intel_iommu_strict) {
1962                int cpu;
1963
1964                for_each_possible_cpu(cpu)
1965                        flush_unmaps_timeout(cpu);
1966        }
1967
1968        /* Remove associated devices and clear attached or cached domains */
1969        rcu_read_lock();
1970        domain_remove_dev_info(domain);
1971        rcu_read_unlock();
1972
1973        /* destroy iovas */
1974        put_iova_domain(&domain->iovad);
1975
1976        freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1977
1978        dma_free_pagelist(freelist);
1979
1980        free_domain_mem(domain);
1981}
1982
1983static int domain_context_mapping_one(struct dmar_domain *domain,
1984                                      struct intel_iommu *iommu,
1985                                      u8 bus, u8 devfn)
1986{
1987        u16 did = domain->iommu_did[iommu->seq_id];
1988        int translation = CONTEXT_TT_MULTI_LEVEL;
1989        struct device_domain_info *info = NULL;
1990        struct context_entry *context;
1991        unsigned long flags;
1992        struct dma_pte *pgd;
1993        int ret, agaw;
1994
1995        WARN_ON(did == 0);
1996
1997        if (hw_pass_through && domain_type_is_si(domain))
1998                translation = CONTEXT_TT_PASS_THROUGH;
1999
2000        pr_debug("Set context mapping for %02x:%02x.%d\n",
2001                bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2002
2003        BUG_ON(!domain->pgd);
2004
2005        spin_lock_irqsave(&device_domain_lock, flags);
2006        spin_lock(&iommu->lock);
2007
2008        ret = -ENOMEM;
2009        context = iommu_context_addr(iommu, bus, devfn, 1);
2010        if (!context)
2011                goto out_unlock;
2012
2013        ret = 0;
2014        if (context_present(context))
2015                goto out_unlock;
2016
2017        pgd = domain->pgd;
2018
2019        context_clear_entry(context);
2020        context_set_domain_id(context, did);
2021
2022        /*
2023         * Skip top levels of page tables for iommu which has less agaw
2024         * than default.  Unnecessary for PT mode.
2025         */
2026        if (translation != CONTEXT_TT_PASS_THROUGH) {
2027                for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
2028                        ret = -ENOMEM;
2029                        pgd = phys_to_virt(dma_pte_addr(pgd));
2030                        if (!dma_pte_present(pgd))
2031                                goto out_unlock;
2032                }
2033
2034                info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2035                if (info && info->ats_supported)
2036                        translation = CONTEXT_TT_DEV_IOTLB;
2037                else
2038                        translation = CONTEXT_TT_MULTI_LEVEL;
2039
2040                context_set_address_root(context, virt_to_phys(pgd));
2041                context_set_address_width(context, iommu->agaw);
2042        } else {
2043                /*
2044                 * In pass through mode, AW must be programmed to
2045                 * indicate the largest AGAW value supported by
2046                 * hardware. And ASR is ignored by hardware.
2047                 */
2048                context_set_address_width(context, iommu->msagaw);
2049        }
2050
2051        context_set_translation_type(context, translation);
2052        context_set_fault_enable(context);
2053        context_set_present(context);
2054        domain_flush_cache(domain, context, sizeof(*context));
2055
2056        /*
2057         * It's a non-present to present mapping. If hardware doesn't cache
2058         * non-present entry we only need to flush the write-buffer. If the
2059         * _does_ cache non-present entries, then it does so in the special
2060         * domain #0, which we have to flush:
2061         */
2062        if (cap_caching_mode(iommu->cap)) {
2063                iommu->flush.flush_context(iommu, 0,
2064                                           (((u16)bus) << 8) | devfn,
2065                                           DMA_CCMD_MASK_NOBIT,
2066                                           DMA_CCMD_DEVICE_INVL);
2067                iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2068        } else {
2069                iommu_flush_write_buffer(iommu);
2070        }
2071        iommu_enable_dev_iotlb(info);
2072
2073        ret = 0;
2074
2075out_unlock:
2076        spin_unlock(&iommu->lock);
2077        spin_unlock_irqrestore(&device_domain_lock, flags);
2078
2079        return ret;
2080}
2081
2082struct domain_context_mapping_data {
2083        struct dmar_domain *domain;
2084        struct intel_iommu *iommu;
2085};
2086
2087static int domain_context_mapping_cb(struct pci_dev *pdev,
2088                                     u16 alias, void *opaque)
2089{
2090        struct domain_context_mapping_data *data = opaque;
2091
2092        return domain_context_mapping_one(data->domain, data->iommu,
2093                                          PCI_BUS_NUM(alias), alias & 0xff);
2094}
2095
2096static int
2097domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2098{
2099        struct intel_iommu *iommu;
2100        u8 bus, devfn;
2101        struct domain_context_mapping_data data;
2102
2103        iommu = device_to_iommu(dev, &bus, &devfn);
2104        if (!iommu)
2105                return -ENODEV;
2106
2107        if (!dev_is_pci(dev))
2108                return domain_context_mapping_one(domain, iommu, bus, devfn);
2109
2110        data.domain = domain;
2111        data.iommu = iommu;
2112
2113        return pci_for_each_dma_alias(to_pci_dev(dev),
2114                                      &domain_context_mapping_cb, &data);
2115}
2116
2117static int domain_context_mapped_cb(struct pci_dev *pdev,
2118                                    u16 alias, void *opaque)
2119{
2120        struct intel_iommu *iommu = opaque;
2121
2122        return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2123}
2124
2125static int domain_context_mapped(struct device *dev)
2126{
2127        struct intel_iommu *iommu;
2128        u8 bus, devfn;
2129
2130        iommu = device_to_iommu(dev, &bus, &devfn);
2131        if (!iommu)
2132                return -ENODEV;
2133
2134        if (!dev_is_pci(dev))
2135                return device_context_mapped(iommu, bus, devfn);
2136
2137        return !pci_for_each_dma_alias(to_pci_dev(dev),
2138                                       domain_context_mapped_cb, iommu);
2139}
2140
2141/* Returns a number of VTD pages, but aligned to MM page size */
2142static inline unsigned long aligned_nrpages(unsigned long host_addr,
2143                                            size_t size)
2144{
2145        host_addr &= ~PAGE_MASK;
2146        return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2147}
2148
2149/* Return largest possible superpage level for a given mapping */
2150static inline int hardware_largepage_caps(struct dmar_domain *domain,
2151                                          unsigned long iov_pfn,
2152                                          unsigned long phy_pfn,
2153                                          unsigned long pages)
2154{
2155        int support, level = 1;
2156        unsigned long pfnmerge;
2157
2158        support = domain->iommu_superpage;
2159
2160        /* To use a large page, the virtual *and* physical addresses
2161           must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2162           of them will mean we have to use smaller pages. So just
2163           merge them and check both at once. */
2164        pfnmerge = iov_pfn | phy_pfn;
2165
2166        while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2167                pages >>= VTD_STRIDE_SHIFT;
2168                if (!pages)
2169                        break;
2170                pfnmerge >>= VTD_STRIDE_SHIFT;
2171                level++;
2172                support--;
2173        }
2174        return level;
2175}
2176
2177static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2178                            struct scatterlist *sg, unsigned long phys_pfn,
2179                            unsigned long nr_pages, int prot)
2180{
2181        struct dma_pte *first_pte = NULL, *pte = NULL;
2182        phys_addr_t uninitialized_var(pteval);
2183        unsigned long sg_res = 0;
2184        unsigned int largepage_lvl = 0;
2185        unsigned long lvl_pages = 0;
2186
2187        BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2188
2189        if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2190                return -EINVAL;
2191
2192        prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2193
2194        if (!sg) {
2195                sg_res = nr_pages;
2196                pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2197        }
2198
2199        while (nr_pages > 0) {
2200                uint64_t tmp;
2201
2202                if (!sg_res) {
2203                        sg_res = aligned_nrpages(sg->offset, sg->length);
2204                        sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
2205                        sg->dma_length = sg->length;
2206                        pteval = page_to_phys(sg_page(sg)) | prot;
2207                        phys_pfn = pteval >> VTD_PAGE_SHIFT;
2208                }
2209
2210                if (!pte) {
2211                        largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2212
2213                        first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2214                        if (!pte)
2215                                return -ENOMEM;
2216                        /* It is large page*/
2217                        if (largepage_lvl > 1) {
2218                                unsigned long nr_superpages, end_pfn;
2219
2220                                pteval |= DMA_PTE_LARGE_PAGE;
2221                                lvl_pages = lvl_to_nr_pages(largepage_lvl);
2222
2223                                nr_superpages = sg_res / lvl_pages;
2224                                end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2225
2226                                /*
2227                                 * Ensure that old small page tables are
2228                                 * removed to make room for superpage(s).
2229                                 */
2230                                dma_pte_free_pagetable(domain, iov_pfn, end_pfn);
2231                        } else {
2232                                pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2233                        }
2234
2235                }
2236                /* We don't need lock here, nobody else
2237                 * touches the iova range
2238                 */
2239                tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2240                if (tmp) {
2241                        static int dumps = 5;
2242                        pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2243                                iov_pfn, tmp, (unsigned long long)pteval);
2244                        if (dumps) {
2245                                dumps--;
2246                                debug_dma_dump_mappings(NULL);
2247                        }
2248                        WARN_ON(1);
2249                }
2250
2251                lvl_pages = lvl_to_nr_pages(largepage_lvl);
2252
2253                BUG_ON(nr_pages < lvl_pages);
2254                BUG_ON(sg_res < lvl_pages);
2255
2256                nr_pages -= lvl_pages;
2257                iov_pfn += lvl_pages;
2258                phys_pfn += lvl_pages;
2259                pteval += lvl_pages * VTD_PAGE_SIZE;
2260                sg_res -= lvl_pages;
2261
2262                /* If the next PTE would be the first in a new page, then we
2263                   need to flush the cache on the entries we've just written.
2264                   And then we'll need to recalculate 'pte', so clear it and
2265                   let it get set again in the if (!pte) block above.
2266
2267                   If we're done (!nr_pages) we need to flush the cache too.
2268
2269                   Also if we've been setting superpages, we may need to
2270                   recalculate 'pte' and switch back to smaller pages for the
2271                   end of the mapping, if the trailing size is not enough to
2272                   use another superpage (i.e. sg_res < lvl_pages). */
2273                pte++;
2274                if (!nr_pages || first_pte_in_page(pte) ||
2275                    (largepage_lvl > 1 && sg_res < lvl_pages)) {
2276                        domain_flush_cache(domain, first_pte,
2277                                           (void *)pte - (void *)first_pte);
2278                        pte = NULL;
2279                }
2280
2281                if (!sg_res && nr_pages)
2282                        sg = sg_next(sg);
2283        }
2284        return 0;
2285}
2286
2287static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2288                                    struct scatterlist *sg, unsigned long nr_pages,
2289                                    int prot)
2290{
2291        return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2292}
2293
2294static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2295                                     unsigned long phys_pfn, unsigned long nr_pages,
2296                                     int prot)
2297{
2298        return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2299}
2300
2301static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2302{
2303        if (!iommu)
2304                return;
2305
2306        clear_context_table(iommu, bus, devfn);
2307        iommu->flush.flush_context(iommu, 0, 0, 0,
2308                                           DMA_CCMD_GLOBAL_INVL);
2309        iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2310}
2311
2312static inline void unlink_domain_info(struct device_domain_info *info)
2313{
2314        assert_spin_locked(&device_domain_lock);
2315        list_del(&info->link);
2316        list_del(&info->global);
2317        if (info->dev)
2318                info->dev->archdata.iommu = NULL;
2319}
2320
2321static void domain_remove_dev_info(struct dmar_domain *domain)
2322{
2323        struct device_domain_info *info, *tmp;
2324        unsigned long flags;
2325
2326        spin_lock_irqsave(&device_domain_lock, flags);
2327        list_for_each_entry_safe(info, tmp, &domain->devices, link)
2328                __dmar_remove_one_dev_info(info);
2329        spin_unlock_irqrestore(&device_domain_lock, flags);
2330}
2331
2332/*
2333 * find_domain
2334 * Note: we use struct device->archdata.iommu stores the info
2335 */
2336static struct dmar_domain *find_domain(struct device *dev)
2337{
2338        struct device_domain_info *info;
2339
2340        /* No lock here, assumes no domain exit in normal case */
2341        info = dev->archdata.iommu;
2342        if (info)
2343                return info->domain;
2344        return NULL;
2345}
2346
2347static inline struct device_domain_info *
2348dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2349{
2350        struct device_domain_info *info;
2351
2352        list_for_each_entry(info, &device_domain_list, global)
2353                if (info->iommu->segment == segment && info->bus == bus &&
2354                    info->devfn == devfn)
2355                        return info;
2356
2357        return NULL;
2358}
2359
2360static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2361                                                    int bus, int devfn,
2362                                                    struct device *dev,
2363                                                    struct dmar_domain *domain)
2364{
2365        struct dmar_domain *found = NULL;
2366        struct device_domain_info *info;
2367        unsigned long flags;
2368        int ret;
2369
2370        info = alloc_devinfo_mem();
2371        if (!info)
2372                return NULL;
2373
2374        info->bus = bus;
2375        info->devfn = devfn;
2376        info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2377        info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2378        info->ats_qdep = 0;
2379        info->dev = dev;
2380        info->domain = domain;
2381        info->iommu = iommu;
2382
2383        if (dev && dev_is_pci(dev)) {
2384                struct pci_dev *pdev = to_pci_dev(info->dev);
2385
2386                if (ecap_dev_iotlb_support(iommu->ecap) &&
2387                    pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2388                    dmar_find_matched_atsr_unit(pdev))
2389                        info->ats_supported = 1;
2390
2391                if (ecs_enabled(iommu)) {
2392                        if (pasid_enabled(iommu)) {
2393                                int features = pci_pasid_features(pdev);
2394                                if (features >= 0)
2395                                        info->pasid_supported = features | 1;
2396                        }
2397
2398                        if (info->ats_supported && ecap_prs(iommu->ecap) &&
2399                            pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2400                                info->pri_supported = 1;
2401                }
2402        }
2403
2404        spin_lock_irqsave(&device_domain_lock, flags);
2405        if (dev)
2406                found = find_domain(dev);
2407
2408        if (!found) {
2409                struct device_domain_info *info2;
2410                info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2411                if (info2) {
2412                        found      = info2->domain;
2413                        info2->dev = dev;
2414                }
2415        }
2416
2417        if (found) {
2418                spin_unlock_irqrestore(&device_domain_lock, flags);
2419                free_devinfo_mem(info);
2420                /* Caller must free the original domain */
2421                return found;
2422        }
2423
2424        spin_lock(&iommu->lock);
2425        ret = domain_attach_iommu(domain, iommu);
2426        spin_unlock(&iommu->lock);
2427
2428        if (ret) {
2429                spin_unlock_irqrestore(&device_domain_lock, flags);
2430                free_devinfo_mem(info);
2431                return NULL;
2432        }
2433
2434        list_add(&info->link, &domain->devices);
2435        list_add(&info->global, &device_domain_list);
2436        if (dev)
2437                dev->archdata.iommu = info;
2438        spin_unlock_irqrestore(&device_domain_lock, flags);
2439
2440        if (dev && domain_context_mapping(domain, dev)) {
2441                pr_err("Domain context map for %s failed\n", dev_name(dev));
2442                dmar_remove_one_dev_info(domain, dev);
2443                return NULL;
2444        }
2445
2446        return domain;
2447}
2448
2449static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2450{
2451        *(u16 *)opaque = alias;
2452        return 0;
2453}
2454
2455/* domain is initialized */
2456static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2457{
2458        struct device_domain_info *info = NULL;
2459        struct dmar_domain *domain, *tmp;
2460        struct intel_iommu *iommu;
2461        u16 req_id, dma_alias;
2462        unsigned long flags;
2463        u8 bus, devfn;
2464
2465        domain = find_domain(dev);
2466        if (domain)
2467                return domain;
2468
2469        iommu = device_to_iommu(dev, &bus, &devfn);
2470        if (!iommu)
2471                return NULL;
2472
2473        req_id = ((u16)bus << 8) | devfn;
2474
2475        if (dev_is_pci(dev)) {
2476                struct pci_dev *pdev = to_pci_dev(dev);
2477
2478                pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2479
2480                spin_lock_irqsave(&device_domain_lock, flags);
2481                info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2482                                                      PCI_BUS_NUM(dma_alias),
2483                                                      dma_alias & 0xff);
2484                if (info) {
2485                        iommu = info->iommu;
2486                        domain = info->domain;
2487                }
2488                spin_unlock_irqrestore(&device_domain_lock, flags);
2489
2490                /* DMA alias already has a domain, uses it */
2491                if (info)
2492                        goto found_domain;
2493        }
2494
2495        /* Allocate and initialize new domain for the device */
2496        domain = alloc_domain(0);
2497        if (!domain)
2498                return NULL;
2499        if (domain_init(domain, iommu, gaw)) {
2500                domain_exit(domain);
2501                return NULL;
2502        }
2503
2504        /* register PCI DMA alias device */
2505        if (dev_is_pci(dev) && req_id != dma_alias) {
2506                tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2507                                               dma_alias & 0xff, NULL, domain);
2508
2509                if (!tmp || tmp != domain) {
2510                        domain_exit(domain);
2511                        domain = tmp;
2512                }
2513
2514                if (!domain)
2515                        return NULL;
2516        }
2517
2518found_domain:
2519        tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2520
2521        if (!tmp || tmp != domain) {
2522                domain_exit(domain);
2523                domain = tmp;
2524        }
2525
2526        return domain;
2527}
2528
2529static int iommu_domain_identity_map(struct dmar_domain *domain,
2530                                     unsigned long long start,
2531                                     unsigned long long end)
2532{
2533        unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2534        unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2535
2536        if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2537                          dma_to_mm_pfn(last_vpfn))) {
2538                pr_err("Reserving iova failed\n");
2539                return -ENOMEM;
2540        }
2541
2542        pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2543        /*
2544         * RMRR range might have overlap with physical memory range,
2545         * clear it first
2546         */
2547        dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2548
2549        return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2550                                  last_vpfn - first_vpfn + 1,
2551                                  DMA_PTE_READ|DMA_PTE_WRITE);
2552}
2553
2554static int domain_prepare_identity_map(struct device *dev,
2555                                       struct dmar_domain *domain,
2556                                       unsigned long long start,
2557                                       unsigned long long end)
2558{
2559        /* For _hardware_ passthrough, don't bother. But for software
2560           passthrough, we do it anyway -- it may indicate a memory
2561           range which is reserved in E820, so which didn't get set
2562           up to start with in si_domain */
2563        if (domain == si_domain && hw_pass_through) {
2564                pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2565                        dev_name(dev), start, end);
2566                return 0;
2567        }
2568
2569        pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2570                dev_name(dev), start, end);
2571
2572        if (end < start) {
2573                WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2574                        "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2575                        dmi_get_system_info(DMI_BIOS_VENDOR),
2576                        dmi_get_system_info(DMI_BIOS_VERSION),
2577                     dmi_get_system_info(DMI_PRODUCT_VERSION));
2578                return -EIO;
2579        }
2580
2581        if (end >> agaw_to_width(domain->agaw)) {
2582                WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2583                     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2584                     agaw_to_width(domain->agaw),
2585                     dmi_get_system_info(DMI_BIOS_VENDOR),
2586                     dmi_get_system_info(DMI_BIOS_VERSION),
2587                     dmi_get_system_info(DMI_PRODUCT_VERSION));
2588                return -EIO;
2589        }
2590
2591        return iommu_domain_identity_map(domain, start, end);
2592}
2593
2594static int iommu_prepare_identity_map(struct device *dev,
2595                                      unsigned long long start,
2596                                      unsigned long long end)
2597{
2598        struct dmar_domain *domain;
2599        int ret;
2600
2601        domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2602        if (!domain)
2603                return -ENOMEM;
2604
2605        ret = domain_prepare_identity_map(dev, domain, start, end);
2606        if (ret)
2607                domain_exit(domain);
2608
2609        return ret;
2610}
2611
2612static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2613                                         struct device *dev)
2614{
2615        if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2616                return 0;
2617        return iommu_prepare_identity_map(dev, rmrr->base_address,
2618                                          rmrr->end_address);
2619}
2620
2621#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2622static inline void iommu_prepare_isa(void)
2623{
2624        struct pci_dev *pdev;
2625        int ret;
2626
2627        pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2628        if (!pdev)
2629                return;
2630
2631        pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2632        ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2633
2634        if (ret)
2635                pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2636
2637        pci_dev_put(pdev);
2638}
2639#else
2640static inline void iommu_prepare_isa(void)
2641{
2642        return;
2643}
2644#endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2645
2646static int md_domain_init(struct dmar_domain *domain, int guest_width);
2647
2648static int __init si_domain_init(int hw)
2649{
2650        int nid, ret = 0;
2651
2652        si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2653        if (!si_domain)
2654                return -EFAULT;
2655
2656        if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2657                domain_exit(si_domain);
2658                return -EFAULT;
2659        }
2660
2661        pr_debug("Identity mapping domain allocated\n");
2662
2663        if (hw)
2664                return 0;
2665
2666        for_each_online_node(nid) {
2667                unsigned long start_pfn, end_pfn;
2668                int i;
2669
2670                for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2671                        ret = iommu_domain_identity_map(si_domain,
2672                                        PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2673                        if (ret)
2674                                return ret;
2675                }
2676        }
2677
2678        return 0;
2679}
2680
2681static int identity_mapping(struct device *dev)
2682{
2683        struct device_domain_info *info;
2684
2685        if (likely(!iommu_identity_mapping))
2686                return 0;
2687
2688        info = dev->archdata.iommu;
2689        if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2690                return (info->domain == si_domain);
2691
2692        return 0;
2693}
2694
2695static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2696{
2697        struct dmar_domain *ndomain;
2698        struct intel_iommu *iommu;
2699        u8 bus, devfn;
2700
2701        iommu = device_to_iommu(dev, &bus, &devfn);
2702        if (!iommu)
2703                return -ENODEV;
2704
2705        ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2706        if (ndomain != domain)
2707                return -EBUSY;
2708
2709        return 0;
2710}
2711
2712static bool device_has_rmrr(struct device *dev)
2713{
2714        struct dmar_rmrr_unit *rmrr;
2715        struct device *tmp;
2716        int i;
2717
2718        rcu_read_lock();
2719        for_each_rmrr_units(rmrr) {
2720                /*
2721                 * Return TRUE if this RMRR contains the device that
2722                 * is passed in.
2723                 */
2724                for_each_active_dev_scope(rmrr->devices,
2725                                          rmrr->devices_cnt, i, tmp)
2726                        if (tmp == dev) {
2727                                rcu_read_unlock();
2728                                return true;
2729                        }
2730        }
2731        rcu_read_unlock();
2732        return false;
2733}
2734
2735/*
2736 * There are a couple cases where we need to restrict the functionality of
2737 * devices associated with RMRRs.  The first is when evaluating a device for
2738 * identity mapping because problems exist when devices are moved in and out
2739 * of domains and their respective RMRR information is lost.  This means that
2740 * a device with associated RMRRs will never be in a "passthrough" domain.
2741 * The second is use of the device through the IOMMU API.  This interface
2742 * expects to have full control of the IOVA space for the device.  We cannot
2743 * satisfy both the requirement that RMRR access is maintained and have an
2744 * unencumbered IOVA space.  We also have no ability to quiesce the device's
2745 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2746 * We therefore prevent devices associated with an RMRR from participating in
2747 * the IOMMU API, which eliminates them from device assignment.
2748 *
2749 * In both cases we assume that PCI USB devices with RMRRs have them largely
2750 * for historical reasons and that the RMRR space is not actively used post
2751 * boot.  This exclusion may change if vendors begin to abuse it.
2752 *
2753 * The same exception is made for graphics devices, with the requirement that
2754 * any use of the RMRR regions will be torn down before assigning the device
2755 * to a guest.
2756 */
2757static bool device_is_rmrr_locked(struct device *dev)
2758{
2759        if (!device_has_rmrr(dev))
2760                return false;
2761
2762        if (dev_is_pci(dev)) {
2763                struct pci_dev *pdev = to_pci_dev(dev);
2764
2765                if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2766                        return false;
2767        }
2768
2769        return true;
2770}
2771
2772static int iommu_should_identity_map(struct device *dev, int startup)
2773{
2774
2775        if (dev_is_pci(dev)) {
2776                struct pci_dev *pdev = to_pci_dev(dev);
2777
2778                if (device_is_rmrr_locked(dev))
2779                        return 0;
2780
2781                if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2782                        return 1;
2783
2784                if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2785                        return 1;
2786
2787                if (!(iommu_identity_mapping & IDENTMAP_ALL))
2788                        return 0;
2789
2790                /*
2791                 * We want to start off with all devices in the 1:1 domain, and
2792                 * take them out later if we find they can't access all of memory.
2793                 *
2794                 * However, we can't do this for PCI devices behind bridges,
2795                 * because all PCI devices behind the same bridge will end up
2796                 * with the same source-id on their transactions.
2797                 *
2798                 * Practically speaking, we can't change things around for these
2799                 * devices at run-time, because we can't be sure there'll be no
2800                 * DMA transactions in flight for any of their siblings.
2801                 *
2802                 * So PCI devices (unless they're on the root bus) as well as
2803                 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2804                 * the 1:1 domain, just in _case_ one of their siblings turns out
2805                 * not to be able to map all of memory.
2806                 */
2807                if (!pci_is_pcie(pdev)) {
2808                        if (!pci_is_root_bus(pdev->bus))
2809                                return 0;
2810                        if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2811                                return 0;
2812                } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2813                        return 0;
2814        } else {
2815                if (device_has_rmrr(dev))
2816                        return 0;
2817        }
2818
2819        /*
2820         * At boot time, we don't yet know if devices will be 64-bit capable.
2821         * Assume that they will — if they turn out not to be, then we can
2822         * take them out of the 1:1 domain later.
2823         */
2824        if (!startup) {
2825                /*
2826                 * If the device's dma_mask is less than the system's memory
2827                 * size then this is not a candidate for identity mapping.
2828                 */
2829                u64 dma_mask = *dev->dma_mask;
2830
2831                if (dev->coherent_dma_mask &&
2832                    dev->coherent_dma_mask < dma_mask)
2833                        dma_mask = dev->coherent_dma_mask;
2834
2835                return dma_mask >= dma_get_required_mask(dev);
2836        }
2837
2838        return 1;
2839}
2840
2841static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2842{
2843        int ret;
2844
2845        if (!iommu_should_identity_map(dev, 1))
2846                return 0;
2847
2848        ret = domain_add_dev_info(si_domain, dev);
2849        if (!ret)
2850                pr_info("%s identity mapping for device %s\n",
2851                        hw ? "Hardware" : "Software", dev_name(dev));
2852        else if (ret == -ENODEV)
2853                /* device not associated with an iommu */
2854                ret = 0;
2855
2856        return ret;
2857}
2858
2859
2860static int __init iommu_prepare_static_identity_mapping(int hw)
2861{
2862        struct pci_dev *pdev = NULL;
2863        struct dmar_drhd_unit *drhd;
2864        struct intel_iommu *iommu;
2865        struct device *dev;
2866        int i;
2867        int ret = 0;
2868
2869        for_each_pci_dev(pdev) {
2870                ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2871                if (ret)
2872                        return ret;
2873        }
2874
2875        for_each_active_iommu(iommu, drhd)
2876                for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2877                        struct acpi_device_physical_node *pn;
2878                        struct acpi_device *adev;
2879
2880                        if (dev->bus != &acpi_bus_type)
2881                                continue;
2882
2883                        adev= to_acpi_device(dev);
2884                        mutex_lock(&adev->physical_node_lock);
2885                        list_for_each_entry(pn, &adev->physical_node_list, node) {
2886                                ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2887                                if (ret)
2888                                        break;
2889                        }
2890                        mutex_unlock(&adev->physical_node_lock);
2891                        if (ret)
2892                                return ret;
2893                }
2894
2895        return 0;
2896}
2897
2898static void intel_iommu_init_qi(struct intel_iommu *iommu)
2899{
2900        /*
2901         * Start from the sane iommu hardware state.
2902         * If the queued invalidation is already initialized by us
2903         * (for example, while enabling interrupt-remapping) then
2904         * we got the things already rolling from a sane state.
2905         */
2906        if (!iommu->qi) {
2907                /*
2908                 * Clear any previous faults.
2909                 */
2910                dmar_fault(-1, iommu);
2911                /*
2912                 * Disable queued invalidation if supported and already enabled
2913                 * before OS handover.
2914                 */
2915                dmar_disable_qi(iommu);
2916        }
2917
2918        if (dmar_enable_qi(iommu)) {
2919                /*
2920                 * Queued Invalidate not enabled, use Register Based Invalidate
2921                 */
2922                iommu->flush.flush_context = __iommu_flush_context;
2923                iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2924                pr_info("%s: Using Register based invalidation\n",
2925                        iommu->name);
2926        } else {
2927                iommu->flush.flush_context = qi_flush_context;
2928                iommu->flush.flush_iotlb = qi_flush_iotlb;
2929                pr_info("%s: Using Queued invalidation\n", iommu->name);
2930        }
2931}
2932
2933static int copy_context_table(struct intel_iommu *iommu,
2934                              struct root_entry *old_re,
2935                              struct context_entry **tbl,
2936                              int bus, bool ext)
2937{
2938        int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2939        struct context_entry *new_ce = NULL, ce;
2940        struct context_entry *old_ce = NULL;
2941        struct root_entry re;
2942        phys_addr_t old_ce_phys;
2943
2944        tbl_idx = ext ? bus * 2 : bus;
2945        memcpy(&re, old_re, sizeof(re));
2946
2947        for (devfn = 0; devfn < 256; devfn++) {
2948                /* First calculate the correct index */
2949                idx = (ext ? devfn * 2 : devfn) % 256;
2950
2951                if (idx == 0) {
2952                        /* First save what we may have and clean up */
2953                        if (new_ce) {
2954                                tbl[tbl_idx] = new_ce;
2955                                __iommu_flush_cache(iommu, new_ce,
2956                                                    VTD_PAGE_SIZE);
2957                                pos = 1;
2958                        }
2959
2960                        if (old_ce)
2961                                iounmap(old_ce);
2962
2963                        ret = 0;
2964                        if (devfn < 0x80)
2965                                old_ce_phys = root_entry_lctp(&re);
2966                        else
2967                                old_ce_phys = root_entry_uctp(&re);
2968
2969                        if (!old_ce_phys) {
2970                                if (ext && devfn == 0) {
2971                                        /* No LCTP, try UCTP */
2972                                        devfn = 0x7f;
2973                                        continue;
2974                                } else {
2975                                        goto out;
2976                                }
2977                        }
2978
2979                        ret = -ENOMEM;
2980                        old_ce = memremap(old_ce_phys, PAGE_SIZE,
2981                                        MEMREMAP_WB);
2982                        if (!old_ce)
2983                                goto out;
2984
2985                        new_ce = alloc_pgtable_page(iommu->node);
2986                        if (!new_ce)
2987                                goto out_unmap;
2988
2989                        ret = 0;
2990                }
2991
2992                /* Now copy the context entry */
2993                memcpy(&ce, old_ce + idx, sizeof(ce));
2994
2995                if (!__context_present(&ce))
2996                        continue;
2997
2998                did = context_domain_id(&ce);
2999                if (did >= 0 && did < cap_ndoms(iommu->cap))
3000                        set_bit(did, iommu->domain_ids);
3001
3002                /*
3003                 * We need a marker for copied context entries. This
3004                 * marker needs to work for the old format as well as
3005                 * for extended context entries.
3006                 *
3007                 * Bit 67 of the context entry is used. In the old
3008                 * format this bit is available to software, in the
3009                 * extended format it is the PGE bit, but PGE is ignored
3010                 * by HW if PASIDs are disabled (and thus still
3011                 * available).
3012                 *
3013                 * So disable PASIDs first and then mark the entry
3014                 * copied. This means that we don't copy PASID
3015                 * translations from the old kernel, but this is fine as
3016                 * faults there are not fatal.
3017                 */
3018                context_clear_pasid_enable(&ce);
3019                context_set_copied(&ce);
3020
3021                new_ce[idx] = ce;
3022        }
3023
3024        tbl[tbl_idx + pos] = new_ce;
3025
3026        __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3027
3028out_unmap:
3029        memunmap(old_ce);
3030
3031out:
3032        return ret;
3033}
3034
3035static int copy_translation_tables(struct intel_iommu *iommu)
3036{
3037        struct context_entry **ctxt_tbls;
3038        struct root_entry *old_rt;
3039        phys_addr_t old_rt_phys;
3040        int ctxt_table_entries;
3041        unsigned long flags;
3042        u64 rtaddr_reg;
3043        int bus, ret;
3044        bool new_ext, ext;
3045
3046        rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3047        ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3048        new_ext    = !!ecap_ecs(iommu->ecap);
3049
3050        /*
3051         * The RTT bit can only be changed when translation is disabled,
3052         * but disabling translation means to open a window for data
3053         * corruption. So bail out and don't copy anything if we would
3054         * have to change the bit.
3055         */
3056        if (new_ext != ext)
3057                return -EINVAL;
3058
3059        old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3060        if (!old_rt_phys)
3061                return -EINVAL;
3062
3063        old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3064        if (!old_rt)
3065                return -ENOMEM;
3066
3067        /* This is too big for the stack - allocate it from slab */
3068        ctxt_table_entries = ext ? 512 : 256;
3069        ret = -ENOMEM;
3070        ctxt_tbls = kzalloc(ctxt_table_entries * sizeof(void *), GFP_KERNEL);
3071        if (!ctxt_tbls)
3072                goto out_unmap;
3073
3074        for (bus = 0; bus < 256; bus++) {
3075                ret = copy_context_table(iommu, &old_rt[bus],
3076                                         ctxt_tbls, bus, ext);
3077                if (ret) {
3078                        pr_err("%s: Failed to copy context table for bus %d\n",
3079                                iommu->name, bus);
3080                        continue;
3081                }
3082        }
3083
3084        spin_lock_irqsave(&iommu->lock, flags);
3085
3086        /* Context tables are copied, now write them to the root_entry table */
3087        for (bus = 0; bus < 256; bus++) {
3088                int idx = ext ? bus * 2 : bus;
3089                u64 val;
3090
3091                if (ctxt_tbls[idx]) {
3092                        val = virt_to_phys(ctxt_tbls[idx]) | 1;
3093                        iommu->root_entry[bus].lo = val;
3094                }
3095
3096                if (!ext || !ctxt_tbls[idx + 1])
3097                        continue;
3098
3099                val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3100                iommu->root_entry[bus].hi = val;
3101        }
3102
3103        spin_unlock_irqrestore(&iommu->lock, flags);
3104
3105        kfree(ctxt_tbls);
3106
3107        __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3108
3109        ret = 0;
3110
3111out_unmap:
3112        memunmap(old_rt);
3113
3114        return ret;
3115}
3116
3117static int __init init_dmars(void)
3118{
3119        struct dmar_drhd_unit *drhd;
3120        struct dmar_rmrr_unit *rmrr;
3121        bool copied_tables = false;
3122        struct device *dev;
3123        struct intel_iommu *iommu;
3124        int i, ret, cpu;
3125
3126        /*
3127         * for each drhd
3128         *    allocate root
3129         *    initialize and program root entry to not present
3130         * endfor
3131         */
3132        for_each_drhd_unit(drhd) {
3133                /*
3134                 * lock not needed as this is only incremented in the single
3135                 * threaded kernel __init code path all other access are read
3136                 * only
3137                 */
3138                if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3139                        g_num_of_iommus++;
3140                        continue;
3141                }
3142                pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3143        }
3144
3145        /* Preallocate enough resources for IOMMU hot-addition */
3146        if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3147                g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3148
3149        g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3150                        GFP_KERNEL);
3151        if (!g_iommus) {
3152                pr_err("Allocating global iommu array failed\n");
3153                ret = -ENOMEM;
3154                goto error;
3155        }
3156
3157        for_each_possible_cpu(cpu) {
3158                struct deferred_flush_data *dfd = per_cpu_ptr(&deferred_flush,
3159                                                              cpu);
3160
3161                dfd->tables = kzalloc(g_num_of_iommus *
3162                                      sizeof(struct deferred_flush_table),
3163                                      GFP_KERNEL);
3164                if (!dfd->tables) {
3165                        ret = -ENOMEM;
3166                        goto free_g_iommus;
3167                }
3168
3169                spin_lock_init(&dfd->lock);
3170                setup_timer(&dfd->timer, flush_unmaps_timeout, cpu);
3171        }
3172
3173        for_each_active_iommu(iommu, drhd) {
3174                g_iommus[iommu->seq_id] = iommu;
3175
3176                intel_iommu_init_qi(iommu);
3177
3178                ret = iommu_init_domains(iommu);
3179                if (ret)
3180                        goto free_iommu;
3181
3182                init_translation_status(iommu);
3183
3184                if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3185                        iommu_disable_translation(iommu);
3186                        clear_translation_pre_enabled(iommu);
3187                        pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3188                                iommu->name);
3189                }
3190
3191                /*
3192                 * TBD:
3193                 * we could share the same root & context tables
3194                 * among all IOMMU's. Need to Split it later.
3195                 */
3196                ret = iommu_alloc_root_entry(iommu);
3197                if (ret)
3198                        goto free_iommu;
3199
3200                if (translation_pre_enabled(iommu)) {
3201                        pr_info("Translation already enabled - trying to copy translation structures\n");
3202
3203                        ret = copy_translation_tables(iommu);
3204                        if (ret) {
3205                                /*
3206                                 * We found the IOMMU with translation
3207                                 * enabled - but failed to copy over the
3208                                 * old root-entry table. Try to proceed
3209                                 * by disabling translation now and
3210                                 * allocating a clean root-entry table.
3211                                 * This might cause DMAR faults, but
3212                                 * probably the dump will still succeed.
3213                                 */
3214                                pr_err("Failed to copy translation tables from previous kernel for %s\n",
3215                                       iommu->name);
3216                                iommu_disable_translation(iommu);
3217                                clear_translation_pre_enabled(iommu);
3218                        } else {
3219                                pr_info("Copied translation tables from previous kernel for %s\n",
3220                                        iommu->name);
3221                                copied_tables = true;
3222                        }
3223                }
3224
3225                if (!ecap_pass_through(iommu->ecap))
3226                        hw_pass_through = 0;
3227#ifdef CONFIG_INTEL_IOMMU_SVM
3228                if (pasid_enabled(iommu))
3229                        intel_svm_alloc_pasid_tables(iommu);
3230#endif
3231        }
3232
3233        /*
3234         * Now that qi is enabled on all iommus, set the root entry and flush
3235         * caches. This is required on some Intel X58 chipsets, otherwise the
3236         * flush_context function will loop forever and the boot hangs.
3237         */
3238        for_each_active_iommu(iommu, drhd) {
3239                iommu_flush_write_buffer(iommu);
3240                iommu_set_root_entry(iommu);
3241                iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3242                iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3243        }
3244
3245        if (iommu_pass_through)
3246                iommu_identity_mapping |= IDENTMAP_ALL;
3247
3248#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3249        iommu_identity_mapping |= IDENTMAP_GFX;
3250#endif
3251
3252        if (iommu_identity_mapping) {
3253                ret = si_domain_init(hw_pass_through);
3254                if (ret)
3255                        goto free_iommu;
3256        }
3257
3258        check_tylersburg_isoch();
3259
3260        /*
3261         * If we copied translations from a previous kernel in the kdump
3262         * case, we can not assign the devices to domains now, as that
3263         * would eliminate the old mappings. So skip this part and defer
3264         * the assignment to device driver initialization time.
3265         */
3266        if (copied_tables)
3267                goto domains_done;
3268
3269        /*
3270         * If pass through is not set or not enabled, setup context entries for
3271         * identity mappings for rmrr, gfx, and isa and may fall back to static
3272         * identity mapping if iommu_identity_mapping is set.
3273         */
3274        if (iommu_identity_mapping) {
3275                ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3276                if (ret) {
3277                        pr_crit("Failed to setup IOMMU pass-through\n");
3278                        goto free_iommu;
3279                }
3280        }
3281        /*
3282         * For each rmrr
3283         *   for each dev attached to rmrr
3284         *   do
3285         *     locate drhd for dev, alloc domain for dev
3286         *     allocate free domain
3287         *     allocate page table entries for rmrr
3288         *     if context not allocated for bus
3289         *           allocate and init context
3290         *           set present in root table for this bus
3291         *     init context with domain, translation etc
3292         *    endfor
3293         * endfor
3294         */
3295        pr_info("Setting RMRR:\n");
3296        for_each_rmrr_units(rmrr) {
3297                /* some BIOS lists non-exist devices in DMAR table. */
3298                for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3299                                          i, dev) {
3300                        ret = iommu_prepare_rmrr_dev(rmrr, dev);
3301                        if (ret)
3302                                pr_err("Mapping reserved region failed\n");
3303                }
3304        }
3305
3306        iommu_prepare_isa();
3307
3308domains_done:
3309
3310        /*
3311         * for each drhd
3312         *   enable fault log
3313         *   global invalidate context cache
3314         *   global invalidate iotlb
3315         *   enable translation
3316         */
3317        for_each_iommu(iommu, drhd) {
3318                if (drhd->ignored) {
3319                        /*
3320                         * we always have to disable PMRs or DMA may fail on
3321                         * this device
3322                         */
3323                        if (force_on)
3324                                iommu_disable_protect_mem_regions(iommu);
3325                        continue;
3326                }
3327
3328                iommu_flush_write_buffer(iommu);
3329
3330#ifdef CONFIG_INTEL_IOMMU_SVM
3331                if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
3332                        ret = intel_svm_enable_prq(iommu);
3333                        if (ret)
3334                                goto free_iommu;
3335                }
3336#endif
3337                ret = dmar_set_interrupt(iommu);
3338                if (ret)
3339                        goto free_iommu;
3340
3341                if (!translation_pre_enabled(iommu))
3342                        iommu_enable_translation(iommu);
3343
3344                iommu_disable_protect_mem_regions(iommu);
3345        }
3346
3347        return 0;
3348
3349free_iommu:
3350        for_each_active_iommu(iommu, drhd) {
3351                disable_dmar_iommu(iommu);
3352                free_dmar_iommu(iommu);
3353        }
3354free_g_iommus:
3355        for_each_possible_cpu(cpu)
3356                kfree(per_cpu_ptr(&deferred_flush, cpu)->tables);
3357        kfree(g_iommus);
3358error:
3359        return ret;
3360}
3361
3362/* This takes a number of _MM_ pages, not VTD pages */
3363static unsigned long intel_alloc_iova(struct device *dev,
3364                                     struct dmar_domain *domain,
3365                                     unsigned long nrpages, uint64_t dma_mask)
3366{
3367        unsigned long iova_pfn = 0;
3368
3369        /* Restrict dma_mask to the width that the iommu can handle */
3370        dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3371        /* Ensure we reserve the whole size-aligned region */
3372        nrpages = __roundup_pow_of_two(nrpages);
3373
3374        if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3375                /*
3376                 * First try to allocate an io virtual address in
3377                 * DMA_BIT_MASK(32) and if that fails then try allocating
3378                 * from higher range
3379                 */
3380                iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3381                                           IOVA_PFN(DMA_BIT_MASK(32)));
3382                if (iova_pfn)
3383                        return iova_pfn;
3384        }
3385        iova_pfn = alloc_iova_fast(&domain->iovad, nrpages, IOVA_PFN(dma_mask));
3386        if (unlikely(!iova_pfn)) {
3387                pr_err("Allocating %ld-page iova for %s failed",
3388                       nrpages, dev_name(dev));
3389                return 0;
3390        }
3391
3392        return iova_pfn;
3393}
3394
3395static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
3396{
3397        struct dmar_rmrr_unit *rmrr;
3398        struct dmar_domain *domain;
3399        struct device *i_dev;
3400        int i, ret;
3401
3402        domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3403        if (!domain) {
3404                pr_err("Allocating domain for %s failed\n",
3405                       dev_name(dev));
3406                return NULL;
3407        }
3408
3409        /* We have a new domain - setup possible RMRRs for the device */
3410        rcu_read_lock();
3411        for_each_rmrr_units(rmrr) {
3412                for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3413                                          i, i_dev) {
3414                        if (i_dev != dev)
3415                                continue;
3416
3417                        ret = domain_prepare_identity_map(dev, domain,
3418                                                          rmrr->base_address,
3419                                                          rmrr->end_address);
3420                        if (ret)
3421                                dev_err(dev, "Mapping reserved region failed\n");
3422                }
3423        }
3424        rcu_read_unlock();
3425
3426        return domain;
3427}
3428
3429static inline struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3430{
3431        struct device_domain_info *info;
3432
3433        /* No lock here, assumes no domain exit in normal case */
3434        info = dev->archdata.iommu;
3435        if (likely(info))
3436                return info->domain;
3437
3438        return __get_valid_domain_for_dev(dev);
3439}
3440
3441/* Check if the dev needs to go through non-identity map and unmap process.*/
3442static int iommu_no_mapping(struct device *dev)
3443{
3444        int found;
3445
3446        if (iommu_dummy(dev))
3447                return 1;
3448
3449        if (!iommu_identity_mapping)
3450                return 0;
3451
3452        found = identity_mapping(dev);
3453        if (found) {
3454                if (iommu_should_identity_map(dev, 0))
3455                        return 1;
3456                else {
3457                        /*
3458                         * 32 bit DMA is removed from si_domain and fall back
3459                         * to non-identity mapping.
3460                         */
3461                        dmar_remove_one_dev_info(si_domain, dev);
3462                        pr_info("32bit %s uses non-identity mapping\n",
3463                                dev_name(dev));
3464                        return 0;
3465                }
3466        } else {
3467                /*
3468                 * In case of a detached 64 bit DMA device from vm, the device
3469                 * is put into si_domain for identity mapping.
3470                 */
3471                if (iommu_should_identity_map(dev, 0)) {
3472                        int ret;
3473                        ret = domain_add_dev_info(si_domain, dev);
3474                        if (!ret) {
3475                                pr_info("64bit %s uses identity mapping\n",
3476                                        dev_name(dev));
3477                                return 1;
3478                        }
3479                }
3480        }
3481
3482        return 0;
3483}
3484
3485static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3486                                     size_t size, int dir, u64 dma_mask)
3487{
3488        struct dmar_domain *domain;
3489        phys_addr_t start_paddr;
3490        unsigned long iova_pfn;
3491        int prot = 0;
3492        int ret;
3493        struct intel_iommu *iommu;
3494        unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3495
3496        BUG_ON(dir == DMA_NONE);
3497
3498        if (iommu_no_mapping(dev))
3499                return paddr;
3500
3501        domain = get_valid_domain_for_dev(dev);
3502        if (!domain)
3503                return 0;
3504
3505        iommu = domain_get_iommu(domain);
3506        size = aligned_nrpages(paddr, size);
3507
3508        iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3509        if (!iova_pfn)
3510                goto error;
3511
3512        /*
3513         * Check if DMAR supports zero-length reads on write only
3514         * mappings..
3515         */
3516        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3517                        !cap_zlr(iommu->cap))
3518                prot |= DMA_PTE_READ;
3519        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3520                prot |= DMA_PTE_WRITE;
3521        /*
3522         * paddr - (paddr + size) might be partial page, we should map the whole
3523         * page.  Note: if two part of one page are separately mapped, we
3524         * might have two guest_addr mapping to the same host paddr, but this
3525         * is not a big problem
3526         */
3527        ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3528                                 mm_to_dma_pfn(paddr_pfn), size, prot);
3529        if (ret)
3530                goto error;
3531
3532        /* it's a non-present to present mapping. Only flush if caching mode */
3533        if (cap_caching_mode(iommu->cap))
3534                iommu_flush_iotlb_psi(iommu, domain,
3535                                      mm_to_dma_pfn(iova_pfn),
3536                                      size, 0, 1);
3537        else
3538                iommu_flush_write_buffer(iommu);
3539
3540        start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3541        start_paddr += paddr & ~PAGE_MASK;
3542        return start_paddr;
3543
3544error:
3545        if (iova_pfn)
3546                free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3547        pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3548                dev_name(dev), size, (unsigned long long)paddr, dir);
3549        return 0;
3550}
3551
3552static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3553                                 unsigned long offset, size_t size,
3554                                 enum dma_data_direction dir,
3555                                 unsigned long attrs)
3556{
3557        return __intel_map_single(dev, page_to_phys(page) + offset, size,
3558                                  dir, *dev->dma_mask);
3559}
3560
3561static void flush_unmaps(struct deferred_flush_data *flush_data)
3562{
3563        int i, j;
3564
3565        flush_data->timer_on = 0;
3566
3567        /* just flush them all */
3568        for (i = 0; i < g_num_of_iommus; i++) {
3569                struct intel_iommu *iommu = g_iommus[i];
3570                struct deferred_flush_table *flush_table =
3571                                &flush_data->tables[i];
3572                if (!iommu)
3573                        continue;
3574
3575                if (!flush_table->next)
3576                        continue;
3577
3578                /* In caching mode, global flushes turn emulation expensive */
3579                if (!cap_caching_mode(iommu->cap))
3580                        iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3581                                         DMA_TLB_GLOBAL_FLUSH);
3582                for (j = 0; j < flush_table->next; j++) {
3583                        unsigned long mask;
3584                        struct deferred_flush_entry *entry =
3585                                                &flush_table->entries[j];
3586                        unsigned long iova_pfn = entry->iova_pfn;
3587                        unsigned long nrpages = entry->nrpages;
3588                        struct dmar_domain *domain = entry->domain;
3589                        struct page *freelist = entry->freelist;
3590
3591                        /* On real hardware multiple invalidations are expensive */
3592                        if (cap_caching_mode(iommu->cap))
3593                                iommu_flush_iotlb_psi(iommu, domain,
3594                                        mm_to_dma_pfn(iova_pfn),
3595                                        nrpages, !freelist, 0);
3596                        else {
3597                                mask = ilog2(nrpages);
3598                                iommu_flush_dev_iotlb(domain,
3599                                                (uint64_t)iova_pfn << PAGE_SHIFT, mask);
3600                        }
3601                        free_iova_fast(&domain->iovad, iova_pfn, nrpages);
3602                        if (freelist)
3603                                dma_free_pagelist(freelist);
3604                }
3605                flush_table->next = 0;
3606        }
3607
3608        flush_data->size = 0;
3609}
3610
3611static void flush_unmaps_timeout(unsigned long cpuid)
3612{
3613        struct deferred_flush_data *flush_data = per_cpu_ptr(&deferred_flush, cpuid);
3614        unsigned long flags;
3615
3616        spin_lock_irqsave(&flush_data->lock, flags);
3617        flush_unmaps(flush_data);
3618        spin_unlock_irqrestore(&flush_data->lock, flags);
3619}
3620
3621static void add_unmap(struct dmar_domain *dom, unsigned long iova_pfn,
3622                      unsigned long nrpages, struct page *freelist)
3623{
3624        unsigned long flags;
3625        int entry_id, iommu_id;
3626        struct intel_iommu *iommu;
3627        struct deferred_flush_entry *entry;
3628        struct deferred_flush_data *flush_data;
3629        unsigned int cpuid;
3630
3631        cpuid = get_cpu();
3632        flush_data = per_cpu_ptr(&deferred_flush, cpuid);
3633
3634        /* Flush all CPUs' entries to avoid deferring too much.  If
3635         * this becomes a bottleneck, can just flush us, and rely on
3636         * flush timer for the rest.
3637         */
3638        if (flush_data->size == HIGH_WATER_MARK) {
3639                int cpu;
3640
3641                for_each_online_cpu(cpu)
3642                        flush_unmaps_timeout(cpu);
3643        }
3644
3645        spin_lock_irqsave(&flush_data->lock, flags);
3646
3647        iommu = domain_get_iommu(dom);
3648        iommu_id = iommu->seq_id;
3649
3650        entry_id = flush_data->tables[iommu_id].next;
3651        ++(flush_data->tables[iommu_id].next);
3652
3653        entry = &flush_data->tables[iommu_id].entries[entry_id];
3654        entry->domain = dom;
3655        entry->iova_pfn = iova_pfn;
3656        entry->nrpages = nrpages;
3657        entry->freelist = freelist;
3658
3659        if (!flush_data->timer_on) {
3660                mod_timer(&flush_data->timer, jiffies + msecs_to_jiffies(10));
3661                flush_data->timer_on = 1;
3662        }
3663        flush_data->size++;
3664        spin_unlock_irqrestore(&flush_data->lock, flags);
3665
3666        put_cpu();
3667}
3668
3669static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3670{
3671        struct dmar_domain *domain;
3672        unsigned long start_pfn, last_pfn;
3673        unsigned long nrpages;
3674        unsigned long iova_pfn;
3675        struct intel_iommu *iommu;
3676        struct page *freelist;
3677
3678        if (iommu_no_mapping(dev))
3679                return;
3680
3681        domain = find_domain(dev);
3682        BUG_ON(!domain);
3683
3684        iommu = domain_get_iommu(domain);
3685
3686        iova_pfn = IOVA_PFN(dev_addr);
3687
3688        nrpages = aligned_nrpages(dev_addr, size);
3689        start_pfn = mm_to_dma_pfn(iova_pfn);
3690        last_pfn = start_pfn + nrpages - 1;
3691
3692        pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3693                 dev_name(dev), start_pfn, last_pfn);
3694
3695        freelist = domain_unmap(domain, start_pfn, last_pfn);
3696
3697        if (intel_iommu_strict) {
3698                iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3699                                      nrpages, !freelist, 0);
3700                /* free iova */
3701                free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3702                dma_free_pagelist(freelist);
3703        } else {
3704                add_unmap(domain, iova_pfn, nrpages, freelist);
3705                /*
3706                 * queue up the release of the unmap to save the 1/6th of the
3707                 * cpu used up by the iotlb flush operation...
3708                 */
3709        }
3710}
3711
3712static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3713                             size_t size, enum dma_data_direction dir,
3714                             unsigned long attrs)
3715{
3716        intel_unmap(dev, dev_addr, size);
3717}
3718
3719static void *intel_alloc_coherent(struct device *dev, size_t size,
3720                                  dma_addr_t *dma_handle, gfp_t flags,
3721                                  unsigned long attrs)
3722{
3723        struct page *page = NULL;
3724        int order;
3725
3726        size = PAGE_ALIGN(size);
3727        order = get_order(size);
3728
3729        if (!iommu_no_mapping(dev))
3730                flags &= ~(GFP_DMA | GFP_DMA32);
3731        else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3732                if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3733                        flags |= GFP_DMA;
3734                else
3735                        flags |= GFP_DMA32;
3736        }
3737
3738        if (gfpflags_allow_blocking(flags)) {
3739                unsigned int count = size >> PAGE_SHIFT;
3740
3741                page = dma_alloc_from_contiguous(dev, count, order);
3742                if (page && iommu_no_mapping(dev) &&
3743                    page_to_phys(page) + size > dev->coherent_dma_mask) {
3744                        dma_release_from_contiguous(dev, page, count);
3745                        page = NULL;
3746                }
3747        }
3748
3749        if (!page)
3750                page = alloc_pages(flags, order);
3751        if (!page)
3752                return NULL;
3753        memset(page_address(page), 0, size);
3754
3755        *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3756                                         DMA_BIDIRECTIONAL,
3757                                         dev->coherent_dma_mask);
3758        if (*dma_handle)
3759                return page_address(page);
3760        if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3761                __free_pages(page, order);
3762
3763        return NULL;
3764}
3765
3766static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3767                                dma_addr_t dma_handle, unsigned long attrs)
3768{
3769        int order;
3770        struct page *page = virt_to_page(vaddr);
3771
3772        size = PAGE_ALIGN(size);
3773        order = get_order(size);
3774
3775        intel_unmap(dev, dma_handle, size);
3776        if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3777                __free_pages(page, order);
3778}
3779
3780static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3781                           int nelems, enum dma_data_direction dir,
3782                           unsigned long attrs)
3783{
3784        dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3785        unsigned long nrpages = 0;
3786        struct scatterlist *sg;
3787        int i;
3788
3789        for_each_sg(sglist, sg, nelems, i) {
3790                nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3791        }
3792
3793        intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3794}
3795
3796static int intel_nontranslate_map_sg(struct device *hddev,
3797        struct scatterlist *sglist, int nelems, int dir)
3798{
3799        int i;
3800        struct scatterlist *sg;
3801
3802        for_each_sg(sglist, sg, nelems, i) {
3803                BUG_ON(!sg_page(sg));
3804                sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3805                sg->dma_length = sg->length;
3806        }
3807        return nelems;
3808}
3809
3810static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3811                        enum dma_data_direction dir, unsigned long attrs)
3812{
3813        int i;
3814        struct dmar_domain *domain;
3815        size_t size = 0;
3816        int prot = 0;
3817        unsigned long iova_pfn;
3818        int ret;
3819        struct scatterlist *sg;
3820        unsigned long start_vpfn;
3821        struct intel_iommu *iommu;
3822
3823        BUG_ON(dir == DMA_NONE);
3824        if (iommu_no_mapping(dev))
3825                return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3826
3827        domain = get_valid_domain_for_dev(dev);
3828        if (!domain)
3829                return 0;
3830
3831        iommu = domain_get_iommu(domain);
3832
3833        for_each_sg(sglist, sg, nelems, i)
3834                size += aligned_nrpages(sg->offset, sg->length);
3835
3836        iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3837                                *dev->dma_mask);
3838        if (!iova_pfn) {
3839                sglist->dma_length = 0;
3840                return 0;
3841        }
3842
3843        /*
3844         * Check if DMAR supports zero-length reads on write only
3845         * mappings..
3846         */
3847        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3848                        !cap_zlr(iommu->cap))
3849                prot |= DMA_PTE_READ;
3850        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3851                prot |= DMA_PTE_WRITE;
3852
3853        start_vpfn = mm_to_dma_pfn(iova_pfn);
3854
3855        ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3856        if (unlikely(ret)) {
3857                dma_pte_free_pagetable(domain, start_vpfn,
3858                                       start_vpfn + size - 1);
3859                free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3860                return 0;
3861        }
3862
3863        /* it's a non-present to present mapping. Only flush if caching mode */
3864        if (cap_caching_mode(iommu->cap))
3865                iommu_flush_iotlb_psi(iommu, domain, start_vpfn, size, 0, 1);
3866        else
3867                iommu_flush_write_buffer(iommu);
3868
3869        return nelems;
3870}
3871
3872static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3873{
3874        return !dma_addr;
3875}
3876
3877struct dma_map_ops intel_dma_ops = {
3878        .alloc = intel_alloc_coherent,
3879        .free = intel_free_coherent,
3880        .map_sg = intel_map_sg,
3881        .unmap_sg = intel_unmap_sg,
3882        .map_page = intel_map_page,
3883        .unmap_page = intel_unmap_page,
3884        .mapping_error = intel_mapping_error,
3885};
3886
3887static inline int iommu_domain_cache_init(void)
3888{
3889        int ret = 0;
3890
3891        iommu_domain_cache = kmem_cache_create("iommu_domain",
3892                                         sizeof(struct dmar_domain),
3893                                         0,
3894                                         SLAB_HWCACHE_ALIGN,
3895
3896                                         NULL);
3897        if (!iommu_domain_cache) {
3898                pr_err("Couldn't create iommu_domain cache\n");
3899                ret = -ENOMEM;
3900        }
3901
3902        return ret;
3903}
3904
3905static inline int iommu_devinfo_cache_init(void)
3906{
3907        int ret = 0;
3908
3909        iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3910                                         sizeof(struct device_domain_info),
3911                                         0,
3912                                         SLAB_HWCACHE_ALIGN,
3913                                         NULL);
3914        if (!iommu_devinfo_cache) {
3915                pr_err("Couldn't create devinfo cache\n");
3916                ret = -ENOMEM;
3917        }
3918
3919        return ret;
3920}
3921
3922static int __init iommu_init_mempool(void)
3923{
3924        int ret;
3925        ret = iova_cache_get();
3926        if (ret)
3927                return ret;
3928
3929        ret = iommu_domain_cache_init();
3930        if (ret)
3931                goto domain_error;
3932
3933        ret = iommu_devinfo_cache_init();
3934        if (!ret)
3935                return ret;
3936
3937        kmem_cache_destroy(iommu_domain_cache);
3938domain_error:
3939        iova_cache_put();
3940
3941        return -ENOMEM;
3942}
3943
3944static void __init iommu_exit_mempool(void)
3945{
3946        kmem_cache_destroy(iommu_devinfo_cache);
3947        kmem_cache_destroy(iommu_domain_cache);
3948        iova_cache_put();
3949}
3950
3951static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3952{
3953        struct dmar_drhd_unit *drhd;
3954        u32 vtbar;
3955        int rc;
3956
3957        /* We know that this device on this chipset has its own IOMMU.
3958         * If we find it under a different IOMMU, then the BIOS is lying
3959         * to us. Hope that the IOMMU for this device is actually
3960         * disabled, and it needs no translation...
3961         */
3962        rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3963        if (rc) {
3964                /* "can't" happen */
3965                dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3966                return;
3967        }
3968        vtbar &= 0xffff0000;
3969
3970        /* we know that the this iommu should be at offset 0xa000 from vtbar */
3971        drhd = dmar_find_matched_drhd_unit(pdev);
3972        if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3973                            TAINT_FIRMWARE_WORKAROUND,
3974                            "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3975                pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3976}
3977DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3978
3979static void __init init_no_remapping_devices(void)
3980{
3981        struct dmar_drhd_unit *drhd;
3982        struct device *dev;
3983        int i;
3984
3985        for_each_drhd_unit(drhd) {
3986                if (!drhd->include_all) {
3987                        for_each_active_dev_scope(drhd->devices,
3988                                                  drhd->devices_cnt, i, dev)
3989                                break;
3990                        /* ignore DMAR unit if no devices exist */
3991                        if (i == drhd->devices_cnt)
3992                                drhd->ignored = 1;
3993                }
3994        }
3995
3996        for_each_active_drhd_unit(drhd) {
3997                if (drhd->include_all)
3998                        continue;
3999
4000                for_each_active_dev_scope(drhd->devices,
4001                                          drhd->devices_cnt, i, dev)
4002                        if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4003                                break;
4004                if (i < drhd->devices_cnt)
4005                        continue;
4006
4007                /* This IOMMU has *only* gfx devices. Either bypass it or
4008                   set the gfx_mapped flag, as appropriate */
4009                if (dmar_map_gfx) {
4010                        intel_iommu_gfx_mapped = 1;
4011                } else {
4012                        drhd->ignored = 1;
4013                        for_each_active_dev_scope(drhd->devices,
4014                                                  drhd->devices_cnt, i, dev)
4015                                dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4016                }
4017        }
4018}
4019
4020#ifdef CONFIG_SUSPEND
4021static int init_iommu_hw(void)
4022{
4023        struct dmar_drhd_unit *drhd;
4024        struct intel_iommu *iommu = NULL;
4025
4026        for_each_active_iommu(iommu, drhd)
4027                if (iommu->qi)
4028                        dmar_reenable_qi(iommu);
4029
4030        for_each_iommu(iommu, drhd) {
4031                if (drhd->ignored) {
4032                        /*
4033                         * we always have to disable PMRs or DMA may fail on
4034                         * this device
4035                         */
4036                        if (force_on)
4037                                iommu_disable_protect_mem_regions(iommu);
4038                        continue;
4039                }
4040        
4041                iommu_flush_write_buffer(iommu);
4042
4043                iommu_set_root_entry(iommu);
4044
4045                iommu->flush.flush_context(iommu, 0, 0, 0,
4046                                           DMA_CCMD_GLOBAL_INVL);
4047                iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4048                iommu_enable_translation(iommu);
4049                iommu_disable_protect_mem_regions(iommu);
4050        }
4051
4052        return 0;
4053}
4054
4055static void iommu_flush_all(void)
4056{
4057        struct dmar_drhd_unit *drhd;
4058        struct intel_iommu *iommu;
4059
4060        for_each_active_iommu(iommu, drhd) {
4061                iommu->flush.flush_context(iommu, 0, 0, 0,
4062                                           DMA_CCMD_GLOBAL_INVL);
4063                iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4064                                         DMA_TLB_GLOBAL_FLUSH);
4065        }
4066}
4067
4068static int iommu_suspend(void)
4069{
4070        struct dmar_drhd_unit *drhd;
4071        struct intel_iommu *iommu = NULL;
4072        unsigned long flag;
4073
4074        for_each_active_iommu(iommu, drhd) {
4075                iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
4076                                                 GFP_ATOMIC);
4077                if (!iommu->iommu_state)
4078                        goto nomem;
4079        }
4080
4081        iommu_flush_all();
4082
4083        for_each_active_iommu(iommu, drhd) {
4084                iommu_disable_translation(iommu);
4085
4086                raw_spin_lock_irqsave(&iommu->register_lock, flag);
4087
4088                iommu->iommu_state[SR_DMAR_FECTL_REG] =
4089                        readl(iommu->reg + DMAR_FECTL_REG);
4090                iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4091                        readl(iommu->reg + DMAR_FEDATA_REG);
4092                iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4093                        readl(iommu->reg + DMAR_FEADDR_REG);
4094                iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4095                        readl(iommu->reg + DMAR_FEUADDR_REG);
4096
4097                raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4098        }
4099        return 0;
4100
4101nomem:
4102        for_each_active_iommu(iommu, drhd)
4103                kfree(iommu->iommu_state);
4104
4105        return -ENOMEM;
4106}
4107
4108static void iommu_resume(void)
4109{
4110        struct dmar_drhd_unit *drhd;
4111        struct intel_iommu *iommu = NULL;
4112        unsigned long flag;
4113
4114        if (init_iommu_hw()) {
4115                if (force_on)
4116                        panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4117                else
4118                        WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4119                return;
4120        }
4121
4122        for_each_active_iommu(iommu, drhd) {
4123
4124                raw_spin_lock_irqsave(&iommu->register_lock, flag);
4125
4126                writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4127                        iommu->reg + DMAR_FECTL_REG);
4128                writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4129                        iommu->reg + DMAR_FEDATA_REG);
4130                writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4131                        iommu->reg + DMAR_FEADDR_REG);
4132                writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4133                        iommu->reg + DMAR_FEUADDR_REG);
4134
4135                raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4136        }
4137
4138        for_each_active_iommu(iommu, drhd)
4139                kfree(iommu->iommu_state);
4140}
4141
4142static struct syscore_ops iommu_syscore_ops = {
4143        .resume         = iommu_resume,
4144        .suspend        = iommu_suspend,
4145};
4146
4147static void __init init_iommu_pm_ops(void)
4148{
4149        register_syscore_ops(&iommu_syscore_ops);
4150}
4151
4152#else
4153static inline void init_iommu_pm_ops(void) {}
4154#endif  /* CONFIG_PM */
4155
4156
4157int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4158{
4159        struct acpi_dmar_reserved_memory *rmrr;
4160        struct dmar_rmrr_unit *rmrru;
4161
4162        rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4163        if (!rmrru)
4164                return -ENOMEM;
4165
4166        rmrru->hdr = header;
4167        rmrr = (struct acpi_dmar_reserved_memory *)header;
4168        rmrru->base_address = rmrr->base_address;
4169        rmrru->end_address = rmrr->end_address;
4170        rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4171                                ((void *)rmrr) + rmrr->header.length,
4172                                &rmrru->devices_cnt);
4173        if (rmrru->devices_cnt && rmrru->devices == NULL) {
4174                kfree(rmrru);
4175                return -ENOMEM;
4176        }
4177
4178        list_add(&rmrru->list, &dmar_rmrr_units);
4179
4180        return 0;
4181}
4182
4183static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4184{
4185        struct dmar_atsr_unit *atsru;
4186        struct acpi_dmar_atsr *tmp;
4187
4188        list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4189                tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4190                if (atsr->segment != tmp->segment)
4191                        continue;
4192                if (atsr->header.length != tmp->header.length)
4193                        continue;
4194                if (memcmp(atsr, tmp, atsr->header.length) == 0)
4195                        return atsru;
4196        }
4197
4198        return NULL;
4199}
4200
4201int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4202{
4203        struct acpi_dmar_atsr *atsr;
4204        struct dmar_atsr_unit *atsru;
4205
4206        if (system_state != SYSTEM_BOOTING && !intel_iommu_enabled)
4207                return 0;
4208
4209        atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4210        atsru = dmar_find_atsr(atsr);
4211        if (atsru)
4212                return 0;
4213
4214        atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4215        if (!atsru)
4216                return -ENOMEM;
4217
4218        /*
4219         * If memory is allocated from slab by ACPI _DSM method, we need to
4220         * copy the memory content because the memory buffer will be freed
4221         * on return.
4222         */
4223        atsru->hdr = (void *)(atsru + 1);
4224        memcpy(atsru->hdr, hdr, hdr->length);
4225        atsru->include_all = atsr->flags & 0x1;
4226        if (!atsru->include_all) {
4227                atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4228                                (void *)atsr + atsr->header.length,
4229                                &atsru->devices_cnt);
4230                if (atsru->devices_cnt && atsru->devices == NULL) {
4231                        kfree(atsru);
4232                        return -ENOMEM;
4233                }
4234        }
4235
4236        list_add_rcu(&atsru->list, &dmar_atsr_units);
4237
4238        return 0;
4239}
4240
4241static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4242{
4243        dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4244        kfree(atsru);
4245}
4246
4247int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4248{
4249        struct acpi_dmar_atsr *atsr;
4250        struct dmar_atsr_unit *atsru;
4251
4252        atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4253        atsru = dmar_find_atsr(atsr);
4254        if (atsru) {
4255                list_del_rcu(&atsru->list);
4256                synchronize_rcu();
4257                intel_iommu_free_atsr(atsru);
4258        }
4259
4260        return 0;
4261}
4262
4263int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4264{
4265        int i;
4266        struct device *dev;
4267        struct acpi_dmar_atsr *atsr;
4268        struct dmar_atsr_unit *atsru;
4269
4270        atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4271        atsru = dmar_find_atsr(atsr);
4272        if (!atsru)
4273                return 0;
4274
4275        if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4276                for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4277                                          i, dev)
4278                        return -EBUSY;
4279        }
4280
4281        return 0;
4282}
4283
4284static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4285{
4286        int sp, ret = 0;
4287        struct intel_iommu *iommu = dmaru->iommu;
4288
4289        if (g_iommus[iommu->seq_id])
4290                return 0;
4291
4292        if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4293                pr_warn("%s: Doesn't support hardware pass through.\n",
4294                        iommu->name);
4295                return -ENXIO;
4296        }
4297        if (!ecap_sc_support(iommu->ecap) &&
4298            domain_update_iommu_snooping(iommu)) {
4299                pr_warn("%s: Doesn't support snooping.\n",
4300                        iommu->name);
4301                return -ENXIO;
4302        }
4303        sp = domain_update_iommu_superpage(iommu) - 1;
4304        if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4305                pr_warn("%s: Doesn't support large page.\n",
4306                        iommu->name);
4307                return -ENXIO;
4308        }
4309
4310        /*
4311         * Disable translation if already enabled prior to OS handover.
4312         */
4313        if (iommu->gcmd & DMA_GCMD_TE)
4314                iommu_disable_translation(iommu);
4315
4316        g_iommus[iommu->seq_id] = iommu;
4317        ret = iommu_init_domains(iommu);
4318        if (ret == 0)
4319                ret = iommu_alloc_root_entry(iommu);
4320        if (ret)
4321                goto out;
4322
4323#ifdef CONFIG_INTEL_IOMMU_SVM
4324        if (pasid_enabled(iommu))
4325                intel_svm_alloc_pasid_tables(iommu);
4326#endif
4327
4328        if (dmaru->ignored) {
4329                /*
4330                 * we always have to disable PMRs or DMA may fail on this device
4331                 */
4332                if (force_on)
4333                        iommu_disable_protect_mem_regions(iommu);
4334                return 0;
4335        }
4336
4337        intel_iommu_init_qi(iommu);
4338        iommu_flush_write_buffer(iommu);
4339
4340#ifdef CONFIG_INTEL_IOMMU_SVM
4341        if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
4342                ret = intel_svm_enable_prq(iommu);
4343                if (ret)
4344                        goto disable_iommu;
4345        }
4346#endif
4347        ret = dmar_set_interrupt(iommu);
4348        if (ret)
4349                goto disable_iommu;
4350
4351        iommu_set_root_entry(iommu);
4352        iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4353        iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4354        iommu_enable_translation(iommu);
4355
4356        iommu_disable_protect_mem_regions(iommu);
4357        return 0;
4358
4359disable_iommu:
4360        disable_dmar_iommu(iommu);
4361out:
4362        free_dmar_iommu(iommu);
4363        return ret;
4364}
4365
4366int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4367{
4368        int ret = 0;
4369        struct intel_iommu *iommu = dmaru->iommu;
4370
4371        if (!intel_iommu_enabled)
4372                return 0;
4373        if (iommu == NULL)
4374                return -EINVAL;
4375
4376        if (insert) {
4377                ret = intel_iommu_add(dmaru);
4378        } else {
4379                disable_dmar_iommu(iommu);
4380                free_dmar_iommu(iommu);
4381        }
4382
4383        return ret;
4384}
4385
4386static void intel_iommu_free_dmars(void)
4387{
4388        struct dmar_rmrr_unit *rmrru, *rmrr_n;
4389        struct dmar_atsr_unit *atsru, *atsr_n;
4390
4391        list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4392                list_del(&rmrru->list);
4393                dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4394                kfree(rmrru);
4395        }
4396
4397        list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4398                list_del(&atsru->list);
4399                intel_iommu_free_atsr(atsru);
4400        }
4401}
4402
4403int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4404{
4405        int i, ret = 1;
4406        struct pci_bus *bus;
4407        struct pci_dev *bridge = NULL;
4408        struct device *tmp;
4409        struct acpi_dmar_atsr *atsr;
4410        struct dmar_atsr_unit *atsru;
4411
4412        dev = pci_physfn(dev);
4413        for (bus = dev->bus; bus; bus = bus->parent) {
4414                bridge = bus->self;
4415                /* If it's an integrated device, allow ATS */
4416                if (!bridge)
4417                        return 1;
4418                /* Connected via non-PCIe: no ATS */
4419                if (!pci_is_pcie(bridge) ||
4420                    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4421                        return 0;
4422                /* If we found the root port, look it up in the ATSR */
4423                if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4424                        break;
4425        }
4426
4427        rcu_read_lock();
4428        list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4429                atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4430                if (atsr->segment != pci_domain_nr(dev->bus))
4431                        continue;
4432
4433                for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4434                        if (tmp == &bridge->dev)
4435                                goto out;
4436
4437                if (atsru->include_all)
4438                        goto out;
4439        }
4440        ret = 0;
4441out:
4442        rcu_read_unlock();
4443
4444        return ret;
4445}
4446
4447int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4448{
4449        int ret = 0;
4450        struct dmar_rmrr_unit *rmrru;
4451        struct dmar_atsr_unit *atsru;
4452        struct acpi_dmar_atsr *atsr;
4453        struct acpi_dmar_reserved_memory *rmrr;
4454
4455        if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
4456                return 0;
4457
4458        list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4459                rmrr = container_of(rmrru->hdr,
4460                                    struct acpi_dmar_reserved_memory, header);
4461                if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4462                        ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4463                                ((void *)rmrr) + rmrr->header.length,
4464                                rmrr->segment, rmrru->devices,
4465                                rmrru->devices_cnt);
4466                        if(ret < 0)
4467                                return ret;
4468                } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4469                        dmar_remove_dev_scope(info, rmrr->segment,
4470                                rmrru->devices, rmrru->devices_cnt);
4471                }
4472        }
4473
4474        list_for_each_entry(atsru, &dmar_atsr_units, list) {
4475                if (atsru->include_all)
4476                        continue;
4477
4478                atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4479                if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4480                        ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4481                                        (void *)atsr + atsr->header.length,
4482                                        atsr->segment, atsru->devices,
4483                                        atsru->devices_cnt);
4484                        if (ret > 0)
4485                                break;
4486                        else if(ret < 0)
4487                                return ret;
4488                } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4489                        if (dmar_remove_dev_scope(info, atsr->segment,
4490                                        atsru->devices, atsru->devices_cnt))
4491                                break;
4492                }
4493        }
4494
4495        return 0;
4496}
4497
4498/*
4499 * Here we only respond to action of unbound device from driver.
4500 *
4501 * Added device is not attached to its DMAR domain here yet. That will happen
4502 * when mapping the device to iova.
4503 */
4504static int device_notifier(struct notifier_block *nb,
4505                                  unsigned long action, void *data)
4506{
4507        struct device *dev = data;
4508        struct dmar_domain *domain;
4509
4510        if (iommu_dummy(dev))
4511                return 0;
4512
4513        if (action != BUS_NOTIFY_REMOVED_DEVICE)
4514                return 0;
4515
4516        domain = find_domain(dev);
4517        if (!domain)
4518                return 0;
4519
4520        dmar_remove_one_dev_info(domain, dev);
4521        if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4522                domain_exit(domain);
4523
4524        return 0;
4525}
4526
4527static struct notifier_block device_nb = {
4528        .notifier_call = device_notifier,
4529};
4530
4531static int intel_iommu_memory_notifier(struct notifier_block *nb,
4532                                       unsigned long val, void *v)
4533{
4534        struct memory_notify *mhp = v;
4535        unsigned long long start, end;
4536        unsigned long start_vpfn, last_vpfn;
4537
4538        switch (val) {
4539        case MEM_GOING_ONLINE:
4540                start = mhp->start_pfn << PAGE_SHIFT;
4541                end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4542                if (iommu_domain_identity_map(si_domain, start, end)) {
4543                        pr_warn("Failed to build identity map for [%llx-%llx]\n",
4544                                start, end);
4545                        return NOTIFY_BAD;
4546                }
4547                break;
4548
4549        case MEM_OFFLINE:
4550        case MEM_CANCEL_ONLINE:
4551                start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4552                last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4553                while (start_vpfn <= last_vpfn) {
4554                        struct iova *iova;
4555                        struct dmar_drhd_unit *drhd;
4556                        struct intel_iommu *iommu;
4557                        struct page *freelist;
4558
4559                        iova = find_iova(&si_domain->iovad, start_vpfn);
4560                        if (iova == NULL) {
4561                                pr_debug("Failed get IOVA for PFN %lx\n",
4562                                         start_vpfn);
4563                                break;
4564                        }
4565
4566                        iova = split_and_remove_iova(&si_domain->iovad, iova,
4567                                                     start_vpfn, last_vpfn);
4568                        if (iova == NULL) {
4569                                pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4570                                        start_vpfn, last_vpfn);
4571                                return NOTIFY_BAD;
4572                        }
4573
4574                        freelist = domain_unmap(si_domain, iova->pfn_lo,
4575                                               iova->pfn_hi);
4576
4577                        rcu_read_lock();
4578                        for_each_active_iommu(iommu, drhd)
4579                                iommu_flush_iotlb_psi(iommu, si_domain,
4580                                        iova->pfn_lo, iova_size(iova),
4581                                        !freelist, 0);
4582                        rcu_read_unlock();
4583                        dma_free_pagelist(freelist);
4584
4585                        start_vpfn = iova->pfn_hi + 1;
4586                        free_iova_mem(iova);
4587                }
4588                break;
4589        }
4590
4591        return NOTIFY_OK;
4592}
4593
4594static struct notifier_block intel_iommu_memory_nb = {
4595        .notifier_call = intel_iommu_memory_notifier,
4596        .priority = 0
4597};
4598
4599static void free_all_cpu_cached_iovas(unsigned int cpu)
4600{
4601        int i;
4602
4603        for (i = 0; i < g_num_of_iommus; i++) {
4604                struct intel_iommu *iommu = g_iommus[i];
4605                struct dmar_domain *domain;
4606                int did;
4607
4608                if (!iommu)
4609                        continue;
4610
4611                for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4612                        domain = get_iommu_domain(iommu, (u16)did);
4613
4614                        if (!domain)
4615                                continue;
4616                        free_cpu_cached_iovas(cpu, &domain->iovad);
4617                }
4618        }
4619}
4620
4621static int intel_iommu_cpu_notifier(struct notifier_block *nfb,
4622                                    unsigned long action, void *v)
4623{
4624        unsigned int cpu = (unsigned long)v;
4625
4626        switch (action) {
4627        case CPU_DEAD:
4628        case CPU_DEAD_FROZEN:
4629                free_all_cpu_cached_iovas(cpu);
4630                flush_unmaps_timeout(cpu);
4631                break;
4632        }
4633        return NOTIFY_OK;
4634}
4635
4636static struct notifier_block intel_iommu_cpu_nb = {
4637        .notifier_call = intel_iommu_cpu_notifier,
4638};
4639
4640static ssize_t intel_iommu_show_version(struct device *dev,
4641                                        struct device_attribute *attr,
4642                                        char *buf)
4643{
4644        struct intel_iommu *iommu = dev_get_drvdata(dev);
4645        u32 ver = readl(iommu->reg + DMAR_VER_REG);
4646        return sprintf(buf, "%d:%d\n",
4647                       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4648}
4649static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4650
4651static ssize_t intel_iommu_show_address(struct device *dev,
4652                                        struct device_attribute *attr,
4653                                        char *buf)
4654{
4655        struct intel_iommu *iommu = dev_get_drvdata(dev);
4656        return sprintf(buf, "%llx\n", iommu->reg_phys);
4657}
4658static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4659
4660static ssize_t intel_iommu_show_cap(struct device *dev,
4661                                    struct device_attribute *attr,
4662                                    char *buf)
4663{
4664        struct intel_iommu *iommu = dev_get_drvdata(dev);
4665        return sprintf(buf, "%llx\n", iommu->cap);
4666}
4667static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4668
4669static ssize_t intel_iommu_show_ecap(struct device *dev,
4670                                    struct device_attribute *attr,
4671                                    char *buf)
4672{
4673        struct intel_iommu *iommu = dev_get_drvdata(dev);
4674        return sprintf(buf, "%llx\n", iommu->ecap);
4675}
4676static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4677
4678static ssize_t intel_iommu_show_ndoms(struct device *dev,
4679                                      struct device_attribute *attr,
4680                                      char *buf)
4681{
4682        struct intel_iommu *iommu = dev_get_drvdata(dev);
4683        return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4684}
4685static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4686
4687static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4688                                           struct device_attribute *attr,
4689                                           char *buf)
4690{
4691        struct intel_iommu *iommu = dev_get_drvdata(dev);
4692        return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4693                                                  cap_ndoms(iommu->cap)));
4694}
4695static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4696
4697static struct attribute *intel_iommu_attrs[] = {
4698        &dev_attr_version.attr,
4699        &dev_attr_address.attr,
4700        &dev_attr_cap.attr,
4701        &dev_attr_ecap.attr,
4702        &dev_attr_domains_supported.attr,
4703        &dev_attr_domains_used.attr,
4704        NULL,
4705};
4706
4707static struct attribute_group intel_iommu_group = {
4708        .name = "intel-iommu",
4709        .attrs = intel_iommu_attrs,
4710};
4711
4712const struct attribute_group *intel_iommu_groups[] = {
4713        &intel_iommu_group,
4714        NULL,
4715};
4716
4717int __init intel_iommu_init(void)
4718{
4719        int ret = -ENODEV;
4720        struct dmar_drhd_unit *drhd;
4721        struct intel_iommu *iommu;
4722
4723        /* VT-d is required for a TXT/tboot launch, so enforce that */
4724        force_on = tboot_force_iommu();
4725
4726        if (iommu_init_mempool()) {
4727                if (force_on)
4728                        panic("tboot: Failed to initialize iommu memory\n");
4729                return -ENOMEM;
4730        }
4731
4732        down_write(&dmar_global_lock);
4733        if (dmar_table_init()) {
4734                if (force_on)
4735                        panic("tboot: Failed to initialize DMAR table\n");
4736                goto out_free_dmar;
4737        }
4738
4739        if (dmar_dev_scope_init() < 0) {
4740                if (force_on)
4741                        panic("tboot: Failed to initialize DMAR device scope\n");
4742                goto out_free_dmar;
4743        }
4744
4745        if (no_iommu || dmar_disabled)
4746                goto out_free_dmar;
4747
4748        if (list_empty(&dmar_rmrr_units))
4749                pr_info("No RMRR found\n");
4750
4751        if (list_empty(&dmar_atsr_units))
4752                pr_info("No ATSR found\n");
4753
4754        if (dmar_init_reserved_ranges()) {
4755                if (force_on)
4756                        panic("tboot: Failed to reserve iommu ranges\n");
4757                goto out_free_reserved_range;
4758        }
4759
4760        init_no_remapping_devices();
4761
4762        ret = init_dmars();
4763        if (ret) {
4764                if (force_on)
4765                        panic("tboot: Failed to initialize DMARs\n");
4766                pr_err("Initialization failed\n");
4767                goto out_free_reserved_range;
4768        }
4769        up_write(&dmar_global_lock);
4770        pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4771
4772#ifdef CONFIG_SWIOTLB
4773        swiotlb = 0;
4774#endif
4775        dma_ops = &intel_dma_ops;
4776
4777        init_iommu_pm_ops();
4778
4779        for_each_active_iommu(iommu, drhd)
4780                iommu->iommu_dev = iommu_device_create(NULL, iommu,
4781                                                       intel_iommu_groups,
4782                                                       "%s", iommu->name);
4783
4784        bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4785        bus_register_notifier(&pci_bus_type, &device_nb);
4786        if (si_domain && !hw_pass_through)
4787                register_memory_notifier(&intel_iommu_memory_nb);
4788        register_hotcpu_notifier(&intel_iommu_cpu_nb);
4789
4790        intel_iommu_enabled = 1;
4791
4792        return 0;
4793
4794out_free_reserved_range:
4795        put_iova_domain(&reserved_iova_list);
4796out_free_dmar:
4797        intel_iommu_free_dmars();
4798        up_write(&dmar_global_lock);
4799        iommu_exit_mempool();
4800        return ret;
4801}
4802
4803static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4804{
4805        struct intel_iommu *iommu = opaque;
4806
4807        domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4808        return 0;
4809}
4810
4811/*
4812 * NB - intel-iommu lacks any sort of reference counting for the users of
4813 * dependent devices.  If multiple endpoints have intersecting dependent
4814 * devices, unbinding the driver from any one of them will possibly leave
4815 * the others unable to operate.
4816 */
4817static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4818{
4819        if (!iommu || !dev || !dev_is_pci(dev))
4820                return;
4821
4822        pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4823}
4824
4825static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4826{
4827        struct intel_iommu *iommu;
4828        unsigned long flags;
4829
4830        assert_spin_locked(&device_domain_lock);
4831
4832        if (WARN_ON(!info))
4833                return;
4834
4835        iommu = info->iommu;
4836
4837        if (info->dev) {
4838                iommu_disable_dev_iotlb(info);
4839                domain_context_clear(iommu, info->dev);
4840        }
4841
4842        unlink_domain_info(info);
4843
4844        spin_lock_irqsave(&iommu->lock, flags);
4845        domain_detach_iommu(info->domain, iommu);
4846        spin_unlock_irqrestore(&iommu->lock, flags);
4847
4848        free_devinfo_mem(info);
4849}
4850
4851static void dmar_remove_one_dev_info(struct dmar_domain *domain,
4852                                     struct device *dev)
4853{
4854        struct device_domain_info *info;
4855        unsigned long flags;
4856
4857        spin_lock_irqsave(&device_domain_lock, flags);
4858        info = dev->archdata.iommu;
4859        __dmar_remove_one_dev_info(info);
4860        spin_unlock_irqrestore(&device_domain_lock, flags);
4861}
4862
4863static int md_domain_init(struct dmar_domain *domain, int guest_width)
4864{
4865        int adjust_width;
4866
4867        init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
4868                        DMA_32BIT_PFN);
4869        domain_reserve_special_ranges(domain);
4870
4871        /* calculate AGAW */
4872        domain->gaw = guest_width;
4873        adjust_width = guestwidth_to_adjustwidth(guest_width);
4874        domain->agaw = width_to_agaw(adjust_width);
4875
4876        domain->iommu_coherency = 0;
4877        domain->iommu_snooping = 0;
4878        domain->iommu_superpage = 0;
4879        domain->max_addr = 0;
4880
4881        /* always allocate the top pgd */
4882        domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4883        if (!domain->pgd)
4884                return -ENOMEM;
4885        domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4886        return 0;
4887}
4888
4889static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4890{
4891        struct dmar_domain *dmar_domain;
4892        struct iommu_domain *domain;
4893
4894        if (type != IOMMU_DOMAIN_UNMANAGED)
4895                return NULL;
4896
4897        dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4898        if (!dmar_domain) {
4899                pr_err("Can't allocate dmar_domain\n");
4900                return NULL;
4901        }
4902        if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4903                pr_err("Domain initialization failed\n");
4904                domain_exit(dmar_domain);
4905                return NULL;
4906        }
4907        domain_update_iommu_cap(dmar_domain);
4908
4909        domain = &dmar_domain->domain;
4910        domain->geometry.aperture_start = 0;
4911        domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4912        domain->geometry.force_aperture = true;
4913
4914        return domain;
4915}
4916
4917static void intel_iommu_domain_free(struct iommu_domain *domain)
4918{
4919        domain_exit(to_dmar_domain(domain));
4920}
4921
4922static int intel_iommu_attach_device(struct iommu_domain *domain,
4923                                     struct device *dev)
4924{
4925        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4926        struct intel_iommu *iommu;
4927        int addr_width;
4928        u8 bus, devfn;
4929
4930        if (device_is_rmrr_locked(dev)) {
4931                dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4932                return -EPERM;
4933        }
4934
4935        /* normally dev is not mapped */
4936        if (unlikely(domain_context_mapped(dev))) {
4937                struct dmar_domain *old_domain;
4938
4939                old_domain = find_domain(dev);
4940                if (old_domain) {
4941                        rcu_read_lock();
4942                        dmar_remove_one_dev_info(old_domain, dev);
4943                        rcu_read_unlock();
4944
4945                        if (!domain_type_is_vm_or_si(old_domain) &&
4946                             list_empty(&old_domain->devices))
4947                                domain_exit(old_domain);
4948                }
4949        }
4950
4951        iommu = device_to_iommu(dev, &bus, &devfn);
4952        if (!iommu)
4953                return -ENODEV;
4954
4955        /* check if this iommu agaw is sufficient for max mapped address */
4956        addr_width = agaw_to_width(iommu->agaw);
4957        if (addr_width > cap_mgaw(iommu->cap))
4958                addr_width = cap_mgaw(iommu->cap);
4959
4960        if (dmar_domain->max_addr > (1LL << addr_width)) {
4961                pr_err("%s: iommu width (%d) is not "
4962                       "sufficient for the mapped address (%llx)\n",
4963                       __func__, addr_width, dmar_domain->max_addr);
4964                return -EFAULT;
4965        }
4966        dmar_domain->gaw = addr_width;
4967
4968        /*
4969         * Knock out extra levels of page tables if necessary
4970         */
4971        while (iommu->agaw < dmar_domain->agaw) {
4972                struct dma_pte *pte;
4973
4974                pte = dmar_domain->pgd;
4975                if (dma_pte_present(pte)) {
4976                        dmar_domain->pgd = (struct dma_pte *)
4977                                phys_to_virt(dma_pte_addr(pte));
4978                        free_pgtable_page(pte);
4979                }
4980                dmar_domain->agaw--;
4981        }
4982
4983        return domain_add_dev_info(dmar_domain, dev);
4984}
4985
4986static void intel_iommu_detach_device(struct iommu_domain *domain,
4987                                      struct device *dev)
4988{
4989        dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
4990}
4991
4992static int intel_iommu_map(struct iommu_domain *domain,
4993                           unsigned long iova, phys_addr_t hpa,
4994                           size_t size, int iommu_prot)
4995{
4996        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4997        u64 max_addr;
4998        int prot = 0;
4999        int ret;
5000
5001        if (iommu_prot & IOMMU_READ)
5002                prot |= DMA_PTE_READ;
5003        if (iommu_prot & IOMMU_WRITE)
5004                prot |= DMA_PTE_WRITE;
5005        if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5006                prot |= DMA_PTE_SNP;
5007
5008        max_addr = iova + size;
5009        if (dmar_domain->max_addr < max_addr) {
5010                u64 end;
5011
5012                /* check if minimum agaw is sufficient for mapped address */
5013                end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5014                if (end < max_addr) {
5015                        pr_err("%s: iommu width (%d) is not "
5016                               "sufficient for the mapped address (%llx)\n",
5017                               __func__, dmar_domain->gaw, max_addr);
5018                        return -EFAULT;
5019                }
5020                dmar_domain->max_addr = max_addr;
5021        }
5022        /* Round up size to next multiple of PAGE_SIZE, if it and
5023           the low bits of hpa would take us onto the next page */
5024        size = aligned_nrpages(hpa, size);
5025        ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5026                                 hpa >> VTD_PAGE_SHIFT, size, prot);
5027        return ret;
5028}
5029
5030static size_t intel_iommu_unmap(struct iommu_domain *domain,
5031                                unsigned long iova, size_t size)
5032{
5033        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5034        struct page *freelist = NULL;
5035        struct intel_iommu *iommu;
5036        unsigned long start_pfn, last_pfn;
5037        unsigned int npages;
5038        int iommu_id, level = 0;
5039
5040        /* Cope with horrid API which requires us to unmap more than the
5041           size argument if it happens to be a large-page mapping. */
5042        BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5043
5044        if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5045                size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5046
5047        start_pfn = iova >> VTD_PAGE_SHIFT;
5048        last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5049
5050        freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5051
5052        npages = last_pfn - start_pfn + 1;
5053
5054        for_each_domain_iommu(iommu_id, dmar_domain) {
5055                iommu = g_iommus[iommu_id];
5056
5057                iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5058                                      start_pfn, npages, !freelist, 0);
5059        }
5060
5061        dma_free_pagelist(freelist);
5062
5063        if (dmar_domain->max_addr == iova + size)
5064                dmar_domain->max_addr = iova;
5065
5066        return size;
5067}
5068
5069static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5070                                            dma_addr_t iova)
5071{
5072        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5073        struct dma_pte *pte;
5074        int level = 0;
5075        u64 phys = 0;
5076
5077        pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5078        if (pte)
5079                phys = dma_pte_addr(pte);
5080
5081        return phys;
5082}
5083
5084static bool intel_iommu_capable(enum iommu_cap cap)
5085{
5086        if (cap == IOMMU_CAP_CACHE_COHERENCY)
5087                return domain_update_iommu_snooping(NULL) == 1;
5088        if (cap == IOMMU_CAP_INTR_REMAP)
5089                return irq_remapping_enabled == 1;
5090
5091        return false;
5092}
5093
5094static int intel_iommu_add_device(struct device *dev)
5095{
5096        struct intel_iommu *iommu;
5097        struct iommu_group *group;
5098        u8 bus, devfn;
5099
5100        iommu = device_to_iommu(dev, &bus, &devfn);
5101        if (!iommu)
5102                return -ENODEV;
5103
5104        iommu_device_link(iommu->iommu_dev, dev);
5105
5106        group = iommu_group_get_for_dev(dev);
5107
5108        if (IS_ERR(group))
5109                return PTR_ERR(group);
5110
5111        iommu_group_put(group);
5112        return 0;
5113}
5114
5115static void intel_iommu_remove_device(struct device *dev)
5116{
5117        struct intel_iommu *iommu;
5118        u8 bus, devfn;
5119
5120        iommu = device_to_iommu(dev, &bus, &devfn);
5121        if (!iommu)
5122                return;
5123
5124        iommu_group_remove_device(dev);
5125
5126        iommu_device_unlink(iommu->iommu_dev, dev);
5127}
5128
5129#ifdef CONFIG_INTEL_IOMMU_SVM
5130int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
5131{
5132        struct device_domain_info *info;
5133        struct context_entry *context;
5134        struct dmar_domain *domain;
5135        unsigned long flags;
5136        u64 ctx_lo;
5137        int ret;
5138
5139        domain = get_valid_domain_for_dev(sdev->dev);
5140        if (!domain)
5141                return -EINVAL;
5142
5143        spin_lock_irqsave(&device_domain_lock, flags);
5144        spin_lock(&iommu->lock);
5145
5146        ret = -EINVAL;
5147        info = sdev->dev->archdata.iommu;
5148        if (!info || !info->pasid_supported)
5149                goto out;
5150
5151        context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5152        if (WARN_ON(!context))
5153                goto out;
5154
5155        ctx_lo = context[0].lo;
5156
5157        sdev->did = domain->iommu_did[iommu->seq_id];
5158        sdev->sid = PCI_DEVID(info->bus, info->devfn);
5159
5160        if (!(ctx_lo & CONTEXT_PASIDE)) {
5161                context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table);
5162                context[1].lo = (u64)virt_to_phys(iommu->pasid_table) | ecap_pss(iommu->ecap);
5163                wmb();
5164                /* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both
5165                 * extended to permit requests-with-PASID if the PASIDE bit
5166                 * is set. which makes sense. For CONTEXT_TT_PASS_THROUGH,
5167                 * however, the PASIDE bit is ignored and requests-with-PASID
5168                 * are unconditionally blocked. Which makes less sense.
5169                 * So convert from CONTEXT_TT_PASS_THROUGH to one of the new
5170                 * "guest mode" translation types depending on whether ATS
5171                 * is available or not. Annoyingly, we can't use the new
5172                 * modes *unless* PASIDE is set. */
5173                if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) {
5174                        ctx_lo &= ~CONTEXT_TT_MASK;
5175                        if (info->ats_supported)
5176                                ctx_lo |= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2;
5177                        else
5178                                ctx_lo |= CONTEXT_TT_PT_PASID << 2;
5179                }
5180                ctx_lo |= CONTEXT_PASIDE;
5181                if (iommu->pasid_state_table)
5182                        ctx_lo |= CONTEXT_DINVE;
5183                if (info->pri_supported)
5184                        ctx_lo |= CONTEXT_PRS;
5185                context[0].lo = ctx_lo;
5186                wmb();
5187                iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
5188                                           DMA_CCMD_MASK_NOBIT,
5189                                           DMA_CCMD_DEVICE_INVL);
5190        }
5191
5192        /* Enable PASID support in the device, if it wasn't already */
5193        if (!info->pasid_enabled)
5194                iommu_enable_dev_iotlb(info);
5195
5196        if (info->ats_enabled) {
5197                sdev->dev_iotlb = 1;
5198                sdev->qdep = info->ats_qdep;
5199                if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
5200                        sdev->qdep = 0;
5201        }
5202        ret = 0;
5203
5204 out:
5205        spin_unlock(&iommu->lock);
5206        spin_unlock_irqrestore(&device_domain_lock, flags);
5207
5208        return ret;
5209}
5210
5211struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5212{
5213        struct intel_iommu *iommu;
5214        u8 bus, devfn;
5215
5216        if (iommu_dummy(dev)) {
5217                dev_warn(dev,
5218                         "No IOMMU translation for device; cannot enable SVM\n");
5219                return NULL;
5220        }
5221
5222        iommu = device_to_iommu(dev, &bus, &devfn);
5223        if ((!iommu)) {
5224                dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5225                return NULL;
5226        }
5227
5228        if (!iommu->pasid_table) {
5229                dev_err(dev, "PASID not enabled on IOMMU; cannot enable SVM\n");
5230                return NULL;
5231        }
5232
5233        return iommu;
5234}
5235#endif /* CONFIG_INTEL_IOMMU_SVM */
5236
5237static const struct iommu_ops intel_iommu_ops = {
5238        .capable        = intel_iommu_capable,
5239        .domain_alloc   = intel_iommu_domain_alloc,
5240        .domain_free    = intel_iommu_domain_free,
5241        .attach_dev     = intel_iommu_attach_device,
5242        .detach_dev     = intel_iommu_detach_device,
5243        .map            = intel_iommu_map,
5244        .unmap          = intel_iommu_unmap,
5245        .map_sg         = default_iommu_map_sg,
5246        .iova_to_phys   = intel_iommu_iova_to_phys,
5247        .add_device     = intel_iommu_add_device,
5248        .remove_device  = intel_iommu_remove_device,
5249        .device_group   = pci_device_group,
5250        .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
5251};
5252
5253static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5254{
5255        /* G4x/GM45 integrated gfx dmar support is totally busted. */
5256        pr_info("Disabling IOMMU for graphics on this chipset\n");
5257        dmar_map_gfx = 0;
5258}
5259
5260DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5261DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5262DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5263DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5264DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5265DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5266DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5267
5268static void quirk_iommu_rwbf(struct pci_dev *dev)
5269{
5270        /*
5271         * Mobile 4 Series Chipset neglects to set RWBF capability,
5272         * but needs it. Same seems to hold for the desktop versions.
5273         */
5274        pr_info("Forcing write-buffer flush capability\n");
5275        rwbf_quirk = 1;
5276}
5277
5278DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5279DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5280DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5281DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5282DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5283DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5284DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5285
5286#define GGC 0x52
5287#define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5288#define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5289#define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5290#define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5291#define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5292#define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5293#define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5294#define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5295
5296static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5297{
5298        unsigned short ggc;
5299
5300        if (pci_read_config_word(dev, GGC, &ggc))
5301                return;
5302
5303        if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5304                pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5305                dmar_map_gfx = 0;
5306        } else if (dmar_map_gfx) {
5307                /* we have to ensure the gfx device is idle before we flush */
5308                pr_info("Disabling batched IOTLB flush on Ironlake\n");
5309                intel_iommu_strict = 1;
5310       }
5311}
5312DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5313DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5314DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5315DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5316
5317/* On Tylersburg chipsets, some BIOSes have been known to enable the
5318   ISOCH DMAR unit for the Azalia sound device, but not give it any
5319   TLB entries, which causes it to deadlock. Check for that.  We do
5320   this in a function called from init_dmars(), instead of in a PCI
5321   quirk, because we don't want to print the obnoxious "BIOS broken"
5322   message if VT-d is actually disabled.
5323*/
5324static void __init check_tylersburg_isoch(void)
5325{
5326        struct pci_dev *pdev;
5327        uint32_t vtisochctrl;
5328
5329        /* If there's no Azalia in the system anyway, forget it. */
5330        pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5331        if (!pdev)
5332                return;
5333        pci_dev_put(pdev);
5334
5335        /* System Management Registers. Might be hidden, in which case
5336           we can't do the sanity check. But that's OK, because the
5337           known-broken BIOSes _don't_ actually hide it, so far. */
5338        pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5339        if (!pdev)
5340                return;
5341
5342        if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5343                pci_dev_put(pdev);
5344                return;
5345        }
5346
5347        pci_dev_put(pdev);
5348
5349        /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5350        if (vtisochctrl & 1)
5351                return;
5352
5353        /* Drop all bits other than the number of TLB entries */
5354        vtisochctrl &= 0x1c;
5355
5356        /* If we have the recommended number of TLB entries (16), fine. */
5357        if (vtisochctrl == 0x10)
5358                return;
5359
5360        /* Zero TLB entries? You get to ride the short bus to school. */
5361        if (!vtisochctrl) {
5362                WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5363                     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5364                     dmi_get_system_info(DMI_BIOS_VENDOR),
5365                     dmi_get_system_info(DMI_BIOS_VERSION),
5366                     dmi_get_system_info(DMI_PRODUCT_VERSION));
5367                iommu_identity_mapping |= IDENTMAP_AZALIA;
5368                return;
5369        }
5370
5371        pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5372               vtisochctrl);
5373}
5374