linux/drivers/iommu/intel-iommu.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2006, Intel Corporation.
   3 *
   4 * This program is free software; you can redistribute it and/or modify it
   5 * under the terms and conditions of the GNU General Public License,
   6 * version 2, as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope it will be useful, but WITHOUT
   9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  11 * more details.
  12 *
  13 * You should have received a copy of the GNU General Public License along with
  14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
  15 * Place - Suite 330, Boston, MA 02111-1307 USA.
  16 *
  17 * Copyright (C) 2006-2008 Intel Corporation
  18 * Author: Ashok Raj <ashok.raj@intel.com>
  19 * Author: Shaohua Li <shaohua.li@intel.com>
  20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
  21 * Author: Fenghua Yu <fenghua.yu@intel.com>
  22 */
  23
  24#include <linux/init.h>
  25#include <linux/bitmap.h>
  26#include <linux/debugfs.h>
  27#include <linux/export.h>
  28#include <linux/slab.h>
  29#include <linux/irq.h>
  30#include <linux/interrupt.h>
  31#include <linux/spinlock.h>
  32#include <linux/pci.h>
  33#include <linux/dmar.h>
  34#include <linux/dma-mapping.h>
  35#include <linux/mempool.h>
  36#include <linux/timer.h>
  37#include <linux/iova.h>
  38#include <linux/iommu.h>
  39#include <linux/intel-iommu.h>
  40#include <linux/syscore_ops.h>
  41#include <linux/tboot.h>
  42#include <linux/dmi.h>
  43#include <linux/pci-ats.h>
  44#include <linux/memblock.h>
  45#include <asm/irq_remapping.h>
  46#include <asm/cacheflush.h>
  47#include <asm/iommu.h>
  48
  49#define ROOT_SIZE               VTD_PAGE_SIZE
  50#define CONTEXT_SIZE            VTD_PAGE_SIZE
  51
  52#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  53#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  54#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  55
  56#define IOAPIC_RANGE_START      (0xfee00000)
  57#define IOAPIC_RANGE_END        (0xfeefffff)
  58#define IOVA_START_ADDR         (0x1000)
  59
  60#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
  61
  62#define MAX_AGAW_WIDTH 64
  63
  64#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  65#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  66
  67/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  68   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  69#define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  70                                __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  71#define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  72
  73#define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  74#define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
  75#define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
  76
  77/* page table handling */
  78#define LEVEL_STRIDE            (9)
  79#define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  80
  81/*
  82 * This bitmap is used to advertise the page sizes our hardware support
  83 * to the IOMMU core, which will then use this information to split
  84 * physically contiguous memory regions it is mapping into page sizes
  85 * that we support.
  86 *
  87 * Traditionally the IOMMU core just handed us the mappings directly,
  88 * after making sure the size is an order of a 4KiB page and that the
  89 * mapping has natural alignment.
  90 *
  91 * To retain this behavior, we currently advertise that we support
  92 * all page sizes that are an order of 4KiB.
  93 *
  94 * If at some point we'd like to utilize the IOMMU core's new behavior,
  95 * we could change this to advertise the real page sizes we support.
  96 */
  97#define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
  98
  99static inline int agaw_to_level(int agaw)
 100{
 101        return agaw + 2;
 102}
 103
 104static inline int agaw_to_width(int agaw)
 105{
 106        return 30 + agaw * LEVEL_STRIDE;
 107}
 108
 109static inline int width_to_agaw(int width)
 110{
 111        return (width - 30) / LEVEL_STRIDE;
 112}
 113
 114static inline unsigned int level_to_offset_bits(int level)
 115{
 116        return (level - 1) * LEVEL_STRIDE;
 117}
 118
 119static inline int pfn_level_offset(unsigned long pfn, int level)
 120{
 121        return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 122}
 123
 124static inline unsigned long level_mask(int level)
 125{
 126        return -1UL << level_to_offset_bits(level);
 127}
 128
 129static inline unsigned long level_size(int level)
 130{
 131        return 1UL << level_to_offset_bits(level);
 132}
 133
 134static inline unsigned long align_to_level(unsigned long pfn, int level)
 135{
 136        return (pfn + level_size(level) - 1) & level_mask(level);
 137}
 138
 139static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 140{
 141        return  1 << ((lvl - 1) * LEVEL_STRIDE);
 142}
 143
 144/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 145   are never going to work. */
 146static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 147{
 148        return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 149}
 150
 151static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 152{
 153        return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 154}
 155static inline unsigned long page_to_dma_pfn(struct page *pg)
 156{
 157        return mm_to_dma_pfn(page_to_pfn(pg));
 158}
 159static inline unsigned long virt_to_dma_pfn(void *p)
 160{
 161        return page_to_dma_pfn(virt_to_page(p));
 162}
 163
 164/* global iommu list, set NULL for ignored DMAR units */
 165static struct intel_iommu **g_iommus;
 166
 167static void __init check_tylersburg_isoch(void);
 168static int rwbf_quirk;
 169
 170/*
 171 * set to 1 to panic kernel if can't successfully enable VT-d
 172 * (used when kernel is launched w/ TXT)
 173 */
 174static int force_on = 0;
 175
 176/*
 177 * 0: Present
 178 * 1-11: Reserved
 179 * 12-63: Context Ptr (12 - (haw-1))
 180 * 64-127: Reserved
 181 */
 182struct root_entry {
 183        u64     val;
 184        u64     rsvd1;
 185};
 186#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 187static inline bool root_present(struct root_entry *root)
 188{
 189        return (root->val & 1);
 190}
 191static inline void set_root_present(struct root_entry *root)
 192{
 193        root->val |= 1;
 194}
 195static inline void set_root_value(struct root_entry *root, unsigned long value)
 196{
 197        root->val |= value & VTD_PAGE_MASK;
 198}
 199
 200static inline struct context_entry *
 201get_context_addr_from_root(struct root_entry *root)
 202{
 203        return (struct context_entry *)
 204                (root_present(root)?phys_to_virt(
 205                root->val & VTD_PAGE_MASK) :
 206                NULL);
 207}
 208
 209/*
 210 * low 64 bits:
 211 * 0: present
 212 * 1: fault processing disable
 213 * 2-3: translation type
 214 * 12-63: address space root
 215 * high 64 bits:
 216 * 0-2: address width
 217 * 3-6: aval
 218 * 8-23: domain id
 219 */
 220struct context_entry {
 221        u64 lo;
 222        u64 hi;
 223};
 224
 225static inline bool context_present(struct context_entry *context)
 226{
 227        return (context->lo & 1);
 228}
 229static inline void context_set_present(struct context_entry *context)
 230{
 231        context->lo |= 1;
 232}
 233
 234static inline void context_set_fault_enable(struct context_entry *context)
 235{
 236        context->lo &= (((u64)-1) << 2) | 1;
 237}
 238
 239static inline void context_set_translation_type(struct context_entry *context,
 240                                                unsigned long value)
 241{
 242        context->lo &= (((u64)-1) << 4) | 3;
 243        context->lo |= (value & 3) << 2;
 244}
 245
 246static inline void context_set_address_root(struct context_entry *context,
 247                                            unsigned long value)
 248{
 249        context->lo |= value & VTD_PAGE_MASK;
 250}
 251
 252static inline void context_set_address_width(struct context_entry *context,
 253                                             unsigned long value)
 254{
 255        context->hi |= value & 7;
 256}
 257
 258static inline void context_set_domain_id(struct context_entry *context,
 259                                         unsigned long value)
 260{
 261        context->hi |= (value & ((1 << 16) - 1)) << 8;
 262}
 263
 264static inline void context_clear_entry(struct context_entry *context)
 265{
 266        context->lo = 0;
 267        context->hi = 0;
 268}
 269
 270/*
 271 * 0: readable
 272 * 1: writable
 273 * 2-6: reserved
 274 * 7: super page
 275 * 8-10: available
 276 * 11: snoop behavior
 277 * 12-63: Host physcial address
 278 */
 279struct dma_pte {
 280        u64 val;
 281};
 282
 283static inline void dma_clear_pte(struct dma_pte *pte)
 284{
 285        pte->val = 0;
 286}
 287
 288static inline void dma_set_pte_readable(struct dma_pte *pte)
 289{
 290        pte->val |= DMA_PTE_READ;
 291}
 292
 293static inline void dma_set_pte_writable(struct dma_pte *pte)
 294{
 295        pte->val |= DMA_PTE_WRITE;
 296}
 297
 298static inline void dma_set_pte_snp(struct dma_pte *pte)
 299{
 300        pte->val |= DMA_PTE_SNP;
 301}
 302
 303static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
 304{
 305        pte->val = (pte->val & ~3) | (prot & 3);
 306}
 307
 308static inline u64 dma_pte_addr(struct dma_pte *pte)
 309{
 310#ifdef CONFIG_64BIT
 311        return pte->val & VTD_PAGE_MASK;
 312#else
 313        /* Must have a full atomic 64-bit read */
 314        return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
 315#endif
 316}
 317
 318static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
 319{
 320        pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
 321}
 322
 323static inline bool dma_pte_present(struct dma_pte *pte)
 324{
 325        return (pte->val & 3) != 0;
 326}
 327
 328static inline bool dma_pte_superpage(struct dma_pte *pte)
 329{
 330        return (pte->val & (1 << 7));
 331}
 332
 333static inline int first_pte_in_page(struct dma_pte *pte)
 334{
 335        return !((unsigned long)pte & ~VTD_PAGE_MASK);
 336}
 337
 338/*
 339 * This domain is a statically identity mapping domain.
 340 *      1. This domain creats a static 1:1 mapping to all usable memory.
 341 *      2. It maps to each iommu if successful.
 342 *      3. Each iommu mapps to this domain if successful.
 343 */
 344static struct dmar_domain *si_domain;
 345static int hw_pass_through = 1;
 346
 347/* devices under the same p2p bridge are owned in one domain */
 348#define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
 349
 350/* domain represents a virtual machine, more than one devices
 351 * across iommus may be owned in one domain, e.g. kvm guest.
 352 */
 353#define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
 354
 355/* si_domain contains mulitple devices */
 356#define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
 357
 358/* define the limit of IOMMUs supported in each domain */
 359#ifdef  CONFIG_X86
 360# define        IOMMU_UNITS_SUPPORTED   MAX_IO_APICS
 361#else
 362# define        IOMMU_UNITS_SUPPORTED   64
 363#endif
 364
 365struct dmar_domain {
 366        int     id;                     /* domain id */
 367        int     nid;                    /* node id */
 368        DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
 369                                        /* bitmap of iommus this domain uses*/
 370
 371        struct list_head devices;       /* all devices' list */
 372        struct iova_domain iovad;       /* iova's that belong to this domain */
 373
 374        struct dma_pte  *pgd;           /* virtual address */
 375        int             gaw;            /* max guest address width */
 376
 377        /* adjusted guest address width, 0 is level 2 30-bit */
 378        int             agaw;
 379
 380        int             flags;          /* flags to find out type of domain */
 381
 382        int             iommu_coherency;/* indicate coherency of iommu access */
 383        int             iommu_snooping; /* indicate snooping control feature*/
 384        int             iommu_count;    /* reference count of iommu */
 385        int             iommu_superpage;/* Level of superpages supported:
 386                                           0 == 4KiB (no superpages), 1 == 2MiB,
 387                                           2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
 388        spinlock_t      iommu_lock;     /* protect iommu set in domain */
 389        u64             max_addr;       /* maximum mapped address */
 390};
 391
 392/* PCI domain-device relationship */
 393struct device_domain_info {
 394        struct list_head link;  /* link to domain siblings */
 395        struct list_head global; /* link to global list */
 396        int segment;            /* PCI domain */
 397        u8 bus;                 /* PCI bus number */
 398        u8 devfn;               /* PCI devfn number */
 399        struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
 400        struct intel_iommu *iommu; /* IOMMU used by this device */
 401        struct dmar_domain *domain; /* pointer to domain */
 402};
 403
 404static void flush_unmaps_timeout(unsigned long data);
 405
 406DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
 407
 408#define HIGH_WATER_MARK 250
 409struct deferred_flush_tables {
 410        int next;
 411        struct iova *iova[HIGH_WATER_MARK];
 412        struct dmar_domain *domain[HIGH_WATER_MARK];
 413};
 414
 415static struct deferred_flush_tables *deferred_flush;
 416
 417/* bitmap for indexing intel_iommus */
 418static int g_num_of_iommus;
 419
 420static DEFINE_SPINLOCK(async_umap_flush_lock);
 421static LIST_HEAD(unmaps_to_do);
 422
 423static int timer_on;
 424static long list_size;
 425
 426static void domain_remove_dev_info(struct dmar_domain *domain);
 427
 428#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
 429int dmar_disabled = 0;
 430#else
 431int dmar_disabled = 1;
 432#endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
 433
 434int intel_iommu_enabled = 0;
 435EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 436
 437static int dmar_map_gfx = 1;
 438static int dmar_forcedac;
 439static int intel_iommu_strict;
 440static int intel_iommu_superpage = 1;
 441
 442int intel_iommu_gfx_mapped;
 443EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 444
 445#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 446static DEFINE_SPINLOCK(device_domain_lock);
 447static LIST_HEAD(device_domain_list);
 448
 449static struct iommu_ops intel_iommu_ops;
 450
 451static int __init intel_iommu_setup(char *str)
 452{
 453        if (!str)
 454                return -EINVAL;
 455        while (*str) {
 456                if (!strncmp(str, "on", 2)) {
 457                        dmar_disabled = 0;
 458                        printk(KERN_INFO "Intel-IOMMU: enabled\n");
 459                } else if (!strncmp(str, "off", 3)) {
 460                        dmar_disabled = 1;
 461                        printk(KERN_INFO "Intel-IOMMU: disabled\n");
 462                } else if (!strncmp(str, "igfx_off", 8)) {
 463                        dmar_map_gfx = 0;
 464                        printk(KERN_INFO
 465                                "Intel-IOMMU: disable GFX device mapping\n");
 466                } else if (!strncmp(str, "forcedac", 8)) {
 467                        printk(KERN_INFO
 468                                "Intel-IOMMU: Forcing DAC for PCI devices\n");
 469                        dmar_forcedac = 1;
 470                } else if (!strncmp(str, "strict", 6)) {
 471                        printk(KERN_INFO
 472                                "Intel-IOMMU: disable batched IOTLB flush\n");
 473                        intel_iommu_strict = 1;
 474                } else if (!strncmp(str, "sp_off", 6)) {
 475                        printk(KERN_INFO
 476                                "Intel-IOMMU: disable supported super page\n");
 477                        intel_iommu_superpage = 0;
 478                }
 479
 480                str += strcspn(str, ",");
 481                while (*str == ',')
 482                        str++;
 483        }
 484        return 0;
 485}
 486__setup("intel_iommu=", intel_iommu_setup);
 487
 488static struct kmem_cache *iommu_domain_cache;
 489static struct kmem_cache *iommu_devinfo_cache;
 490static struct kmem_cache *iommu_iova_cache;
 491
 492static inline void *alloc_pgtable_page(int node)
 493{
 494        struct page *page;
 495        void *vaddr = NULL;
 496
 497        page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 498        if (page)
 499                vaddr = page_address(page);
 500        return vaddr;
 501}
 502
 503static inline void free_pgtable_page(void *vaddr)
 504{
 505        free_page((unsigned long)vaddr);
 506}
 507
 508static inline void *alloc_domain_mem(void)
 509{
 510        return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 511}
 512
 513static void free_domain_mem(void *vaddr)
 514{
 515        kmem_cache_free(iommu_domain_cache, vaddr);
 516}
 517
 518static inline void * alloc_devinfo_mem(void)
 519{
 520        return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 521}
 522
 523static inline void free_devinfo_mem(void *vaddr)
 524{
 525        kmem_cache_free(iommu_devinfo_cache, vaddr);
 526}
 527
 528struct iova *alloc_iova_mem(void)
 529{
 530        return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
 531}
 532
 533void free_iova_mem(struct iova *iova)
 534{
 535        kmem_cache_free(iommu_iova_cache, iova);
 536}
 537
 538
 539static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 540{
 541        unsigned long sagaw;
 542        int agaw = -1;
 543
 544        sagaw = cap_sagaw(iommu->cap);
 545        for (agaw = width_to_agaw(max_gaw);
 546             agaw >= 0; agaw--) {
 547                if (test_bit(agaw, &sagaw))
 548                        break;
 549        }
 550
 551        return agaw;
 552}
 553
 554/*
 555 * Calculate max SAGAW for each iommu.
 556 */
 557int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 558{
 559        return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 560}
 561
 562/*
 563 * calculate agaw for each iommu.
 564 * "SAGAW" may be different across iommus, use a default agaw, and
 565 * get a supported less agaw for iommus that don't support the default agaw.
 566 */
 567int iommu_calculate_agaw(struct intel_iommu *iommu)
 568{
 569        return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 570}
 571
 572/* This functionin only returns single iommu in a domain */
 573static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 574{
 575        int iommu_id;
 576
 577        /* si_domain and vm domain should not get here. */
 578        BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
 579        BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
 580
 581        iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
 582        if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 583                return NULL;
 584
 585        return g_iommus[iommu_id];
 586}
 587
 588static void domain_update_iommu_coherency(struct dmar_domain *domain)
 589{
 590        int i;
 591
 592        i = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
 593
 594        domain->iommu_coherency = i < g_num_of_iommus ? 1 : 0;
 595
 596        for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
 597                if (!ecap_coherent(g_iommus[i]->ecap)) {
 598                        domain->iommu_coherency = 0;
 599                        break;
 600                }
 601        }
 602}
 603
 604static void domain_update_iommu_snooping(struct dmar_domain *domain)
 605{
 606        int i;
 607
 608        domain->iommu_snooping = 1;
 609
 610        for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
 611                if (!ecap_sc_support(g_iommus[i]->ecap)) {
 612                        domain->iommu_snooping = 0;
 613                        break;
 614                }
 615        }
 616}
 617
 618static void domain_update_iommu_superpage(struct dmar_domain *domain)
 619{
 620        struct dmar_drhd_unit *drhd;
 621        struct intel_iommu *iommu = NULL;
 622        int mask = 0xf;
 623
 624        if (!intel_iommu_superpage) {
 625                domain->iommu_superpage = 0;
 626                return;
 627        }
 628
 629        /* set iommu_superpage to the smallest common denominator */
 630        for_each_active_iommu(iommu, drhd) {
 631                mask &= cap_super_page_val(iommu->cap);
 632                if (!mask) {
 633                        break;
 634                }
 635        }
 636        domain->iommu_superpage = fls(mask);
 637}
 638
 639/* Some capabilities may be different across iommus */
 640static void domain_update_iommu_cap(struct dmar_domain *domain)
 641{
 642        domain_update_iommu_coherency(domain);
 643        domain_update_iommu_snooping(domain);
 644        domain_update_iommu_superpage(domain);
 645}
 646
 647static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
 648{
 649        struct dmar_drhd_unit *drhd = NULL;
 650        int i;
 651
 652        for_each_drhd_unit(drhd) {
 653                if (drhd->ignored)
 654                        continue;
 655                if (segment != drhd->segment)
 656                        continue;
 657
 658                for (i = 0; i < drhd->devices_cnt; i++) {
 659                        if (drhd->devices[i] &&
 660                            drhd->devices[i]->bus->number == bus &&
 661                            drhd->devices[i]->devfn == devfn)
 662                                return drhd->iommu;
 663                        if (drhd->devices[i] &&
 664                            drhd->devices[i]->subordinate &&
 665                            drhd->devices[i]->subordinate->number <= bus &&
 666                            drhd->devices[i]->subordinate->busn_res.end >= bus)
 667                                return drhd->iommu;
 668                }
 669
 670                if (drhd->include_all)
 671                        return drhd->iommu;
 672        }
 673
 674        return NULL;
 675}
 676
 677static void domain_flush_cache(struct dmar_domain *domain,
 678                               void *addr, int size)
 679{
 680        if (!domain->iommu_coherency)
 681                clflush_cache_range(addr, size);
 682}
 683
 684/* Gets context entry for a given bus and devfn */
 685static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
 686                u8 bus, u8 devfn)
 687{
 688        struct root_entry *root;
 689        struct context_entry *context;
 690        unsigned long phy_addr;
 691        unsigned long flags;
 692
 693        spin_lock_irqsave(&iommu->lock, flags);
 694        root = &iommu->root_entry[bus];
 695        context = get_context_addr_from_root(root);
 696        if (!context) {
 697                context = (struct context_entry *)
 698                                alloc_pgtable_page(iommu->node);
 699                if (!context) {
 700                        spin_unlock_irqrestore(&iommu->lock, flags);
 701                        return NULL;
 702                }
 703                __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 704                phy_addr = virt_to_phys((void *)context);
 705                set_root_value(root, phy_addr);
 706                set_root_present(root);
 707                __iommu_flush_cache(iommu, root, sizeof(*root));
 708        }
 709        spin_unlock_irqrestore(&iommu->lock, flags);
 710        return &context[devfn];
 711}
 712
 713static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 714{
 715        struct root_entry *root;
 716        struct context_entry *context;
 717        int ret;
 718        unsigned long flags;
 719
 720        spin_lock_irqsave(&iommu->lock, flags);
 721        root = &iommu->root_entry[bus];
 722        context = get_context_addr_from_root(root);
 723        if (!context) {
 724                ret = 0;
 725                goto out;
 726        }
 727        ret = context_present(&context[devfn]);
 728out:
 729        spin_unlock_irqrestore(&iommu->lock, flags);
 730        return ret;
 731}
 732
 733static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
 734{
 735        struct root_entry *root;
 736        struct context_entry *context;
 737        unsigned long flags;
 738
 739        spin_lock_irqsave(&iommu->lock, flags);
 740        root = &iommu->root_entry[bus];
 741        context = get_context_addr_from_root(root);
 742        if (context) {
 743                context_clear_entry(&context[devfn]);
 744                __iommu_flush_cache(iommu, &context[devfn], \
 745                        sizeof(*context));
 746        }
 747        spin_unlock_irqrestore(&iommu->lock, flags);
 748}
 749
 750static void free_context_table(struct intel_iommu *iommu)
 751{
 752        struct root_entry *root;
 753        int i;
 754        unsigned long flags;
 755        struct context_entry *context;
 756
 757        spin_lock_irqsave(&iommu->lock, flags);
 758        if (!iommu->root_entry) {
 759                goto out;
 760        }
 761        for (i = 0; i < ROOT_ENTRY_NR; i++) {
 762                root = &iommu->root_entry[i];
 763                context = get_context_addr_from_root(root);
 764                if (context)
 765                        free_pgtable_page(context);
 766        }
 767        free_pgtable_page(iommu->root_entry);
 768        iommu->root_entry = NULL;
 769out:
 770        spin_unlock_irqrestore(&iommu->lock, flags);
 771}
 772
 773static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 774                                      unsigned long pfn, int target_level)
 775{
 776        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 777        struct dma_pte *parent, *pte = NULL;
 778        int level = agaw_to_level(domain->agaw);
 779        int offset;
 780
 781        BUG_ON(!domain->pgd);
 782        BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
 783        parent = domain->pgd;
 784
 785        while (level > 0) {
 786                void *tmp_page;
 787
 788                offset = pfn_level_offset(pfn, level);
 789                pte = &parent[offset];
 790                if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 791                        break;
 792                if (level == target_level)
 793                        break;
 794
 795                if (!dma_pte_present(pte)) {
 796                        uint64_t pteval;
 797
 798                        tmp_page = alloc_pgtable_page(domain->nid);
 799
 800                        if (!tmp_page)
 801                                return NULL;
 802
 803                        domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 804                        pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 805                        if (cmpxchg64(&pte->val, 0ULL, pteval)) {
 806                                /* Someone else set it while we were thinking; use theirs. */
 807                                free_pgtable_page(tmp_page);
 808                        } else {
 809                                dma_pte_addr(pte);
 810                                domain_flush_cache(domain, pte, sizeof(*pte));
 811                        }
 812                }
 813                parent = phys_to_virt(dma_pte_addr(pte));
 814                level--;
 815        }
 816
 817        return pte;
 818}
 819
 820
 821/* return address's pte at specific level */
 822static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
 823                                         unsigned long pfn,
 824                                         int level, int *large_page)
 825{
 826        struct dma_pte *parent, *pte = NULL;
 827        int total = agaw_to_level(domain->agaw);
 828        int offset;
 829
 830        parent = domain->pgd;
 831        while (level <= total) {
 832                offset = pfn_level_offset(pfn, total);
 833                pte = &parent[offset];
 834                if (level == total)
 835                        return pte;
 836
 837                if (!dma_pte_present(pte)) {
 838                        *large_page = total;
 839                        break;
 840                }
 841
 842                if (pte->val & DMA_PTE_LARGE_PAGE) {
 843                        *large_page = total;
 844                        return pte;
 845                }
 846
 847                parent = phys_to_virt(dma_pte_addr(pte));
 848                total--;
 849        }
 850        return NULL;
 851}
 852
 853/* clear last level pte, a tlb flush should be followed */
 854static int dma_pte_clear_range(struct dmar_domain *domain,
 855                                unsigned long start_pfn,
 856                                unsigned long last_pfn)
 857{
 858        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 859        unsigned int large_page = 1;
 860        struct dma_pte *first_pte, *pte;
 861        int order;
 862
 863        BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
 864        BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
 865        BUG_ON(start_pfn > last_pfn);
 866
 867        /* we don't need lock here; nobody else touches the iova range */
 868        do {
 869                large_page = 1;
 870                first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
 871                if (!pte) {
 872                        start_pfn = align_to_level(start_pfn + 1, large_page + 1);
 873                        continue;
 874                }
 875                do {
 876                        dma_clear_pte(pte);
 877                        start_pfn += lvl_to_nr_pages(large_page);
 878                        pte++;
 879                } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
 880
 881                domain_flush_cache(domain, first_pte,
 882                                   (void *)pte - (void *)first_pte);
 883
 884        } while (start_pfn && start_pfn <= last_pfn);
 885
 886        order = (large_page - 1) * 9;
 887        return order;
 888}
 889
 890/* free page table pages. last level pte should already be cleared */
 891static void dma_pte_free_pagetable(struct dmar_domain *domain,
 892                                   unsigned long start_pfn,
 893                                   unsigned long last_pfn)
 894{
 895        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 896        struct dma_pte *first_pte, *pte;
 897        int total = agaw_to_level(domain->agaw);
 898        int level;
 899        unsigned long tmp;
 900        int large_page = 2;
 901
 902        BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
 903        BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
 904        BUG_ON(start_pfn > last_pfn);
 905
 906        /* We don't need lock here; nobody else touches the iova range */
 907        level = 2;
 908        while (level <= total) {
 909                tmp = align_to_level(start_pfn, level);
 910
 911                /* If we can't even clear one PTE at this level, we're done */
 912                if (tmp + level_size(level) - 1 > last_pfn)
 913                        return;
 914
 915                do {
 916                        large_page = level;
 917                        first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
 918                        if (large_page > level)
 919                                level = large_page + 1;
 920                        if (!pte) {
 921                                tmp = align_to_level(tmp + 1, level + 1);
 922                                continue;
 923                        }
 924                        do {
 925                                if (dma_pte_present(pte)) {
 926                                        free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
 927                                        dma_clear_pte(pte);
 928                                }
 929                                pte++;
 930                                tmp += level_size(level);
 931                        } while (!first_pte_in_page(pte) &&
 932                                 tmp + level_size(level) - 1 <= last_pfn);
 933
 934                        domain_flush_cache(domain, first_pte,
 935                                           (void *)pte - (void *)first_pte);
 936                        
 937                } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
 938                level++;
 939        }
 940        /* free pgd */
 941        if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
 942                free_pgtable_page(domain->pgd);
 943                domain->pgd = NULL;
 944        }
 945}
 946
 947/* iommu handling */
 948static int iommu_alloc_root_entry(struct intel_iommu *iommu)
 949{
 950        struct root_entry *root;
 951        unsigned long flags;
 952
 953        root = (struct root_entry *)alloc_pgtable_page(iommu->node);
 954        if (!root)
 955                return -ENOMEM;
 956
 957        __iommu_flush_cache(iommu, root, ROOT_SIZE);
 958
 959        spin_lock_irqsave(&iommu->lock, flags);
 960        iommu->root_entry = root;
 961        spin_unlock_irqrestore(&iommu->lock, flags);
 962
 963        return 0;
 964}
 965
 966static void iommu_set_root_entry(struct intel_iommu *iommu)
 967{
 968        void *addr;
 969        u32 sts;
 970        unsigned long flag;
 971
 972        addr = iommu->root_entry;
 973
 974        raw_spin_lock_irqsave(&iommu->register_lock, flag);
 975        dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
 976
 977        writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
 978
 979        /* Make sure hardware complete it */
 980        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
 981                      readl, (sts & DMA_GSTS_RTPS), sts);
 982
 983        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
 984}
 985
 986static void iommu_flush_write_buffer(struct intel_iommu *iommu)
 987{
 988        u32 val;
 989        unsigned long flag;
 990
 991        if (!rwbf_quirk && !cap_rwbf(iommu->cap))
 992                return;
 993
 994        raw_spin_lock_irqsave(&iommu->register_lock, flag);
 995        writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
 996
 997        /* Make sure hardware complete it */
 998        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
 999                      readl, (!(val & DMA_GSTS_WBFS)), val);
1000
1001        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1002}
1003
1004/* return value determine if we need a write buffer flush */
1005static void __iommu_flush_context(struct intel_iommu *iommu,
1006                                  u16 did, u16 source_id, u8 function_mask,
1007                                  u64 type)
1008{
1009        u64 val = 0;
1010        unsigned long flag;
1011
1012        switch (type) {
1013        case DMA_CCMD_GLOBAL_INVL:
1014                val = DMA_CCMD_GLOBAL_INVL;
1015                break;
1016        case DMA_CCMD_DOMAIN_INVL:
1017                val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1018                break;
1019        case DMA_CCMD_DEVICE_INVL:
1020                val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1021                        | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1022                break;
1023        default:
1024                BUG();
1025        }
1026        val |= DMA_CCMD_ICC;
1027
1028        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1029        dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1030
1031        /* Make sure hardware complete it */
1032        IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1033                dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1034
1035        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1036}
1037
1038/* return value determine if we need a write buffer flush */
1039static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1040                                u64 addr, unsigned int size_order, u64 type)
1041{
1042        int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1043        u64 val = 0, val_iva = 0;
1044        unsigned long flag;
1045
1046        switch (type) {
1047        case DMA_TLB_GLOBAL_FLUSH:
1048                /* global flush doesn't need set IVA_REG */
1049                val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1050                break;
1051        case DMA_TLB_DSI_FLUSH:
1052                val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1053                break;
1054        case DMA_TLB_PSI_FLUSH:
1055                val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1056                /* Note: always flush non-leaf currently */
1057                val_iva = size_order | addr;
1058                break;
1059        default:
1060                BUG();
1061        }
1062        /* Note: set drain read/write */
1063#if 0
1064        /*
1065         * This is probably to be super secure.. Looks like we can
1066         * ignore it without any impact.
1067         */
1068        if (cap_read_drain(iommu->cap))
1069                val |= DMA_TLB_READ_DRAIN;
1070#endif
1071        if (cap_write_drain(iommu->cap))
1072                val |= DMA_TLB_WRITE_DRAIN;
1073
1074        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1075        /* Note: Only uses first TLB reg currently */
1076        if (val_iva)
1077                dmar_writeq(iommu->reg + tlb_offset, val_iva);
1078        dmar_writeq(iommu->reg + tlb_offset + 8, val);
1079
1080        /* Make sure hardware complete it */
1081        IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1082                dmar_readq, (!(val & DMA_TLB_IVT)), val);
1083
1084        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1085
1086        /* check IOTLB invalidation granularity */
1087        if (DMA_TLB_IAIG(val) == 0)
1088                printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1089        if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1090                pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1091                        (unsigned long long)DMA_TLB_IIRG(type),
1092                        (unsigned long long)DMA_TLB_IAIG(val));
1093}
1094
1095static struct device_domain_info *iommu_support_dev_iotlb(
1096        struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1097{
1098        int found = 0;
1099        unsigned long flags;
1100        struct device_domain_info *info;
1101        struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1102
1103        if (!ecap_dev_iotlb_support(iommu->ecap))
1104                return NULL;
1105
1106        if (!iommu->qi)
1107                return NULL;
1108
1109        spin_lock_irqsave(&device_domain_lock, flags);
1110        list_for_each_entry(info, &domain->devices, link)
1111                if (info->bus == bus && info->devfn == devfn) {
1112                        found = 1;
1113                        break;
1114                }
1115        spin_unlock_irqrestore(&device_domain_lock, flags);
1116
1117        if (!found || !info->dev)
1118                return NULL;
1119
1120        if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1121                return NULL;
1122
1123        if (!dmar_find_matched_atsr_unit(info->dev))
1124                return NULL;
1125
1126        info->iommu = iommu;
1127
1128        return info;
1129}
1130
1131static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1132{
1133        if (!info)
1134                return;
1135
1136        pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1137}
1138
1139static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1140{
1141        if (!info->dev || !pci_ats_enabled(info->dev))
1142                return;
1143
1144        pci_disable_ats(info->dev);
1145}
1146
1147static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1148                                  u64 addr, unsigned mask)
1149{
1150        u16 sid, qdep;
1151        unsigned long flags;
1152        struct device_domain_info *info;
1153
1154        spin_lock_irqsave(&device_domain_lock, flags);
1155        list_for_each_entry(info, &domain->devices, link) {
1156                if (!info->dev || !pci_ats_enabled(info->dev))
1157                        continue;
1158
1159                sid = info->bus << 8 | info->devfn;
1160                qdep = pci_ats_queue_depth(info->dev);
1161                qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1162        }
1163        spin_unlock_irqrestore(&device_domain_lock, flags);
1164}
1165
1166static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1167                                  unsigned long pfn, unsigned int pages, int map)
1168{
1169        unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1170        uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1171
1172        BUG_ON(pages == 0);
1173
1174        /*
1175         * Fallback to domain selective flush if no PSI support or the size is
1176         * too big.
1177         * PSI requires page size to be 2 ^ x, and the base address is naturally
1178         * aligned to the size
1179         */
1180        if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1181                iommu->flush.flush_iotlb(iommu, did, 0, 0,
1182                                                DMA_TLB_DSI_FLUSH);
1183        else
1184                iommu->flush.flush_iotlb(iommu, did, addr, mask,
1185                                                DMA_TLB_PSI_FLUSH);
1186
1187        /*
1188         * In caching mode, changes of pages from non-present to present require
1189         * flush. However, device IOTLB doesn't need to be flushed in this case.
1190         */
1191        if (!cap_caching_mode(iommu->cap) || !map)
1192                iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1193}
1194
1195static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1196{
1197        u32 pmen;
1198        unsigned long flags;
1199
1200        raw_spin_lock_irqsave(&iommu->register_lock, flags);
1201        pmen = readl(iommu->reg + DMAR_PMEN_REG);
1202        pmen &= ~DMA_PMEN_EPM;
1203        writel(pmen, iommu->reg + DMAR_PMEN_REG);
1204
1205        /* wait for the protected region status bit to clear */
1206        IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1207                readl, !(pmen & DMA_PMEN_PRS), pmen);
1208
1209        raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1210}
1211
1212static int iommu_enable_translation(struct intel_iommu *iommu)
1213{
1214        u32 sts;
1215        unsigned long flags;
1216
1217        raw_spin_lock_irqsave(&iommu->register_lock, flags);
1218        iommu->gcmd |= DMA_GCMD_TE;
1219        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1220
1221        /* Make sure hardware complete it */
1222        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1223                      readl, (sts & DMA_GSTS_TES), sts);
1224
1225        raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1226        return 0;
1227}
1228
1229static int iommu_disable_translation(struct intel_iommu *iommu)
1230{
1231        u32 sts;
1232        unsigned long flag;
1233
1234        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1235        iommu->gcmd &= ~DMA_GCMD_TE;
1236        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1237
1238        /* Make sure hardware complete it */
1239        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1240                      readl, (!(sts & DMA_GSTS_TES)), sts);
1241
1242        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1243        return 0;
1244}
1245
1246
1247static int iommu_init_domains(struct intel_iommu *iommu)
1248{
1249        unsigned long ndomains;
1250        unsigned long nlongs;
1251
1252        ndomains = cap_ndoms(iommu->cap);
1253        pr_debug("IOMMU %d: Number of Domains supported <%ld>\n", iommu->seq_id,
1254                        ndomains);
1255        nlongs = BITS_TO_LONGS(ndomains);
1256
1257        spin_lock_init(&iommu->lock);
1258
1259        /* TBD: there might be 64K domains,
1260         * consider other allocation for future chip
1261         */
1262        iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1263        if (!iommu->domain_ids) {
1264                printk(KERN_ERR "Allocating domain id array failed\n");
1265                return -ENOMEM;
1266        }
1267        iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1268                        GFP_KERNEL);
1269        if (!iommu->domains) {
1270                printk(KERN_ERR "Allocating domain array failed\n");
1271                return -ENOMEM;
1272        }
1273
1274        /*
1275         * if Caching mode is set, then invalid translations are tagged
1276         * with domainid 0. Hence we need to pre-allocate it.
1277         */
1278        if (cap_caching_mode(iommu->cap))
1279                set_bit(0, iommu->domain_ids);
1280        return 0;
1281}
1282
1283
1284static void domain_exit(struct dmar_domain *domain);
1285static void vm_domain_exit(struct dmar_domain *domain);
1286
1287void free_dmar_iommu(struct intel_iommu *iommu)
1288{
1289        struct dmar_domain *domain;
1290        int i;
1291        unsigned long flags;
1292
1293        if ((iommu->domains) && (iommu->domain_ids)) {
1294                for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1295                        domain = iommu->domains[i];
1296                        clear_bit(i, iommu->domain_ids);
1297
1298                        spin_lock_irqsave(&domain->iommu_lock, flags);
1299                        if (--domain->iommu_count == 0) {
1300                                if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1301                                        vm_domain_exit(domain);
1302                                else
1303                                        domain_exit(domain);
1304                        }
1305                        spin_unlock_irqrestore(&domain->iommu_lock, flags);
1306                }
1307        }
1308
1309        if (iommu->gcmd & DMA_GCMD_TE)
1310                iommu_disable_translation(iommu);
1311
1312        if (iommu->irq) {
1313                irq_set_handler_data(iommu->irq, NULL);
1314                /* This will mask the irq */
1315                free_irq(iommu->irq, iommu);
1316                destroy_irq(iommu->irq);
1317        }
1318
1319        kfree(iommu->domains);
1320        kfree(iommu->domain_ids);
1321
1322        g_iommus[iommu->seq_id] = NULL;
1323
1324        /* if all iommus are freed, free g_iommus */
1325        for (i = 0; i < g_num_of_iommus; i++) {
1326                if (g_iommus[i])
1327                        break;
1328        }
1329
1330        if (i == g_num_of_iommus)
1331                kfree(g_iommus);
1332
1333        /* free context mapping */
1334        free_context_table(iommu);
1335}
1336
1337static struct dmar_domain *alloc_domain(void)
1338{
1339        struct dmar_domain *domain;
1340
1341        domain = alloc_domain_mem();
1342        if (!domain)
1343                return NULL;
1344
1345        domain->nid = -1;
1346        memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1347        domain->flags = 0;
1348
1349        return domain;
1350}
1351
1352static int iommu_attach_domain(struct dmar_domain *domain,
1353                               struct intel_iommu *iommu)
1354{
1355        int num;
1356        unsigned long ndomains;
1357        unsigned long flags;
1358
1359        ndomains = cap_ndoms(iommu->cap);
1360
1361        spin_lock_irqsave(&iommu->lock, flags);
1362
1363        num = find_first_zero_bit(iommu->domain_ids, ndomains);
1364        if (num >= ndomains) {
1365                spin_unlock_irqrestore(&iommu->lock, flags);
1366                printk(KERN_ERR "IOMMU: no free domain ids\n");
1367                return -ENOMEM;
1368        }
1369
1370        domain->id = num;
1371        set_bit(num, iommu->domain_ids);
1372        set_bit(iommu->seq_id, domain->iommu_bmp);
1373        iommu->domains[num] = domain;
1374        spin_unlock_irqrestore(&iommu->lock, flags);
1375
1376        return 0;
1377}
1378
1379static void iommu_detach_domain(struct dmar_domain *domain,
1380                                struct intel_iommu *iommu)
1381{
1382        unsigned long flags;
1383        int num, ndomains;
1384        int found = 0;
1385
1386        spin_lock_irqsave(&iommu->lock, flags);
1387        ndomains = cap_ndoms(iommu->cap);
1388        for_each_set_bit(num, iommu->domain_ids, ndomains) {
1389                if (iommu->domains[num] == domain) {
1390                        found = 1;
1391                        break;
1392                }
1393        }
1394
1395        if (found) {
1396                clear_bit(num, iommu->domain_ids);
1397                clear_bit(iommu->seq_id, domain->iommu_bmp);
1398                iommu->domains[num] = NULL;
1399        }
1400        spin_unlock_irqrestore(&iommu->lock, flags);
1401}
1402
1403static struct iova_domain reserved_iova_list;
1404static struct lock_class_key reserved_rbtree_key;
1405
1406static int dmar_init_reserved_ranges(void)
1407{
1408        struct pci_dev *pdev = NULL;
1409        struct iova *iova;
1410        int i;
1411
1412        init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1413
1414        lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1415                &reserved_rbtree_key);
1416
1417        /* IOAPIC ranges shouldn't be accessed by DMA */
1418        iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1419                IOVA_PFN(IOAPIC_RANGE_END));
1420        if (!iova) {
1421                printk(KERN_ERR "Reserve IOAPIC range failed\n");
1422                return -ENODEV;
1423        }
1424
1425        /* Reserve all PCI MMIO to avoid peer-to-peer access */
1426        for_each_pci_dev(pdev) {
1427                struct resource *r;
1428
1429                for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1430                        r = &pdev->resource[i];
1431                        if (!r->flags || !(r->flags & IORESOURCE_MEM))
1432                                continue;
1433                        iova = reserve_iova(&reserved_iova_list,
1434                                            IOVA_PFN(r->start),
1435                                            IOVA_PFN(r->end));
1436                        if (!iova) {
1437                                printk(KERN_ERR "Reserve iova failed\n");
1438                                return -ENODEV;
1439                        }
1440                }
1441        }
1442        return 0;
1443}
1444
1445static void domain_reserve_special_ranges(struct dmar_domain *domain)
1446{
1447        copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1448}
1449
1450static inline int guestwidth_to_adjustwidth(int gaw)
1451{
1452        int agaw;
1453        int r = (gaw - 12) % 9;
1454
1455        if (r == 0)
1456                agaw = gaw;
1457        else
1458                agaw = gaw + 9 - r;
1459        if (agaw > 64)
1460                agaw = 64;
1461        return agaw;
1462}
1463
1464static int domain_init(struct dmar_domain *domain, int guest_width)
1465{
1466        struct intel_iommu *iommu;
1467        int adjust_width, agaw;
1468        unsigned long sagaw;
1469
1470        init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1471        spin_lock_init(&domain->iommu_lock);
1472
1473        domain_reserve_special_ranges(domain);
1474
1475        /* calculate AGAW */
1476        iommu = domain_get_iommu(domain);
1477        if (guest_width > cap_mgaw(iommu->cap))
1478                guest_width = cap_mgaw(iommu->cap);
1479        domain->gaw = guest_width;
1480        adjust_width = guestwidth_to_adjustwidth(guest_width);
1481        agaw = width_to_agaw(adjust_width);
1482        sagaw = cap_sagaw(iommu->cap);
1483        if (!test_bit(agaw, &sagaw)) {
1484                /* hardware doesn't support it, choose a bigger one */
1485                pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1486                agaw = find_next_bit(&sagaw, 5, agaw);
1487                if (agaw >= 5)
1488                        return -ENODEV;
1489        }
1490        domain->agaw = agaw;
1491        INIT_LIST_HEAD(&domain->devices);
1492
1493        if (ecap_coherent(iommu->ecap))
1494                domain->iommu_coherency = 1;
1495        else
1496                domain->iommu_coherency = 0;
1497
1498        if (ecap_sc_support(iommu->ecap))
1499                domain->iommu_snooping = 1;
1500        else
1501                domain->iommu_snooping = 0;
1502
1503        domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1504        domain->iommu_count = 1;
1505        domain->nid = iommu->node;
1506
1507        /* always allocate the top pgd */
1508        domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1509        if (!domain->pgd)
1510                return -ENOMEM;
1511        __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1512        return 0;
1513}
1514
1515static void domain_exit(struct dmar_domain *domain)
1516{
1517        struct dmar_drhd_unit *drhd;
1518        struct intel_iommu *iommu;
1519
1520        /* Domain 0 is reserved, so dont process it */
1521        if (!domain)
1522                return;
1523
1524        /* Flush any lazy unmaps that may reference this domain */
1525        if (!intel_iommu_strict)
1526                flush_unmaps_timeout(0);
1527
1528        domain_remove_dev_info(domain);
1529        /* destroy iovas */
1530        put_iova_domain(&domain->iovad);
1531
1532        /* clear ptes */
1533        dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1534
1535        /* free page tables */
1536        dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1537
1538        for_each_active_iommu(iommu, drhd)
1539                if (test_bit(iommu->seq_id, domain->iommu_bmp))
1540                        iommu_detach_domain(domain, iommu);
1541
1542        free_domain_mem(domain);
1543}
1544
1545static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1546                                 u8 bus, u8 devfn, int translation)
1547{
1548        struct context_entry *context;
1549        unsigned long flags;
1550        struct intel_iommu *iommu;
1551        struct dma_pte *pgd;
1552        unsigned long num;
1553        unsigned long ndomains;
1554        int id;
1555        int agaw;
1556        struct device_domain_info *info = NULL;
1557
1558        pr_debug("Set context mapping for %02x:%02x.%d\n",
1559                bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1560
1561        BUG_ON(!domain->pgd);
1562        BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1563               translation != CONTEXT_TT_MULTI_LEVEL);
1564
1565        iommu = device_to_iommu(segment, bus, devfn);
1566        if (!iommu)
1567                return -ENODEV;
1568
1569        context = device_to_context_entry(iommu, bus, devfn);
1570        if (!context)
1571                return -ENOMEM;
1572        spin_lock_irqsave(&iommu->lock, flags);
1573        if (context_present(context)) {
1574                spin_unlock_irqrestore(&iommu->lock, flags);
1575                return 0;
1576        }
1577
1578        id = domain->id;
1579        pgd = domain->pgd;
1580
1581        if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1582            domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1583                int found = 0;
1584
1585                /* find an available domain id for this device in iommu */
1586                ndomains = cap_ndoms(iommu->cap);
1587                for_each_set_bit(num, iommu->domain_ids, ndomains) {
1588                        if (iommu->domains[num] == domain) {
1589                                id = num;
1590                                found = 1;
1591                                break;
1592                        }
1593                }
1594
1595                if (found == 0) {
1596                        num = find_first_zero_bit(iommu->domain_ids, ndomains);
1597                        if (num >= ndomains) {
1598                                spin_unlock_irqrestore(&iommu->lock, flags);
1599                                printk(KERN_ERR "IOMMU: no free domain ids\n");
1600                                return -EFAULT;
1601                        }
1602
1603                        set_bit(num, iommu->domain_ids);
1604                        iommu->domains[num] = domain;
1605                        id = num;
1606                }
1607
1608                /* Skip top levels of page tables for
1609                 * iommu which has less agaw than default.
1610                 * Unnecessary for PT mode.
1611                 */
1612                if (translation != CONTEXT_TT_PASS_THROUGH) {
1613                        for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1614                                pgd = phys_to_virt(dma_pte_addr(pgd));
1615                                if (!dma_pte_present(pgd)) {
1616                                        spin_unlock_irqrestore(&iommu->lock, flags);
1617                                        return -ENOMEM;
1618                                }
1619                        }
1620                }
1621        }
1622
1623        context_set_domain_id(context, id);
1624
1625        if (translation != CONTEXT_TT_PASS_THROUGH) {
1626                info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1627                translation = info ? CONTEXT_TT_DEV_IOTLB :
1628                                     CONTEXT_TT_MULTI_LEVEL;
1629        }
1630        /*
1631         * In pass through mode, AW must be programmed to indicate the largest
1632         * AGAW value supported by hardware. And ASR is ignored by hardware.
1633         */
1634        if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1635                context_set_address_width(context, iommu->msagaw);
1636        else {
1637                context_set_address_root(context, virt_to_phys(pgd));
1638                context_set_address_width(context, iommu->agaw);
1639        }
1640
1641        context_set_translation_type(context, translation);
1642        context_set_fault_enable(context);
1643        context_set_present(context);
1644        domain_flush_cache(domain, context, sizeof(*context));
1645
1646        /*
1647         * It's a non-present to present mapping. If hardware doesn't cache
1648         * non-present entry we only need to flush the write-buffer. If the
1649         * _does_ cache non-present entries, then it does so in the special
1650         * domain #0, which we have to flush:
1651         */
1652        if (cap_caching_mode(iommu->cap)) {
1653                iommu->flush.flush_context(iommu, 0,
1654                                           (((u16)bus) << 8) | devfn,
1655                                           DMA_CCMD_MASK_NOBIT,
1656                                           DMA_CCMD_DEVICE_INVL);
1657                iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1658        } else {
1659                iommu_flush_write_buffer(iommu);
1660        }
1661        iommu_enable_dev_iotlb(info);
1662        spin_unlock_irqrestore(&iommu->lock, flags);
1663
1664        spin_lock_irqsave(&domain->iommu_lock, flags);
1665        if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1666                domain->iommu_count++;
1667                if (domain->iommu_count == 1)
1668                        domain->nid = iommu->node;
1669                domain_update_iommu_cap(domain);
1670        }
1671        spin_unlock_irqrestore(&domain->iommu_lock, flags);
1672        return 0;
1673}
1674
1675static int
1676domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1677                        int translation)
1678{
1679        int ret;
1680        struct pci_dev *tmp, *parent;
1681
1682        ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1683                                         pdev->bus->number, pdev->devfn,
1684                                         translation);
1685        if (ret)
1686                return ret;
1687
1688        /* dependent device mapping */
1689        tmp = pci_find_upstream_pcie_bridge(pdev);
1690        if (!tmp)
1691                return 0;
1692        /* Secondary interface's bus number and devfn 0 */
1693        parent = pdev->bus->self;
1694        while (parent != tmp) {
1695                ret = domain_context_mapping_one(domain,
1696                                                 pci_domain_nr(parent->bus),
1697                                                 parent->bus->number,
1698                                                 parent->devfn, translation);
1699                if (ret)
1700                        return ret;
1701                parent = parent->bus->self;
1702        }
1703        if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1704                return domain_context_mapping_one(domain,
1705                                        pci_domain_nr(tmp->subordinate),
1706                                        tmp->subordinate->number, 0,
1707                                        translation);
1708        else /* this is a legacy PCI bridge */
1709                return domain_context_mapping_one(domain,
1710                                                  pci_domain_nr(tmp->bus),
1711                                                  tmp->bus->number,
1712                                                  tmp->devfn,
1713                                                  translation);
1714}
1715
1716static int domain_context_mapped(struct pci_dev *pdev)
1717{
1718        int ret;
1719        struct pci_dev *tmp, *parent;
1720        struct intel_iommu *iommu;
1721
1722        iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1723                                pdev->devfn);
1724        if (!iommu)
1725                return -ENODEV;
1726
1727        ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1728        if (!ret)
1729                return ret;
1730        /* dependent device mapping */
1731        tmp = pci_find_upstream_pcie_bridge(pdev);
1732        if (!tmp)
1733                return ret;
1734        /* Secondary interface's bus number and devfn 0 */
1735        parent = pdev->bus->self;
1736        while (parent != tmp) {
1737                ret = device_context_mapped(iommu, parent->bus->number,
1738                                            parent->devfn);
1739                if (!ret)
1740                        return ret;
1741                parent = parent->bus->self;
1742        }
1743        if (pci_is_pcie(tmp))
1744                return device_context_mapped(iommu, tmp->subordinate->number,
1745                                             0);
1746        else
1747                return device_context_mapped(iommu, tmp->bus->number,
1748                                             tmp->devfn);
1749}
1750
1751/* Returns a number of VTD pages, but aligned to MM page size */
1752static inline unsigned long aligned_nrpages(unsigned long host_addr,
1753                                            size_t size)
1754{
1755        host_addr &= ~PAGE_MASK;
1756        return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1757}
1758
1759/* Return largest possible superpage level for a given mapping */
1760static inline int hardware_largepage_caps(struct dmar_domain *domain,
1761                                          unsigned long iov_pfn,
1762                                          unsigned long phy_pfn,
1763                                          unsigned long pages)
1764{
1765        int support, level = 1;
1766        unsigned long pfnmerge;
1767
1768        support = domain->iommu_superpage;
1769
1770        /* To use a large page, the virtual *and* physical addresses
1771           must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1772           of them will mean we have to use smaller pages. So just
1773           merge them and check both at once. */
1774        pfnmerge = iov_pfn | phy_pfn;
1775
1776        while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1777                pages >>= VTD_STRIDE_SHIFT;
1778                if (!pages)
1779                        break;
1780                pfnmerge >>= VTD_STRIDE_SHIFT;
1781                level++;
1782                support--;
1783        }
1784        return level;
1785}
1786
1787static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1788                            struct scatterlist *sg, unsigned long phys_pfn,
1789                            unsigned long nr_pages, int prot)
1790{
1791        struct dma_pte *first_pte = NULL, *pte = NULL;
1792        phys_addr_t uninitialized_var(pteval);
1793        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1794        unsigned long sg_res;
1795        unsigned int largepage_lvl = 0;
1796        unsigned long lvl_pages = 0;
1797
1798        BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1799
1800        if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1801                return -EINVAL;
1802
1803        prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1804
1805        if (sg)
1806                sg_res = 0;
1807        else {
1808                sg_res = nr_pages + 1;
1809                pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1810        }
1811
1812        while (nr_pages > 0) {
1813                uint64_t tmp;
1814
1815                if (!sg_res) {
1816                        sg_res = aligned_nrpages(sg->offset, sg->length);
1817                        sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1818                        sg->dma_length = sg->length;
1819                        pteval = page_to_phys(sg_page(sg)) | prot;
1820                        phys_pfn = pteval >> VTD_PAGE_SHIFT;
1821                }
1822
1823                if (!pte) {
1824                        largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1825
1826                        first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1827                        if (!pte)
1828                                return -ENOMEM;
1829                        /* It is large page*/
1830                        if (largepage_lvl > 1)
1831                                pteval |= DMA_PTE_LARGE_PAGE;
1832                        else
1833                                pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1834
1835                }
1836                /* We don't need lock here, nobody else
1837                 * touches the iova range
1838                 */
1839                tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1840                if (tmp) {
1841                        static int dumps = 5;
1842                        printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1843                               iov_pfn, tmp, (unsigned long long)pteval);
1844                        if (dumps) {
1845                                dumps--;
1846                                debug_dma_dump_mappings(NULL);
1847                        }
1848                        WARN_ON(1);
1849                }
1850
1851                lvl_pages = lvl_to_nr_pages(largepage_lvl);
1852
1853                BUG_ON(nr_pages < lvl_pages);
1854                BUG_ON(sg_res < lvl_pages);
1855
1856                nr_pages -= lvl_pages;
1857                iov_pfn += lvl_pages;
1858                phys_pfn += lvl_pages;
1859                pteval += lvl_pages * VTD_PAGE_SIZE;
1860                sg_res -= lvl_pages;
1861
1862                /* If the next PTE would be the first in a new page, then we
1863                   need to flush the cache on the entries we've just written.
1864                   And then we'll need to recalculate 'pte', so clear it and
1865                   let it get set again in the if (!pte) block above.
1866
1867                   If we're done (!nr_pages) we need to flush the cache too.
1868
1869                   Also if we've been setting superpages, we may need to
1870                   recalculate 'pte' and switch back to smaller pages for the
1871                   end of the mapping, if the trailing size is not enough to
1872                   use another superpage (i.e. sg_res < lvl_pages). */
1873                pte++;
1874                if (!nr_pages || first_pte_in_page(pte) ||
1875                    (largepage_lvl > 1 && sg_res < lvl_pages)) {
1876                        domain_flush_cache(domain, first_pte,
1877                                           (void *)pte - (void *)first_pte);
1878                        pte = NULL;
1879                }
1880
1881                if (!sg_res && nr_pages)
1882                        sg = sg_next(sg);
1883        }
1884        return 0;
1885}
1886
1887static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1888                                    struct scatterlist *sg, unsigned long nr_pages,
1889                                    int prot)
1890{
1891        return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1892}
1893
1894static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1895                                     unsigned long phys_pfn, unsigned long nr_pages,
1896                                     int prot)
1897{
1898        return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1899}
1900
1901static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1902{
1903        if (!iommu)
1904                return;
1905
1906        clear_context_table(iommu, bus, devfn);
1907        iommu->flush.flush_context(iommu, 0, 0, 0,
1908                                           DMA_CCMD_GLOBAL_INVL);
1909        iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1910}
1911
1912static inline void unlink_domain_info(struct device_domain_info *info)
1913{
1914        assert_spin_locked(&device_domain_lock);
1915        list_del(&info->link);
1916        list_del(&info->global);
1917        if (info->dev)
1918                info->dev->dev.archdata.iommu = NULL;
1919}
1920
1921static void domain_remove_dev_info(struct dmar_domain *domain)
1922{
1923        struct device_domain_info *info;
1924        unsigned long flags;
1925        struct intel_iommu *iommu;
1926
1927        spin_lock_irqsave(&device_domain_lock, flags);
1928        while (!list_empty(&domain->devices)) {
1929                info = list_entry(domain->devices.next,
1930                        struct device_domain_info, link);
1931                unlink_domain_info(info);
1932                spin_unlock_irqrestore(&device_domain_lock, flags);
1933
1934                iommu_disable_dev_iotlb(info);
1935                iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1936                iommu_detach_dev(iommu, info->bus, info->devfn);
1937                free_devinfo_mem(info);
1938
1939                spin_lock_irqsave(&device_domain_lock, flags);
1940        }
1941        spin_unlock_irqrestore(&device_domain_lock, flags);
1942}
1943
1944/*
1945 * find_domain
1946 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1947 */
1948static struct dmar_domain *
1949find_domain(struct pci_dev *pdev)
1950{
1951        struct device_domain_info *info;
1952
1953        /* No lock here, assumes no domain exit in normal case */
1954        info = pdev->dev.archdata.iommu;
1955        if (info)
1956                return info->domain;
1957        return NULL;
1958}
1959
1960/* domain is initialized */
1961static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1962{
1963        struct dmar_domain *domain, *found = NULL;
1964        struct intel_iommu *iommu;
1965        struct dmar_drhd_unit *drhd;
1966        struct device_domain_info *info, *tmp;
1967        struct pci_dev *dev_tmp;
1968        unsigned long flags;
1969        int bus = 0, devfn = 0;
1970        int segment;
1971        int ret;
1972
1973        domain = find_domain(pdev);
1974        if (domain)
1975                return domain;
1976
1977        segment = pci_domain_nr(pdev->bus);
1978
1979        dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1980        if (dev_tmp) {
1981                if (pci_is_pcie(dev_tmp)) {
1982                        bus = dev_tmp->subordinate->number;
1983                        devfn = 0;
1984                } else {
1985                        bus = dev_tmp->bus->number;
1986                        devfn = dev_tmp->devfn;
1987                }
1988                spin_lock_irqsave(&device_domain_lock, flags);
1989                list_for_each_entry(info, &device_domain_list, global) {
1990                        if (info->segment == segment &&
1991                            info->bus == bus && info->devfn == devfn) {
1992                                found = info->domain;
1993                                break;
1994                        }
1995                }
1996                spin_unlock_irqrestore(&device_domain_lock, flags);
1997                /* pcie-pci bridge already has a domain, uses it */
1998                if (found) {
1999                        domain = found;
2000                        goto found_domain;
2001                }
2002        }
2003
2004        domain = alloc_domain();
2005        if (!domain)
2006                goto error;
2007
2008        /* Allocate new domain for the device */
2009        drhd = dmar_find_matched_drhd_unit(pdev);
2010        if (!drhd) {
2011                printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
2012                        pci_name(pdev));
2013                free_domain_mem(domain);
2014                return NULL;
2015        }
2016        iommu = drhd->iommu;
2017
2018        ret = iommu_attach_domain(domain, iommu);
2019        if (ret) {
2020                free_domain_mem(domain);
2021                goto error;
2022        }
2023
2024        if (domain_init(domain, gaw)) {
2025                domain_exit(domain);
2026                goto error;
2027        }
2028
2029        /* register pcie-to-pci device */
2030        if (dev_tmp) {
2031                info = alloc_devinfo_mem();
2032                if (!info) {
2033                        domain_exit(domain);
2034                        goto error;
2035                }
2036                info->segment = segment;
2037                info->bus = bus;
2038                info->devfn = devfn;
2039                info->dev = NULL;
2040                info->domain = domain;
2041                /* This domain is shared by devices under p2p bridge */
2042                domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2043
2044                /* pcie-to-pci bridge already has a domain, uses it */
2045                found = NULL;
2046                spin_lock_irqsave(&device_domain_lock, flags);
2047                list_for_each_entry(tmp, &device_domain_list, global) {
2048                        if (tmp->segment == segment &&
2049                            tmp->bus == bus && tmp->devfn == devfn) {
2050                                found = tmp->domain;
2051                                break;
2052                        }
2053                }
2054                if (found) {
2055                        spin_unlock_irqrestore(&device_domain_lock, flags);
2056                        free_devinfo_mem(info);
2057                        domain_exit(domain);
2058                        domain = found;
2059                } else {
2060                        list_add(&info->link, &domain->devices);
2061                        list_add(&info->global, &device_domain_list);
2062                        spin_unlock_irqrestore(&device_domain_lock, flags);
2063                }
2064        }
2065
2066found_domain:
2067        info = alloc_devinfo_mem();
2068        if (!info)
2069                goto error;
2070        info->segment = segment;
2071        info->bus = pdev->bus->number;
2072        info->devfn = pdev->devfn;
2073        info->dev = pdev;
2074        info->domain = domain;
2075        spin_lock_irqsave(&device_domain_lock, flags);
2076        /* somebody is fast */
2077        found = find_domain(pdev);
2078        if (found != NULL) {
2079                spin_unlock_irqrestore(&device_domain_lock, flags);
2080                if (found != domain) {
2081                        domain_exit(domain);
2082                        domain = found;
2083                }
2084                free_devinfo_mem(info);
2085                return domain;
2086        }
2087        list_add(&info->link, &domain->devices);
2088        list_add(&info->global, &device_domain_list);
2089        pdev->dev.archdata.iommu = info;
2090        spin_unlock_irqrestore(&device_domain_lock, flags);
2091        return domain;
2092error:
2093        /* recheck it here, maybe others set it */
2094        return find_domain(pdev);
2095}
2096
2097static int iommu_identity_mapping;
2098#define IDENTMAP_ALL            1
2099#define IDENTMAP_GFX            2
2100#define IDENTMAP_AZALIA         4
2101
2102static int iommu_domain_identity_map(struct dmar_domain *domain,
2103                                     unsigned long long start,
2104                                     unsigned long long end)
2105{
2106        unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2107        unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2108
2109        if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2110                          dma_to_mm_pfn(last_vpfn))) {
2111                printk(KERN_ERR "IOMMU: reserve iova failed\n");
2112                return -ENOMEM;
2113        }
2114
2115        pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2116                 start, end, domain->id);
2117        /*
2118         * RMRR range might have overlap with physical memory range,
2119         * clear it first
2120         */
2121        dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2122
2123        return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2124                                  last_vpfn - first_vpfn + 1,
2125                                  DMA_PTE_READ|DMA_PTE_WRITE);
2126}
2127
2128static int iommu_prepare_identity_map(struct pci_dev *pdev,
2129                                      unsigned long long start,
2130                                      unsigned long long end)
2131{
2132        struct dmar_domain *domain;
2133        int ret;
2134
2135        domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2136        if (!domain)
2137                return -ENOMEM;
2138
2139        /* For _hardware_ passthrough, don't bother. But for software
2140           passthrough, we do it anyway -- it may indicate a memory
2141           range which is reserved in E820, so which didn't get set
2142           up to start with in si_domain */
2143        if (domain == si_domain && hw_pass_through) {
2144                printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2145                       pci_name(pdev), start, end);
2146                return 0;
2147        }
2148
2149        printk(KERN_INFO
2150               "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2151               pci_name(pdev), start, end);
2152        
2153        if (end < start) {
2154                WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2155                        "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2156                        dmi_get_system_info(DMI_BIOS_VENDOR),
2157                        dmi_get_system_info(DMI_BIOS_VERSION),
2158                     dmi_get_system_info(DMI_PRODUCT_VERSION));
2159                ret = -EIO;
2160                goto error;
2161        }
2162
2163        if (end >> agaw_to_width(domain->agaw)) {
2164                WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2165                     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2166                     agaw_to_width(domain->agaw),
2167                     dmi_get_system_info(DMI_BIOS_VENDOR),
2168                     dmi_get_system_info(DMI_BIOS_VERSION),
2169                     dmi_get_system_info(DMI_PRODUCT_VERSION));
2170                ret = -EIO;
2171                goto error;
2172        }
2173
2174        ret = iommu_domain_identity_map(domain, start, end);
2175        if (ret)
2176                goto error;
2177
2178        /* context entry init */
2179        ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2180        if (ret)
2181                goto error;
2182
2183        return 0;
2184
2185 error:
2186        domain_exit(domain);
2187        return ret;
2188}
2189
2190static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2191        struct pci_dev *pdev)
2192{
2193        if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2194                return 0;
2195        return iommu_prepare_identity_map(pdev, rmrr->base_address,
2196                rmrr->end_address);
2197}
2198
2199#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2200static inline void iommu_prepare_isa(void)
2201{
2202        struct pci_dev *pdev;
2203        int ret;
2204
2205        pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2206        if (!pdev)
2207                return;
2208
2209        printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2210        ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2211
2212        if (ret)
2213                printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2214                       "floppy might not work\n");
2215
2216}
2217#else
2218static inline void iommu_prepare_isa(void)
2219{
2220        return;
2221}
2222#endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2223
2224static int md_domain_init(struct dmar_domain *domain, int guest_width);
2225
2226static int __init si_domain_init(int hw)
2227{
2228        struct dmar_drhd_unit *drhd;
2229        struct intel_iommu *iommu;
2230        int nid, ret = 0;
2231
2232        si_domain = alloc_domain();
2233        if (!si_domain)
2234                return -EFAULT;
2235
2236        pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2237
2238        for_each_active_iommu(iommu, drhd) {
2239                ret = iommu_attach_domain(si_domain, iommu);
2240                if (ret) {
2241                        domain_exit(si_domain);
2242                        return -EFAULT;
2243                }
2244        }
2245
2246        if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2247                domain_exit(si_domain);
2248                return -EFAULT;
2249        }
2250
2251        si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2252
2253        if (hw)
2254                return 0;
2255
2256        for_each_online_node(nid) {
2257                unsigned long start_pfn, end_pfn;
2258                int i;
2259
2260                for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2261                        ret = iommu_domain_identity_map(si_domain,
2262                                        PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2263                        if (ret)
2264                                return ret;
2265                }
2266        }
2267
2268        return 0;
2269}
2270
2271static void domain_remove_one_dev_info(struct dmar_domain *domain,
2272                                          struct pci_dev *pdev);
2273static int identity_mapping(struct pci_dev *pdev)
2274{
2275        struct device_domain_info *info;
2276
2277        if (likely(!iommu_identity_mapping))
2278                return 0;
2279
2280        info = pdev->dev.archdata.iommu;
2281        if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2282                return (info->domain == si_domain);
2283
2284        return 0;
2285}
2286
2287static int domain_add_dev_info(struct dmar_domain *domain,
2288                               struct pci_dev *pdev,
2289                               int translation)
2290{
2291        struct device_domain_info *info;
2292        unsigned long flags;
2293        int ret;
2294
2295        info = alloc_devinfo_mem();
2296        if (!info)
2297                return -ENOMEM;
2298
2299        info->segment = pci_domain_nr(pdev->bus);
2300        info->bus = pdev->bus->number;
2301        info->devfn = pdev->devfn;
2302        info->dev = pdev;
2303        info->domain = domain;
2304
2305        spin_lock_irqsave(&device_domain_lock, flags);
2306        list_add(&info->link, &domain->devices);
2307        list_add(&info->global, &device_domain_list);
2308        pdev->dev.archdata.iommu = info;
2309        spin_unlock_irqrestore(&device_domain_lock, flags);
2310
2311        ret = domain_context_mapping(domain, pdev, translation);
2312        if (ret) {
2313                spin_lock_irqsave(&device_domain_lock, flags);
2314                unlink_domain_info(info);
2315                spin_unlock_irqrestore(&device_domain_lock, flags);
2316                free_devinfo_mem(info);
2317                return ret;
2318        }
2319
2320        return 0;
2321}
2322
2323static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2324{
2325        if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2326                return 1;
2327
2328        if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2329                return 1;
2330
2331        if (!(iommu_identity_mapping & IDENTMAP_ALL))
2332                return 0;
2333
2334        /*
2335         * We want to start off with all devices in the 1:1 domain, and
2336         * take them out later if we find they can't access all of memory.
2337         *
2338         * However, we can't do this for PCI devices behind bridges,
2339         * because all PCI devices behind the same bridge will end up
2340         * with the same source-id on their transactions.
2341         *
2342         * Practically speaking, we can't change things around for these
2343         * devices at run-time, because we can't be sure there'll be no
2344         * DMA transactions in flight for any of their siblings.
2345         * 
2346         * So PCI devices (unless they're on the root bus) as well as
2347         * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2348         * the 1:1 domain, just in _case_ one of their siblings turns out
2349         * not to be able to map all of memory.
2350         */
2351        if (!pci_is_pcie(pdev)) {
2352                if (!pci_is_root_bus(pdev->bus))
2353                        return 0;
2354                if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2355                        return 0;
2356        } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2357                return 0;
2358
2359        /* 
2360         * At boot time, we don't yet know if devices will be 64-bit capable.
2361         * Assume that they will -- if they turn out not to be, then we can 
2362         * take them out of the 1:1 domain later.
2363         */
2364        if (!startup) {
2365                /*
2366                 * If the device's dma_mask is less than the system's memory
2367                 * size then this is not a candidate for identity mapping.
2368                 */
2369                u64 dma_mask = pdev->dma_mask;
2370
2371                if (pdev->dev.coherent_dma_mask &&
2372                    pdev->dev.coherent_dma_mask < dma_mask)
2373                        dma_mask = pdev->dev.coherent_dma_mask;
2374
2375                return dma_mask >= dma_get_required_mask(&pdev->dev);
2376        }
2377
2378        return 1;
2379}
2380
2381static int __init iommu_prepare_static_identity_mapping(int hw)
2382{
2383        struct pci_dev *pdev = NULL;
2384        int ret;
2385
2386        ret = si_domain_init(hw);
2387        if (ret)
2388                return -EFAULT;
2389
2390        for_each_pci_dev(pdev) {
2391                if (iommu_should_identity_map(pdev, 1)) {
2392                        ret = domain_add_dev_info(si_domain, pdev,
2393                                             hw ? CONTEXT_TT_PASS_THROUGH :
2394                                                  CONTEXT_TT_MULTI_LEVEL);
2395                        if (ret) {
2396                                /* device not associated with an iommu */
2397                                if (ret == -ENODEV)
2398                                        continue;
2399                                return ret;
2400                        }
2401                        pr_info("IOMMU: %s identity mapping for device %s\n",
2402                                hw ? "hardware" : "software", pci_name(pdev));
2403                }
2404        }
2405
2406        return 0;
2407}
2408
2409static int __init init_dmars(void)
2410{
2411        struct dmar_drhd_unit *drhd;
2412        struct dmar_rmrr_unit *rmrr;
2413        struct pci_dev *pdev;
2414        struct intel_iommu *iommu;
2415        int i, ret;
2416
2417        /*
2418         * for each drhd
2419         *    allocate root
2420         *    initialize and program root entry to not present
2421         * endfor
2422         */
2423        for_each_drhd_unit(drhd) {
2424                /*
2425                 * lock not needed as this is only incremented in the single
2426                 * threaded kernel __init code path all other access are read
2427                 * only
2428                 */
2429                if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2430                        g_num_of_iommus++;
2431                        continue;
2432                }
2433                printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2434                          IOMMU_UNITS_SUPPORTED);
2435        }
2436
2437        g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2438                        GFP_KERNEL);
2439        if (!g_iommus) {
2440                printk(KERN_ERR "Allocating global iommu array failed\n");
2441                ret = -ENOMEM;
2442                goto error;
2443        }
2444
2445        deferred_flush = kzalloc(g_num_of_iommus *
2446                sizeof(struct deferred_flush_tables), GFP_KERNEL);
2447        if (!deferred_flush) {
2448                ret = -ENOMEM;
2449                goto error;
2450        }
2451
2452        for_each_drhd_unit(drhd) {
2453                if (drhd->ignored)
2454                        continue;
2455
2456                iommu = drhd->iommu;
2457                g_iommus[iommu->seq_id] = iommu;
2458
2459                ret = iommu_init_domains(iommu);
2460                if (ret)
2461                        goto error;
2462
2463                /*
2464                 * TBD:
2465                 * we could share the same root & context tables
2466                 * among all IOMMU's. Need to Split it later.
2467                 */
2468                ret = iommu_alloc_root_entry(iommu);
2469                if (ret) {
2470                        printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2471                        goto error;
2472                }
2473                if (!ecap_pass_through(iommu->ecap))
2474                        hw_pass_through = 0;
2475        }
2476
2477        /*
2478         * Start from the sane iommu hardware state.
2479         */
2480        for_each_drhd_unit(drhd) {
2481                if (drhd->ignored)
2482                        continue;
2483
2484                iommu = drhd->iommu;
2485
2486                /*
2487                 * If the queued invalidation is already initialized by us
2488                 * (for example, while enabling interrupt-remapping) then
2489                 * we got the things already rolling from a sane state.
2490                 */
2491                if (iommu->qi)
2492                        continue;
2493
2494                /*
2495                 * Clear any previous faults.
2496                 */
2497                dmar_fault(-1, iommu);
2498                /*
2499                 * Disable queued invalidation if supported and already enabled
2500                 * before OS handover.
2501                 */
2502                dmar_disable_qi(iommu);
2503        }
2504
2505        for_each_drhd_unit(drhd) {
2506                if (drhd->ignored)
2507                        continue;
2508
2509                iommu = drhd->iommu;
2510
2511                if (dmar_enable_qi(iommu)) {
2512                        /*
2513                         * Queued Invalidate not enabled, use Register Based
2514                         * Invalidate
2515                         */
2516                        iommu->flush.flush_context = __iommu_flush_context;
2517                        iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2518                        printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2519                               "invalidation\n",
2520                                iommu->seq_id,
2521                               (unsigned long long)drhd->reg_base_addr);
2522                } else {
2523                        iommu->flush.flush_context = qi_flush_context;
2524                        iommu->flush.flush_iotlb = qi_flush_iotlb;
2525                        printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2526                               "invalidation\n",
2527                                iommu->seq_id,
2528                               (unsigned long long)drhd->reg_base_addr);
2529                }
2530        }
2531
2532        if (iommu_pass_through)
2533                iommu_identity_mapping |= IDENTMAP_ALL;
2534
2535#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2536        iommu_identity_mapping |= IDENTMAP_GFX;
2537#endif
2538
2539        check_tylersburg_isoch();
2540
2541        /*
2542         * If pass through is not set or not enabled, setup context entries for
2543         * identity mappings for rmrr, gfx, and isa and may fall back to static
2544         * identity mapping if iommu_identity_mapping is set.
2545         */
2546        if (iommu_identity_mapping) {
2547                ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2548                if (ret) {
2549                        printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2550                        goto error;
2551                }
2552        }
2553        /*
2554         * For each rmrr
2555         *   for each dev attached to rmrr
2556         *   do
2557         *     locate drhd for dev, alloc domain for dev
2558         *     allocate free domain
2559         *     allocate page table entries for rmrr
2560         *     if context not allocated for bus
2561         *           allocate and init context
2562         *           set present in root table for this bus
2563         *     init context with domain, translation etc
2564         *    endfor
2565         * endfor
2566         */
2567        printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2568        for_each_rmrr_units(rmrr) {
2569                for (i = 0; i < rmrr->devices_cnt; i++) {
2570                        pdev = rmrr->devices[i];
2571                        /*
2572                         * some BIOS lists non-exist devices in DMAR
2573                         * table.
2574                         */
2575                        if (!pdev)
2576                                continue;
2577                        ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2578                        if (ret)
2579                                printk(KERN_ERR
2580                                       "IOMMU: mapping reserved region failed\n");
2581                }
2582        }
2583
2584        iommu_prepare_isa();
2585
2586        /*
2587         * for each drhd
2588         *   enable fault log
2589         *   global invalidate context cache
2590         *   global invalidate iotlb
2591         *   enable translation
2592         */
2593        for_each_drhd_unit(drhd) {
2594                if (drhd->ignored) {
2595                        /*
2596                         * we always have to disable PMRs or DMA may fail on
2597                         * this device
2598                         */
2599                        if (force_on)
2600                                iommu_disable_protect_mem_regions(drhd->iommu);
2601                        continue;
2602                }
2603                iommu = drhd->iommu;
2604
2605                iommu_flush_write_buffer(iommu);
2606
2607                ret = dmar_set_interrupt(iommu);
2608                if (ret)
2609                        goto error;
2610
2611                iommu_set_root_entry(iommu);
2612
2613                iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2614                iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2615
2616                ret = iommu_enable_translation(iommu);
2617                if (ret)
2618                        goto error;
2619
2620                iommu_disable_protect_mem_regions(iommu);
2621        }
2622
2623        return 0;
2624error:
2625        for_each_drhd_unit(drhd) {
2626                if (drhd->ignored)
2627                        continue;
2628                iommu = drhd->iommu;
2629                free_iommu(iommu);
2630        }
2631        kfree(g_iommus);
2632        return ret;
2633}
2634
2635/* This takes a number of _MM_ pages, not VTD pages */
2636static struct iova *intel_alloc_iova(struct device *dev,
2637                                     struct dmar_domain *domain,
2638                                     unsigned long nrpages, uint64_t dma_mask)
2639{
2640        struct pci_dev *pdev = to_pci_dev(dev);
2641        struct iova *iova = NULL;
2642
2643        /* Restrict dma_mask to the width that the iommu can handle */
2644        dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2645
2646        if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2647                /*
2648                 * First try to allocate an io virtual address in
2649                 * DMA_BIT_MASK(32) and if that fails then try allocating
2650                 * from higher range
2651                 */
2652                iova = alloc_iova(&domain->iovad, nrpages,
2653                                  IOVA_PFN(DMA_BIT_MASK(32)), 1);
2654                if (iova)
2655                        return iova;
2656        }
2657        iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2658        if (unlikely(!iova)) {
2659                printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2660                       nrpages, pci_name(pdev));
2661                return NULL;
2662        }
2663
2664        return iova;
2665}
2666
2667static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2668{
2669        struct dmar_domain *domain;
2670        int ret;
2671
2672        domain = get_domain_for_dev(pdev,
2673                        DEFAULT_DOMAIN_ADDRESS_WIDTH);
2674        if (!domain) {
2675                printk(KERN_ERR
2676                        "Allocating domain for %s failed", pci_name(pdev));
2677                return NULL;
2678        }
2679
2680        /* make sure context mapping is ok */
2681        if (unlikely(!domain_context_mapped(pdev))) {
2682                ret = domain_context_mapping(domain, pdev,
2683                                             CONTEXT_TT_MULTI_LEVEL);
2684                if (ret) {
2685                        printk(KERN_ERR
2686                                "Domain context map for %s failed",
2687                                pci_name(pdev));
2688                        return NULL;
2689                }
2690        }
2691
2692        return domain;
2693}
2694
2695static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2696{
2697        struct device_domain_info *info;
2698
2699        /* No lock here, assumes no domain exit in normal case */
2700        info = dev->dev.archdata.iommu;
2701        if (likely(info))
2702                return info->domain;
2703
2704        return __get_valid_domain_for_dev(dev);
2705}
2706
2707static int iommu_dummy(struct pci_dev *pdev)
2708{
2709        return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2710}
2711
2712/* Check if the pdev needs to go through non-identity map and unmap process.*/
2713static int iommu_no_mapping(struct device *dev)
2714{
2715        struct pci_dev *pdev;
2716        int found;
2717
2718        if (unlikely(dev->bus != &pci_bus_type))
2719                return 1;
2720
2721        pdev = to_pci_dev(dev);
2722        if (iommu_dummy(pdev))
2723                return 1;
2724
2725        if (!iommu_identity_mapping)
2726                return 0;
2727
2728        found = identity_mapping(pdev);
2729        if (found) {
2730                if (iommu_should_identity_map(pdev, 0))
2731                        return 1;
2732                else {
2733                        /*
2734                         * 32 bit DMA is removed from si_domain and fall back
2735                         * to non-identity mapping.
2736                         */
2737                        domain_remove_one_dev_info(si_domain, pdev);
2738                        printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2739                               pci_name(pdev));
2740                        return 0;
2741                }
2742        } else {
2743                /*
2744                 * In case of a detached 64 bit DMA device from vm, the device
2745                 * is put into si_domain for identity mapping.
2746                 */
2747                if (iommu_should_identity_map(pdev, 0)) {
2748                        int ret;
2749                        ret = domain_add_dev_info(si_domain, pdev,
2750                                                  hw_pass_through ?
2751                                                  CONTEXT_TT_PASS_THROUGH :
2752                                                  CONTEXT_TT_MULTI_LEVEL);
2753                        if (!ret) {
2754                                printk(KERN_INFO "64bit %s uses identity mapping\n",
2755                                       pci_name(pdev));
2756                                return 1;
2757                        }
2758                }
2759        }
2760
2761        return 0;
2762}
2763
2764static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2765                                     size_t size, int dir, u64 dma_mask)
2766{
2767        struct pci_dev *pdev = to_pci_dev(hwdev);
2768        struct dmar_domain *domain;
2769        phys_addr_t start_paddr;
2770        struct iova *iova;
2771        int prot = 0;
2772        int ret;
2773        struct intel_iommu *iommu;
2774        unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2775
2776        BUG_ON(dir == DMA_NONE);
2777
2778        if (iommu_no_mapping(hwdev))
2779                return paddr;
2780
2781        domain = get_valid_domain_for_dev(pdev);
2782        if (!domain)
2783                return 0;
2784
2785        iommu = domain_get_iommu(domain);
2786        size = aligned_nrpages(paddr, size);
2787
2788        iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2789        if (!iova)
2790                goto error;
2791
2792        /*
2793         * Check if DMAR supports zero-length reads on write only
2794         * mappings..
2795         */
2796        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2797                        !cap_zlr(iommu->cap))
2798                prot |= DMA_PTE_READ;
2799        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2800                prot |= DMA_PTE_WRITE;
2801        /*
2802         * paddr - (paddr + size) might be partial page, we should map the whole
2803         * page.  Note: if two part of one page are separately mapped, we
2804         * might have two guest_addr mapping to the same host paddr, but this
2805         * is not a big problem
2806         */
2807        ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2808                                 mm_to_dma_pfn(paddr_pfn), size, prot);
2809        if (ret)
2810                goto error;
2811
2812        /* it's a non-present to present mapping. Only flush if caching mode */
2813        if (cap_caching_mode(iommu->cap))
2814                iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2815        else
2816                iommu_flush_write_buffer(iommu);
2817
2818        start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2819        start_paddr += paddr & ~PAGE_MASK;
2820        return start_paddr;
2821
2822error:
2823        if (iova)
2824                __free_iova(&domain->iovad, iova);
2825        printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2826                pci_name(pdev), size, (unsigned long long)paddr, dir);
2827        return 0;
2828}
2829
2830static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2831                                 unsigned long offset, size_t size,
2832                                 enum dma_data_direction dir,
2833                                 struct dma_attrs *attrs)
2834{
2835        return __intel_map_single(dev, page_to_phys(page) + offset, size,
2836                                  dir, to_pci_dev(dev)->dma_mask);
2837}
2838
2839static void flush_unmaps(void)
2840{
2841        int i, j;
2842
2843        timer_on = 0;
2844
2845        /* just flush them all */
2846        for (i = 0; i < g_num_of_iommus; i++) {
2847                struct intel_iommu *iommu = g_iommus[i];
2848                if (!iommu)
2849                        continue;
2850
2851                if (!deferred_flush[i].next)
2852                        continue;
2853
2854                /* In caching mode, global flushes turn emulation expensive */
2855                if (!cap_caching_mode(iommu->cap))
2856                        iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2857                                         DMA_TLB_GLOBAL_FLUSH);
2858                for (j = 0; j < deferred_flush[i].next; j++) {
2859                        unsigned long mask;
2860                        struct iova *iova = deferred_flush[i].iova[j];
2861                        struct dmar_domain *domain = deferred_flush[i].domain[j];
2862
2863                        /* On real hardware multiple invalidations are expensive */
2864                        if (cap_caching_mode(iommu->cap))
2865                                iommu_flush_iotlb_psi(iommu, domain->id,
2866                                iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2867                        else {
2868                                mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2869                                iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2870                                                (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2871                        }
2872                        __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2873                }
2874                deferred_flush[i].next = 0;
2875        }
2876
2877        list_size = 0;
2878}
2879
2880static void flush_unmaps_timeout(unsigned long data)
2881{
2882        unsigned long flags;
2883
2884        spin_lock_irqsave(&async_umap_flush_lock, flags);
2885        flush_unmaps();
2886        spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2887}
2888
2889static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2890{
2891        unsigned long flags;
2892        int next, iommu_id;
2893        struct intel_iommu *iommu;
2894
2895        spin_lock_irqsave(&async_umap_flush_lock, flags);
2896        if (list_size == HIGH_WATER_MARK)
2897                flush_unmaps();
2898
2899        iommu = domain_get_iommu(dom);
2900        iommu_id = iommu->seq_id;
2901
2902        next = deferred_flush[iommu_id].next;
2903        deferred_flush[iommu_id].domain[next] = dom;
2904        deferred_flush[iommu_id].iova[next] = iova;
2905        deferred_flush[iommu_id].next++;
2906
2907        if (!timer_on) {
2908                mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2909                timer_on = 1;
2910        }
2911        list_size++;
2912        spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2913}
2914
2915static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2916                             size_t size, enum dma_data_direction dir,
2917                             struct dma_attrs *attrs)
2918{
2919        struct pci_dev *pdev = to_pci_dev(dev);
2920        struct dmar_domain *domain;
2921        unsigned long start_pfn, last_pfn;
2922        struct iova *iova;
2923        struct intel_iommu *iommu;
2924
2925        if (iommu_no_mapping(dev))
2926                return;
2927
2928        domain = find_domain(pdev);
2929        BUG_ON(!domain);
2930
2931        iommu = domain_get_iommu(domain);
2932
2933        iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2934        if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2935                      (unsigned long long)dev_addr))
2936                return;
2937
2938        start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2939        last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2940
2941        pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2942                 pci_name(pdev), start_pfn, last_pfn);
2943
2944        /*  clear the whole page */
2945        dma_pte_clear_range(domain, start_pfn, last_pfn);
2946
2947        /* free page tables */
2948        dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2949
2950        if (intel_iommu_strict) {
2951                iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2952                                      last_pfn - start_pfn + 1, 0);
2953                /* free iova */
2954                __free_iova(&domain->iovad, iova);
2955        } else {
2956                add_unmap(domain, iova);
2957                /*
2958                 * queue up the release of the unmap to save the 1/6th of the
2959                 * cpu used up by the iotlb flush operation...
2960                 */
2961        }
2962}
2963
2964static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2965                                  dma_addr_t *dma_handle, gfp_t flags,
2966                                  struct dma_attrs *attrs)
2967{
2968        void *vaddr;
2969        int order;
2970
2971        size = PAGE_ALIGN(size);
2972        order = get_order(size);
2973
2974        if (!iommu_no_mapping(hwdev))
2975                flags &= ~(GFP_DMA | GFP_DMA32);
2976        else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2977                if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2978                        flags |= GFP_DMA;
2979                else
2980                        flags |= GFP_DMA32;
2981        }
2982
2983        vaddr = (void *)__get_free_pages(flags, order);
2984        if (!vaddr)
2985                return NULL;
2986        memset(vaddr, 0, size);
2987
2988        *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2989                                         DMA_BIDIRECTIONAL,
2990                                         hwdev->coherent_dma_mask);
2991        if (*dma_handle)
2992                return vaddr;
2993        free_pages((unsigned long)vaddr, order);
2994        return NULL;
2995}
2996
2997static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2998                                dma_addr_t dma_handle, struct dma_attrs *attrs)
2999{
3000        int order;
3001
3002        size = PAGE_ALIGN(size);
3003        order = get_order(size);
3004
3005        intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3006        free_pages((unsigned long)vaddr, order);
3007}
3008
3009static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3010                           int nelems, enum dma_data_direction dir,
3011                           struct dma_attrs *attrs)
3012{
3013        struct pci_dev *pdev = to_pci_dev(hwdev);
3014        struct dmar_domain *domain;
3015        unsigned long start_pfn, last_pfn;
3016        struct iova *iova;
3017        struct intel_iommu *iommu;
3018
3019        if (iommu_no_mapping(hwdev))
3020                return;
3021
3022        domain = find_domain(pdev);
3023        BUG_ON(!domain);
3024
3025        iommu = domain_get_iommu(domain);
3026
3027        iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3028        if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3029                      (unsigned long long)sglist[0].dma_address))
3030                return;
3031
3032        start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3033        last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3034
3035        /*  clear the whole page */
3036        dma_pte_clear_range(domain, start_pfn, last_pfn);
3037
3038        /* free page tables */
3039        dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3040
3041        if (intel_iommu_strict) {
3042                iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3043                                      last_pfn - start_pfn + 1, 0);
3044                /* free iova */
3045                __free_iova(&domain->iovad, iova);
3046        } else {
3047                add_unmap(domain, iova);
3048                /*
3049                 * queue up the release of the unmap to save the 1/6th of the
3050                 * cpu used up by the iotlb flush operation...
3051                 */
3052        }
3053}
3054
3055static int intel_nontranslate_map_sg(struct device *hddev,
3056        struct scatterlist *sglist, int nelems, int dir)
3057{
3058        int i;
3059        struct scatterlist *sg;
3060
3061        for_each_sg(sglist, sg, nelems, i) {
3062                BUG_ON(!sg_page(sg));
3063                sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3064                sg->dma_length = sg->length;
3065        }
3066        return nelems;
3067}
3068
3069static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3070                        enum dma_data_direction dir, struct dma_attrs *attrs)
3071{
3072        int i;
3073        struct pci_dev *pdev = to_pci_dev(hwdev);
3074        struct dmar_domain *domain;
3075        size_t size = 0;
3076        int prot = 0;
3077        struct iova *iova = NULL;
3078        int ret;
3079        struct scatterlist *sg;
3080        unsigned long start_vpfn;
3081        struct intel_iommu *iommu;
3082
3083        BUG_ON(dir == DMA_NONE);
3084        if (iommu_no_mapping(hwdev))
3085                return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3086
3087        domain = get_valid_domain_for_dev(pdev);
3088        if (!domain)
3089                return 0;
3090
3091        iommu = domain_get_iommu(domain);
3092
3093        for_each_sg(sglist, sg, nelems, i)
3094                size += aligned_nrpages(sg->offset, sg->length);
3095
3096        iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3097                                pdev->dma_mask);
3098        if (!iova) {
3099                sglist->dma_length = 0;
3100                return 0;
3101        }
3102
3103        /*
3104         * Check if DMAR supports zero-length reads on write only
3105         * mappings..
3106         */
3107        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3108                        !cap_zlr(iommu->cap))
3109                prot |= DMA_PTE_READ;
3110        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3111                prot |= DMA_PTE_WRITE;
3112
3113        start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3114
3115        ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3116        if (unlikely(ret)) {
3117                /*  clear the page */
3118                dma_pte_clear_range(domain, start_vpfn,
3119                                    start_vpfn + size - 1);
3120                /* free page tables */
3121                dma_pte_free_pagetable(domain, start_vpfn,
3122                                       start_vpfn + size - 1);
3123                /* free iova */
3124                __free_iova(&domain->iovad, iova);
3125                return 0;
3126        }
3127
3128        /* it's a non-present to present mapping. Only flush if caching mode */
3129        if (cap_caching_mode(iommu->cap))
3130                iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3131        else
3132                iommu_flush_write_buffer(iommu);
3133
3134        return nelems;
3135}
3136
3137static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3138{
3139        return !dma_addr;
3140}
3141
3142struct dma_map_ops intel_dma_ops = {
3143        .alloc = intel_alloc_coherent,
3144        .free = intel_free_coherent,
3145        .map_sg = intel_map_sg,
3146        .unmap_sg = intel_unmap_sg,
3147        .map_page = intel_map_page,
3148        .unmap_page = intel_unmap_page,
3149        .mapping_error = intel_mapping_error,
3150};
3151
3152static inline int iommu_domain_cache_init(void)
3153{
3154        int ret = 0;
3155
3156        iommu_domain_cache = kmem_cache_create("iommu_domain",
3157                                         sizeof(struct dmar_domain),
3158                                         0,
3159                                         SLAB_HWCACHE_ALIGN,
3160
3161                                         NULL);
3162        if (!iommu_domain_cache) {
3163                printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3164                ret = -ENOMEM;
3165        }
3166
3167        return ret;
3168}
3169
3170static inline int iommu_devinfo_cache_init(void)
3171{
3172        int ret = 0;
3173
3174        iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3175                                         sizeof(struct device_domain_info),
3176                                         0,
3177                                         SLAB_HWCACHE_ALIGN,
3178                                         NULL);
3179        if (!iommu_devinfo_cache) {
3180                printk(KERN_ERR "Couldn't create devinfo cache\n");
3181                ret = -ENOMEM;
3182        }
3183
3184        return ret;
3185}
3186
3187static inline int iommu_iova_cache_init(void)
3188{
3189        int ret = 0;
3190
3191        iommu_iova_cache = kmem_cache_create("iommu_iova",
3192                                         sizeof(struct iova),
3193                                         0,
3194                                         SLAB_HWCACHE_ALIGN,
3195                                         NULL);
3196        if (!iommu_iova_cache) {
3197                printk(KERN_ERR "Couldn't create iova cache\n");
3198                ret = -ENOMEM;
3199        }
3200
3201        return ret;
3202}
3203
3204static int __init iommu_init_mempool(void)
3205{
3206        int ret;
3207        ret = iommu_iova_cache_init();
3208        if (ret)
3209                return ret;
3210
3211        ret = iommu_domain_cache_init();
3212        if (ret)
3213                goto domain_error;
3214
3215        ret = iommu_devinfo_cache_init();
3216        if (!ret)
3217                return ret;
3218
3219        kmem_cache_destroy(iommu_domain_cache);
3220domain_error:
3221        kmem_cache_destroy(iommu_iova_cache);
3222
3223        return -ENOMEM;
3224}
3225
3226static void __init iommu_exit_mempool(void)
3227{
3228        kmem_cache_destroy(iommu_devinfo_cache);
3229        kmem_cache_destroy(iommu_domain_cache);
3230        kmem_cache_destroy(iommu_iova_cache);
3231
3232}
3233
3234static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3235{
3236        struct dmar_drhd_unit *drhd;
3237        u32 vtbar;
3238        int rc;
3239
3240        /* We know that this device on this chipset has its own IOMMU.
3241         * If we find it under a different IOMMU, then the BIOS is lying
3242         * to us. Hope that the IOMMU for this device is actually
3243         * disabled, and it needs no translation...
3244         */
3245        rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3246        if (rc) {
3247                /* "can't" happen */
3248                dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3249                return;
3250        }
3251        vtbar &= 0xffff0000;
3252
3253        /* we know that the this iommu should be at offset 0xa000 from vtbar */
3254        drhd = dmar_find_matched_drhd_unit(pdev);
3255        if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3256                            TAINT_FIRMWARE_WORKAROUND,
3257                            "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3258                pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3259}
3260DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3261
3262static void __init init_no_remapping_devices(void)
3263{
3264        struct dmar_drhd_unit *drhd;
3265
3266        for_each_drhd_unit(drhd) {
3267                if (!drhd->include_all) {
3268                        int i;
3269                        for (i = 0; i < drhd->devices_cnt; i++)
3270                                if (drhd->devices[i] != NULL)
3271                                        break;
3272                        /* ignore DMAR unit if no pci devices exist */
3273                        if (i == drhd->devices_cnt)
3274                                drhd->ignored = 1;
3275                }
3276        }
3277
3278        for_each_drhd_unit(drhd) {
3279                int i;
3280                if (drhd->ignored || drhd->include_all)
3281                        continue;
3282
3283                for (i = 0; i < drhd->devices_cnt; i++)
3284                        if (drhd->devices[i] &&
3285                            !IS_GFX_DEVICE(drhd->devices[i]))
3286                                break;
3287
3288                if (i < drhd->devices_cnt)
3289                        continue;
3290
3291                /* This IOMMU has *only* gfx devices. Either bypass it or
3292                   set the gfx_mapped flag, as appropriate */
3293                if (dmar_map_gfx) {
3294                        intel_iommu_gfx_mapped = 1;
3295                } else {
3296                        drhd->ignored = 1;
3297                        for (i = 0; i < drhd->devices_cnt; i++) {
3298                                if (!drhd->devices[i])
3299                                        continue;
3300                                drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3301                        }
3302                }
3303        }
3304}
3305
3306#ifdef CONFIG_SUSPEND
3307static int init_iommu_hw(void)
3308{
3309        struct dmar_drhd_unit *drhd;
3310        struct intel_iommu *iommu = NULL;
3311
3312        for_each_active_iommu(iommu, drhd)
3313                if (iommu->qi)
3314                        dmar_reenable_qi(iommu);
3315
3316        for_each_iommu(iommu, drhd) {
3317                if (drhd->ignored) {
3318                        /*
3319                         * we always have to disable PMRs or DMA may fail on
3320                         * this device
3321                         */
3322                        if (force_on)
3323                                iommu_disable_protect_mem_regions(iommu);
3324                        continue;
3325                }
3326        
3327                iommu_flush_write_buffer(iommu);
3328
3329                iommu_set_root_entry(iommu);
3330
3331                iommu->flush.flush_context(iommu, 0, 0, 0,
3332                                           DMA_CCMD_GLOBAL_INVL);
3333                iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3334                                         DMA_TLB_GLOBAL_FLUSH);
3335                if (iommu_enable_translation(iommu))
3336                        return 1;
3337                iommu_disable_protect_mem_regions(iommu);
3338        }
3339
3340        return 0;
3341}
3342
3343static void iommu_flush_all(void)
3344{
3345        struct dmar_drhd_unit *drhd;
3346        struct intel_iommu *iommu;
3347
3348        for_each_active_iommu(iommu, drhd) {
3349                iommu->flush.flush_context(iommu, 0, 0, 0,
3350                                           DMA_CCMD_GLOBAL_INVL);
3351                iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3352                                         DMA_TLB_GLOBAL_FLUSH);
3353        }
3354}
3355
3356static int iommu_suspend(void)
3357{
3358        struct dmar_drhd_unit *drhd;
3359        struct intel_iommu *iommu = NULL;
3360        unsigned long flag;
3361
3362        for_each_active_iommu(iommu, drhd) {
3363                iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3364                                                 GFP_ATOMIC);
3365                if (!iommu->iommu_state)
3366                        goto nomem;
3367        }
3368
3369        iommu_flush_all();
3370
3371        for_each_active_iommu(iommu, drhd) {
3372                iommu_disable_translation(iommu);
3373
3374                raw_spin_lock_irqsave(&iommu->register_lock, flag);
3375
3376                iommu->iommu_state[SR_DMAR_FECTL_REG] =
3377                        readl(iommu->reg + DMAR_FECTL_REG);
3378                iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3379                        readl(iommu->reg + DMAR_FEDATA_REG);
3380                iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3381                        readl(iommu->reg + DMAR_FEADDR_REG);
3382                iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3383                        readl(iommu->reg + DMAR_FEUADDR_REG);
3384
3385                raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3386        }
3387        return 0;
3388
3389nomem:
3390        for_each_active_iommu(iommu, drhd)
3391                kfree(iommu->iommu_state);
3392
3393        return -ENOMEM;
3394}
3395
3396static void iommu_resume(void)
3397{
3398        struct dmar_drhd_unit *drhd;
3399        struct intel_iommu *iommu = NULL;
3400        unsigned long flag;
3401
3402        if (init_iommu_hw()) {
3403                if (force_on)
3404                        panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3405                else
3406                        WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3407                return;
3408        }
3409
3410        for_each_active_iommu(iommu, drhd) {
3411
3412                raw_spin_lock_irqsave(&iommu->register_lock, flag);
3413
3414                writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3415                        iommu->reg + DMAR_FECTL_REG);
3416                writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3417                        iommu->reg + DMAR_FEDATA_REG);
3418                writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3419                        iommu->reg + DMAR_FEADDR_REG);
3420                writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3421                        iommu->reg + DMAR_FEUADDR_REG);
3422
3423                raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3424        }
3425
3426        for_each_active_iommu(iommu, drhd)
3427                kfree(iommu->iommu_state);
3428}
3429
3430static struct syscore_ops iommu_syscore_ops = {
3431        .resume         = iommu_resume,
3432        .suspend        = iommu_suspend,
3433};
3434
3435static void __init init_iommu_pm_ops(void)
3436{
3437        register_syscore_ops(&iommu_syscore_ops);
3438}
3439
3440#else
3441static inline void init_iommu_pm_ops(void) {}
3442#endif  /* CONFIG_PM */
3443
3444LIST_HEAD(dmar_rmrr_units);
3445
3446static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3447{
3448        list_add(&rmrr->list, &dmar_rmrr_units);
3449}
3450
3451
3452int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3453{
3454        struct acpi_dmar_reserved_memory *rmrr;
3455        struct dmar_rmrr_unit *rmrru;
3456
3457        rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3458        if (!rmrru)
3459                return -ENOMEM;
3460
3461        rmrru->hdr = header;
3462        rmrr = (struct acpi_dmar_reserved_memory *)header;
3463        rmrru->base_address = rmrr->base_address;
3464        rmrru->end_address = rmrr->end_address;
3465
3466        dmar_register_rmrr_unit(rmrru);
3467        return 0;
3468}
3469
3470static int __init
3471rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3472{
3473        struct acpi_dmar_reserved_memory *rmrr;
3474        int ret;
3475
3476        rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3477        ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3478                ((void *)rmrr) + rmrr->header.length,
3479                &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3480
3481        if (ret || (rmrru->devices_cnt == 0)) {
3482                list_del(&rmrru->list);
3483                kfree(rmrru);
3484        }
3485        return ret;
3486}
3487
3488static LIST_HEAD(dmar_atsr_units);
3489
3490int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3491{
3492        struct acpi_dmar_atsr *atsr;
3493        struct dmar_atsr_unit *atsru;
3494
3495        atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3496        atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3497        if (!atsru)
3498                return -ENOMEM;
3499
3500        atsru->hdr = hdr;
3501        atsru->include_all = atsr->flags & 0x1;
3502
3503        list_add(&atsru->list, &dmar_atsr_units);
3504
3505        return 0;
3506}
3507
3508static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3509{
3510        int rc;
3511        struct acpi_dmar_atsr *atsr;
3512
3513        if (atsru->include_all)
3514                return 0;
3515
3516        atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3517        rc = dmar_parse_dev_scope((void *)(atsr + 1),
3518                                (void *)atsr + atsr->header.length,
3519                                &atsru->devices_cnt, &atsru->devices,
3520                                atsr->segment);
3521        if (rc || !atsru->devices_cnt) {
3522                list_del(&atsru->list);
3523                kfree(atsru);
3524        }
3525
3526        return rc;
3527}
3528
3529int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3530{
3531        int i;
3532        struct pci_bus *bus;
3533        struct acpi_dmar_atsr *atsr;
3534        struct dmar_atsr_unit *atsru;
3535
3536        dev = pci_physfn(dev);
3537
3538        list_for_each_entry(atsru, &dmar_atsr_units, list) {
3539                atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3540                if (atsr->segment == pci_domain_nr(dev->bus))
3541                        goto found;
3542        }
3543
3544        return 0;
3545
3546found:
3547        for (bus = dev->bus; bus; bus = bus->parent) {
3548                struct pci_dev *bridge = bus->self;
3549
3550                if (!bridge || !pci_is_pcie(bridge) ||
3551                    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3552                        return 0;
3553
3554                if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) {
3555                        for (i = 0; i < atsru->devices_cnt; i++)
3556                                if (atsru->devices[i] == bridge)
3557                                        return 1;
3558                        break;
3559                }
3560        }
3561
3562        if (atsru->include_all)
3563                return 1;
3564
3565        return 0;
3566}
3567
3568int __init dmar_parse_rmrr_atsr_dev(void)
3569{
3570        struct dmar_rmrr_unit *rmrr, *rmrr_n;
3571        struct dmar_atsr_unit *atsr, *atsr_n;
3572        int ret = 0;
3573
3574        list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3575                ret = rmrr_parse_dev(rmrr);
3576                if (ret)
3577                        return ret;
3578        }
3579
3580        list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3581                ret = atsr_parse_dev(atsr);
3582                if (ret)
3583                        return ret;
3584        }
3585
3586        return ret;
3587}
3588
3589/*
3590 * Here we only respond to action of unbound device from driver.
3591 *
3592 * Added device is not attached to its DMAR domain here yet. That will happen
3593 * when mapping the device to iova.
3594 */
3595static int device_notifier(struct notifier_block *nb,
3596                                  unsigned long action, void *data)
3597{
3598        struct device *dev = data;
3599        struct pci_dev *pdev = to_pci_dev(dev);
3600        struct dmar_domain *domain;
3601
3602        if (iommu_no_mapping(dev))
3603                return 0;
3604
3605        domain = find_domain(pdev);
3606        if (!domain)
3607                return 0;
3608
3609        if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3610                domain_remove_one_dev_info(domain, pdev);
3611
3612                if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3613                    !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3614                    list_empty(&domain->devices))
3615                        domain_exit(domain);
3616        }
3617
3618        return 0;
3619}
3620
3621static struct notifier_block device_nb = {
3622        .notifier_call = device_notifier,
3623};
3624
3625int __init intel_iommu_init(void)
3626{
3627        int ret = 0;
3628
3629        /* VT-d is required for a TXT/tboot launch, so enforce that */
3630        force_on = tboot_force_iommu();
3631
3632        if (dmar_table_init()) {
3633                if (force_on)
3634                        panic("tboot: Failed to initialize DMAR table\n");
3635                return  -ENODEV;
3636        }
3637
3638        if (dmar_dev_scope_init() < 0) {
3639                if (force_on)
3640                        panic("tboot: Failed to initialize DMAR device scope\n");
3641                return  -ENODEV;
3642        }
3643
3644        if (no_iommu || dmar_disabled)
3645                return -ENODEV;
3646
3647        if (iommu_init_mempool()) {
3648                if (force_on)
3649                        panic("tboot: Failed to initialize iommu memory\n");
3650                return  -ENODEV;
3651        }
3652
3653        if (list_empty(&dmar_rmrr_units))
3654                printk(KERN_INFO "DMAR: No RMRR found\n");
3655
3656        if (list_empty(&dmar_atsr_units))
3657                printk(KERN_INFO "DMAR: No ATSR found\n");
3658
3659        if (dmar_init_reserved_ranges()) {
3660                if (force_on)
3661                        panic("tboot: Failed to reserve iommu ranges\n");
3662                return  -ENODEV;
3663        }
3664
3665        init_no_remapping_devices();
3666
3667        ret = init_dmars();
3668        if (ret) {
3669                if (force_on)
3670                        panic("tboot: Failed to initialize DMARs\n");
3671                printk(KERN_ERR "IOMMU: dmar init failed\n");
3672                put_iova_domain(&reserved_iova_list);
3673                iommu_exit_mempool();
3674                return ret;
3675        }
3676        printk(KERN_INFO
3677        "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3678
3679        init_timer(&unmap_timer);
3680#ifdef CONFIG_SWIOTLB
3681        swiotlb = 0;
3682#endif
3683        dma_ops = &intel_dma_ops;
3684
3685        init_iommu_pm_ops();
3686
3687        bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3688
3689        bus_register_notifier(&pci_bus_type, &device_nb);
3690
3691        intel_iommu_enabled = 1;
3692
3693        return 0;
3694}
3695
3696static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3697                                           struct pci_dev *pdev)
3698{
3699        struct pci_dev *tmp, *parent;
3700
3701        if (!iommu || !pdev)
3702                return;
3703
3704        /* dependent device detach */
3705        tmp = pci_find_upstream_pcie_bridge(pdev);
3706        /* Secondary interface's bus number and devfn 0 */
3707        if (tmp) {
3708                parent = pdev->bus->self;
3709                while (parent != tmp) {
3710                        iommu_detach_dev(iommu, parent->bus->number,
3711                                         parent->devfn);
3712                        parent = parent->bus->self;
3713                }
3714                if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3715                        iommu_detach_dev(iommu,
3716                                tmp->subordinate->number, 0);
3717                else /* this is a legacy PCI bridge */
3718                        iommu_detach_dev(iommu, tmp->bus->number,
3719                                         tmp->devfn);
3720        }
3721}
3722
3723static void domain_remove_one_dev_info(struct dmar_domain *domain,
3724                                          struct pci_dev *pdev)
3725{
3726        struct device_domain_info *info;
3727        struct intel_iommu *iommu;
3728        unsigned long flags;
3729        int found = 0;
3730        struct list_head *entry, *tmp;
3731
3732        iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3733                                pdev->devfn);
3734        if (!iommu)
3735                return;
3736
3737        spin_lock_irqsave(&device_domain_lock, flags);
3738        list_for_each_safe(entry, tmp, &domain->devices) {
3739                info = list_entry(entry, struct device_domain_info, link);
3740                if (info->segment == pci_domain_nr(pdev->bus) &&
3741                    info->bus == pdev->bus->number &&
3742                    info->devfn == pdev->devfn) {
3743                        unlink_domain_info(info);
3744                        spin_unlock_irqrestore(&device_domain_lock, flags);
3745
3746                        iommu_disable_dev_iotlb(info);
3747                        iommu_detach_dev(iommu, info->bus, info->devfn);
3748                        iommu_detach_dependent_devices(iommu, pdev);
3749                        free_devinfo_mem(info);
3750
3751                        spin_lock_irqsave(&device_domain_lock, flags);
3752
3753                        if (found)
3754                                break;
3755                        else
3756                                continue;
3757                }
3758
3759                /* if there is no other devices under the same iommu
3760                 * owned by this domain, clear this iommu in iommu_bmp
3761                 * update iommu count and coherency
3762                 */
3763                if (iommu == device_to_iommu(info->segment, info->bus,
3764                                            info->devfn))
3765                        found = 1;
3766        }
3767
3768        spin_unlock_irqrestore(&device_domain_lock, flags);
3769
3770        if (found == 0) {
3771                unsigned long tmp_flags;
3772                spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3773                clear_bit(iommu->seq_id, domain->iommu_bmp);
3774                domain->iommu_count--;
3775                domain_update_iommu_cap(domain);
3776                spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3777
3778                if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3779                    !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3780                        spin_lock_irqsave(&iommu->lock, tmp_flags);
3781                        clear_bit(domain->id, iommu->domain_ids);
3782                        iommu->domains[domain->id] = NULL;
3783                        spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3784                }
3785        }
3786}
3787
3788static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3789{
3790        struct device_domain_info *info;
3791        struct intel_iommu *iommu;
3792        unsigned long flags1, flags2;
3793
3794        spin_lock_irqsave(&device_domain_lock, flags1);
3795        while (!list_empty(&domain->devices)) {
3796                info = list_entry(domain->devices.next,
3797                        struct device_domain_info, link);
3798                unlink_domain_info(info);
3799                spin_unlock_irqrestore(&device_domain_lock, flags1);
3800
3801                iommu_disable_dev_iotlb(info);
3802                iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3803                iommu_detach_dev(iommu, info->bus, info->devfn);
3804                iommu_detach_dependent_devices(iommu, info->dev);
3805
3806                /* clear this iommu in iommu_bmp, update iommu count
3807                 * and capabilities
3808                 */
3809                spin_lock_irqsave(&domain->iommu_lock, flags2);
3810                if (test_and_clear_bit(iommu->seq_id,
3811                                       domain->iommu_bmp)) {
3812                        domain->iommu_count--;
3813                        domain_update_iommu_cap(domain);
3814                }
3815                spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3816
3817                free_devinfo_mem(info);
3818                spin_lock_irqsave(&device_domain_lock, flags1);
3819        }
3820        spin_unlock_irqrestore(&device_domain_lock, flags1);
3821}
3822
3823/* domain id for virtual machine, it won't be set in context */
3824static unsigned long vm_domid;
3825
3826static struct dmar_domain *iommu_alloc_vm_domain(void)
3827{
3828        struct dmar_domain *domain;
3829
3830        domain = alloc_domain_mem();
3831        if (!domain)
3832                return NULL;
3833
3834        domain->id = vm_domid++;
3835        domain->nid = -1;
3836        memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
3837        domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3838
3839        return domain;
3840}
3841
3842static int md_domain_init(struct dmar_domain *domain, int guest_width)
3843{
3844        int adjust_width;
3845
3846        init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3847        spin_lock_init(&domain->iommu_lock);
3848
3849        domain_reserve_special_ranges(domain);
3850
3851        /* calculate AGAW */
3852        domain->gaw = guest_width;
3853        adjust_width = guestwidth_to_adjustwidth(guest_width);
3854        domain->agaw = width_to_agaw(adjust_width);
3855
3856        INIT_LIST_HEAD(&domain->devices);
3857
3858        domain->iommu_count = 0;
3859        domain->iommu_coherency = 0;
3860        domain->iommu_snooping = 0;
3861        domain->iommu_superpage = 0;
3862        domain->max_addr = 0;
3863        domain->nid = -1;
3864
3865        /* always allocate the top pgd */
3866        domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3867        if (!domain->pgd)
3868                return -ENOMEM;
3869        domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3870        return 0;
3871}
3872
3873static void iommu_free_vm_domain(struct dmar_domain *domain)
3874{
3875        unsigned long flags;
3876        struct dmar_drhd_unit *drhd;
3877        struct intel_iommu *iommu;
3878        unsigned long i;
3879        unsigned long ndomains;
3880
3881        for_each_drhd_unit(drhd) {
3882                if (drhd->ignored)
3883                        continue;
3884                iommu = drhd->iommu;
3885
3886                ndomains = cap_ndoms(iommu->cap);
3887                for_each_set_bit(i, iommu->domain_ids, ndomains) {
3888                        if (iommu->domains[i] == domain) {
3889                                spin_lock_irqsave(&iommu->lock, flags);
3890                                clear_bit(i, iommu->domain_ids);
3891                                iommu->domains[i] = NULL;
3892                                spin_unlock_irqrestore(&iommu->lock, flags);
3893                                break;
3894                        }
3895                }
3896        }
3897}
3898
3899static void vm_domain_exit(struct dmar_domain *domain)
3900{
3901        /* Domain 0 is reserved, so dont process it */
3902        if (!domain)
3903                return;
3904
3905        vm_domain_remove_all_dev_info(domain);
3906        /* destroy iovas */
3907        put_iova_domain(&domain->iovad);
3908
3909        /* clear ptes */
3910        dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3911
3912        /* free page tables */
3913        dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3914
3915        iommu_free_vm_domain(domain);
3916        free_domain_mem(domain);
3917}
3918
3919static int intel_iommu_domain_init(struct iommu_domain *domain)
3920{
3921        struct dmar_domain *dmar_domain;
3922
3923        dmar_domain = iommu_alloc_vm_domain();
3924        if (!dmar_domain) {
3925                printk(KERN_ERR
3926                        "intel_iommu_domain_init: dmar_domain == NULL\n");
3927                return -ENOMEM;
3928        }
3929        if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3930                printk(KERN_ERR
3931                        "intel_iommu_domain_init() failed\n");
3932                vm_domain_exit(dmar_domain);
3933                return -ENOMEM;
3934        }
3935        domain_update_iommu_cap(dmar_domain);
3936        domain->priv = dmar_domain;
3937
3938        domain->geometry.aperture_start = 0;
3939        domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
3940        domain->geometry.force_aperture = true;
3941
3942        return 0;
3943}
3944
3945static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3946{
3947        struct dmar_domain *dmar_domain = domain->priv;
3948
3949        domain->priv = NULL;
3950        vm_domain_exit(dmar_domain);
3951}
3952
3953static int intel_iommu_attach_device(struct iommu_domain *domain,
3954                                     struct device *dev)
3955{
3956        struct dmar_domain *dmar_domain = domain->priv;
3957        struct pci_dev *pdev = to_pci_dev(dev);
3958        struct intel_iommu *iommu;
3959        int addr_width;
3960
3961        /* normally pdev is not mapped */
3962        if (unlikely(domain_context_mapped(pdev))) {
3963                struct dmar_domain *old_domain;
3964
3965                old_domain = find_domain(pdev);
3966                if (old_domain) {
3967                        if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3968                            dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3969                                domain_remove_one_dev_info(old_domain, pdev);
3970                        else
3971                                domain_remove_dev_info(old_domain);
3972                }
3973        }
3974
3975        iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3976                                pdev->devfn);
3977        if (!iommu)
3978                return -ENODEV;
3979
3980        /* check if this iommu agaw is sufficient for max mapped address */
3981        addr_width = agaw_to_width(iommu->agaw);
3982        if (addr_width > cap_mgaw(iommu->cap))
3983                addr_width = cap_mgaw(iommu->cap);
3984
3985        if (dmar_domain->max_addr > (1LL << addr_width)) {
3986                printk(KERN_ERR "%s: iommu width (%d) is not "
3987                       "sufficient for the mapped address (%llx)\n",
3988                       __func__, addr_width, dmar_domain->max_addr);
3989                return -EFAULT;
3990        }
3991        dmar_domain->gaw = addr_width;
3992
3993        /*
3994         * Knock out extra levels of page tables if necessary
3995         */
3996        while (iommu->agaw < dmar_domain->agaw) {
3997                struct dma_pte *pte;
3998
3999                pte = dmar_domain->pgd;
4000                if (dma_pte_present(pte)) {
4001                        dmar_domain->pgd = (struct dma_pte *)
4002                                phys_to_virt(dma_pte_addr(pte));
4003                        free_pgtable_page(pte);
4004                }
4005                dmar_domain->agaw--;
4006        }
4007
4008        return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4009}
4010
4011static void intel_iommu_detach_device(struct iommu_domain *domain,
4012                                      struct device *dev)
4013{
4014        struct dmar_domain *dmar_domain = domain->priv;
4015        struct pci_dev *pdev = to_pci_dev(dev);
4016
4017        domain_remove_one_dev_info(dmar_domain, pdev);
4018}
4019
4020static int intel_iommu_map(struct iommu_domain *domain,
4021                           unsigned long iova, phys_addr_t hpa,
4022                           size_t size, int iommu_prot)
4023{
4024        struct dmar_domain *dmar_domain = domain->priv;
4025        u64 max_addr;
4026        int prot = 0;
4027        int ret;
4028
4029        if (iommu_prot & IOMMU_READ)
4030                prot |= DMA_PTE_READ;
4031        if (iommu_prot & IOMMU_WRITE)
4032                prot |= DMA_PTE_WRITE;
4033        if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4034                prot |= DMA_PTE_SNP;
4035
4036        max_addr = iova + size;
4037        if (dmar_domain->max_addr < max_addr) {
4038                u64 end;
4039
4040                /* check if minimum agaw is sufficient for mapped address */
4041                end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4042                if (end < max_addr) {
4043                        printk(KERN_ERR "%s: iommu width (%d) is not "
4044                               "sufficient for the mapped address (%llx)\n",
4045                               __func__, dmar_domain->gaw, max_addr);
4046                        return -EFAULT;
4047                }
4048                dmar_domain->max_addr = max_addr;
4049        }
4050        /* Round up size to next multiple of PAGE_SIZE, if it and
4051           the low bits of hpa would take us onto the next page */
4052        size = aligned_nrpages(hpa, size);
4053        ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4054                                 hpa >> VTD_PAGE_SHIFT, size, prot);
4055        return ret;
4056}
4057
4058static size_t intel_iommu_unmap(struct iommu_domain *domain,
4059                             unsigned long iova, size_t size)
4060{
4061        struct dmar_domain *dmar_domain = domain->priv;
4062        int order;
4063
4064        order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4065                            (iova + size - 1) >> VTD_PAGE_SHIFT);
4066
4067        if (dmar_domain->max_addr == iova + size)
4068                dmar_domain->max_addr = iova;
4069
4070        return PAGE_SIZE << order;
4071}
4072
4073static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4074                                            unsigned long iova)
4075{
4076        struct dmar_domain *dmar_domain = domain->priv;
4077        struct dma_pte *pte;
4078        u64 phys = 0;
4079
4080        pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4081        if (pte)
4082                phys = dma_pte_addr(pte);
4083
4084        return phys;
4085}
4086
4087static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4088                                      unsigned long cap)
4089{
4090        struct dmar_domain *dmar_domain = domain->priv;
4091
4092        if (cap == IOMMU_CAP_CACHE_COHERENCY)
4093                return dmar_domain->iommu_snooping;
4094        if (cap == IOMMU_CAP_INTR_REMAP)
4095                return irq_remapping_enabled;
4096
4097        return 0;
4098}
4099
4100static void swap_pci_ref(struct pci_dev **from, struct pci_dev *to)
4101{
4102        pci_dev_put(*from);
4103        *from = to;
4104}
4105
4106#define REQ_ACS_FLAGS   (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
4107
4108static int intel_iommu_add_device(struct device *dev)
4109{
4110        struct pci_dev *pdev = to_pci_dev(dev);
4111        struct pci_dev *bridge, *dma_pdev = NULL;
4112        struct iommu_group *group;
4113        int ret;
4114
4115        if (!device_to_iommu(pci_domain_nr(pdev->bus),
4116                             pdev->bus->number, pdev->devfn))
4117                return -ENODEV;
4118
4119        bridge = pci_find_upstream_pcie_bridge(pdev);
4120        if (bridge) {
4121                if (pci_is_pcie(bridge))
4122                        dma_pdev = pci_get_domain_bus_and_slot(
4123                                                pci_domain_nr(pdev->bus),
4124                                                bridge->subordinate->number, 0);
4125                if (!dma_pdev)
4126                        dma_pdev = pci_dev_get(bridge);
4127        } else
4128                dma_pdev = pci_dev_get(pdev);
4129
4130        /* Account for quirked devices */
4131        swap_pci_ref(&dma_pdev, pci_get_dma_source(dma_pdev));
4132
4133        /*
4134         * If it's a multifunction device that does not support our
4135         * required ACS flags, add to the same group as function 0.
4136         */
4137        if (dma_pdev->multifunction &&
4138            !pci_acs_enabled(dma_pdev, REQ_ACS_FLAGS))
4139                swap_pci_ref(&dma_pdev,
4140                             pci_get_slot(dma_pdev->bus,
4141                                          PCI_DEVFN(PCI_SLOT(dma_pdev->devfn),
4142                                          0)));
4143
4144        /*
4145         * Devices on the root bus go through the iommu.  If that's not us,
4146         * find the next upstream device and test ACS up to the root bus.
4147         * Finding the next device may require skipping virtual buses.
4148         */
4149        while (!pci_is_root_bus(dma_pdev->bus)) {
4150                struct pci_bus *bus = dma_pdev->bus;
4151
4152                while (!bus->self) {
4153                        if (!pci_is_root_bus(bus))
4154                                bus = bus->parent;
4155                        else
4156                                goto root_bus;
4157                }
4158
4159                if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS))
4160                        break;
4161
4162                swap_pci_ref(&dma_pdev, pci_dev_get(bus->self));
4163        }
4164
4165root_bus:
4166        group = iommu_group_get(&dma_pdev->dev);
4167        pci_dev_put(dma_pdev);
4168        if (!group) {
4169                group = iommu_group_alloc();
4170                if (IS_ERR(group))
4171                        return PTR_ERR(group);
4172        }
4173
4174        ret = iommu_group_add_device(group, dev);
4175
4176        iommu_group_put(group);
4177        return ret;
4178}
4179
4180static void intel_iommu_remove_device(struct device *dev)
4181{
4182        iommu_group_remove_device(dev);
4183}
4184
4185static struct iommu_ops intel_iommu_ops = {
4186        .domain_init    = intel_iommu_domain_init,
4187        .domain_destroy = intel_iommu_domain_destroy,
4188        .attach_dev     = intel_iommu_attach_device,
4189        .detach_dev     = intel_iommu_detach_device,
4190        .map            = intel_iommu_map,
4191        .unmap          = intel_iommu_unmap,
4192        .iova_to_phys   = intel_iommu_iova_to_phys,
4193        .domain_has_cap = intel_iommu_domain_has_cap,
4194        .add_device     = intel_iommu_add_device,
4195        .remove_device  = intel_iommu_remove_device,
4196        .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
4197};
4198
4199static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
4200{
4201        /*
4202         * Mobile 4 Series Chipset neglects to set RWBF capability,
4203         * but needs it:
4204         */
4205        printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4206        rwbf_quirk = 1;
4207
4208        /* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
4209        if (dev->revision == 0x07) {
4210                printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4211                dmar_map_gfx = 0;
4212        }
4213}
4214
4215DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4216
4217#define GGC 0x52
4218#define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4219#define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4220#define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4221#define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4222#define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4223#define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4224#define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4225#define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4226
4227static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4228{
4229        unsigned short ggc;
4230
4231        if (pci_read_config_word(dev, GGC, &ggc))
4232                return;
4233
4234        if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4235                printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4236                dmar_map_gfx = 0;
4237        } else if (dmar_map_gfx) {
4238                /* we have to ensure the gfx device is idle before we flush */
4239                printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4240                intel_iommu_strict = 1;
4241       }
4242}
4243DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4244DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4245DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4246DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4247
4248/* On Tylersburg chipsets, some BIOSes have been known to enable the
4249   ISOCH DMAR unit for the Azalia sound device, but not give it any
4250   TLB entries, which causes it to deadlock. Check for that.  We do
4251   this in a function called from init_dmars(), instead of in a PCI
4252   quirk, because we don't want to print the obnoxious "BIOS broken"
4253   message if VT-d is actually disabled.
4254*/
4255static void __init check_tylersburg_isoch(void)
4256{
4257        struct pci_dev *pdev;
4258        uint32_t vtisochctrl;
4259
4260        /* If there's no Azalia in the system anyway, forget it. */
4261        pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4262        if (!pdev)
4263                return;
4264        pci_dev_put(pdev);
4265
4266        /* System Management Registers. Might be hidden, in which case
4267           we can't do the sanity check. But that's OK, because the
4268           known-broken BIOSes _don't_ actually hide it, so far. */
4269        pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4270        if (!pdev)
4271                return;
4272
4273        if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4274                pci_dev_put(pdev);
4275                return;
4276        }
4277
4278        pci_dev_put(pdev);
4279
4280        /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4281        if (vtisochctrl & 1)
4282                return;
4283
4284        /* Drop all bits other than the number of TLB entries */
4285        vtisochctrl &= 0x1c;
4286
4287        /* If we have the recommended number of TLB entries (16), fine. */
4288        if (vtisochctrl == 0x10)
4289                return;
4290
4291        /* Zero TLB entries? You get to ride the short bus to school. */
4292        if (!vtisochctrl) {
4293                WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4294                     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4295                     dmi_get_system_info(DMI_BIOS_VENDOR),
4296                     dmi_get_system_info(DMI_BIOS_VERSION),
4297                     dmi_get_system_info(DMI_PRODUCT_VERSION));
4298                iommu_identity_mapping |= IDENTMAP_AZALIA;
4299                return;
4300        }
4301        
4302        printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4303               vtisochctrl);
4304}
4305