linux/drivers/iommu/intel-iommu.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2006, Intel Corporation.
   3 *
   4 * This program is free software; you can redistribute it and/or modify it
   5 * under the terms and conditions of the GNU General Public License,
   6 * version 2, as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope it will be useful, but WITHOUT
   9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  11 * more details.
  12 *
  13 * You should have received a copy of the GNU General Public License along with
  14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
  15 * Place - Suite 330, Boston, MA 02111-1307 USA.
  16 *
  17 * Copyright (C) 2006-2008 Intel Corporation
  18 * Author: Ashok Raj <ashok.raj@intel.com>
  19 * Author: Shaohua Li <shaohua.li@intel.com>
  20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
  21 * Author: Fenghua Yu <fenghua.yu@intel.com>
  22 */
  23
  24#include <linux/init.h>
  25#include <linux/bitmap.h>
  26#include <linux/debugfs.h>
  27#include <linux/export.h>
  28#include <linux/slab.h>
  29#include <linux/irq.h>
  30#include <linux/interrupt.h>
  31#include <linux/spinlock.h>
  32#include <linux/pci.h>
  33#include <linux/dmar.h>
  34#include <linux/dma-mapping.h>
  35#include <linux/mempool.h>
  36#include <linux/timer.h>
  37#include <linux/iova.h>
  38#include <linux/iommu.h>
  39#include <linux/intel-iommu.h>
  40#include <linux/syscore_ops.h>
  41#include <linux/tboot.h>
  42#include <linux/dmi.h>
  43#include <linux/pci-ats.h>
  44#include <linux/memblock.h>
  45#include <asm/irq_remapping.h>
  46#include <asm/cacheflush.h>
  47#include <asm/iommu.h>
  48
  49#include "irq_remapping.h"
  50#include "pci.h"
  51
  52#define ROOT_SIZE               VTD_PAGE_SIZE
  53#define CONTEXT_SIZE            VTD_PAGE_SIZE
  54
  55#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  56#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  57#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  58
  59#define IOAPIC_RANGE_START      (0xfee00000)
  60#define IOAPIC_RANGE_END        (0xfeefffff)
  61#define IOVA_START_ADDR         (0x1000)
  62
  63#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
  64
  65#define MAX_AGAW_WIDTH 64
  66#define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
  67
  68#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  69#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  70
  71/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  72   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  73#define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  74                                __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  75#define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  76
  77#define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  78#define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
  79#define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
  80
  81/* page table handling */
  82#define LEVEL_STRIDE            (9)
  83#define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  84
  85/*
  86 * This bitmap is used to advertise the page sizes our hardware support
  87 * to the IOMMU core, which will then use this information to split
  88 * physically contiguous memory regions it is mapping into page sizes
  89 * that we support.
  90 *
  91 * Traditionally the IOMMU core just handed us the mappings directly,
  92 * after making sure the size is an order of a 4KiB page and that the
  93 * mapping has natural alignment.
  94 *
  95 * To retain this behavior, we currently advertise that we support
  96 * all page sizes that are an order of 4KiB.
  97 *
  98 * If at some point we'd like to utilize the IOMMU core's new behavior,
  99 * we could change this to advertise the real page sizes we support.
 100 */
 101#define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
 102
 103static inline int agaw_to_level(int agaw)
 104{
 105        return agaw + 2;
 106}
 107
 108static inline int agaw_to_width(int agaw)
 109{
 110        return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
 111}
 112
 113static inline int width_to_agaw(int width)
 114{
 115        return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
 116}
 117
 118static inline unsigned int level_to_offset_bits(int level)
 119{
 120        return (level - 1) * LEVEL_STRIDE;
 121}
 122
 123static inline int pfn_level_offset(unsigned long pfn, int level)
 124{
 125        return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 126}
 127
 128static inline unsigned long level_mask(int level)
 129{
 130        return -1UL << level_to_offset_bits(level);
 131}
 132
 133static inline unsigned long level_size(int level)
 134{
 135        return 1UL << level_to_offset_bits(level);
 136}
 137
 138static inline unsigned long align_to_level(unsigned long pfn, int level)
 139{
 140        return (pfn + level_size(level) - 1) & level_mask(level);
 141}
 142
 143static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 144{
 145        return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
 146}
 147
 148/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 149   are never going to work. */
 150static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 151{
 152        return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 153}
 154
 155static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 156{
 157        return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 158}
 159static inline unsigned long page_to_dma_pfn(struct page *pg)
 160{
 161        return mm_to_dma_pfn(page_to_pfn(pg));
 162}
 163static inline unsigned long virt_to_dma_pfn(void *p)
 164{
 165        return page_to_dma_pfn(virt_to_page(p));
 166}
 167
 168/* global iommu list, set NULL for ignored DMAR units */
 169static struct intel_iommu **g_iommus;
 170
 171static void __init check_tylersburg_isoch(void);
 172static int rwbf_quirk;
 173
 174/*
 175 * set to 1 to panic kernel if can't successfully enable VT-d
 176 * (used when kernel is launched w/ TXT)
 177 */
 178static int force_on = 0;
 179
 180/*
 181 * 0: Present
 182 * 1-11: Reserved
 183 * 12-63: Context Ptr (12 - (haw-1))
 184 * 64-127: Reserved
 185 */
 186struct root_entry {
 187        u64     val;
 188        u64     rsvd1;
 189};
 190#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 191static inline bool root_present(struct root_entry *root)
 192{
 193        return (root->val & 1);
 194}
 195static inline void set_root_present(struct root_entry *root)
 196{
 197        root->val |= 1;
 198}
 199static inline void set_root_value(struct root_entry *root, unsigned long value)
 200{
 201        root->val |= value & VTD_PAGE_MASK;
 202}
 203
 204static inline struct context_entry *
 205get_context_addr_from_root(struct root_entry *root)
 206{
 207        return (struct context_entry *)
 208                (root_present(root)?phys_to_virt(
 209                root->val & VTD_PAGE_MASK) :
 210                NULL);
 211}
 212
 213/*
 214 * low 64 bits:
 215 * 0: present
 216 * 1: fault processing disable
 217 * 2-3: translation type
 218 * 12-63: address space root
 219 * high 64 bits:
 220 * 0-2: address width
 221 * 3-6: aval
 222 * 8-23: domain id
 223 */
 224struct context_entry {
 225        u64 lo;
 226        u64 hi;
 227};
 228
 229static inline bool context_present(struct context_entry *context)
 230{
 231        return (context->lo & 1);
 232}
 233static inline void context_set_present(struct context_entry *context)
 234{
 235        context->lo |= 1;
 236}
 237
 238static inline void context_set_fault_enable(struct context_entry *context)
 239{
 240        context->lo &= (((u64)-1) << 2) | 1;
 241}
 242
 243static inline void context_set_translation_type(struct context_entry *context,
 244                                                unsigned long value)
 245{
 246        context->lo &= (((u64)-1) << 4) | 3;
 247        context->lo |= (value & 3) << 2;
 248}
 249
 250static inline void context_set_address_root(struct context_entry *context,
 251                                            unsigned long value)
 252{
 253        context->lo |= value & VTD_PAGE_MASK;
 254}
 255
 256static inline void context_set_address_width(struct context_entry *context,
 257                                             unsigned long value)
 258{
 259        context->hi |= value & 7;
 260}
 261
 262static inline void context_set_domain_id(struct context_entry *context,
 263                                         unsigned long value)
 264{
 265        context->hi |= (value & ((1 << 16) - 1)) << 8;
 266}
 267
 268static inline void context_clear_entry(struct context_entry *context)
 269{
 270        context->lo = 0;
 271        context->hi = 0;
 272}
 273
 274/*
 275 * 0: readable
 276 * 1: writable
 277 * 2-6: reserved
 278 * 7: super page
 279 * 8-10: available
 280 * 11: snoop behavior
 281 * 12-63: Host physcial address
 282 */
 283struct dma_pte {
 284        u64 val;
 285};
 286
 287static inline void dma_clear_pte(struct dma_pte *pte)
 288{
 289        pte->val = 0;
 290}
 291
 292static inline u64 dma_pte_addr(struct dma_pte *pte)
 293{
 294#ifdef CONFIG_64BIT
 295        return pte->val & VTD_PAGE_MASK;
 296#else
 297        /* Must have a full atomic 64-bit read */
 298        return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
 299#endif
 300}
 301
 302static inline bool dma_pte_present(struct dma_pte *pte)
 303{
 304        return (pte->val & 3) != 0;
 305}
 306
 307static inline bool dma_pte_superpage(struct dma_pte *pte)
 308{
 309        return (pte->val & (1 << 7));
 310}
 311
 312static inline int first_pte_in_page(struct dma_pte *pte)
 313{
 314        return !((unsigned long)pte & ~VTD_PAGE_MASK);
 315}
 316
 317/*
 318 * This domain is a statically identity mapping domain.
 319 *      1. This domain creats a static 1:1 mapping to all usable memory.
 320 *      2. It maps to each iommu if successful.
 321 *      3. Each iommu mapps to this domain if successful.
 322 */
 323static struct dmar_domain *si_domain;
 324static int hw_pass_through = 1;
 325
 326/* devices under the same p2p bridge are owned in one domain */
 327#define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
 328
 329/* domain represents a virtual machine, more than one devices
 330 * across iommus may be owned in one domain, e.g. kvm guest.
 331 */
 332#define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
 333
 334/* si_domain contains mulitple devices */
 335#define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
 336
 337/* define the limit of IOMMUs supported in each domain */
 338#ifdef  CONFIG_X86
 339# define        IOMMU_UNITS_SUPPORTED   MAX_IO_APICS
 340#else
 341# define        IOMMU_UNITS_SUPPORTED   64
 342#endif
 343
 344struct dmar_domain {
 345        int     id;                     /* domain id */
 346        int     nid;                    /* node id */
 347        DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
 348                                        /* bitmap of iommus this domain uses*/
 349
 350        struct list_head devices;       /* all devices' list */
 351        struct iova_domain iovad;       /* iova's that belong to this domain */
 352
 353        struct dma_pte  *pgd;           /* virtual address */
 354        int             gaw;            /* max guest address width */
 355
 356        /* adjusted guest address width, 0 is level 2 30-bit */
 357        int             agaw;
 358
 359        int             flags;          /* flags to find out type of domain */
 360
 361        int             iommu_coherency;/* indicate coherency of iommu access */
 362        int             iommu_snooping; /* indicate snooping control feature*/
 363        int             iommu_count;    /* reference count of iommu */
 364        int             iommu_superpage;/* Level of superpages supported:
 365                                           0 == 4KiB (no superpages), 1 == 2MiB,
 366                                           2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
 367        spinlock_t      iommu_lock;     /* protect iommu set in domain */
 368        u64             max_addr;       /* maximum mapped address */
 369};
 370
 371/* PCI domain-device relationship */
 372struct device_domain_info {
 373        struct list_head link;  /* link to domain siblings */
 374        struct list_head global; /* link to global list */
 375        int segment;            /* PCI domain */
 376        u8 bus;                 /* PCI bus number */
 377        u8 devfn;               /* PCI devfn number */
 378        struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
 379        struct intel_iommu *iommu; /* IOMMU used by this device */
 380        struct dmar_domain *domain; /* pointer to domain */
 381};
 382
 383static void flush_unmaps_timeout(unsigned long data);
 384
 385static DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
 386
 387#define HIGH_WATER_MARK 250
 388struct deferred_flush_tables {
 389        int next;
 390        struct iova *iova[HIGH_WATER_MARK];
 391        struct dmar_domain *domain[HIGH_WATER_MARK];
 392};
 393
 394static struct deferred_flush_tables *deferred_flush;
 395
 396/* bitmap for indexing intel_iommus */
 397static int g_num_of_iommus;
 398
 399static DEFINE_SPINLOCK(async_umap_flush_lock);
 400static LIST_HEAD(unmaps_to_do);
 401
 402static int timer_on;
 403static long list_size;
 404
 405static void domain_remove_dev_info(struct dmar_domain *domain);
 406
 407#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
 408int dmar_disabled = 0;
 409#else
 410int dmar_disabled = 1;
 411#endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
 412
 413int intel_iommu_enabled = 0;
 414EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 415
 416static int dmar_map_gfx = 1;
 417static int dmar_forcedac;
 418static int intel_iommu_strict;
 419static int intel_iommu_superpage = 1;
 420
 421int intel_iommu_gfx_mapped;
 422EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 423
 424#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 425static DEFINE_SPINLOCK(device_domain_lock);
 426static LIST_HEAD(device_domain_list);
 427
 428static struct iommu_ops intel_iommu_ops;
 429
 430static int __init intel_iommu_setup(char *str)
 431{
 432        if (!str)
 433                return -EINVAL;
 434        while (*str) {
 435                if (!strncmp(str, "on", 2)) {
 436                        dmar_disabled = 0;
 437                        printk(KERN_INFO "Intel-IOMMU: enabled\n");
 438                } else if (!strncmp(str, "off", 3)) {
 439                        dmar_disabled = 1;
 440                        printk(KERN_INFO "Intel-IOMMU: disabled\n");
 441                } else if (!strncmp(str, "igfx_off", 8)) {
 442                        dmar_map_gfx = 0;
 443                        printk(KERN_INFO
 444                                "Intel-IOMMU: disable GFX device mapping\n");
 445                } else if (!strncmp(str, "forcedac", 8)) {
 446                        printk(KERN_INFO
 447                                "Intel-IOMMU: Forcing DAC for PCI devices\n");
 448                        dmar_forcedac = 1;
 449                } else if (!strncmp(str, "strict", 6)) {
 450                        printk(KERN_INFO
 451                                "Intel-IOMMU: disable batched IOTLB flush\n");
 452                        intel_iommu_strict = 1;
 453                } else if (!strncmp(str, "sp_off", 6)) {
 454                        printk(KERN_INFO
 455                                "Intel-IOMMU: disable supported super page\n");
 456                        intel_iommu_superpage = 0;
 457                }
 458
 459                str += strcspn(str, ",");
 460                while (*str == ',')
 461                        str++;
 462        }
 463        return 0;
 464}
 465__setup("intel_iommu=", intel_iommu_setup);
 466
 467static struct kmem_cache *iommu_domain_cache;
 468static struct kmem_cache *iommu_devinfo_cache;
 469static struct kmem_cache *iommu_iova_cache;
 470
 471static inline void *alloc_pgtable_page(int node)
 472{
 473        struct page *page;
 474        void *vaddr = NULL;
 475
 476        page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 477        if (page)
 478                vaddr = page_address(page);
 479        return vaddr;
 480}
 481
 482static inline void free_pgtable_page(void *vaddr)
 483{
 484        free_page((unsigned long)vaddr);
 485}
 486
 487static inline void *alloc_domain_mem(void)
 488{
 489        return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 490}
 491
 492static void free_domain_mem(void *vaddr)
 493{
 494        kmem_cache_free(iommu_domain_cache, vaddr);
 495}
 496
 497static inline void * alloc_devinfo_mem(void)
 498{
 499        return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 500}
 501
 502static inline void free_devinfo_mem(void *vaddr)
 503{
 504        kmem_cache_free(iommu_devinfo_cache, vaddr);
 505}
 506
 507struct iova *alloc_iova_mem(void)
 508{
 509        return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
 510}
 511
 512void free_iova_mem(struct iova *iova)
 513{
 514        kmem_cache_free(iommu_iova_cache, iova);
 515}
 516
 517
 518static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 519{
 520        unsigned long sagaw;
 521        int agaw = -1;
 522
 523        sagaw = cap_sagaw(iommu->cap);
 524        for (agaw = width_to_agaw(max_gaw);
 525             agaw >= 0; agaw--) {
 526                if (test_bit(agaw, &sagaw))
 527                        break;
 528        }
 529
 530        return agaw;
 531}
 532
 533/*
 534 * Calculate max SAGAW for each iommu.
 535 */
 536int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 537{
 538        return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 539}
 540
 541/*
 542 * calculate agaw for each iommu.
 543 * "SAGAW" may be different across iommus, use a default agaw, and
 544 * get a supported less agaw for iommus that don't support the default agaw.
 545 */
 546int iommu_calculate_agaw(struct intel_iommu *iommu)
 547{
 548        return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 549}
 550
 551/* This functionin only returns single iommu in a domain */
 552static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 553{
 554        int iommu_id;
 555
 556        /* si_domain and vm domain should not get here. */
 557        BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
 558        BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
 559
 560        iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
 561        if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 562                return NULL;
 563
 564        return g_iommus[iommu_id];
 565}
 566
 567static void domain_update_iommu_coherency(struct dmar_domain *domain)
 568{
 569        int i;
 570
 571        i = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
 572
 573        domain->iommu_coherency = i < g_num_of_iommus ? 1 : 0;
 574
 575        for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
 576                if (!ecap_coherent(g_iommus[i]->ecap)) {
 577                        domain->iommu_coherency = 0;
 578                        break;
 579                }
 580        }
 581}
 582
 583static void domain_update_iommu_snooping(struct dmar_domain *domain)
 584{
 585        int i;
 586
 587        domain->iommu_snooping = 1;
 588
 589        for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
 590                if (!ecap_sc_support(g_iommus[i]->ecap)) {
 591                        domain->iommu_snooping = 0;
 592                        break;
 593                }
 594        }
 595}
 596
 597static void domain_update_iommu_superpage(struct dmar_domain *domain)
 598{
 599        struct dmar_drhd_unit *drhd;
 600        struct intel_iommu *iommu = NULL;
 601        int mask = 0xf;
 602
 603        if (!intel_iommu_superpage) {
 604                domain->iommu_superpage = 0;
 605                return;
 606        }
 607
 608        /* set iommu_superpage to the smallest common denominator */
 609        for_each_active_iommu(iommu, drhd) {
 610                mask &= cap_super_page_val(iommu->cap);
 611                if (!mask) {
 612                        break;
 613                }
 614        }
 615        domain->iommu_superpage = fls(mask);
 616}
 617
 618/* Some capabilities may be different across iommus */
 619static void domain_update_iommu_cap(struct dmar_domain *domain)
 620{
 621        domain_update_iommu_coherency(domain);
 622        domain_update_iommu_snooping(domain);
 623        domain_update_iommu_superpage(domain);
 624}
 625
 626static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
 627{
 628        struct dmar_drhd_unit *drhd = NULL;
 629        int i;
 630
 631        for_each_active_drhd_unit(drhd) {
 632                if (segment != drhd->segment)
 633                        continue;
 634
 635                for (i = 0; i < drhd->devices_cnt; i++) {
 636                        if (drhd->devices[i] &&
 637                            drhd->devices[i]->bus->number == bus &&
 638                            drhd->devices[i]->devfn == devfn)
 639                                return drhd->iommu;
 640                        if (drhd->devices[i] &&
 641                            drhd->devices[i]->subordinate &&
 642                            drhd->devices[i]->subordinate->number <= bus &&
 643                            drhd->devices[i]->subordinate->busn_res.end >= bus)
 644                                return drhd->iommu;
 645                }
 646
 647                if (drhd->include_all)
 648                        return drhd->iommu;
 649        }
 650
 651        return NULL;
 652}
 653
 654static void domain_flush_cache(struct dmar_domain *domain,
 655                               void *addr, int size)
 656{
 657        if (!domain->iommu_coherency)
 658                clflush_cache_range(addr, size);
 659}
 660
 661/* Gets context entry for a given bus and devfn */
 662static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
 663                u8 bus, u8 devfn)
 664{
 665        struct root_entry *root;
 666        struct context_entry *context;
 667        unsigned long phy_addr;
 668        unsigned long flags;
 669
 670        spin_lock_irqsave(&iommu->lock, flags);
 671        root = &iommu->root_entry[bus];
 672        context = get_context_addr_from_root(root);
 673        if (!context) {
 674                context = (struct context_entry *)
 675                                alloc_pgtable_page(iommu->node);
 676                if (!context) {
 677                        spin_unlock_irqrestore(&iommu->lock, flags);
 678                        return NULL;
 679                }
 680                __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 681                phy_addr = virt_to_phys((void *)context);
 682                set_root_value(root, phy_addr);
 683                set_root_present(root);
 684                __iommu_flush_cache(iommu, root, sizeof(*root));
 685        }
 686        spin_unlock_irqrestore(&iommu->lock, flags);
 687        return &context[devfn];
 688}
 689
 690static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 691{
 692        struct root_entry *root;
 693        struct context_entry *context;
 694        int ret;
 695        unsigned long flags;
 696
 697        spin_lock_irqsave(&iommu->lock, flags);
 698        root = &iommu->root_entry[bus];
 699        context = get_context_addr_from_root(root);
 700        if (!context) {
 701                ret = 0;
 702                goto out;
 703        }
 704        ret = context_present(&context[devfn]);
 705out:
 706        spin_unlock_irqrestore(&iommu->lock, flags);
 707        return ret;
 708}
 709
 710static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
 711{
 712        struct root_entry *root;
 713        struct context_entry *context;
 714        unsigned long flags;
 715
 716        spin_lock_irqsave(&iommu->lock, flags);
 717        root = &iommu->root_entry[bus];
 718        context = get_context_addr_from_root(root);
 719        if (context) {
 720                context_clear_entry(&context[devfn]);
 721                __iommu_flush_cache(iommu, &context[devfn], \
 722                        sizeof(*context));
 723        }
 724        spin_unlock_irqrestore(&iommu->lock, flags);
 725}
 726
 727static void free_context_table(struct intel_iommu *iommu)
 728{
 729        struct root_entry *root;
 730        int i;
 731        unsigned long flags;
 732        struct context_entry *context;
 733
 734        spin_lock_irqsave(&iommu->lock, flags);
 735        if (!iommu->root_entry) {
 736                goto out;
 737        }
 738        for (i = 0; i < ROOT_ENTRY_NR; i++) {
 739                root = &iommu->root_entry[i];
 740                context = get_context_addr_from_root(root);
 741                if (context)
 742                        free_pgtable_page(context);
 743        }
 744        free_pgtable_page(iommu->root_entry);
 745        iommu->root_entry = NULL;
 746out:
 747        spin_unlock_irqrestore(&iommu->lock, flags);
 748}
 749
 750static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 751                                      unsigned long pfn, int target_level)
 752{
 753        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 754        struct dma_pte *parent, *pte = NULL;
 755        int level = agaw_to_level(domain->agaw);
 756        int offset;
 757
 758        BUG_ON(!domain->pgd);
 759
 760        if (addr_width < BITS_PER_LONG && pfn >> addr_width)
 761                /* Address beyond IOMMU's addressing capabilities. */
 762                return NULL;
 763
 764        parent = domain->pgd;
 765
 766        while (level > 0) {
 767                void *tmp_page;
 768
 769                offset = pfn_level_offset(pfn, level);
 770                pte = &parent[offset];
 771                if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 772                        break;
 773                if (level == target_level)
 774                        break;
 775
 776                if (!dma_pte_present(pte)) {
 777                        uint64_t pteval;
 778
 779                        tmp_page = alloc_pgtable_page(domain->nid);
 780
 781                        if (!tmp_page)
 782                                return NULL;
 783
 784                        domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 785                        pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 786                        if (cmpxchg64(&pte->val, 0ULL, pteval)) {
 787                                /* Someone else set it while we were thinking; use theirs. */
 788                                free_pgtable_page(tmp_page);
 789                        } else {
 790                                dma_pte_addr(pte);
 791                                domain_flush_cache(domain, pte, sizeof(*pte));
 792                        }
 793                }
 794                parent = phys_to_virt(dma_pte_addr(pte));
 795                level--;
 796        }
 797
 798        return pte;
 799}
 800
 801
 802/* return address's pte at specific level */
 803static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
 804                                         unsigned long pfn,
 805                                         int level, int *large_page)
 806{
 807        struct dma_pte *parent, *pte = NULL;
 808        int total = agaw_to_level(domain->agaw);
 809        int offset;
 810
 811        parent = domain->pgd;
 812        while (level <= total) {
 813                offset = pfn_level_offset(pfn, total);
 814                pte = &parent[offset];
 815                if (level == total)
 816                        return pte;
 817
 818                if (!dma_pte_present(pte)) {
 819                        *large_page = total;
 820                        break;
 821                }
 822
 823                if (pte->val & DMA_PTE_LARGE_PAGE) {
 824                        *large_page = total;
 825                        return pte;
 826                }
 827
 828                parent = phys_to_virt(dma_pte_addr(pte));
 829                total--;
 830        }
 831        return NULL;
 832}
 833
 834/* clear last level pte, a tlb flush should be followed */
 835static int dma_pte_clear_range(struct dmar_domain *domain,
 836                                unsigned long start_pfn,
 837                                unsigned long last_pfn)
 838{
 839        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 840        unsigned int large_page = 1;
 841        struct dma_pte *first_pte, *pte;
 842
 843        BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
 844        BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
 845        BUG_ON(start_pfn > last_pfn);
 846
 847        /* we don't need lock here; nobody else touches the iova range */
 848        do {
 849                large_page = 1;
 850                first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
 851                if (!pte) {
 852                        start_pfn = align_to_level(start_pfn + 1, large_page + 1);
 853                        continue;
 854                }
 855                do {
 856                        dma_clear_pte(pte);
 857                        start_pfn += lvl_to_nr_pages(large_page);
 858                        pte++;
 859                } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
 860
 861                domain_flush_cache(domain, first_pte,
 862                                   (void *)pte - (void *)first_pte);
 863
 864        } while (start_pfn && start_pfn <= last_pfn);
 865
 866        return min_t(int, (large_page - 1) * 9, MAX_AGAW_PFN_WIDTH);
 867}
 868
 869static void dma_pte_free_level(struct dmar_domain *domain, int level,
 870                               struct dma_pte *pte, unsigned long pfn,
 871                               unsigned long start_pfn, unsigned long last_pfn)
 872{
 873        pfn = max(start_pfn, pfn);
 874        pte = &pte[pfn_level_offset(pfn, level)];
 875
 876        do {
 877                unsigned long level_pfn;
 878                struct dma_pte *level_pte;
 879
 880                if (!dma_pte_present(pte) || dma_pte_superpage(pte))
 881                        goto next;
 882
 883                level_pfn = pfn & level_mask(level - 1);
 884                level_pte = phys_to_virt(dma_pte_addr(pte));
 885
 886                if (level > 2)
 887                        dma_pte_free_level(domain, level - 1, level_pte,
 888                                           level_pfn, start_pfn, last_pfn);
 889
 890                /* If range covers entire pagetable, free it */
 891                if (!(start_pfn > level_pfn ||
 892                      last_pfn < level_pfn + level_size(level) - 1)) {
 893                        dma_clear_pte(pte);
 894                        domain_flush_cache(domain, pte, sizeof(*pte));
 895                        free_pgtable_page(level_pte);
 896                }
 897next:
 898                pfn += level_size(level);
 899        } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
 900}
 901
 902/* free page table pages. last level pte should already be cleared */
 903static void dma_pte_free_pagetable(struct dmar_domain *domain,
 904                                   unsigned long start_pfn,
 905                                   unsigned long last_pfn)
 906{
 907        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 908
 909        BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
 910        BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
 911        BUG_ON(start_pfn > last_pfn);
 912
 913        /* We don't need lock here; nobody else touches the iova range */
 914        dma_pte_free_level(domain, agaw_to_level(domain->agaw),
 915                           domain->pgd, 0, start_pfn, last_pfn);
 916
 917        /* free pgd */
 918        if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
 919                free_pgtable_page(domain->pgd);
 920                domain->pgd = NULL;
 921        }
 922}
 923
 924/* iommu handling */
 925static int iommu_alloc_root_entry(struct intel_iommu *iommu)
 926{
 927        struct root_entry *root;
 928        unsigned long flags;
 929
 930        root = (struct root_entry *)alloc_pgtable_page(iommu->node);
 931        if (!root)
 932                return -ENOMEM;
 933
 934        __iommu_flush_cache(iommu, root, ROOT_SIZE);
 935
 936        spin_lock_irqsave(&iommu->lock, flags);
 937        iommu->root_entry = root;
 938        spin_unlock_irqrestore(&iommu->lock, flags);
 939
 940        return 0;
 941}
 942
 943static void iommu_set_root_entry(struct intel_iommu *iommu)
 944{
 945        void *addr;
 946        u32 sts;
 947        unsigned long flag;
 948
 949        addr = iommu->root_entry;
 950
 951        raw_spin_lock_irqsave(&iommu->register_lock, flag);
 952        dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
 953
 954        writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
 955
 956        /* Make sure hardware complete it */
 957        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
 958                      readl, (sts & DMA_GSTS_RTPS), sts);
 959
 960        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
 961}
 962
 963static void iommu_flush_write_buffer(struct intel_iommu *iommu)
 964{
 965        u32 val;
 966        unsigned long flag;
 967
 968        if (!rwbf_quirk && !cap_rwbf(iommu->cap))
 969                return;
 970
 971        raw_spin_lock_irqsave(&iommu->register_lock, flag);
 972        writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
 973
 974        /* Make sure hardware complete it */
 975        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
 976                      readl, (!(val & DMA_GSTS_WBFS)), val);
 977
 978        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
 979}
 980
 981/* return value determine if we need a write buffer flush */
 982static void __iommu_flush_context(struct intel_iommu *iommu,
 983                                  u16 did, u16 source_id, u8 function_mask,
 984                                  u64 type)
 985{
 986        u64 val = 0;
 987        unsigned long flag;
 988
 989        switch (type) {
 990        case DMA_CCMD_GLOBAL_INVL:
 991                val = DMA_CCMD_GLOBAL_INVL;
 992                break;
 993        case DMA_CCMD_DOMAIN_INVL:
 994                val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
 995                break;
 996        case DMA_CCMD_DEVICE_INVL:
 997                val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
 998                        | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
 999                break;
1000        default:
1001                BUG();
1002        }
1003        val |= DMA_CCMD_ICC;
1004
1005        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1006        dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1007
1008        /* Make sure hardware complete it */
1009        IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1010                dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1011
1012        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1013}
1014
1015/* return value determine if we need a write buffer flush */
1016static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1017                                u64 addr, unsigned int size_order, u64 type)
1018{
1019        int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1020        u64 val = 0, val_iva = 0;
1021        unsigned long flag;
1022
1023        switch (type) {
1024        case DMA_TLB_GLOBAL_FLUSH:
1025                /* global flush doesn't need set IVA_REG */
1026                val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1027                break;
1028        case DMA_TLB_DSI_FLUSH:
1029                val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1030                break;
1031        case DMA_TLB_PSI_FLUSH:
1032                val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1033                /* Note: always flush non-leaf currently */
1034                val_iva = size_order | addr;
1035                break;
1036        default:
1037                BUG();
1038        }
1039        /* Note: set drain read/write */
1040#if 0
1041        /*
1042         * This is probably to be super secure.. Looks like we can
1043         * ignore it without any impact.
1044         */
1045        if (cap_read_drain(iommu->cap))
1046                val |= DMA_TLB_READ_DRAIN;
1047#endif
1048        if (cap_write_drain(iommu->cap))
1049                val |= DMA_TLB_WRITE_DRAIN;
1050
1051        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1052        /* Note: Only uses first TLB reg currently */
1053        if (val_iva)
1054                dmar_writeq(iommu->reg + tlb_offset, val_iva);
1055        dmar_writeq(iommu->reg + tlb_offset + 8, val);
1056
1057        /* Make sure hardware complete it */
1058        IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1059                dmar_readq, (!(val & DMA_TLB_IVT)), val);
1060
1061        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1062
1063        /* check IOTLB invalidation granularity */
1064        if (DMA_TLB_IAIG(val) == 0)
1065                printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1066        if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1067                pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1068                        (unsigned long long)DMA_TLB_IIRG(type),
1069                        (unsigned long long)DMA_TLB_IAIG(val));
1070}
1071
1072static struct device_domain_info *iommu_support_dev_iotlb(
1073        struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1074{
1075        int found = 0;
1076        unsigned long flags;
1077        struct device_domain_info *info;
1078        struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1079
1080        if (!ecap_dev_iotlb_support(iommu->ecap))
1081                return NULL;
1082
1083        if (!iommu->qi)
1084                return NULL;
1085
1086        spin_lock_irqsave(&device_domain_lock, flags);
1087        list_for_each_entry(info, &domain->devices, link)
1088                if (info->bus == bus && info->devfn == devfn) {
1089                        found = 1;
1090                        break;
1091                }
1092        spin_unlock_irqrestore(&device_domain_lock, flags);
1093
1094        if (!found || !info->dev)
1095                return NULL;
1096
1097        if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1098                return NULL;
1099
1100        if (!dmar_find_matched_atsr_unit(info->dev))
1101                return NULL;
1102
1103        info->iommu = iommu;
1104
1105        return info;
1106}
1107
1108static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1109{
1110        if (!info)
1111                return;
1112
1113        pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1114}
1115
1116static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1117{
1118        if (!info->dev || !pci_ats_enabled(info->dev))
1119                return;
1120
1121        pci_disable_ats(info->dev);
1122}
1123
1124static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1125                                  u64 addr, unsigned mask)
1126{
1127        u16 sid, qdep;
1128        unsigned long flags;
1129        struct device_domain_info *info;
1130
1131        spin_lock_irqsave(&device_domain_lock, flags);
1132        list_for_each_entry(info, &domain->devices, link) {
1133                if (!info->dev || !pci_ats_enabled(info->dev))
1134                        continue;
1135
1136                sid = info->bus << 8 | info->devfn;
1137                qdep = pci_ats_queue_depth(info->dev);
1138                qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1139        }
1140        spin_unlock_irqrestore(&device_domain_lock, flags);
1141}
1142
1143static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1144                                  unsigned long pfn, unsigned int pages, int map)
1145{
1146        unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1147        uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1148
1149        BUG_ON(pages == 0);
1150
1151        /*
1152         * Fallback to domain selective flush if no PSI support or the size is
1153         * too big.
1154         * PSI requires page size to be 2 ^ x, and the base address is naturally
1155         * aligned to the size
1156         */
1157        if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1158                iommu->flush.flush_iotlb(iommu, did, 0, 0,
1159                                                DMA_TLB_DSI_FLUSH);
1160        else
1161                iommu->flush.flush_iotlb(iommu, did, addr, mask,
1162                                                DMA_TLB_PSI_FLUSH);
1163
1164        /*
1165         * In caching mode, changes of pages from non-present to present require
1166         * flush. However, device IOTLB doesn't need to be flushed in this case.
1167         */
1168        if (!cap_caching_mode(iommu->cap) || !map)
1169                iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1170}
1171
1172static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1173{
1174        u32 pmen;
1175        unsigned long flags;
1176
1177        raw_spin_lock_irqsave(&iommu->register_lock, flags);
1178        pmen = readl(iommu->reg + DMAR_PMEN_REG);
1179        pmen &= ~DMA_PMEN_EPM;
1180        writel(pmen, iommu->reg + DMAR_PMEN_REG);
1181
1182        /* wait for the protected region status bit to clear */
1183        IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1184                readl, !(pmen & DMA_PMEN_PRS), pmen);
1185
1186        raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1187}
1188
1189static int iommu_enable_translation(struct intel_iommu *iommu)
1190{
1191        u32 sts;
1192        unsigned long flags;
1193
1194        raw_spin_lock_irqsave(&iommu->register_lock, flags);
1195        iommu->gcmd |= DMA_GCMD_TE;
1196        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1197
1198        /* Make sure hardware complete it */
1199        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1200                      readl, (sts & DMA_GSTS_TES), sts);
1201
1202        raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1203        return 0;
1204}
1205
1206static int iommu_disable_translation(struct intel_iommu *iommu)
1207{
1208        u32 sts;
1209        unsigned long flag;
1210
1211        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1212        iommu->gcmd &= ~DMA_GCMD_TE;
1213        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1214
1215        /* Make sure hardware complete it */
1216        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1217                      readl, (!(sts & DMA_GSTS_TES)), sts);
1218
1219        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1220        return 0;
1221}
1222
1223
1224static int iommu_init_domains(struct intel_iommu *iommu)
1225{
1226        unsigned long ndomains;
1227        unsigned long nlongs;
1228
1229        ndomains = cap_ndoms(iommu->cap);
1230        pr_debug("IOMMU%d: Number of Domains supported <%ld>\n",
1231                 iommu->seq_id, ndomains);
1232        nlongs = BITS_TO_LONGS(ndomains);
1233
1234        spin_lock_init(&iommu->lock);
1235
1236        /* TBD: there might be 64K domains,
1237         * consider other allocation for future chip
1238         */
1239        iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1240        if (!iommu->domain_ids) {
1241                pr_err("IOMMU%d: allocating domain id array failed\n",
1242                       iommu->seq_id);
1243                return -ENOMEM;
1244        }
1245        iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1246                        GFP_KERNEL);
1247        if (!iommu->domains) {
1248                pr_err("IOMMU%d: allocating domain array failed\n",
1249                       iommu->seq_id);
1250                kfree(iommu->domain_ids);
1251                iommu->domain_ids = NULL;
1252                return -ENOMEM;
1253        }
1254
1255        /*
1256         * if Caching mode is set, then invalid translations are tagged
1257         * with domainid 0. Hence we need to pre-allocate it.
1258         */
1259        if (cap_caching_mode(iommu->cap))
1260                set_bit(0, iommu->domain_ids);
1261        return 0;
1262}
1263
1264
1265static void domain_exit(struct dmar_domain *domain);
1266static void vm_domain_exit(struct dmar_domain *domain);
1267
1268static void free_dmar_iommu(struct intel_iommu *iommu)
1269{
1270        struct dmar_domain *domain;
1271        int i, count;
1272        unsigned long flags;
1273
1274        if ((iommu->domains) && (iommu->domain_ids)) {
1275                for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1276                        domain = iommu->domains[i];
1277                        clear_bit(i, iommu->domain_ids);
1278
1279                        spin_lock_irqsave(&domain->iommu_lock, flags);
1280                        count = --domain->iommu_count;
1281                        spin_unlock_irqrestore(&domain->iommu_lock, flags);
1282                        if (count == 0) {
1283                                if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1284                                        vm_domain_exit(domain);
1285                                else
1286                                        domain_exit(domain);
1287                        }
1288                }
1289        }
1290
1291        if (iommu->gcmd & DMA_GCMD_TE)
1292                iommu_disable_translation(iommu);
1293
1294        kfree(iommu->domains);
1295        kfree(iommu->domain_ids);
1296        iommu->domains = NULL;
1297        iommu->domain_ids = NULL;
1298
1299        g_iommus[iommu->seq_id] = NULL;
1300
1301        /* if all iommus are freed, free g_iommus */
1302        for (i = 0; i < g_num_of_iommus; i++) {
1303                if (g_iommus[i])
1304                        break;
1305        }
1306
1307        if (i == g_num_of_iommus)
1308                kfree(g_iommus);
1309
1310        /* free context mapping */
1311        free_context_table(iommu);
1312}
1313
1314static struct dmar_domain *alloc_domain(void)
1315{
1316        struct dmar_domain *domain;
1317
1318        domain = alloc_domain_mem();
1319        if (!domain)
1320                return NULL;
1321
1322        domain->nid = -1;
1323        memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1324        domain->flags = 0;
1325
1326        return domain;
1327}
1328
1329static int iommu_attach_domain(struct dmar_domain *domain,
1330                               struct intel_iommu *iommu)
1331{
1332        int num;
1333        unsigned long ndomains;
1334        unsigned long flags;
1335
1336        ndomains = cap_ndoms(iommu->cap);
1337
1338        spin_lock_irqsave(&iommu->lock, flags);
1339
1340        num = find_first_zero_bit(iommu->domain_ids, ndomains);
1341        if (num >= ndomains) {
1342                spin_unlock_irqrestore(&iommu->lock, flags);
1343                printk(KERN_ERR "IOMMU: no free domain ids\n");
1344                return -ENOMEM;
1345        }
1346
1347        domain->id = num;
1348        set_bit(num, iommu->domain_ids);
1349        set_bit(iommu->seq_id, domain->iommu_bmp);
1350        iommu->domains[num] = domain;
1351        spin_unlock_irqrestore(&iommu->lock, flags);
1352
1353        return 0;
1354}
1355
1356static void iommu_detach_domain(struct dmar_domain *domain,
1357                                struct intel_iommu *iommu)
1358{
1359        unsigned long flags;
1360        int num, ndomains;
1361        int found = 0;
1362
1363        spin_lock_irqsave(&iommu->lock, flags);
1364        ndomains = cap_ndoms(iommu->cap);
1365        for_each_set_bit(num, iommu->domain_ids, ndomains) {
1366                if (iommu->domains[num] == domain) {
1367                        found = 1;
1368                        break;
1369                }
1370        }
1371
1372        if (found) {
1373                clear_bit(num, iommu->domain_ids);
1374                clear_bit(iommu->seq_id, domain->iommu_bmp);
1375                iommu->domains[num] = NULL;
1376        }
1377        spin_unlock_irqrestore(&iommu->lock, flags);
1378}
1379
1380static struct iova_domain reserved_iova_list;
1381static struct lock_class_key reserved_rbtree_key;
1382
1383static int dmar_init_reserved_ranges(void)
1384{
1385        struct pci_dev *pdev = NULL;
1386        struct iova *iova;
1387        int i;
1388
1389        init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1390
1391        lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1392                &reserved_rbtree_key);
1393
1394        /* IOAPIC ranges shouldn't be accessed by DMA */
1395        iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1396                IOVA_PFN(IOAPIC_RANGE_END));
1397        if (!iova) {
1398                printk(KERN_ERR "Reserve IOAPIC range failed\n");
1399                return -ENODEV;
1400        }
1401
1402        /* Reserve all PCI MMIO to avoid peer-to-peer access */
1403        for_each_pci_dev(pdev) {
1404                struct resource *r;
1405
1406                for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1407                        r = &pdev->resource[i];
1408                        if (!r->flags || !(r->flags & IORESOURCE_MEM))
1409                                continue;
1410                        iova = reserve_iova(&reserved_iova_list,
1411                                            IOVA_PFN(r->start),
1412                                            IOVA_PFN(r->end));
1413                        if (!iova) {
1414                                printk(KERN_ERR "Reserve iova failed\n");
1415                                return -ENODEV;
1416                        }
1417                }
1418        }
1419        return 0;
1420}
1421
1422static void domain_reserve_special_ranges(struct dmar_domain *domain)
1423{
1424        copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1425}
1426
1427static inline int guestwidth_to_adjustwidth(int gaw)
1428{
1429        int agaw;
1430        int r = (gaw - 12) % 9;
1431
1432        if (r == 0)
1433                agaw = gaw;
1434        else
1435                agaw = gaw + 9 - r;
1436        if (agaw > 64)
1437                agaw = 64;
1438        return agaw;
1439}
1440
1441static int domain_init(struct dmar_domain *domain, int guest_width)
1442{
1443        struct intel_iommu *iommu;
1444        int adjust_width, agaw;
1445        unsigned long sagaw;
1446
1447        init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1448        spin_lock_init(&domain->iommu_lock);
1449
1450        domain_reserve_special_ranges(domain);
1451
1452        /* calculate AGAW */
1453        iommu = domain_get_iommu(domain);
1454        if (guest_width > cap_mgaw(iommu->cap))
1455                guest_width = cap_mgaw(iommu->cap);
1456        domain->gaw = guest_width;
1457        adjust_width = guestwidth_to_adjustwidth(guest_width);
1458        agaw = width_to_agaw(adjust_width);
1459        sagaw = cap_sagaw(iommu->cap);
1460        if (!test_bit(agaw, &sagaw)) {
1461                /* hardware doesn't support it, choose a bigger one */
1462                pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1463                agaw = find_next_bit(&sagaw, 5, agaw);
1464                if (agaw >= 5)
1465                        return -ENODEV;
1466        }
1467        domain->agaw = agaw;
1468        INIT_LIST_HEAD(&domain->devices);
1469
1470        if (ecap_coherent(iommu->ecap))
1471                domain->iommu_coherency = 1;
1472        else
1473                domain->iommu_coherency = 0;
1474
1475        if (ecap_sc_support(iommu->ecap))
1476                domain->iommu_snooping = 1;
1477        else
1478                domain->iommu_snooping = 0;
1479
1480        domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1481        domain->iommu_count = 1;
1482        domain->nid = iommu->node;
1483
1484        /* always allocate the top pgd */
1485        domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1486        if (!domain->pgd)
1487                return -ENOMEM;
1488        __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1489        return 0;
1490}
1491
1492static void domain_exit(struct dmar_domain *domain)
1493{
1494        struct dmar_drhd_unit *drhd;
1495        struct intel_iommu *iommu;
1496
1497        /* Domain 0 is reserved, so dont process it */
1498        if (!domain)
1499                return;
1500
1501        /* Flush any lazy unmaps that may reference this domain */
1502        if (!intel_iommu_strict)
1503                flush_unmaps_timeout(0);
1504
1505        domain_remove_dev_info(domain);
1506        /* destroy iovas */
1507        put_iova_domain(&domain->iovad);
1508
1509        /* clear ptes */
1510        dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1511
1512        /* free page tables */
1513        dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1514
1515        for_each_active_iommu(iommu, drhd)
1516                if (test_bit(iommu->seq_id, domain->iommu_bmp))
1517                        iommu_detach_domain(domain, iommu);
1518
1519        free_domain_mem(domain);
1520}
1521
1522static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1523                                 u8 bus, u8 devfn, int translation)
1524{
1525        struct context_entry *context;
1526        unsigned long flags;
1527        struct intel_iommu *iommu;
1528        struct dma_pte *pgd;
1529        unsigned long num;
1530        unsigned long ndomains;
1531        int id;
1532        int agaw;
1533        struct device_domain_info *info = NULL;
1534
1535        pr_debug("Set context mapping for %02x:%02x.%d\n",
1536                bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1537
1538        BUG_ON(!domain->pgd);
1539        BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1540               translation != CONTEXT_TT_MULTI_LEVEL);
1541
1542        iommu = device_to_iommu(segment, bus, devfn);
1543        if (!iommu)
1544                return -ENODEV;
1545
1546        context = device_to_context_entry(iommu, bus, devfn);
1547        if (!context)
1548                return -ENOMEM;
1549        spin_lock_irqsave(&iommu->lock, flags);
1550        if (context_present(context)) {
1551                spin_unlock_irqrestore(&iommu->lock, flags);
1552                return 0;
1553        }
1554
1555        id = domain->id;
1556        pgd = domain->pgd;
1557
1558        if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1559            domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1560                int found = 0;
1561
1562                /* find an available domain id for this device in iommu */
1563                ndomains = cap_ndoms(iommu->cap);
1564                for_each_set_bit(num, iommu->domain_ids, ndomains) {
1565                        if (iommu->domains[num] == domain) {
1566                                id = num;
1567                                found = 1;
1568                                break;
1569                        }
1570                }
1571
1572                if (found == 0) {
1573                        num = find_first_zero_bit(iommu->domain_ids, ndomains);
1574                        if (num >= ndomains) {
1575                                spin_unlock_irqrestore(&iommu->lock, flags);
1576                                printk(KERN_ERR "IOMMU: no free domain ids\n");
1577                                return -EFAULT;
1578                        }
1579
1580                        set_bit(num, iommu->domain_ids);
1581                        iommu->domains[num] = domain;
1582                        id = num;
1583                }
1584
1585                /* Skip top levels of page tables for
1586                 * iommu which has less agaw than default.
1587                 * Unnecessary for PT mode.
1588                 */
1589                if (translation != CONTEXT_TT_PASS_THROUGH) {
1590                        for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1591                                pgd = phys_to_virt(dma_pte_addr(pgd));
1592                                if (!dma_pte_present(pgd)) {
1593                                        spin_unlock_irqrestore(&iommu->lock, flags);
1594                                        return -ENOMEM;
1595                                }
1596                        }
1597                }
1598        }
1599
1600        context_set_domain_id(context, id);
1601
1602        if (translation != CONTEXT_TT_PASS_THROUGH) {
1603                info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1604                translation = info ? CONTEXT_TT_DEV_IOTLB :
1605                                     CONTEXT_TT_MULTI_LEVEL;
1606        }
1607        /*
1608         * In pass through mode, AW must be programmed to indicate the largest
1609         * AGAW value supported by hardware. And ASR is ignored by hardware.
1610         */
1611        if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1612                context_set_address_width(context, iommu->msagaw);
1613        else {
1614                context_set_address_root(context, virt_to_phys(pgd));
1615                context_set_address_width(context, iommu->agaw);
1616        }
1617
1618        context_set_translation_type(context, translation);
1619        context_set_fault_enable(context);
1620        context_set_present(context);
1621        domain_flush_cache(domain, context, sizeof(*context));
1622
1623        /*
1624         * It's a non-present to present mapping. If hardware doesn't cache
1625         * non-present entry we only need to flush the write-buffer. If the
1626         * _does_ cache non-present entries, then it does so in the special
1627         * domain #0, which we have to flush:
1628         */
1629        if (cap_caching_mode(iommu->cap)) {
1630                iommu->flush.flush_context(iommu, 0,
1631                                           (((u16)bus) << 8) | devfn,
1632                                           DMA_CCMD_MASK_NOBIT,
1633                                           DMA_CCMD_DEVICE_INVL);
1634                iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1635        } else {
1636                iommu_flush_write_buffer(iommu);
1637        }
1638        iommu_enable_dev_iotlb(info);
1639        spin_unlock_irqrestore(&iommu->lock, flags);
1640
1641        spin_lock_irqsave(&domain->iommu_lock, flags);
1642        if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1643                domain->iommu_count++;
1644                if (domain->iommu_count == 1)
1645                        domain->nid = iommu->node;
1646                domain_update_iommu_cap(domain);
1647        }
1648        spin_unlock_irqrestore(&domain->iommu_lock, flags);
1649        return 0;
1650}
1651
1652static int
1653domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1654                        int translation)
1655{
1656        int ret;
1657        struct pci_dev *tmp, *parent;
1658
1659        ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1660                                         pdev->bus->number, pdev->devfn,
1661                                         translation);
1662        if (ret)
1663                return ret;
1664
1665        /* dependent device mapping */
1666        tmp = pci_find_upstream_pcie_bridge(pdev);
1667        if (!tmp)
1668                return 0;
1669        /* Secondary interface's bus number and devfn 0 */
1670        parent = pdev->bus->self;
1671        while (parent != tmp) {
1672                ret = domain_context_mapping_one(domain,
1673                                                 pci_domain_nr(parent->bus),
1674                                                 parent->bus->number,
1675                                                 parent->devfn, translation);
1676                if (ret)
1677                        return ret;
1678                parent = parent->bus->self;
1679        }
1680        if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1681                return domain_context_mapping_one(domain,
1682                                        pci_domain_nr(tmp->subordinate),
1683                                        tmp->subordinate->number, 0,
1684                                        translation);
1685        else /* this is a legacy PCI bridge */
1686                return domain_context_mapping_one(domain,
1687                                                  pci_domain_nr(tmp->bus),
1688                                                  tmp->bus->number,
1689                                                  tmp->devfn,
1690                                                  translation);
1691}
1692
1693static int domain_context_mapped(struct pci_dev *pdev)
1694{
1695        int ret;
1696        struct pci_dev *tmp, *parent;
1697        struct intel_iommu *iommu;
1698
1699        iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1700                                pdev->devfn);
1701        if (!iommu)
1702                return -ENODEV;
1703
1704        ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1705        if (!ret)
1706                return ret;
1707        /* dependent device mapping */
1708        tmp = pci_find_upstream_pcie_bridge(pdev);
1709        if (!tmp)
1710                return ret;
1711        /* Secondary interface's bus number and devfn 0 */
1712        parent = pdev->bus->self;
1713        while (parent != tmp) {
1714                ret = device_context_mapped(iommu, parent->bus->number,
1715                                            parent->devfn);
1716                if (!ret)
1717                        return ret;
1718                parent = parent->bus->self;
1719        }
1720        if (pci_is_pcie(tmp))
1721                return device_context_mapped(iommu, tmp->subordinate->number,
1722                                             0);
1723        else
1724                return device_context_mapped(iommu, tmp->bus->number,
1725                                             tmp->devfn);
1726}
1727
1728/* Returns a number of VTD pages, but aligned to MM page size */
1729static inline unsigned long aligned_nrpages(unsigned long host_addr,
1730                                            size_t size)
1731{
1732        host_addr &= ~PAGE_MASK;
1733        return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1734}
1735
1736/* Return largest possible superpage level for a given mapping */
1737static inline int hardware_largepage_caps(struct dmar_domain *domain,
1738                                          unsigned long iov_pfn,
1739                                          unsigned long phy_pfn,
1740                                          unsigned long pages)
1741{
1742        int support, level = 1;
1743        unsigned long pfnmerge;
1744
1745        support = domain->iommu_superpage;
1746
1747        /* To use a large page, the virtual *and* physical addresses
1748           must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1749           of them will mean we have to use smaller pages. So just
1750           merge them and check both at once. */
1751        pfnmerge = iov_pfn | phy_pfn;
1752
1753        while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1754                pages >>= VTD_STRIDE_SHIFT;
1755                if (!pages)
1756                        break;
1757                pfnmerge >>= VTD_STRIDE_SHIFT;
1758                level++;
1759                support--;
1760        }
1761        return level;
1762}
1763
1764static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1765                            struct scatterlist *sg, unsigned long phys_pfn,
1766                            unsigned long nr_pages, int prot)
1767{
1768        struct dma_pte *first_pte = NULL, *pte = NULL;
1769        phys_addr_t uninitialized_var(pteval);
1770        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1771        unsigned long sg_res;
1772        unsigned int largepage_lvl = 0;
1773        unsigned long lvl_pages = 0;
1774
1775        BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1776
1777        if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1778                return -EINVAL;
1779
1780        prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1781
1782        if (sg)
1783                sg_res = 0;
1784        else {
1785                sg_res = nr_pages + 1;
1786                pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1787        }
1788
1789        while (nr_pages > 0) {
1790                uint64_t tmp;
1791
1792                if (!sg_res) {
1793                        sg_res = aligned_nrpages(sg->offset, sg->length);
1794                        sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1795                        sg->dma_length = sg->length;
1796                        pteval = page_to_phys(sg_page(sg)) | prot;
1797                        phys_pfn = pteval >> VTD_PAGE_SHIFT;
1798                }
1799
1800                if (!pte) {
1801                        largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1802
1803                        first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1804                        if (!pte)
1805                                return -ENOMEM;
1806                        /* It is large page*/
1807                        if (largepage_lvl > 1) {
1808                                pteval |= DMA_PTE_LARGE_PAGE;
1809                                /* Ensure that old small page tables are removed to make room
1810                                   for superpage, if they exist. */
1811                                dma_pte_clear_range(domain, iov_pfn,
1812                                                    iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1813                                dma_pte_free_pagetable(domain, iov_pfn,
1814                                                       iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1815                        } else {
1816                                pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1817                        }
1818
1819                }
1820                /* We don't need lock here, nobody else
1821                 * touches the iova range
1822                 */
1823                tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1824                if (tmp) {
1825                        static int dumps = 5;
1826                        printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1827                               iov_pfn, tmp, (unsigned long long)pteval);
1828                        if (dumps) {
1829                                dumps--;
1830                                debug_dma_dump_mappings(NULL);
1831                        }
1832                        WARN_ON(1);
1833                }
1834
1835                lvl_pages = lvl_to_nr_pages(largepage_lvl);
1836
1837                BUG_ON(nr_pages < lvl_pages);
1838                BUG_ON(sg_res < lvl_pages);
1839
1840                nr_pages -= lvl_pages;
1841                iov_pfn += lvl_pages;
1842                phys_pfn += lvl_pages;
1843                pteval += lvl_pages * VTD_PAGE_SIZE;
1844                sg_res -= lvl_pages;
1845
1846                /* If the next PTE would be the first in a new page, then we
1847                   need to flush the cache on the entries we've just written.
1848                   And then we'll need to recalculate 'pte', so clear it and
1849                   let it get set again in the if (!pte) block above.
1850
1851                   If we're done (!nr_pages) we need to flush the cache too.
1852
1853                   Also if we've been setting superpages, we may need to
1854                   recalculate 'pte' and switch back to smaller pages for the
1855                   end of the mapping, if the trailing size is not enough to
1856                   use another superpage (i.e. sg_res < lvl_pages). */
1857                pte++;
1858                if (!nr_pages || first_pte_in_page(pte) ||
1859                    (largepage_lvl > 1 && sg_res < lvl_pages)) {
1860                        domain_flush_cache(domain, first_pte,
1861                                           (void *)pte - (void *)first_pte);
1862                        pte = NULL;
1863                }
1864
1865                if (!sg_res && nr_pages)
1866                        sg = sg_next(sg);
1867        }
1868        return 0;
1869}
1870
1871static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1872                                    struct scatterlist *sg, unsigned long nr_pages,
1873                                    int prot)
1874{
1875        return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1876}
1877
1878static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1879                                     unsigned long phys_pfn, unsigned long nr_pages,
1880                                     int prot)
1881{
1882        return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1883}
1884
1885static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1886{
1887        if (!iommu)
1888                return;
1889
1890        clear_context_table(iommu, bus, devfn);
1891        iommu->flush.flush_context(iommu, 0, 0, 0,
1892                                           DMA_CCMD_GLOBAL_INVL);
1893        iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1894}
1895
1896static inline void unlink_domain_info(struct device_domain_info *info)
1897{
1898        assert_spin_locked(&device_domain_lock);
1899        list_del(&info->link);
1900        list_del(&info->global);
1901        if (info->dev)
1902                info->dev->dev.archdata.iommu = NULL;
1903}
1904
1905static void domain_remove_dev_info(struct dmar_domain *domain)
1906{
1907        struct device_domain_info *info;
1908        unsigned long flags;
1909        struct intel_iommu *iommu;
1910
1911        spin_lock_irqsave(&device_domain_lock, flags);
1912        while (!list_empty(&domain->devices)) {
1913                info = list_entry(domain->devices.next,
1914                        struct device_domain_info, link);
1915                unlink_domain_info(info);
1916                spin_unlock_irqrestore(&device_domain_lock, flags);
1917
1918                iommu_disable_dev_iotlb(info);
1919                iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1920                iommu_detach_dev(iommu, info->bus, info->devfn);
1921                free_devinfo_mem(info);
1922
1923                spin_lock_irqsave(&device_domain_lock, flags);
1924        }
1925        spin_unlock_irqrestore(&device_domain_lock, flags);
1926}
1927
1928/*
1929 * find_domain
1930 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1931 */
1932static struct dmar_domain *
1933find_domain(struct pci_dev *pdev)
1934{
1935        struct device_domain_info *info;
1936
1937        /* No lock here, assumes no domain exit in normal case */
1938        info = pdev->dev.archdata.iommu;
1939        if (info)
1940                return info->domain;
1941        return NULL;
1942}
1943
1944/* domain is initialized */
1945static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1946{
1947        struct dmar_domain *domain, *found = NULL;
1948        struct intel_iommu *iommu;
1949        struct dmar_drhd_unit *drhd;
1950        struct device_domain_info *info, *tmp;
1951        struct pci_dev *dev_tmp;
1952        unsigned long flags;
1953        int bus = 0, devfn = 0;
1954        int segment;
1955        int ret;
1956
1957        domain = find_domain(pdev);
1958        if (domain)
1959                return domain;
1960
1961        segment = pci_domain_nr(pdev->bus);
1962
1963        dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1964        if (dev_tmp) {
1965                if (pci_is_pcie(dev_tmp)) {
1966                        bus = dev_tmp->subordinate->number;
1967                        devfn = 0;
1968                } else {
1969                        bus = dev_tmp->bus->number;
1970                        devfn = dev_tmp->devfn;
1971                }
1972                spin_lock_irqsave(&device_domain_lock, flags);
1973                list_for_each_entry(info, &device_domain_list, global) {
1974                        if (info->segment == segment &&
1975                            info->bus == bus && info->devfn == devfn) {
1976                                found = info->domain;
1977                                break;
1978                        }
1979                }
1980                spin_unlock_irqrestore(&device_domain_lock, flags);
1981                /* pcie-pci bridge already has a domain, uses it */
1982                if (found) {
1983                        domain = found;
1984                        goto found_domain;
1985                }
1986        }
1987
1988        domain = alloc_domain();
1989        if (!domain)
1990                goto error;
1991
1992        /* Allocate new domain for the device */
1993        drhd = dmar_find_matched_drhd_unit(pdev);
1994        if (!drhd) {
1995                printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1996                        pci_name(pdev));
1997                free_domain_mem(domain);
1998                return NULL;
1999        }
2000        iommu = drhd->iommu;
2001
2002        ret = iommu_attach_domain(domain, iommu);
2003        if (ret) {
2004                free_domain_mem(domain);
2005                goto error;
2006        }
2007
2008        if (domain_init(domain, gaw)) {
2009                domain_exit(domain);
2010                goto error;
2011        }
2012
2013        /* register pcie-to-pci device */
2014        if (dev_tmp) {
2015                info = alloc_devinfo_mem();
2016                if (!info) {
2017                        domain_exit(domain);
2018                        goto error;
2019                }
2020                info->segment = segment;
2021                info->bus = bus;
2022                info->devfn = devfn;
2023                info->dev = NULL;
2024                info->domain = domain;
2025                /* This domain is shared by devices under p2p bridge */
2026                domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2027
2028                /* pcie-to-pci bridge already has a domain, uses it */
2029                found = NULL;
2030                spin_lock_irqsave(&device_domain_lock, flags);
2031                list_for_each_entry(tmp, &device_domain_list, global) {
2032                        if (tmp->segment == segment &&
2033                            tmp->bus == bus && tmp->devfn == devfn) {
2034                                found = tmp->domain;
2035                                break;
2036                        }
2037                }
2038                if (found) {
2039                        spin_unlock_irqrestore(&device_domain_lock, flags);
2040                        free_devinfo_mem(info);
2041                        domain_exit(domain);
2042                        domain = found;
2043                } else {
2044                        list_add(&info->link, &domain->devices);
2045                        list_add(&info->global, &device_domain_list);
2046                        spin_unlock_irqrestore(&device_domain_lock, flags);
2047                }
2048        }
2049
2050found_domain:
2051        info = alloc_devinfo_mem();
2052        if (!info)
2053                goto error;
2054        info->segment = segment;
2055        info->bus = pdev->bus->number;
2056        info->devfn = pdev->devfn;
2057        info->dev = pdev;
2058        info->domain = domain;
2059        spin_lock_irqsave(&device_domain_lock, flags);
2060        /* somebody is fast */
2061        found = find_domain(pdev);
2062        if (found != NULL) {
2063                spin_unlock_irqrestore(&device_domain_lock, flags);
2064                if (found != domain) {
2065                        domain_exit(domain);
2066                        domain = found;
2067                }
2068                free_devinfo_mem(info);
2069                return domain;
2070        }
2071        list_add(&info->link, &domain->devices);
2072        list_add(&info->global, &device_domain_list);
2073        pdev->dev.archdata.iommu = info;
2074        spin_unlock_irqrestore(&device_domain_lock, flags);
2075        return domain;
2076error:
2077        /* recheck it here, maybe others set it */
2078        return find_domain(pdev);
2079}
2080
2081static int iommu_identity_mapping;
2082#define IDENTMAP_ALL            1
2083#define IDENTMAP_GFX            2
2084#define IDENTMAP_AZALIA         4
2085
2086static int iommu_domain_identity_map(struct dmar_domain *domain,
2087                                     unsigned long long start,
2088                                     unsigned long long end)
2089{
2090        unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2091        unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2092
2093        if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2094                          dma_to_mm_pfn(last_vpfn))) {
2095                printk(KERN_ERR "IOMMU: reserve iova failed\n");
2096                return -ENOMEM;
2097        }
2098
2099        pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2100                 start, end, domain->id);
2101        /*
2102         * RMRR range might have overlap with physical memory range,
2103         * clear it first
2104         */
2105        dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2106
2107        return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2108                                  last_vpfn - first_vpfn + 1,
2109                                  DMA_PTE_READ|DMA_PTE_WRITE);
2110}
2111
2112static int iommu_prepare_identity_map(struct pci_dev *pdev,
2113                                      unsigned long long start,
2114                                      unsigned long long end)
2115{
2116        struct dmar_domain *domain;
2117        int ret;
2118
2119        domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2120        if (!domain)
2121                return -ENOMEM;
2122
2123        /* For _hardware_ passthrough, don't bother. But for software
2124           passthrough, we do it anyway -- it may indicate a memory
2125           range which is reserved in E820, so which didn't get set
2126           up to start with in si_domain */
2127        if (domain == si_domain && hw_pass_through) {
2128                printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2129                       pci_name(pdev), start, end);
2130                return 0;
2131        }
2132
2133        printk(KERN_INFO
2134               "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2135               pci_name(pdev), start, end);
2136        
2137        if (end < start) {
2138                WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2139                        "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2140                        dmi_get_system_info(DMI_BIOS_VENDOR),
2141                        dmi_get_system_info(DMI_BIOS_VERSION),
2142                     dmi_get_system_info(DMI_PRODUCT_VERSION));
2143                ret = -EIO;
2144                goto error;
2145        }
2146
2147        if (end >> agaw_to_width(domain->agaw)) {
2148                WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2149                     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2150                     agaw_to_width(domain->agaw),
2151                     dmi_get_system_info(DMI_BIOS_VENDOR),
2152                     dmi_get_system_info(DMI_BIOS_VERSION),
2153                     dmi_get_system_info(DMI_PRODUCT_VERSION));
2154                ret = -EIO;
2155                goto error;
2156        }
2157
2158        ret = iommu_domain_identity_map(domain, start, end);
2159        if (ret)
2160                goto error;
2161
2162        /* context entry init */
2163        ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2164        if (ret)
2165                goto error;
2166
2167        return 0;
2168
2169 error:
2170        domain_exit(domain);
2171        return ret;
2172}
2173
2174static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2175        struct pci_dev *pdev)
2176{
2177        if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2178                return 0;
2179        return iommu_prepare_identity_map(pdev, rmrr->base_address,
2180                rmrr->end_address);
2181}
2182
2183#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2184static inline void iommu_prepare_isa(void)
2185{
2186        struct pci_dev *pdev;
2187        int ret;
2188
2189        pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2190        if (!pdev)
2191                return;
2192
2193        printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2194        ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2195
2196        if (ret)
2197                printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2198                       "floppy might not work\n");
2199
2200}
2201#else
2202static inline void iommu_prepare_isa(void)
2203{
2204        return;
2205}
2206#endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2207
2208static int md_domain_init(struct dmar_domain *domain, int guest_width);
2209
2210static int __init si_domain_init(int hw)
2211{
2212        struct dmar_drhd_unit *drhd;
2213        struct intel_iommu *iommu;
2214        int nid, ret = 0;
2215
2216        si_domain = alloc_domain();
2217        if (!si_domain)
2218                return -EFAULT;
2219
2220        for_each_active_iommu(iommu, drhd) {
2221                ret = iommu_attach_domain(si_domain, iommu);
2222                if (ret) {
2223                        domain_exit(si_domain);
2224                        return -EFAULT;
2225                }
2226        }
2227
2228        if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2229                domain_exit(si_domain);
2230                return -EFAULT;
2231        }
2232
2233        si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2234        pr_debug("IOMMU: identity mapping domain is domain %d\n",
2235                 si_domain->id);
2236
2237        if (hw)
2238                return 0;
2239
2240        for_each_online_node(nid) {
2241                unsigned long start_pfn, end_pfn;
2242                int i;
2243
2244                for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2245                        ret = iommu_domain_identity_map(si_domain,
2246                                        PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2247                        if (ret)
2248                                return ret;
2249                }
2250        }
2251
2252        return 0;
2253}
2254
2255static void domain_remove_one_dev_info(struct dmar_domain *domain,
2256                                          struct pci_dev *pdev);
2257static int identity_mapping(struct pci_dev *pdev)
2258{
2259        struct device_domain_info *info;
2260
2261        if (likely(!iommu_identity_mapping))
2262                return 0;
2263
2264        info = pdev->dev.archdata.iommu;
2265        if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2266                return (info->domain == si_domain);
2267
2268        return 0;
2269}
2270
2271static int domain_add_dev_info(struct dmar_domain *domain,
2272                               struct pci_dev *pdev,
2273                               int translation)
2274{
2275        struct device_domain_info *info;
2276        unsigned long flags;
2277        int ret;
2278
2279        info = alloc_devinfo_mem();
2280        if (!info)
2281                return -ENOMEM;
2282
2283        info->segment = pci_domain_nr(pdev->bus);
2284        info->bus = pdev->bus->number;
2285        info->devfn = pdev->devfn;
2286        info->dev = pdev;
2287        info->domain = domain;
2288
2289        spin_lock_irqsave(&device_domain_lock, flags);
2290        list_add(&info->link, &domain->devices);
2291        list_add(&info->global, &device_domain_list);
2292        pdev->dev.archdata.iommu = info;
2293        spin_unlock_irqrestore(&device_domain_lock, flags);
2294
2295        ret = domain_context_mapping(domain, pdev, translation);
2296        if (ret) {
2297                spin_lock_irqsave(&device_domain_lock, flags);
2298                unlink_domain_info(info);
2299                spin_unlock_irqrestore(&device_domain_lock, flags);
2300                free_devinfo_mem(info);
2301                return ret;
2302        }
2303
2304        return 0;
2305}
2306
2307static bool device_has_rmrr(struct pci_dev *dev)
2308{
2309        struct dmar_rmrr_unit *rmrr;
2310        int i;
2311
2312        for_each_rmrr_units(rmrr) {
2313                for (i = 0; i < rmrr->devices_cnt; i++) {
2314                        /*
2315                         * Return TRUE if this RMRR contains the device that
2316                         * is passed in.
2317                         */
2318                        if (rmrr->devices[i] == dev)
2319                                return true;
2320                }
2321        }
2322        return false;
2323}
2324
2325static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2326{
2327
2328        /*
2329         * We want to prevent any device associated with an RMRR from
2330         * getting placed into the SI Domain. This is done because
2331         * problems exist when devices are moved in and out of domains
2332         * and their respective RMRR info is lost. We exempt USB devices
2333         * from this process due to their usage of RMRRs that are known
2334         * to not be needed after BIOS hand-off to OS.
2335         */
2336        if (device_has_rmrr(pdev) &&
2337            (pdev->class >> 8) != PCI_CLASS_SERIAL_USB)
2338                return 0;
2339
2340        if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2341                return 1;
2342
2343        if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2344                return 1;
2345
2346        if (!(iommu_identity_mapping & IDENTMAP_ALL))
2347                return 0;
2348
2349        /*
2350         * We want to start off with all devices in the 1:1 domain, and
2351         * take them out later if we find they can't access all of memory.
2352         *
2353         * However, we can't do this for PCI devices behind bridges,
2354         * because all PCI devices behind the same bridge will end up
2355         * with the same source-id on their transactions.
2356         *
2357         * Practically speaking, we can't change things around for these
2358         * devices at run-time, because we can't be sure there'll be no
2359         * DMA transactions in flight for any of their siblings.
2360         * 
2361         * So PCI devices (unless they're on the root bus) as well as
2362         * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2363         * the 1:1 domain, just in _case_ one of their siblings turns out
2364         * not to be able to map all of memory.
2365         */
2366        if (!pci_is_pcie(pdev)) {
2367                if (!pci_is_root_bus(pdev->bus))
2368                        return 0;
2369                if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2370                        return 0;
2371        } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2372                return 0;
2373
2374        /* 
2375         * At boot time, we don't yet know if devices will be 64-bit capable.
2376         * Assume that they will -- if they turn out not to be, then we can 
2377         * take them out of the 1:1 domain later.
2378         */
2379        if (!startup) {
2380                /*
2381                 * If the device's dma_mask is less than the system's memory
2382                 * size then this is not a candidate for identity mapping.
2383                 */
2384                u64 dma_mask = pdev->dma_mask;
2385
2386                if (pdev->dev.coherent_dma_mask &&
2387                    pdev->dev.coherent_dma_mask < dma_mask)
2388                        dma_mask = pdev->dev.coherent_dma_mask;
2389
2390                return dma_mask >= dma_get_required_mask(&pdev->dev);
2391        }
2392
2393        return 1;
2394}
2395
2396static int __init iommu_prepare_static_identity_mapping(int hw)
2397{
2398        struct pci_dev *pdev = NULL;
2399        int ret;
2400
2401        ret = si_domain_init(hw);
2402        if (ret)
2403                return -EFAULT;
2404
2405        for_each_pci_dev(pdev) {
2406                if (iommu_should_identity_map(pdev, 1)) {
2407                        ret = domain_add_dev_info(si_domain, pdev,
2408                                             hw ? CONTEXT_TT_PASS_THROUGH :
2409                                                  CONTEXT_TT_MULTI_LEVEL);
2410                        if (ret) {
2411                                /* device not associated with an iommu */
2412                                if (ret == -ENODEV)
2413                                        continue;
2414                                return ret;
2415                        }
2416                        pr_info("IOMMU: %s identity mapping for device %s\n",
2417                                hw ? "hardware" : "software", pci_name(pdev));
2418                }
2419        }
2420
2421        return 0;
2422}
2423
2424static int __init init_dmars(void)
2425{
2426        struct dmar_drhd_unit *drhd;
2427        struct dmar_rmrr_unit *rmrr;
2428        struct pci_dev *pdev;
2429        struct intel_iommu *iommu;
2430        int i, ret;
2431
2432        /*
2433         * for each drhd
2434         *    allocate root
2435         *    initialize and program root entry to not present
2436         * endfor
2437         */
2438        for_each_drhd_unit(drhd) {
2439                /*
2440                 * lock not needed as this is only incremented in the single
2441                 * threaded kernel __init code path all other access are read
2442                 * only
2443                 */
2444                if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2445                        g_num_of_iommus++;
2446                        continue;
2447                }
2448                printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2449                          IOMMU_UNITS_SUPPORTED);
2450        }
2451
2452        g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2453                        GFP_KERNEL);
2454        if (!g_iommus) {
2455                printk(KERN_ERR "Allocating global iommu array failed\n");
2456                ret = -ENOMEM;
2457                goto error;
2458        }
2459
2460        deferred_flush = kzalloc(g_num_of_iommus *
2461                sizeof(struct deferred_flush_tables), GFP_KERNEL);
2462        if (!deferred_flush) {
2463                ret = -ENOMEM;
2464                goto error;
2465        }
2466
2467        for_each_active_iommu(iommu, drhd) {
2468                g_iommus[iommu->seq_id] = iommu;
2469
2470                ret = iommu_init_domains(iommu);
2471                if (ret)
2472                        goto error;
2473
2474                /*
2475                 * TBD:
2476                 * we could share the same root & context tables
2477                 * among all IOMMU's. Need to Split it later.
2478                 */
2479                ret = iommu_alloc_root_entry(iommu);
2480                if (ret) {
2481                        printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2482                        goto error;
2483                }
2484                if (!ecap_pass_through(iommu->ecap))
2485                        hw_pass_through = 0;
2486        }
2487
2488        /*
2489         * Start from the sane iommu hardware state.
2490         */
2491        for_each_active_iommu(iommu, drhd) {
2492                /*
2493                 * If the queued invalidation is already initialized by us
2494                 * (for example, while enabling interrupt-remapping) then
2495                 * we got the things already rolling from a sane state.
2496                 */
2497                if (iommu->qi)
2498                        continue;
2499
2500                /*
2501                 * Clear any previous faults.
2502                 */
2503                dmar_fault(-1, iommu);
2504                /*
2505                 * Disable queued invalidation if supported and already enabled
2506                 * before OS handover.
2507                 */
2508                dmar_disable_qi(iommu);
2509        }
2510
2511        for_each_active_iommu(iommu, drhd) {
2512                if (dmar_enable_qi(iommu)) {
2513                        /*
2514                         * Queued Invalidate not enabled, use Register Based
2515                         * Invalidate
2516                         */
2517                        iommu->flush.flush_context = __iommu_flush_context;
2518                        iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2519                        printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2520                               "invalidation\n",
2521                                iommu->seq_id,
2522                               (unsigned long long)drhd->reg_base_addr);
2523                } else {
2524                        iommu->flush.flush_context = qi_flush_context;
2525                        iommu->flush.flush_iotlb = qi_flush_iotlb;
2526                        printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2527                               "invalidation\n",
2528                                iommu->seq_id,
2529                               (unsigned long long)drhd->reg_base_addr);
2530                }
2531        }
2532
2533        if (iommu_pass_through)
2534                iommu_identity_mapping |= IDENTMAP_ALL;
2535
2536#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2537        iommu_identity_mapping |= IDENTMAP_GFX;
2538#endif
2539
2540        check_tylersburg_isoch();
2541
2542        /*
2543         * If pass through is not set or not enabled, setup context entries for
2544         * identity mappings for rmrr, gfx, and isa and may fall back to static
2545         * identity mapping if iommu_identity_mapping is set.
2546         */
2547        if (iommu_identity_mapping) {
2548                ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2549                if (ret) {
2550                        printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2551                        goto error;
2552                }
2553        }
2554        /*
2555         * For each rmrr
2556         *   for each dev attached to rmrr
2557         *   do
2558         *     locate drhd for dev, alloc domain for dev
2559         *     allocate free domain
2560         *     allocate page table entries for rmrr
2561         *     if context not allocated for bus
2562         *           allocate and init context
2563         *           set present in root table for this bus
2564         *     init context with domain, translation etc
2565         *    endfor
2566         * endfor
2567         */
2568        printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2569        for_each_rmrr_units(rmrr) {
2570                for (i = 0; i < rmrr->devices_cnt; i++) {
2571                        pdev = rmrr->devices[i];
2572                        /*
2573                         * some BIOS lists non-exist devices in DMAR
2574                         * table.
2575                         */
2576                        if (!pdev)
2577                                continue;
2578                        ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2579                        if (ret)
2580                                printk(KERN_ERR
2581                                       "IOMMU: mapping reserved region failed\n");
2582                }
2583        }
2584
2585        iommu_prepare_isa();
2586
2587        /*
2588         * for each drhd
2589         *   enable fault log
2590         *   global invalidate context cache
2591         *   global invalidate iotlb
2592         *   enable translation
2593         */
2594        for_each_iommu(iommu, drhd) {
2595                if (drhd->ignored) {
2596                        /*
2597                         * we always have to disable PMRs or DMA may fail on
2598                         * this device
2599                         */
2600                        if (force_on)
2601                                iommu_disable_protect_mem_regions(iommu);
2602                        continue;
2603                }
2604
2605                iommu_flush_write_buffer(iommu);
2606
2607                ret = dmar_set_interrupt(iommu);
2608                if (ret)
2609                        goto error;
2610
2611                iommu_set_root_entry(iommu);
2612
2613                iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2614                iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2615
2616                ret = iommu_enable_translation(iommu);
2617                if (ret)
2618                        goto error;
2619
2620                iommu_disable_protect_mem_regions(iommu);
2621        }
2622
2623        return 0;
2624error:
2625        for_each_active_iommu(iommu, drhd)
2626                free_dmar_iommu(iommu);
2627        kfree(deferred_flush);
2628        kfree(g_iommus);
2629        return ret;
2630}
2631
2632/* This takes a number of _MM_ pages, not VTD pages */
2633static struct iova *intel_alloc_iova(struct device *dev,
2634                                     struct dmar_domain *domain,
2635                                     unsigned long nrpages, uint64_t dma_mask)
2636{
2637        struct pci_dev *pdev = to_pci_dev(dev);
2638        struct iova *iova = NULL;
2639
2640        /* Restrict dma_mask to the width that the iommu can handle */
2641        dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2642
2643        if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2644                /*
2645                 * First try to allocate an io virtual address in
2646                 * DMA_BIT_MASK(32) and if that fails then try allocating
2647                 * from higher range
2648                 */
2649                iova = alloc_iova(&domain->iovad, nrpages,
2650                                  IOVA_PFN(DMA_BIT_MASK(32)), 1);
2651                if (iova)
2652                        return iova;
2653        }
2654        iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2655        if (unlikely(!iova)) {
2656                printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2657                       nrpages, pci_name(pdev));
2658                return NULL;
2659        }
2660
2661        return iova;
2662}
2663
2664static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2665{
2666        struct dmar_domain *domain;
2667        int ret;
2668
2669        domain = get_domain_for_dev(pdev,
2670                        DEFAULT_DOMAIN_ADDRESS_WIDTH);
2671        if (!domain) {
2672                printk(KERN_ERR
2673                        "Allocating domain for %s failed", pci_name(pdev));
2674                return NULL;
2675        }
2676
2677        /* make sure context mapping is ok */
2678        if (unlikely(!domain_context_mapped(pdev))) {
2679                ret = domain_context_mapping(domain, pdev,
2680                                             CONTEXT_TT_MULTI_LEVEL);
2681                if (ret) {
2682                        printk(KERN_ERR
2683                                "Domain context map for %s failed",
2684                                pci_name(pdev));
2685                        return NULL;
2686                }
2687        }
2688
2689        return domain;
2690}
2691
2692static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2693{
2694        struct device_domain_info *info;
2695
2696        /* No lock here, assumes no domain exit in normal case */
2697        info = dev->dev.archdata.iommu;
2698        if (likely(info))
2699                return info->domain;
2700
2701        return __get_valid_domain_for_dev(dev);
2702}
2703
2704static int iommu_dummy(struct pci_dev *pdev)
2705{
2706        return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2707}
2708
2709/* Check if the pdev needs to go through non-identity map and unmap process.*/
2710static int iommu_no_mapping(struct device *dev)
2711{
2712        struct pci_dev *pdev;
2713        int found;
2714
2715        if (unlikely(!dev_is_pci(dev)))
2716                return 1;
2717
2718        pdev = to_pci_dev(dev);
2719        if (iommu_dummy(pdev))
2720                return 1;
2721
2722        if (!iommu_identity_mapping)
2723                return 0;
2724
2725        found = identity_mapping(pdev);
2726        if (found) {
2727                if (iommu_should_identity_map(pdev, 0))
2728                        return 1;
2729                else {
2730                        /*
2731                         * 32 bit DMA is removed from si_domain and fall back
2732                         * to non-identity mapping.
2733                         */
2734                        domain_remove_one_dev_info(si_domain, pdev);
2735                        printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2736                               pci_name(pdev));
2737                        return 0;
2738                }
2739        } else {
2740                /*
2741                 * In case of a detached 64 bit DMA device from vm, the device
2742                 * is put into si_domain for identity mapping.
2743                 */
2744                if (iommu_should_identity_map(pdev, 0)) {
2745                        int ret;
2746                        ret = domain_add_dev_info(si_domain, pdev,
2747                                                  hw_pass_through ?
2748                                                  CONTEXT_TT_PASS_THROUGH :
2749                                                  CONTEXT_TT_MULTI_LEVEL);
2750                        if (!ret) {
2751                                printk(KERN_INFO "64bit %s uses identity mapping\n",
2752                                       pci_name(pdev));
2753                                return 1;
2754                        }
2755                }
2756        }
2757
2758        return 0;
2759}
2760
2761static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2762                                     size_t size, int dir, u64 dma_mask)
2763{
2764        struct pci_dev *pdev = to_pci_dev(hwdev);
2765        struct dmar_domain *domain;
2766        phys_addr_t start_paddr;
2767        struct iova *iova;
2768        int prot = 0;
2769        int ret;
2770        struct intel_iommu *iommu;
2771        unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2772
2773        BUG_ON(dir == DMA_NONE);
2774
2775        if (iommu_no_mapping(hwdev))
2776                return paddr;
2777
2778        domain = get_valid_domain_for_dev(pdev);
2779        if (!domain)
2780                return 0;
2781
2782        iommu = domain_get_iommu(domain);
2783        size = aligned_nrpages(paddr, size);
2784
2785        iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2786        if (!iova)
2787                goto error;
2788
2789        /*
2790         * Check if DMAR supports zero-length reads on write only
2791         * mappings..
2792         */
2793        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2794                        !cap_zlr(iommu->cap))
2795                prot |= DMA_PTE_READ;
2796        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2797                prot |= DMA_PTE_WRITE;
2798        /*
2799         * paddr - (paddr + size) might be partial page, we should map the whole
2800         * page.  Note: if two part of one page are separately mapped, we
2801         * might have two guest_addr mapping to the same host paddr, but this
2802         * is not a big problem
2803         */
2804        ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2805                                 mm_to_dma_pfn(paddr_pfn), size, prot);
2806        if (ret)
2807                goto error;
2808
2809        /* it's a non-present to present mapping. Only flush if caching mode */
2810        if (cap_caching_mode(iommu->cap))
2811                iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2812        else
2813                iommu_flush_write_buffer(iommu);
2814
2815        start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2816        start_paddr += paddr & ~PAGE_MASK;
2817        return start_paddr;
2818
2819error:
2820        if (iova)
2821                __free_iova(&domain->iovad, iova);
2822        printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2823                pci_name(pdev), size, (unsigned long long)paddr, dir);
2824        return 0;
2825}
2826
2827static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2828                                 unsigned long offset, size_t size,
2829                                 enum dma_data_direction dir,
2830                                 struct dma_attrs *attrs)
2831{
2832        return __intel_map_single(dev, page_to_phys(page) + offset, size,
2833                                  dir, to_pci_dev(dev)->dma_mask);
2834}
2835
2836static void flush_unmaps(void)
2837{
2838        int i, j;
2839
2840        timer_on = 0;
2841
2842        /* just flush them all */
2843        for (i = 0; i < g_num_of_iommus; i++) {
2844                struct intel_iommu *iommu = g_iommus[i];
2845                if (!iommu)
2846                        continue;
2847
2848                if (!deferred_flush[i].next)
2849                        continue;
2850
2851                /* In caching mode, global flushes turn emulation expensive */
2852                if (!cap_caching_mode(iommu->cap))
2853                        iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2854                                         DMA_TLB_GLOBAL_FLUSH);
2855                for (j = 0; j < deferred_flush[i].next; j++) {
2856                        unsigned long mask;
2857                        struct iova *iova = deferred_flush[i].iova[j];
2858                        struct dmar_domain *domain = deferred_flush[i].domain[j];
2859
2860                        /* On real hardware multiple invalidations are expensive */
2861                        if (cap_caching_mode(iommu->cap))
2862                                iommu_flush_iotlb_psi(iommu, domain->id,
2863                                iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2864                        else {
2865                                mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2866                                iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2867                                                (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2868                        }
2869                        __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2870                }
2871                deferred_flush[i].next = 0;
2872        }
2873
2874        list_size = 0;
2875}
2876
2877static void flush_unmaps_timeout(unsigned long data)
2878{
2879        unsigned long flags;
2880
2881        spin_lock_irqsave(&async_umap_flush_lock, flags);
2882        flush_unmaps();
2883        spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2884}
2885
2886static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2887{
2888        unsigned long flags;
2889        int next, iommu_id;
2890        struct intel_iommu *iommu;
2891
2892        spin_lock_irqsave(&async_umap_flush_lock, flags);
2893        if (list_size == HIGH_WATER_MARK)
2894                flush_unmaps();
2895
2896        iommu = domain_get_iommu(dom);
2897        iommu_id = iommu->seq_id;
2898
2899        next = deferred_flush[iommu_id].next;
2900        deferred_flush[iommu_id].domain[next] = dom;
2901        deferred_flush[iommu_id].iova[next] = iova;
2902        deferred_flush[iommu_id].next++;
2903
2904        if (!timer_on) {
2905                mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2906                timer_on = 1;
2907        }
2908        list_size++;
2909        spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2910}
2911
2912static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2913                             size_t size, enum dma_data_direction dir,
2914                             struct dma_attrs *attrs)
2915{
2916        struct pci_dev *pdev = to_pci_dev(dev);
2917        struct dmar_domain *domain;
2918        unsigned long start_pfn, last_pfn;
2919        struct iova *iova;
2920        struct intel_iommu *iommu;
2921
2922        if (iommu_no_mapping(dev))
2923                return;
2924
2925        domain = find_domain(pdev);
2926        BUG_ON(!domain);
2927
2928        iommu = domain_get_iommu(domain);
2929
2930        iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2931        if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2932                      (unsigned long long)dev_addr))
2933                return;
2934
2935        start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2936        last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2937
2938        pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2939                 pci_name(pdev), start_pfn, last_pfn);
2940
2941        /*  clear the whole page */
2942        dma_pte_clear_range(domain, start_pfn, last_pfn);
2943
2944        /* free page tables */
2945        dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2946
2947        if (intel_iommu_strict) {
2948                iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2949                                      last_pfn - start_pfn + 1, 0);
2950                /* free iova */
2951                __free_iova(&domain->iovad, iova);
2952        } else {
2953                add_unmap(domain, iova);
2954                /*
2955                 * queue up the release of the unmap to save the 1/6th of the
2956                 * cpu used up by the iotlb flush operation...
2957                 */
2958        }
2959}
2960
2961static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2962                                  dma_addr_t *dma_handle, gfp_t flags,
2963                                  struct dma_attrs *attrs)
2964{
2965        void *vaddr;
2966        int order;
2967
2968        size = PAGE_ALIGN(size);
2969        order = get_order(size);
2970
2971        if (!iommu_no_mapping(hwdev))
2972                flags &= ~(GFP_DMA | GFP_DMA32);
2973        else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2974                if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2975                        flags |= GFP_DMA;
2976                else
2977                        flags |= GFP_DMA32;
2978        }
2979
2980        vaddr = (void *)__get_free_pages(flags, order);
2981        if (!vaddr)
2982                return NULL;
2983        memset(vaddr, 0, size);
2984
2985        *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2986                                         DMA_BIDIRECTIONAL,
2987                                         hwdev->coherent_dma_mask);
2988        if (*dma_handle)
2989                return vaddr;
2990        free_pages((unsigned long)vaddr, order);
2991        return NULL;
2992}
2993
2994static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2995                                dma_addr_t dma_handle, struct dma_attrs *attrs)
2996{
2997        int order;
2998
2999        size = PAGE_ALIGN(size);
3000        order = get_order(size);
3001
3002        intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3003        free_pages((unsigned long)vaddr, order);
3004}
3005
3006static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3007                           int nelems, enum dma_data_direction dir,
3008                           struct dma_attrs *attrs)
3009{
3010        struct pci_dev *pdev = to_pci_dev(hwdev);
3011        struct dmar_domain *domain;
3012        unsigned long start_pfn, last_pfn;
3013        struct iova *iova;
3014        struct intel_iommu *iommu;
3015
3016        if (iommu_no_mapping(hwdev))
3017                return;
3018
3019        domain = find_domain(pdev);
3020        BUG_ON(!domain);
3021
3022        iommu = domain_get_iommu(domain);
3023
3024        iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3025        if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3026                      (unsigned long long)sglist[0].dma_address))
3027                return;
3028
3029        start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3030        last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3031
3032        /*  clear the whole page */
3033        dma_pte_clear_range(domain, start_pfn, last_pfn);
3034
3035        /* free page tables */
3036        dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3037
3038        if (intel_iommu_strict) {
3039                iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3040                                      last_pfn - start_pfn + 1, 0);
3041                /* free iova */
3042                __free_iova(&domain->iovad, iova);
3043        } else {
3044                add_unmap(domain, iova);
3045                /*
3046                 * queue up the release of the unmap to save the 1/6th of the
3047                 * cpu used up by the iotlb flush operation...
3048                 */
3049        }
3050}
3051
3052static int intel_nontranslate_map_sg(struct device *hddev,
3053        struct scatterlist *sglist, int nelems, int dir)
3054{
3055        int i;
3056        struct scatterlist *sg;
3057
3058        for_each_sg(sglist, sg, nelems, i) {
3059                BUG_ON(!sg_page(sg));
3060                sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3061                sg->dma_length = sg->length;
3062        }
3063        return nelems;
3064}
3065
3066static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3067                        enum dma_data_direction dir, struct dma_attrs *attrs)
3068{
3069        int i;
3070        struct pci_dev *pdev = to_pci_dev(hwdev);
3071        struct dmar_domain *domain;
3072        size_t size = 0;
3073        int prot = 0;
3074        struct iova *iova = NULL;
3075        int ret;
3076        struct scatterlist *sg;
3077        unsigned long start_vpfn;
3078        struct intel_iommu *iommu;
3079
3080        BUG_ON(dir == DMA_NONE);
3081        if (iommu_no_mapping(hwdev))
3082                return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3083
3084        domain = get_valid_domain_for_dev(pdev);
3085        if (!domain)
3086                return 0;
3087
3088        iommu = domain_get_iommu(domain);
3089
3090        for_each_sg(sglist, sg, nelems, i)
3091                size += aligned_nrpages(sg->offset, sg->length);
3092
3093        iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3094                                pdev->dma_mask);
3095        if (!iova) {
3096                sglist->dma_length = 0;
3097                return 0;
3098        }
3099
3100        /*
3101         * Check if DMAR supports zero-length reads on write only
3102         * mappings..
3103         */
3104        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3105                        !cap_zlr(iommu->cap))
3106                prot |= DMA_PTE_READ;
3107        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3108                prot |= DMA_PTE_WRITE;
3109
3110        start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3111
3112        ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3113        if (unlikely(ret)) {
3114                /*  clear the page */
3115                dma_pte_clear_range(domain, start_vpfn,
3116                                    start_vpfn + size - 1);
3117                /* free page tables */
3118                dma_pte_free_pagetable(domain, start_vpfn,
3119                                       start_vpfn + size - 1);
3120                /* free iova */
3121                __free_iova(&domain->iovad, iova);
3122                return 0;
3123        }
3124
3125        /* it's a non-present to present mapping. Only flush if caching mode */
3126        if (cap_caching_mode(iommu->cap))
3127                iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3128        else
3129                iommu_flush_write_buffer(iommu);
3130
3131        return nelems;
3132}
3133
3134static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3135{
3136        return !dma_addr;
3137}
3138
3139struct dma_map_ops intel_dma_ops = {
3140        .alloc = intel_alloc_coherent,
3141        .free = intel_free_coherent,
3142        .map_sg = intel_map_sg,
3143        .unmap_sg = intel_unmap_sg,
3144        .map_page = intel_map_page,
3145        .unmap_page = intel_unmap_page,
3146        .mapping_error = intel_mapping_error,
3147};
3148
3149static inline int iommu_domain_cache_init(void)
3150{
3151        int ret = 0;
3152
3153        iommu_domain_cache = kmem_cache_create("iommu_domain",
3154                                         sizeof(struct dmar_domain),
3155                                         0,
3156                                         SLAB_HWCACHE_ALIGN,
3157
3158                                         NULL);
3159        if (!iommu_domain_cache) {
3160                printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3161                ret = -ENOMEM;
3162        }
3163
3164        return ret;
3165}
3166
3167static inline int iommu_devinfo_cache_init(void)
3168{
3169        int ret = 0;
3170
3171        iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3172                                         sizeof(struct device_domain_info),
3173                                         0,
3174                                         SLAB_HWCACHE_ALIGN,
3175                                         NULL);
3176        if (!iommu_devinfo_cache) {
3177                printk(KERN_ERR "Couldn't create devinfo cache\n");
3178                ret = -ENOMEM;
3179        }
3180
3181        return ret;
3182}
3183
3184static inline int iommu_iova_cache_init(void)
3185{
3186        int ret = 0;
3187
3188        iommu_iova_cache = kmem_cache_create("iommu_iova",
3189                                         sizeof(struct iova),
3190                                         0,
3191                                         SLAB_HWCACHE_ALIGN,
3192                                         NULL);
3193        if (!iommu_iova_cache) {
3194                printk(KERN_ERR "Couldn't create iova cache\n");
3195                ret = -ENOMEM;
3196        }
3197
3198        return ret;
3199}
3200
3201static int __init iommu_init_mempool(void)
3202{
3203        int ret;
3204        ret = iommu_iova_cache_init();
3205        if (ret)
3206                return ret;
3207
3208        ret = iommu_domain_cache_init();
3209        if (ret)
3210                goto domain_error;
3211
3212        ret = iommu_devinfo_cache_init();
3213        if (!ret)
3214                return ret;
3215
3216        kmem_cache_destroy(iommu_domain_cache);
3217domain_error:
3218        kmem_cache_destroy(iommu_iova_cache);
3219
3220        return -ENOMEM;
3221}
3222
3223static void __init iommu_exit_mempool(void)
3224{
3225        kmem_cache_destroy(iommu_devinfo_cache);
3226        kmem_cache_destroy(iommu_domain_cache);
3227        kmem_cache_destroy(iommu_iova_cache);
3228
3229}
3230
3231static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3232{
3233        struct dmar_drhd_unit *drhd;
3234        u32 vtbar;
3235        int rc;
3236
3237        /* We know that this device on this chipset has its own IOMMU.
3238         * If we find it under a different IOMMU, then the BIOS is lying
3239         * to us. Hope that the IOMMU for this device is actually
3240         * disabled, and it needs no translation...
3241         */
3242        rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3243        if (rc) {
3244                /* "can't" happen */
3245                dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3246                return;
3247        }
3248        vtbar &= 0xffff0000;
3249
3250        /* we know that the this iommu should be at offset 0xa000 from vtbar */
3251        drhd = dmar_find_matched_drhd_unit(pdev);
3252        if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3253                            TAINT_FIRMWARE_WORKAROUND,
3254                            "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3255                pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3256}
3257DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3258
3259static void __init init_no_remapping_devices(void)
3260{
3261        struct dmar_drhd_unit *drhd;
3262
3263        for_each_drhd_unit(drhd) {
3264                if (!drhd->include_all) {
3265                        int i;
3266                        for (i = 0; i < drhd->devices_cnt; i++)
3267                                if (drhd->devices[i] != NULL)
3268                                        break;
3269                        /* ignore DMAR unit if no pci devices exist */
3270                        if (i == drhd->devices_cnt)
3271                                drhd->ignored = 1;
3272                }
3273        }
3274
3275        for_each_active_drhd_unit(drhd) {
3276                int i;
3277                if (drhd->include_all)
3278                        continue;
3279
3280                for (i = 0; i < drhd->devices_cnt; i++)
3281                        if (drhd->devices[i] &&
3282                            !IS_GFX_DEVICE(drhd->devices[i]))
3283                                break;
3284
3285                if (i < drhd->devices_cnt)
3286                        continue;
3287
3288                /* This IOMMU has *only* gfx devices. Either bypass it or
3289                   set the gfx_mapped flag, as appropriate */
3290                if (dmar_map_gfx) {
3291                        intel_iommu_gfx_mapped = 1;
3292                } else {
3293                        drhd->ignored = 1;
3294                        for (i = 0; i < drhd->devices_cnt; i++) {
3295                                if (!drhd->devices[i])
3296                                        continue;
3297                                drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3298                        }
3299                }
3300        }
3301}
3302
3303#ifdef CONFIG_SUSPEND
3304static int init_iommu_hw(void)
3305{
3306        struct dmar_drhd_unit *drhd;
3307        struct intel_iommu *iommu = NULL;
3308
3309        for_each_active_iommu(iommu, drhd)
3310                if (iommu->qi)
3311                        dmar_reenable_qi(iommu);
3312
3313        for_each_iommu(iommu, drhd) {
3314                if (drhd->ignored) {
3315                        /*
3316                         * we always have to disable PMRs or DMA may fail on
3317                         * this device
3318                         */
3319                        if (force_on)
3320                                iommu_disable_protect_mem_regions(iommu);
3321                        continue;
3322                }
3323        
3324                iommu_flush_write_buffer(iommu);
3325
3326                iommu_set_root_entry(iommu);
3327
3328                iommu->flush.flush_context(iommu, 0, 0, 0,
3329                                           DMA_CCMD_GLOBAL_INVL);
3330                iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3331                                         DMA_TLB_GLOBAL_FLUSH);
3332                if (iommu_enable_translation(iommu))
3333                        return 1;
3334                iommu_disable_protect_mem_regions(iommu);
3335        }
3336
3337        return 0;
3338}
3339
3340static void iommu_flush_all(void)
3341{
3342        struct dmar_drhd_unit *drhd;
3343        struct intel_iommu *iommu;
3344
3345        for_each_active_iommu(iommu, drhd) {
3346                iommu->flush.flush_context(iommu, 0, 0, 0,
3347                                           DMA_CCMD_GLOBAL_INVL);
3348                iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3349                                         DMA_TLB_GLOBAL_FLUSH);
3350        }
3351}
3352
3353static int iommu_suspend(void)
3354{
3355        struct dmar_drhd_unit *drhd;
3356        struct intel_iommu *iommu = NULL;
3357        unsigned long flag;
3358
3359        for_each_active_iommu(iommu, drhd) {
3360                iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3361                                                 GFP_ATOMIC);
3362                if (!iommu->iommu_state)
3363                        goto nomem;
3364        }
3365
3366        iommu_flush_all();
3367
3368        for_each_active_iommu(iommu, drhd) {
3369                iommu_disable_translation(iommu);
3370
3371                raw_spin_lock_irqsave(&iommu->register_lock, flag);
3372
3373                iommu->iommu_state[SR_DMAR_FECTL_REG] =
3374                        readl(iommu->reg + DMAR_FECTL_REG);
3375                iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3376                        readl(iommu->reg + DMAR_FEDATA_REG);
3377                iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3378                        readl(iommu->reg + DMAR_FEADDR_REG);
3379                iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3380                        readl(iommu->reg + DMAR_FEUADDR_REG);
3381
3382                raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3383        }
3384        return 0;
3385
3386nomem:
3387        for_each_active_iommu(iommu, drhd)
3388                kfree(iommu->iommu_state);
3389
3390        return -ENOMEM;
3391}
3392
3393static void iommu_resume(void)
3394{
3395        struct dmar_drhd_unit *drhd;
3396        struct intel_iommu *iommu = NULL;
3397        unsigned long flag;
3398
3399        if (init_iommu_hw()) {
3400                if (force_on)
3401                        panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3402                else
3403                        WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3404                return;
3405        }
3406
3407        for_each_active_iommu(iommu, drhd) {
3408
3409                raw_spin_lock_irqsave(&iommu->register_lock, flag);
3410
3411                writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3412                        iommu->reg + DMAR_FECTL_REG);
3413                writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3414                        iommu->reg + DMAR_FEDATA_REG);
3415                writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3416                        iommu->reg + DMAR_FEADDR_REG);
3417                writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3418                        iommu->reg + DMAR_FEUADDR_REG);
3419
3420                raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3421        }
3422
3423        for_each_active_iommu(iommu, drhd)
3424                kfree(iommu->iommu_state);
3425}
3426
3427static struct syscore_ops iommu_syscore_ops = {
3428        .resume         = iommu_resume,
3429        .suspend        = iommu_suspend,
3430};
3431
3432static void __init init_iommu_pm_ops(void)
3433{
3434        register_syscore_ops(&iommu_syscore_ops);
3435}
3436
3437#else
3438static inline void init_iommu_pm_ops(void) {}
3439#endif  /* CONFIG_PM */
3440
3441LIST_HEAD(dmar_rmrr_units);
3442
3443static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3444{
3445        list_add(&rmrr->list, &dmar_rmrr_units);
3446}
3447
3448
3449int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3450{
3451        struct acpi_dmar_reserved_memory *rmrr;
3452        struct dmar_rmrr_unit *rmrru;
3453
3454        rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3455        if (!rmrru)
3456                return -ENOMEM;
3457
3458        rmrru->hdr = header;
3459        rmrr = (struct acpi_dmar_reserved_memory *)header;
3460        rmrru->base_address = rmrr->base_address;
3461        rmrru->end_address = rmrr->end_address;
3462
3463        dmar_register_rmrr_unit(rmrru);
3464        return 0;
3465}
3466
3467static int __init
3468rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3469{
3470        struct acpi_dmar_reserved_memory *rmrr;
3471
3472        rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3473        return dmar_parse_dev_scope((void *)(rmrr + 1),
3474                                    ((void *)rmrr) + rmrr->header.length,
3475                                    &rmrru->devices_cnt, &rmrru->devices,
3476                                    rmrr->segment);
3477}
3478
3479static LIST_HEAD(dmar_atsr_units);
3480
3481int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3482{
3483        struct acpi_dmar_atsr *atsr;
3484        struct dmar_atsr_unit *atsru;
3485
3486        atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3487        atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3488        if (!atsru)
3489                return -ENOMEM;
3490
3491        atsru->hdr = hdr;
3492        atsru->include_all = atsr->flags & 0x1;
3493
3494        list_add(&atsru->list, &dmar_atsr_units);
3495
3496        return 0;
3497}
3498
3499static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3500{
3501        struct acpi_dmar_atsr *atsr;
3502
3503        if (atsru->include_all)
3504                return 0;
3505
3506        atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3507        return dmar_parse_dev_scope((void *)(atsr + 1),
3508                                    (void *)atsr + atsr->header.length,
3509                                    &atsru->devices_cnt, &atsru->devices,
3510                                    atsr->segment);
3511}
3512
3513static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3514{
3515        dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3516        kfree(atsru);
3517}
3518
3519static void intel_iommu_free_dmars(void)
3520{
3521        struct dmar_rmrr_unit *rmrru, *rmrr_n;
3522        struct dmar_atsr_unit *atsru, *atsr_n;
3523
3524        list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3525                list_del(&rmrru->list);
3526                dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3527                kfree(rmrru);
3528        }
3529
3530        list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3531                list_del(&atsru->list);
3532                intel_iommu_free_atsr(atsru);
3533        }
3534}
3535
3536int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3537{
3538        int i;
3539        struct pci_bus *bus;
3540        struct acpi_dmar_atsr *atsr;
3541        struct dmar_atsr_unit *atsru;
3542
3543        dev = pci_physfn(dev);
3544
3545        list_for_each_entry(atsru, &dmar_atsr_units, list) {
3546                atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3547                if (atsr->segment == pci_domain_nr(dev->bus))
3548                        goto found;
3549        }
3550
3551        return 0;
3552
3553found:
3554        for (bus = dev->bus; bus; bus = bus->parent) {
3555                struct pci_dev *bridge = bus->self;
3556
3557                if (!bridge || !pci_is_pcie(bridge) ||
3558                    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3559                        return 0;
3560
3561                if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) {
3562                        for (i = 0; i < atsru->devices_cnt; i++)
3563                                if (atsru->devices[i] == bridge)
3564                                        return 1;
3565                        break;
3566                }
3567        }
3568
3569        if (atsru->include_all)
3570                return 1;
3571
3572        return 0;
3573}
3574
3575int __init dmar_parse_rmrr_atsr_dev(void)
3576{
3577        struct dmar_rmrr_unit *rmrr;
3578        struct dmar_atsr_unit *atsr;
3579        int ret = 0;
3580
3581        list_for_each_entry(rmrr, &dmar_rmrr_units, list) {
3582                ret = rmrr_parse_dev(rmrr);
3583                if (ret)
3584                        return ret;
3585        }
3586
3587        list_for_each_entry(atsr, &dmar_atsr_units, list) {
3588                ret = atsr_parse_dev(atsr);
3589                if (ret)
3590                        return ret;
3591        }
3592
3593        return ret;
3594}
3595
3596/*
3597 * Here we only respond to action of unbound device from driver.
3598 *
3599 * Added device is not attached to its DMAR domain here yet. That will happen
3600 * when mapping the device to iova.
3601 */
3602static int device_notifier(struct notifier_block *nb,
3603                                  unsigned long action, void *data)
3604{
3605        struct device *dev = data;
3606        struct pci_dev *pdev = to_pci_dev(dev);
3607        struct dmar_domain *domain;
3608
3609        if (iommu_no_mapping(dev))
3610                return 0;
3611
3612        domain = find_domain(pdev);
3613        if (!domain)
3614                return 0;
3615
3616        if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3617                domain_remove_one_dev_info(domain, pdev);
3618
3619                if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3620                    !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3621                    list_empty(&domain->devices))
3622                        domain_exit(domain);
3623        }
3624
3625        return 0;
3626}
3627
3628static struct notifier_block device_nb = {
3629        .notifier_call = device_notifier,
3630};
3631
3632int __init intel_iommu_init(void)
3633{
3634        int ret = -ENODEV;
3635        struct dmar_drhd_unit *drhd;
3636        struct intel_iommu *iommu;
3637
3638        /* VT-d is required for a TXT/tboot launch, so enforce that */
3639        force_on = tboot_force_iommu();
3640
3641        if (dmar_table_init()) {
3642                if (force_on)
3643                        panic("tboot: Failed to initialize DMAR table\n");
3644                goto out_free_dmar;
3645        }
3646
3647        /*
3648         * Disable translation if already enabled prior to OS handover.
3649         */
3650        for_each_active_iommu(iommu, drhd)
3651                if (iommu->gcmd & DMA_GCMD_TE)
3652                        iommu_disable_translation(iommu);
3653
3654        if (dmar_dev_scope_init() < 0) {
3655                if (force_on)
3656                        panic("tboot: Failed to initialize DMAR device scope\n");
3657                goto out_free_dmar;
3658        }
3659
3660        if (no_iommu || dmar_disabled)
3661                goto out_free_dmar;
3662
3663        if (iommu_init_mempool()) {
3664                if (force_on)
3665                        panic("tboot: Failed to initialize iommu memory\n");
3666                goto out_free_dmar;
3667        }
3668
3669        if (list_empty(&dmar_rmrr_units))
3670                printk(KERN_INFO "DMAR: No RMRR found\n");
3671
3672        if (list_empty(&dmar_atsr_units))
3673                printk(KERN_INFO "DMAR: No ATSR found\n");
3674
3675        if (dmar_init_reserved_ranges()) {
3676                if (force_on)
3677                        panic("tboot: Failed to reserve iommu ranges\n");
3678                goto out_free_mempool;
3679        }
3680
3681        init_no_remapping_devices();
3682
3683        ret = init_dmars();
3684        if (ret) {
3685                if (force_on)
3686                        panic("tboot: Failed to initialize DMARs\n");
3687                printk(KERN_ERR "IOMMU: dmar init failed\n");
3688                goto out_free_reserved_range;
3689        }
3690        printk(KERN_INFO
3691        "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3692
3693        init_timer(&unmap_timer);
3694#ifdef CONFIG_SWIOTLB
3695        swiotlb = 0;
3696#endif
3697        dma_ops = &intel_dma_ops;
3698
3699        init_iommu_pm_ops();
3700
3701        bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3702
3703        bus_register_notifier(&pci_bus_type, &device_nb);
3704
3705        intel_iommu_enabled = 1;
3706
3707        return 0;
3708
3709out_free_reserved_range:
3710        put_iova_domain(&reserved_iova_list);
3711out_free_mempool:
3712        iommu_exit_mempool();
3713out_free_dmar:
3714        intel_iommu_free_dmars();
3715        return ret;
3716}
3717
3718static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3719                                           struct pci_dev *pdev)
3720{
3721        struct pci_dev *tmp, *parent;
3722
3723        if (!iommu || !pdev)
3724                return;
3725
3726        /* dependent device detach */
3727        tmp = pci_find_upstream_pcie_bridge(pdev);
3728        /* Secondary interface's bus number and devfn 0 */
3729        if (tmp) {
3730                parent = pdev->bus->self;
3731                while (parent != tmp) {
3732                        iommu_detach_dev(iommu, parent->bus->number,
3733                                         parent->devfn);
3734                        parent = parent->bus->self;
3735                }
3736                if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3737                        iommu_detach_dev(iommu,
3738                                tmp->subordinate->number, 0);
3739                else /* this is a legacy PCI bridge */
3740                        iommu_detach_dev(iommu, tmp->bus->number,
3741                                         tmp->devfn);
3742        }
3743}
3744
3745static void domain_remove_one_dev_info(struct dmar_domain *domain,
3746                                          struct pci_dev *pdev)
3747{
3748        struct device_domain_info *info, *tmp;
3749        struct intel_iommu *iommu;
3750        unsigned long flags;
3751        int found = 0;
3752
3753        iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3754                                pdev->devfn);
3755        if (!iommu)
3756                return;
3757
3758        spin_lock_irqsave(&device_domain_lock, flags);
3759        list_for_each_entry_safe(info, tmp, &domain->devices, link) {
3760                if (info->segment == pci_domain_nr(pdev->bus) &&
3761                    info->bus == pdev->bus->number &&
3762                    info->devfn == pdev->devfn) {
3763                        unlink_domain_info(info);
3764                        spin_unlock_irqrestore(&device_domain_lock, flags);
3765
3766                        iommu_disable_dev_iotlb(info);
3767                        iommu_detach_dev(iommu, info->bus, info->devfn);
3768                        iommu_detach_dependent_devices(iommu, pdev);
3769                        free_devinfo_mem(info);
3770
3771                        spin_lock_irqsave(&device_domain_lock, flags);
3772
3773                        if (found)
3774                                break;
3775                        else
3776                                continue;
3777                }
3778
3779                /* if there is no other devices under the same iommu
3780                 * owned by this domain, clear this iommu in iommu_bmp
3781                 * update iommu count and coherency
3782                 */
3783                if (iommu == device_to_iommu(info->segment, info->bus,
3784                                            info->devfn))
3785                        found = 1;
3786        }
3787
3788        spin_unlock_irqrestore(&device_domain_lock, flags);
3789
3790        if (found == 0) {
3791                unsigned long tmp_flags;
3792                spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3793                clear_bit(iommu->seq_id, domain->iommu_bmp);
3794                domain->iommu_count--;
3795                domain_update_iommu_cap(domain);
3796                spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3797
3798                if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3799                    !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3800                        spin_lock_irqsave(&iommu->lock, tmp_flags);
3801                        clear_bit(domain->id, iommu->domain_ids);
3802                        iommu->domains[domain->id] = NULL;
3803                        spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3804                }
3805        }
3806}
3807
3808static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3809{
3810        struct device_domain_info *info;
3811        struct intel_iommu *iommu;
3812        unsigned long flags1, flags2;
3813
3814        spin_lock_irqsave(&device_domain_lock, flags1);
3815        while (!list_empty(&domain->devices)) {
3816                info = list_entry(domain->devices.next,
3817                        struct device_domain_info, link);
3818                unlink_domain_info(info);
3819                spin_unlock_irqrestore(&device_domain_lock, flags1);
3820
3821                iommu_disable_dev_iotlb(info);
3822                iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3823                iommu_detach_dev(iommu, info->bus, info->devfn);
3824                iommu_detach_dependent_devices(iommu, info->dev);
3825
3826                /* clear this iommu in iommu_bmp, update iommu count
3827                 * and capabilities
3828                 */
3829                spin_lock_irqsave(&domain->iommu_lock, flags2);
3830                if (test_and_clear_bit(iommu->seq_id,
3831                                       domain->iommu_bmp)) {
3832                        domain->iommu_count--;
3833                        domain_update_iommu_cap(domain);
3834                }
3835                spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3836
3837                free_devinfo_mem(info);
3838                spin_lock_irqsave(&device_domain_lock, flags1);
3839        }
3840        spin_unlock_irqrestore(&device_domain_lock, flags1);
3841}
3842
3843/* domain id for virtual machine, it won't be set in context */
3844static atomic_t vm_domid = ATOMIC_INIT(0);
3845
3846static struct dmar_domain *iommu_alloc_vm_domain(void)
3847{
3848        struct dmar_domain *domain;
3849
3850        domain = alloc_domain_mem();
3851        if (!domain)
3852                return NULL;
3853
3854        domain->id = atomic_inc_return(&vm_domid);
3855        domain->nid = -1;
3856        memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
3857        domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3858
3859        return domain;
3860}
3861
3862static int md_domain_init(struct dmar_domain *domain, int guest_width)
3863{
3864        int adjust_width;
3865
3866        init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3867        spin_lock_init(&domain->iommu_lock);
3868
3869        domain_reserve_special_ranges(domain);
3870
3871        /* calculate AGAW */
3872        domain->gaw = guest_width;
3873        adjust_width = guestwidth_to_adjustwidth(guest_width);
3874        domain->agaw = width_to_agaw(adjust_width);
3875
3876        INIT_LIST_HEAD(&domain->devices);
3877
3878        domain->iommu_count = 0;
3879        domain->iommu_coherency = 0;
3880        domain->iommu_snooping = 0;
3881        domain->iommu_superpage = 0;
3882        domain->max_addr = 0;
3883        domain->nid = -1;
3884
3885        /* always allocate the top pgd */
3886        domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3887        if (!domain->pgd)
3888                return -ENOMEM;
3889        domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3890        return 0;
3891}
3892
3893static void iommu_free_vm_domain(struct dmar_domain *domain)
3894{
3895        unsigned long flags;
3896        struct dmar_drhd_unit *drhd;
3897        struct intel_iommu *iommu;
3898        unsigned long i;
3899        unsigned long ndomains;
3900
3901        for_each_active_iommu(iommu, drhd) {
3902                ndomains = cap_ndoms(iommu->cap);
3903                for_each_set_bit(i, iommu->domain_ids, ndomains) {
3904                        if (iommu->domains[i] == domain) {
3905                                spin_lock_irqsave(&iommu->lock, flags);
3906                                clear_bit(i, iommu->domain_ids);
3907                                iommu->domains[i] = NULL;
3908                                spin_unlock_irqrestore(&iommu->lock, flags);
3909                                break;
3910                        }
3911                }
3912        }
3913}
3914
3915static void vm_domain_exit(struct dmar_domain *domain)
3916{
3917        /* Domain 0 is reserved, so dont process it */
3918        if (!domain)
3919                return;
3920
3921        vm_domain_remove_all_dev_info(domain);
3922        /* destroy iovas */
3923        put_iova_domain(&domain->iovad);
3924
3925        /* clear ptes */
3926        dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3927
3928        /* free page tables */
3929        dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3930
3931        iommu_free_vm_domain(domain);
3932        free_domain_mem(domain);
3933}
3934
3935static int intel_iommu_domain_init(struct iommu_domain *domain)
3936{
3937        struct dmar_domain *dmar_domain;
3938
3939        dmar_domain = iommu_alloc_vm_domain();
3940        if (!dmar_domain) {
3941                printk(KERN_ERR
3942                        "intel_iommu_domain_init: dmar_domain == NULL\n");
3943                return -ENOMEM;
3944        }
3945        if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3946                printk(KERN_ERR
3947                        "intel_iommu_domain_init() failed\n");
3948                vm_domain_exit(dmar_domain);
3949                return -ENOMEM;
3950        }
3951        domain_update_iommu_cap(dmar_domain);
3952        domain->priv = dmar_domain;
3953
3954        domain->geometry.aperture_start = 0;
3955        domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
3956        domain->geometry.force_aperture = true;
3957
3958        return 0;
3959}
3960
3961static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3962{
3963        struct dmar_domain *dmar_domain = domain->priv;
3964
3965        domain->priv = NULL;
3966        vm_domain_exit(dmar_domain);
3967}
3968
3969static int intel_iommu_attach_device(struct iommu_domain *domain,
3970                                     struct device *dev)
3971{
3972        struct dmar_domain *dmar_domain = domain->priv;
3973        struct pci_dev *pdev = to_pci_dev(dev);
3974        struct intel_iommu *iommu;
3975        int addr_width;
3976
3977        /* normally pdev is not mapped */
3978        if (unlikely(domain_context_mapped(pdev))) {
3979                struct dmar_domain *old_domain;
3980
3981                old_domain = find_domain(pdev);
3982                if (old_domain) {
3983                        if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3984                            dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3985                                domain_remove_one_dev_info(old_domain, pdev);
3986                        else
3987                                domain_remove_dev_info(old_domain);
3988                }
3989        }
3990
3991        iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3992                                pdev->devfn);
3993        if (!iommu)
3994                return -ENODEV;
3995
3996        /* check if this iommu agaw is sufficient for max mapped address */
3997        addr_width = agaw_to_width(iommu->agaw);
3998        if (addr_width > cap_mgaw(iommu->cap))
3999                addr_width = cap_mgaw(iommu->cap);
4000
4001        if (dmar_domain->max_addr > (1LL << addr_width)) {
4002                printk(KERN_ERR "%s: iommu width (%d) is not "
4003                       "sufficient for the mapped address (%llx)\n",
4004                       __func__, addr_width, dmar_domain->max_addr);
4005                return -EFAULT;
4006        }
4007        dmar_domain->gaw = addr_width;
4008
4009        /*
4010         * Knock out extra levels of page tables if necessary
4011         */
4012        while (iommu->agaw < dmar_domain->agaw) {
4013                struct dma_pte *pte;
4014
4015                pte = dmar_domain->pgd;
4016                if (dma_pte_present(pte)) {
4017                        dmar_domain->pgd = (struct dma_pte *)
4018                                phys_to_virt(dma_pte_addr(pte));
4019                        free_pgtable_page(pte);
4020                }
4021                dmar_domain->agaw--;
4022        }
4023
4024        return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4025}
4026
4027static void intel_iommu_detach_device(struct iommu_domain *domain,
4028                                      struct device *dev)
4029{
4030        struct dmar_domain *dmar_domain = domain->priv;
4031        struct pci_dev *pdev = to_pci_dev(dev);
4032
4033        domain_remove_one_dev_info(dmar_domain, pdev);
4034}
4035
4036static int intel_iommu_map(struct iommu_domain *domain,
4037                           unsigned long iova, phys_addr_t hpa,
4038                           size_t size, int iommu_prot)
4039{
4040        struct dmar_domain *dmar_domain = domain->priv;
4041        u64 max_addr;
4042        int prot = 0;
4043        int ret;
4044
4045        if (iommu_prot & IOMMU_READ)
4046                prot |= DMA_PTE_READ;
4047        if (iommu_prot & IOMMU_WRITE)
4048                prot |= DMA_PTE_WRITE;
4049        if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4050                prot |= DMA_PTE_SNP;
4051
4052        max_addr = iova + size;
4053        if (dmar_domain->max_addr < max_addr) {
4054                u64 end;
4055
4056                /* check if minimum agaw is sufficient for mapped address */
4057                end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4058                if (end < max_addr) {
4059                        printk(KERN_ERR "%s: iommu width (%d) is not "
4060                               "sufficient for the mapped address (%llx)\n",
4061                               __func__, dmar_domain->gaw, max_addr);
4062                        return -EFAULT;
4063                }
4064                dmar_domain->max_addr = max_addr;
4065        }
4066        /* Round up size to next multiple of PAGE_SIZE, if it and
4067           the low bits of hpa would take us onto the next page */
4068        size = aligned_nrpages(hpa, size);
4069        ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4070                                 hpa >> VTD_PAGE_SHIFT, size, prot);
4071        return ret;
4072}
4073
4074static size_t intel_iommu_unmap(struct iommu_domain *domain,
4075                             unsigned long iova, size_t size)
4076{
4077        struct dmar_domain *dmar_domain = domain->priv;
4078        int order;
4079
4080        order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4081                            (iova + size - 1) >> VTD_PAGE_SHIFT);
4082
4083        if (dmar_domain->max_addr == iova + size)
4084                dmar_domain->max_addr = iova;
4085
4086        return PAGE_SIZE << order;
4087}
4088
4089static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4090                                            dma_addr_t iova)
4091{
4092        struct dmar_domain *dmar_domain = domain->priv;
4093        struct dma_pte *pte;
4094        u64 phys = 0;
4095
4096        pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4097        if (pte)
4098                phys = dma_pte_addr(pte);
4099
4100        return phys;
4101}
4102
4103static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4104                                      unsigned long cap)
4105{
4106        struct dmar_domain *dmar_domain = domain->priv;
4107
4108        if (cap == IOMMU_CAP_CACHE_COHERENCY)
4109                return dmar_domain->iommu_snooping;
4110        if (cap == IOMMU_CAP_INTR_REMAP)
4111                return irq_remapping_enabled;
4112
4113        return 0;
4114}
4115
4116#define REQ_ACS_FLAGS   (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
4117
4118static int intel_iommu_add_device(struct device *dev)
4119{
4120        struct pci_dev *pdev = to_pci_dev(dev);
4121        struct pci_dev *bridge, *dma_pdev = NULL;
4122        struct iommu_group *group;
4123        int ret;
4124
4125        if (!device_to_iommu(pci_domain_nr(pdev->bus),
4126                             pdev->bus->number, pdev->devfn))
4127                return -ENODEV;
4128
4129        bridge = pci_find_upstream_pcie_bridge(pdev);
4130        if (bridge) {
4131                if (pci_is_pcie(bridge))
4132                        dma_pdev = pci_get_domain_bus_and_slot(
4133                                                pci_domain_nr(pdev->bus),
4134                                                bridge->subordinate->number, 0);
4135                if (!dma_pdev)
4136                        dma_pdev = pci_dev_get(bridge);
4137        } else
4138                dma_pdev = pci_dev_get(pdev);
4139
4140        /* Account for quirked devices */
4141        swap_pci_ref(&dma_pdev, pci_get_dma_source(dma_pdev));
4142
4143        /*
4144         * If it's a multifunction device that does not support our
4145         * required ACS flags, add to the same group as lowest numbered
4146         * function that also does not suport the required ACS flags.
4147         */
4148        if (dma_pdev->multifunction &&
4149            !pci_acs_enabled(dma_pdev, REQ_ACS_FLAGS)) {
4150                u8 i, slot = PCI_SLOT(dma_pdev->devfn);
4151
4152                for (i = 0; i < 8; i++) {
4153                        struct pci_dev *tmp;
4154
4155                        tmp = pci_get_slot(dma_pdev->bus, PCI_DEVFN(slot, i));
4156                        if (!tmp)
4157                                continue;
4158
4159                        if (!pci_acs_enabled(tmp, REQ_ACS_FLAGS)) {
4160                                swap_pci_ref(&dma_pdev, tmp);
4161                                break;
4162                        }
4163                        pci_dev_put(tmp);
4164                }
4165        }
4166
4167        /*
4168         * Devices on the root bus go through the iommu.  If that's not us,
4169         * find the next upstream device and test ACS up to the root bus.
4170         * Finding the next device may require skipping virtual buses.
4171         */
4172        while (!pci_is_root_bus(dma_pdev->bus)) {
4173                struct pci_bus *bus = dma_pdev->bus;
4174
4175                while (!bus->self) {
4176                        if (!pci_is_root_bus(bus))
4177                                bus = bus->parent;
4178                        else
4179                                goto root_bus;
4180                }
4181
4182                if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS))
4183                        break;
4184
4185                swap_pci_ref(&dma_pdev, pci_dev_get(bus->self));
4186        }
4187
4188root_bus:
4189        group = iommu_group_get(&dma_pdev->dev);
4190        pci_dev_put(dma_pdev);
4191        if (!group) {
4192                group = iommu_group_alloc();
4193                if (IS_ERR(group))
4194                        return PTR_ERR(group);
4195        }
4196
4197        ret = iommu_group_add_device(group, dev);
4198
4199        iommu_group_put(group);
4200        return ret;
4201}
4202
4203static void intel_iommu_remove_device(struct device *dev)
4204{
4205        iommu_group_remove_device(dev);
4206}
4207
4208static struct iommu_ops intel_iommu_ops = {
4209        .domain_init    = intel_iommu_domain_init,
4210        .domain_destroy = intel_iommu_domain_destroy,
4211        .attach_dev     = intel_iommu_attach_device,
4212        .detach_dev     = intel_iommu_detach_device,
4213        .map            = intel_iommu_map,
4214        .unmap          = intel_iommu_unmap,
4215        .iova_to_phys   = intel_iommu_iova_to_phys,
4216        .domain_has_cap = intel_iommu_domain_has_cap,
4217        .add_device     = intel_iommu_add_device,
4218        .remove_device  = intel_iommu_remove_device,
4219        .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
4220};
4221
4222static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4223{
4224        /* G4x/GM45 integrated gfx dmar support is totally busted. */
4225        printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4226        dmar_map_gfx = 0;
4227}
4228
4229DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4230DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4231DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4232DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4233DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4234DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4235DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4236
4237static void quirk_iommu_rwbf(struct pci_dev *dev)
4238{
4239        /*
4240         * Mobile 4 Series Chipset neglects to set RWBF capability,
4241         * but needs it. Same seems to hold for the desktop versions.
4242         */
4243        printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4244        rwbf_quirk = 1;
4245}
4246
4247DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4248DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4249DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4250DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4251DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4252DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4253DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4254
4255#define GGC 0x52
4256#define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4257#define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4258#define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4259#define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4260#define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4261#define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4262#define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4263#define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4264
4265static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4266{
4267        unsigned short ggc;
4268
4269        if (pci_read_config_word(dev, GGC, &ggc))
4270                return;
4271
4272        if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4273                printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4274                dmar_map_gfx = 0;
4275        } else if (dmar_map_gfx) {
4276                /* we have to ensure the gfx device is idle before we flush */
4277                printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4278                intel_iommu_strict = 1;
4279       }
4280}
4281DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4282DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4283DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4284DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4285
4286/* On Tylersburg chipsets, some BIOSes have been known to enable the
4287   ISOCH DMAR unit for the Azalia sound device, but not give it any
4288   TLB entries, which causes it to deadlock. Check for that.  We do
4289   this in a function called from init_dmars(), instead of in a PCI
4290   quirk, because we don't want to print the obnoxious "BIOS broken"
4291   message if VT-d is actually disabled.
4292*/
4293static void __init check_tylersburg_isoch(void)
4294{
4295        struct pci_dev *pdev;
4296        uint32_t vtisochctrl;
4297
4298        /* If there's no Azalia in the system anyway, forget it. */
4299        pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4300        if (!pdev)
4301                return;
4302        pci_dev_put(pdev);
4303
4304        /* System Management Registers. Might be hidden, in which case
4305           we can't do the sanity check. But that's OK, because the
4306           known-broken BIOSes _don't_ actually hide it, so far. */
4307        pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4308        if (!pdev)
4309                return;
4310
4311        if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4312                pci_dev_put(pdev);
4313                return;
4314        }
4315
4316        pci_dev_put(pdev);
4317
4318        /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4319        if (vtisochctrl & 1)
4320                return;
4321
4322        /* Drop all bits other than the number of TLB entries */
4323        vtisochctrl &= 0x1c;
4324
4325        /* If we have the recommended number of TLB entries (16), fine. */
4326        if (vtisochctrl == 0x10)
4327                return;
4328
4329        /* Zero TLB entries? You get to ride the short bus to school. */
4330        if (!vtisochctrl) {
4331                WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4332                     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4333                     dmi_get_system_info(DMI_BIOS_VENDOR),
4334                     dmi_get_system_info(DMI_BIOS_VERSION),
4335                     dmi_get_system_info(DMI_PRODUCT_VERSION));
4336                iommu_identity_mapping |= IDENTMAP_AZALIA;
4337                return;
4338        }
4339        
4340        printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4341               vtisochctrl);
4342}
4343