linux/drivers/iommu/intel-iommu.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2006, Intel Corporation.
   3 *
   4 * This program is free software; you can redistribute it and/or modify it
   5 * under the terms and conditions of the GNU General Public License,
   6 * version 2, as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope it will be useful, but WITHOUT
   9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  11 * more details.
  12 *
  13 * You should have received a copy of the GNU General Public License along with
  14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
  15 * Place - Suite 330, Boston, MA 02111-1307 USA.
  16 *
  17 * Copyright (C) 2006-2008 Intel Corporation
  18 * Author: Ashok Raj <ashok.raj@intel.com>
  19 * Author: Shaohua Li <shaohua.li@intel.com>
  20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
  21 * Author: Fenghua Yu <fenghua.yu@intel.com>
  22 */
  23
  24#include <linux/init.h>
  25#include <linux/bitmap.h>
  26#include <linux/debugfs.h>
  27#include <linux/export.h>
  28#include <linux/slab.h>
  29#include <linux/irq.h>
  30#include <linux/interrupt.h>
  31#include <linux/spinlock.h>
  32#include <linux/pci.h>
  33#include <linux/dmar.h>
  34#include <linux/dma-mapping.h>
  35#include <linux/mempool.h>
  36#include <linux/timer.h>
  37#include <linux/iova.h>
  38#include <linux/iommu.h>
  39#include <linux/intel-iommu.h>
  40#include <linux/syscore_ops.h>
  41#include <linux/tboot.h>
  42#include <linux/dmi.h>
  43#include <linux/pci-ats.h>
  44#include <linux/memblock.h>
  45#include <asm/irq_remapping.h>
  46#include <asm/cacheflush.h>
  47#include <asm/iommu.h>
  48
  49#include "irq_remapping.h"
  50#include "pci.h"
  51
  52#define ROOT_SIZE               VTD_PAGE_SIZE
  53#define CONTEXT_SIZE            VTD_PAGE_SIZE
  54
  55#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  56#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  57#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  58
  59#define IOAPIC_RANGE_START      (0xfee00000)
  60#define IOAPIC_RANGE_END        (0xfeefffff)
  61#define IOVA_START_ADDR         (0x1000)
  62
  63#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
  64
  65#define MAX_AGAW_WIDTH 64
  66
  67#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  68#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  69
  70/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  71   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  72#define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  73                                __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  74#define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  75
  76#define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  77#define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
  78#define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
  79
  80/* page table handling */
  81#define LEVEL_STRIDE            (9)
  82#define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  83
  84/*
  85 * This bitmap is used to advertise the page sizes our hardware support
  86 * to the IOMMU core, which will then use this information to split
  87 * physically contiguous memory regions it is mapping into page sizes
  88 * that we support.
  89 *
  90 * Traditionally the IOMMU core just handed us the mappings directly,
  91 * after making sure the size is an order of a 4KiB page and that the
  92 * mapping has natural alignment.
  93 *
  94 * To retain this behavior, we currently advertise that we support
  95 * all page sizes that are an order of 4KiB.
  96 *
  97 * If at some point we'd like to utilize the IOMMU core's new behavior,
  98 * we could change this to advertise the real page sizes we support.
  99 */
 100#define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
 101
 102static inline int agaw_to_level(int agaw)
 103{
 104        return agaw + 2;
 105}
 106
 107static inline int agaw_to_width(int agaw)
 108{
 109        return 30 + agaw * LEVEL_STRIDE;
 110}
 111
 112static inline int width_to_agaw(int width)
 113{
 114        return (width - 30) / LEVEL_STRIDE;
 115}
 116
 117static inline unsigned int level_to_offset_bits(int level)
 118{
 119        return (level - 1) * LEVEL_STRIDE;
 120}
 121
 122static inline int pfn_level_offset(unsigned long pfn, int level)
 123{
 124        return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 125}
 126
 127static inline unsigned long level_mask(int level)
 128{
 129        return -1UL << level_to_offset_bits(level);
 130}
 131
 132static inline unsigned long level_size(int level)
 133{
 134        return 1UL << level_to_offset_bits(level);
 135}
 136
 137static inline unsigned long align_to_level(unsigned long pfn, int level)
 138{
 139        return (pfn + level_size(level) - 1) & level_mask(level);
 140}
 141
 142static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 143{
 144        return  1 << ((lvl - 1) * LEVEL_STRIDE);
 145}
 146
 147/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 148   are never going to work. */
 149static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 150{
 151        return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 152}
 153
 154static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 155{
 156        return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 157}
 158static inline unsigned long page_to_dma_pfn(struct page *pg)
 159{
 160        return mm_to_dma_pfn(page_to_pfn(pg));
 161}
 162static inline unsigned long virt_to_dma_pfn(void *p)
 163{
 164        return page_to_dma_pfn(virt_to_page(p));
 165}
 166
 167/* global iommu list, set NULL for ignored DMAR units */
 168static struct intel_iommu **g_iommus;
 169
 170static void __init check_tylersburg_isoch(void);
 171static int rwbf_quirk;
 172
 173/*
 174 * set to 1 to panic kernel if can't successfully enable VT-d
 175 * (used when kernel is launched w/ TXT)
 176 */
 177static int force_on = 0;
 178
 179/*
 180 * 0: Present
 181 * 1-11: Reserved
 182 * 12-63: Context Ptr (12 - (haw-1))
 183 * 64-127: Reserved
 184 */
 185struct root_entry {
 186        u64     val;
 187        u64     rsvd1;
 188};
 189#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 190static inline bool root_present(struct root_entry *root)
 191{
 192        return (root->val & 1);
 193}
 194static inline void set_root_present(struct root_entry *root)
 195{
 196        root->val |= 1;
 197}
 198static inline void set_root_value(struct root_entry *root, unsigned long value)
 199{
 200        root->val |= value & VTD_PAGE_MASK;
 201}
 202
 203static inline struct context_entry *
 204get_context_addr_from_root(struct root_entry *root)
 205{
 206        return (struct context_entry *)
 207                (root_present(root)?phys_to_virt(
 208                root->val & VTD_PAGE_MASK) :
 209                NULL);
 210}
 211
 212/*
 213 * low 64 bits:
 214 * 0: present
 215 * 1: fault processing disable
 216 * 2-3: translation type
 217 * 12-63: address space root
 218 * high 64 bits:
 219 * 0-2: address width
 220 * 3-6: aval
 221 * 8-23: domain id
 222 */
 223struct context_entry {
 224        u64 lo;
 225        u64 hi;
 226};
 227
 228static inline bool context_present(struct context_entry *context)
 229{
 230        return (context->lo & 1);
 231}
 232static inline void context_set_present(struct context_entry *context)
 233{
 234        context->lo |= 1;
 235}
 236
 237static inline void context_set_fault_enable(struct context_entry *context)
 238{
 239        context->lo &= (((u64)-1) << 2) | 1;
 240}
 241
 242static inline void context_set_translation_type(struct context_entry *context,
 243                                                unsigned long value)
 244{
 245        context->lo &= (((u64)-1) << 4) | 3;
 246        context->lo |= (value & 3) << 2;
 247}
 248
 249static inline void context_set_address_root(struct context_entry *context,
 250                                            unsigned long value)
 251{
 252        context->lo |= value & VTD_PAGE_MASK;
 253}
 254
 255static inline void context_set_address_width(struct context_entry *context,
 256                                             unsigned long value)
 257{
 258        context->hi |= value & 7;
 259}
 260
 261static inline void context_set_domain_id(struct context_entry *context,
 262                                         unsigned long value)
 263{
 264        context->hi |= (value & ((1 << 16) - 1)) << 8;
 265}
 266
 267static inline void context_clear_entry(struct context_entry *context)
 268{
 269        context->lo = 0;
 270        context->hi = 0;
 271}
 272
 273/*
 274 * 0: readable
 275 * 1: writable
 276 * 2-6: reserved
 277 * 7: super page
 278 * 8-10: available
 279 * 11: snoop behavior
 280 * 12-63: Host physcial address
 281 */
 282struct dma_pte {
 283        u64 val;
 284};
 285
 286static inline void dma_clear_pte(struct dma_pte *pte)
 287{
 288        pte->val = 0;
 289}
 290
 291static inline void dma_set_pte_readable(struct dma_pte *pte)
 292{
 293        pte->val |= DMA_PTE_READ;
 294}
 295
 296static inline void dma_set_pte_writable(struct dma_pte *pte)
 297{
 298        pte->val |= DMA_PTE_WRITE;
 299}
 300
 301static inline void dma_set_pte_snp(struct dma_pte *pte)
 302{
 303        pte->val |= DMA_PTE_SNP;
 304}
 305
 306static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
 307{
 308        pte->val = (pte->val & ~3) | (prot & 3);
 309}
 310
 311static inline u64 dma_pte_addr(struct dma_pte *pte)
 312{
 313#ifdef CONFIG_64BIT
 314        return pte->val & VTD_PAGE_MASK;
 315#else
 316        /* Must have a full atomic 64-bit read */
 317        return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
 318#endif
 319}
 320
 321static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
 322{
 323        pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
 324}
 325
 326static inline bool dma_pte_present(struct dma_pte *pte)
 327{
 328        return (pte->val & 3) != 0;
 329}
 330
 331static inline bool dma_pte_superpage(struct dma_pte *pte)
 332{
 333        return (pte->val & (1 << 7));
 334}
 335
 336static inline int first_pte_in_page(struct dma_pte *pte)
 337{
 338        return !((unsigned long)pte & ~VTD_PAGE_MASK);
 339}
 340
 341/*
 342 * This domain is a statically identity mapping domain.
 343 *      1. This domain creats a static 1:1 mapping to all usable memory.
 344 *      2. It maps to each iommu if successful.
 345 *      3. Each iommu mapps to this domain if successful.
 346 */
 347static struct dmar_domain *si_domain;
 348static int hw_pass_through = 1;
 349
 350/* devices under the same p2p bridge are owned in one domain */
 351#define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
 352
 353/* domain represents a virtual machine, more than one devices
 354 * across iommus may be owned in one domain, e.g. kvm guest.
 355 */
 356#define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
 357
 358/* si_domain contains mulitple devices */
 359#define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
 360
 361/* define the limit of IOMMUs supported in each domain */
 362#ifdef  CONFIG_X86
 363# define        IOMMU_UNITS_SUPPORTED   MAX_IO_APICS
 364#else
 365# define        IOMMU_UNITS_SUPPORTED   64
 366#endif
 367
 368struct dmar_domain {
 369        int     id;                     /* domain id */
 370        int     nid;                    /* node id */
 371        DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
 372                                        /* bitmap of iommus this domain uses*/
 373
 374        struct list_head devices;       /* all devices' list */
 375        struct iova_domain iovad;       /* iova's that belong to this domain */
 376
 377        struct dma_pte  *pgd;           /* virtual address */
 378        int             gaw;            /* max guest address width */
 379
 380        /* adjusted guest address width, 0 is level 2 30-bit */
 381        int             agaw;
 382
 383        int             flags;          /* flags to find out type of domain */
 384
 385        int             iommu_coherency;/* indicate coherency of iommu access */
 386        int             iommu_snooping; /* indicate snooping control feature*/
 387        int             iommu_count;    /* reference count of iommu */
 388        int             iommu_superpage;/* Level of superpages supported:
 389                                           0 == 4KiB (no superpages), 1 == 2MiB,
 390                                           2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
 391        spinlock_t      iommu_lock;     /* protect iommu set in domain */
 392        u64             max_addr;       /* maximum mapped address */
 393};
 394
 395/* PCI domain-device relationship */
 396struct device_domain_info {
 397        struct list_head link;  /* link to domain siblings */
 398        struct list_head global; /* link to global list */
 399        int segment;            /* PCI domain */
 400        u8 bus;                 /* PCI bus number */
 401        u8 devfn;               /* PCI devfn number */
 402        struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
 403        struct intel_iommu *iommu; /* IOMMU used by this device */
 404        struct dmar_domain *domain; /* pointer to domain */
 405};
 406
 407static void flush_unmaps_timeout(unsigned long data);
 408
 409DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
 410
 411#define HIGH_WATER_MARK 250
 412struct deferred_flush_tables {
 413        int next;
 414        struct iova *iova[HIGH_WATER_MARK];
 415        struct dmar_domain *domain[HIGH_WATER_MARK];
 416};
 417
 418static struct deferred_flush_tables *deferred_flush;
 419
 420/* bitmap for indexing intel_iommus */
 421static int g_num_of_iommus;
 422
 423static DEFINE_SPINLOCK(async_umap_flush_lock);
 424static LIST_HEAD(unmaps_to_do);
 425
 426static int timer_on;
 427static long list_size;
 428
 429static void domain_remove_dev_info(struct dmar_domain *domain);
 430
 431#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
 432int dmar_disabled = 0;
 433#else
 434int dmar_disabled = 1;
 435#endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
 436
 437int intel_iommu_enabled = 0;
 438EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 439
 440static int dmar_map_gfx = 1;
 441static int dmar_forcedac;
 442static int intel_iommu_strict;
 443static int intel_iommu_superpage = 1;
 444
 445int intel_iommu_gfx_mapped;
 446EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 447
 448#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 449static DEFINE_SPINLOCK(device_domain_lock);
 450static LIST_HEAD(device_domain_list);
 451
 452static struct iommu_ops intel_iommu_ops;
 453
 454static int __init intel_iommu_setup(char *str)
 455{
 456        if (!str)
 457                return -EINVAL;
 458        while (*str) {
 459                if (!strncmp(str, "on", 2)) {
 460                        dmar_disabled = 0;
 461                        printk(KERN_INFO "Intel-IOMMU: enabled\n");
 462                } else if (!strncmp(str, "off", 3)) {
 463                        dmar_disabled = 1;
 464                        printk(KERN_INFO "Intel-IOMMU: disabled\n");
 465                } else if (!strncmp(str, "igfx_off", 8)) {
 466                        dmar_map_gfx = 0;
 467                        printk(KERN_INFO
 468                                "Intel-IOMMU: disable GFX device mapping\n");
 469                } else if (!strncmp(str, "forcedac", 8)) {
 470                        printk(KERN_INFO
 471                                "Intel-IOMMU: Forcing DAC for PCI devices\n");
 472                        dmar_forcedac = 1;
 473                } else if (!strncmp(str, "strict", 6)) {
 474                        printk(KERN_INFO
 475                                "Intel-IOMMU: disable batched IOTLB flush\n");
 476                        intel_iommu_strict = 1;
 477                } else if (!strncmp(str, "sp_off", 6)) {
 478                        printk(KERN_INFO
 479                                "Intel-IOMMU: disable supported super page\n");
 480                        intel_iommu_superpage = 0;
 481                }
 482
 483                str += strcspn(str, ",");
 484                while (*str == ',')
 485                        str++;
 486        }
 487        return 0;
 488}
 489__setup("intel_iommu=", intel_iommu_setup);
 490
 491static struct kmem_cache *iommu_domain_cache;
 492static struct kmem_cache *iommu_devinfo_cache;
 493static struct kmem_cache *iommu_iova_cache;
 494
 495static inline void *alloc_pgtable_page(int node)
 496{
 497        struct page *page;
 498        void *vaddr = NULL;
 499
 500        page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 501        if (page)
 502                vaddr = page_address(page);
 503        return vaddr;
 504}
 505
 506static inline void free_pgtable_page(void *vaddr)
 507{
 508        free_page((unsigned long)vaddr);
 509}
 510
 511static inline void *alloc_domain_mem(void)
 512{
 513        return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 514}
 515
 516static void free_domain_mem(void *vaddr)
 517{
 518        kmem_cache_free(iommu_domain_cache, vaddr);
 519}
 520
 521static inline void * alloc_devinfo_mem(void)
 522{
 523        return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 524}
 525
 526static inline void free_devinfo_mem(void *vaddr)
 527{
 528        kmem_cache_free(iommu_devinfo_cache, vaddr);
 529}
 530
 531struct iova *alloc_iova_mem(void)
 532{
 533        return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
 534}
 535
 536void free_iova_mem(struct iova *iova)
 537{
 538        kmem_cache_free(iommu_iova_cache, iova);
 539}
 540
 541
 542static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 543{
 544        unsigned long sagaw;
 545        int agaw = -1;
 546
 547        sagaw = cap_sagaw(iommu->cap);
 548        for (agaw = width_to_agaw(max_gaw);
 549             agaw >= 0; agaw--) {
 550                if (test_bit(agaw, &sagaw))
 551                        break;
 552        }
 553
 554        return agaw;
 555}
 556
 557/*
 558 * Calculate max SAGAW for each iommu.
 559 */
 560int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 561{
 562        return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 563}
 564
 565/*
 566 * calculate agaw for each iommu.
 567 * "SAGAW" may be different across iommus, use a default agaw, and
 568 * get a supported less agaw for iommus that don't support the default agaw.
 569 */
 570int iommu_calculate_agaw(struct intel_iommu *iommu)
 571{
 572        return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 573}
 574
 575/* This functionin only returns single iommu in a domain */
 576static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 577{
 578        int iommu_id;
 579
 580        /* si_domain and vm domain should not get here. */
 581        BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
 582        BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
 583
 584        iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
 585        if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 586                return NULL;
 587
 588        return g_iommus[iommu_id];
 589}
 590
 591static void domain_update_iommu_coherency(struct dmar_domain *domain)
 592{
 593        int i;
 594
 595        i = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
 596
 597        domain->iommu_coherency = i < g_num_of_iommus ? 1 : 0;
 598
 599        for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
 600                if (!ecap_coherent(g_iommus[i]->ecap)) {
 601                        domain->iommu_coherency = 0;
 602                        break;
 603                }
 604        }
 605}
 606
 607static void domain_update_iommu_snooping(struct dmar_domain *domain)
 608{
 609        int i;
 610
 611        domain->iommu_snooping = 1;
 612
 613        for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
 614                if (!ecap_sc_support(g_iommus[i]->ecap)) {
 615                        domain->iommu_snooping = 0;
 616                        break;
 617                }
 618        }
 619}
 620
 621static void domain_update_iommu_superpage(struct dmar_domain *domain)
 622{
 623        struct dmar_drhd_unit *drhd;
 624        struct intel_iommu *iommu = NULL;
 625        int mask = 0xf;
 626
 627        if (!intel_iommu_superpage) {
 628                domain->iommu_superpage = 0;
 629                return;
 630        }
 631
 632        /* set iommu_superpage to the smallest common denominator */
 633        for_each_active_iommu(iommu, drhd) {
 634                mask &= cap_super_page_val(iommu->cap);
 635                if (!mask) {
 636                        break;
 637                }
 638        }
 639        domain->iommu_superpage = fls(mask);
 640}
 641
 642/* Some capabilities may be different across iommus */
 643static void domain_update_iommu_cap(struct dmar_domain *domain)
 644{
 645        domain_update_iommu_coherency(domain);
 646        domain_update_iommu_snooping(domain);
 647        domain_update_iommu_superpage(domain);
 648}
 649
 650static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
 651{
 652        struct dmar_drhd_unit *drhd = NULL;
 653        int i;
 654
 655        for_each_drhd_unit(drhd) {
 656                if (drhd->ignored)
 657                        continue;
 658                if (segment != drhd->segment)
 659                        continue;
 660
 661                for (i = 0; i < drhd->devices_cnt; i++) {
 662                        if (drhd->devices[i] &&
 663                            drhd->devices[i]->bus->number == bus &&
 664                            drhd->devices[i]->devfn == devfn)
 665                                return drhd->iommu;
 666                        if (drhd->devices[i] &&
 667                            drhd->devices[i]->subordinate &&
 668                            drhd->devices[i]->subordinate->number <= bus &&
 669                            drhd->devices[i]->subordinate->busn_res.end >= bus)
 670                                return drhd->iommu;
 671                }
 672
 673                if (drhd->include_all)
 674                        return drhd->iommu;
 675        }
 676
 677        return NULL;
 678}
 679
 680static void domain_flush_cache(struct dmar_domain *domain,
 681                               void *addr, int size)
 682{
 683        if (!domain->iommu_coherency)
 684                clflush_cache_range(addr, size);
 685}
 686
 687/* Gets context entry for a given bus and devfn */
 688static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
 689                u8 bus, u8 devfn)
 690{
 691        struct root_entry *root;
 692        struct context_entry *context;
 693        unsigned long phy_addr;
 694        unsigned long flags;
 695
 696        spin_lock_irqsave(&iommu->lock, flags);
 697        root = &iommu->root_entry[bus];
 698        context = get_context_addr_from_root(root);
 699        if (!context) {
 700                context = (struct context_entry *)
 701                                alloc_pgtable_page(iommu->node);
 702                if (!context) {
 703                        spin_unlock_irqrestore(&iommu->lock, flags);
 704                        return NULL;
 705                }
 706                __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 707                phy_addr = virt_to_phys((void *)context);
 708                set_root_value(root, phy_addr);
 709                set_root_present(root);
 710                __iommu_flush_cache(iommu, root, sizeof(*root));
 711        }
 712        spin_unlock_irqrestore(&iommu->lock, flags);
 713        return &context[devfn];
 714}
 715
 716static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 717{
 718        struct root_entry *root;
 719        struct context_entry *context;
 720        int ret;
 721        unsigned long flags;
 722
 723        spin_lock_irqsave(&iommu->lock, flags);
 724        root = &iommu->root_entry[bus];
 725        context = get_context_addr_from_root(root);
 726        if (!context) {
 727                ret = 0;
 728                goto out;
 729        }
 730        ret = context_present(&context[devfn]);
 731out:
 732        spin_unlock_irqrestore(&iommu->lock, flags);
 733        return ret;
 734}
 735
 736static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
 737{
 738        struct root_entry *root;
 739        struct context_entry *context;
 740        unsigned long flags;
 741
 742        spin_lock_irqsave(&iommu->lock, flags);
 743        root = &iommu->root_entry[bus];
 744        context = get_context_addr_from_root(root);
 745        if (context) {
 746                context_clear_entry(&context[devfn]);
 747                __iommu_flush_cache(iommu, &context[devfn], \
 748                        sizeof(*context));
 749        }
 750        spin_unlock_irqrestore(&iommu->lock, flags);
 751}
 752
 753static void free_context_table(struct intel_iommu *iommu)
 754{
 755        struct root_entry *root;
 756        int i;
 757        unsigned long flags;
 758        struct context_entry *context;
 759
 760        spin_lock_irqsave(&iommu->lock, flags);
 761        if (!iommu->root_entry) {
 762                goto out;
 763        }
 764        for (i = 0; i < ROOT_ENTRY_NR; i++) {
 765                root = &iommu->root_entry[i];
 766                context = get_context_addr_from_root(root);
 767                if (context)
 768                        free_pgtable_page(context);
 769        }
 770        free_pgtable_page(iommu->root_entry);
 771        iommu->root_entry = NULL;
 772out:
 773        spin_unlock_irqrestore(&iommu->lock, flags);
 774}
 775
 776static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 777                                      unsigned long pfn, int target_level)
 778{
 779        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 780        struct dma_pte *parent, *pte = NULL;
 781        int level = agaw_to_level(domain->agaw);
 782        int offset;
 783
 784        BUG_ON(!domain->pgd);
 785        BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
 786        parent = domain->pgd;
 787
 788        while (level > 0) {
 789                void *tmp_page;
 790
 791                offset = pfn_level_offset(pfn, level);
 792                pte = &parent[offset];
 793                if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 794                        break;
 795                if (level == target_level)
 796                        break;
 797
 798                if (!dma_pte_present(pte)) {
 799                        uint64_t pteval;
 800
 801                        tmp_page = alloc_pgtable_page(domain->nid);
 802
 803                        if (!tmp_page)
 804                                return NULL;
 805
 806                        domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 807                        pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 808                        if (cmpxchg64(&pte->val, 0ULL, pteval)) {
 809                                /* Someone else set it while we were thinking; use theirs. */
 810                                free_pgtable_page(tmp_page);
 811                        } else {
 812                                dma_pte_addr(pte);
 813                                domain_flush_cache(domain, pte, sizeof(*pte));
 814                        }
 815                }
 816                parent = phys_to_virt(dma_pte_addr(pte));
 817                level--;
 818        }
 819
 820        return pte;
 821}
 822
 823
 824/* return address's pte at specific level */
 825static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
 826                                         unsigned long pfn,
 827                                         int level, int *large_page)
 828{
 829        struct dma_pte *parent, *pte = NULL;
 830        int total = agaw_to_level(domain->agaw);
 831        int offset;
 832
 833        parent = domain->pgd;
 834        while (level <= total) {
 835                offset = pfn_level_offset(pfn, total);
 836                pte = &parent[offset];
 837                if (level == total)
 838                        return pte;
 839
 840                if (!dma_pte_present(pte)) {
 841                        *large_page = total;
 842                        break;
 843                }
 844
 845                if (pte->val & DMA_PTE_LARGE_PAGE) {
 846                        *large_page = total;
 847                        return pte;
 848                }
 849
 850                parent = phys_to_virt(dma_pte_addr(pte));
 851                total--;
 852        }
 853        return NULL;
 854}
 855
 856/* clear last level pte, a tlb flush should be followed */
 857static int dma_pte_clear_range(struct dmar_domain *domain,
 858                                unsigned long start_pfn,
 859                                unsigned long last_pfn)
 860{
 861        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 862        unsigned int large_page = 1;
 863        struct dma_pte *first_pte, *pte;
 864        int order;
 865
 866        BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
 867        BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
 868        BUG_ON(start_pfn > last_pfn);
 869
 870        /* we don't need lock here; nobody else touches the iova range */
 871        do {
 872                large_page = 1;
 873                first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
 874                if (!pte) {
 875                        start_pfn = align_to_level(start_pfn + 1, large_page + 1);
 876                        continue;
 877                }
 878                do {
 879                        dma_clear_pte(pte);
 880                        start_pfn += lvl_to_nr_pages(large_page);
 881                        pte++;
 882                } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
 883
 884                domain_flush_cache(domain, first_pte,
 885                                   (void *)pte - (void *)first_pte);
 886
 887        } while (start_pfn && start_pfn <= last_pfn);
 888
 889        order = (large_page - 1) * 9;
 890        return order;
 891}
 892
 893/* free page table pages. last level pte should already be cleared */
 894static void dma_pte_free_pagetable(struct dmar_domain *domain,
 895                                   unsigned long start_pfn,
 896                                   unsigned long last_pfn)
 897{
 898        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 899        struct dma_pte *first_pte, *pte;
 900        int total = agaw_to_level(domain->agaw);
 901        int level;
 902        unsigned long tmp;
 903        int large_page = 2;
 904
 905        BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
 906        BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
 907        BUG_ON(start_pfn > last_pfn);
 908
 909        /* We don't need lock here; nobody else touches the iova range */
 910        level = 2;
 911        while (level <= total) {
 912                tmp = align_to_level(start_pfn, level);
 913
 914                /* If we can't even clear one PTE at this level, we're done */
 915                if (tmp + level_size(level) - 1 > last_pfn)
 916                        return;
 917
 918                do {
 919                        large_page = level;
 920                        first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
 921                        if (large_page > level)
 922                                level = large_page + 1;
 923                        if (!pte) {
 924                                tmp = align_to_level(tmp + 1, level + 1);
 925                                continue;
 926                        }
 927                        do {
 928                                if (dma_pte_present(pte)) {
 929                                        free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
 930                                        dma_clear_pte(pte);
 931                                }
 932                                pte++;
 933                                tmp += level_size(level);
 934                        } while (!first_pte_in_page(pte) &&
 935                                 tmp + level_size(level) - 1 <= last_pfn);
 936
 937                        domain_flush_cache(domain, first_pte,
 938                                           (void *)pte - (void *)first_pte);
 939                        
 940                } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
 941                level++;
 942        }
 943        /* free pgd */
 944        if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
 945                free_pgtable_page(domain->pgd);
 946                domain->pgd = NULL;
 947        }
 948}
 949
 950/* iommu handling */
 951static int iommu_alloc_root_entry(struct intel_iommu *iommu)
 952{
 953        struct root_entry *root;
 954        unsigned long flags;
 955
 956        root = (struct root_entry *)alloc_pgtable_page(iommu->node);
 957        if (!root)
 958                return -ENOMEM;
 959
 960        __iommu_flush_cache(iommu, root, ROOT_SIZE);
 961
 962        spin_lock_irqsave(&iommu->lock, flags);
 963        iommu->root_entry = root;
 964        spin_unlock_irqrestore(&iommu->lock, flags);
 965
 966        return 0;
 967}
 968
 969static void iommu_set_root_entry(struct intel_iommu *iommu)
 970{
 971        void *addr;
 972        u32 sts;
 973        unsigned long flag;
 974
 975        addr = iommu->root_entry;
 976
 977        raw_spin_lock_irqsave(&iommu->register_lock, flag);
 978        dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
 979
 980        writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
 981
 982        /* Make sure hardware complete it */
 983        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
 984                      readl, (sts & DMA_GSTS_RTPS), sts);
 985
 986        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
 987}
 988
 989static void iommu_flush_write_buffer(struct intel_iommu *iommu)
 990{
 991        u32 val;
 992        unsigned long flag;
 993
 994        if (!rwbf_quirk && !cap_rwbf(iommu->cap))
 995                return;
 996
 997        raw_spin_lock_irqsave(&iommu->register_lock, flag);
 998        writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
 999
1000        /* Make sure hardware complete it */
1001        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1002                      readl, (!(val & DMA_GSTS_WBFS)), val);
1003
1004        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1005}
1006
1007/* return value determine if we need a write buffer flush */
1008static void __iommu_flush_context(struct intel_iommu *iommu,
1009                                  u16 did, u16 source_id, u8 function_mask,
1010                                  u64 type)
1011{
1012        u64 val = 0;
1013        unsigned long flag;
1014
1015        switch (type) {
1016        case DMA_CCMD_GLOBAL_INVL:
1017                val = DMA_CCMD_GLOBAL_INVL;
1018                break;
1019        case DMA_CCMD_DOMAIN_INVL:
1020                val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1021                break;
1022        case DMA_CCMD_DEVICE_INVL:
1023                val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1024                        | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1025                break;
1026        default:
1027                BUG();
1028        }
1029        val |= DMA_CCMD_ICC;
1030
1031        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1032        dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1033
1034        /* Make sure hardware complete it */
1035        IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1036                dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1037
1038        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1039}
1040
1041/* return value determine if we need a write buffer flush */
1042static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1043                                u64 addr, unsigned int size_order, u64 type)
1044{
1045        int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1046        u64 val = 0, val_iva = 0;
1047        unsigned long flag;
1048
1049        switch (type) {
1050        case DMA_TLB_GLOBAL_FLUSH:
1051                /* global flush doesn't need set IVA_REG */
1052                val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1053                break;
1054        case DMA_TLB_DSI_FLUSH:
1055                val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1056                break;
1057        case DMA_TLB_PSI_FLUSH:
1058                val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1059                /* Note: always flush non-leaf currently */
1060                val_iva = size_order | addr;
1061                break;
1062        default:
1063                BUG();
1064        }
1065        /* Note: set drain read/write */
1066#if 0
1067        /*
1068         * This is probably to be super secure.. Looks like we can
1069         * ignore it without any impact.
1070         */
1071        if (cap_read_drain(iommu->cap))
1072                val |= DMA_TLB_READ_DRAIN;
1073#endif
1074        if (cap_write_drain(iommu->cap))
1075                val |= DMA_TLB_WRITE_DRAIN;
1076
1077        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1078        /* Note: Only uses first TLB reg currently */
1079        if (val_iva)
1080                dmar_writeq(iommu->reg + tlb_offset, val_iva);
1081        dmar_writeq(iommu->reg + tlb_offset + 8, val);
1082
1083        /* Make sure hardware complete it */
1084        IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1085                dmar_readq, (!(val & DMA_TLB_IVT)), val);
1086
1087        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1088
1089        /* check IOTLB invalidation granularity */
1090        if (DMA_TLB_IAIG(val) == 0)
1091                printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1092        if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1093                pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1094                        (unsigned long long)DMA_TLB_IIRG(type),
1095                        (unsigned long long)DMA_TLB_IAIG(val));
1096}
1097
1098static struct device_domain_info *iommu_support_dev_iotlb(
1099        struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1100{
1101        int found = 0;
1102        unsigned long flags;
1103        struct device_domain_info *info;
1104        struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1105
1106        if (!ecap_dev_iotlb_support(iommu->ecap))
1107                return NULL;
1108
1109        if (!iommu->qi)
1110                return NULL;
1111
1112        spin_lock_irqsave(&device_domain_lock, flags);
1113        list_for_each_entry(info, &domain->devices, link)
1114                if (info->bus == bus && info->devfn == devfn) {
1115                        found = 1;
1116                        break;
1117                }
1118        spin_unlock_irqrestore(&device_domain_lock, flags);
1119
1120        if (!found || !info->dev)
1121                return NULL;
1122
1123        if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1124                return NULL;
1125
1126        if (!dmar_find_matched_atsr_unit(info->dev))
1127                return NULL;
1128
1129        info->iommu = iommu;
1130
1131        return info;
1132}
1133
1134static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1135{
1136        if (!info)
1137                return;
1138
1139        pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1140}
1141
1142static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1143{
1144        if (!info->dev || !pci_ats_enabled(info->dev))
1145                return;
1146
1147        pci_disable_ats(info->dev);
1148}
1149
1150static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1151                                  u64 addr, unsigned mask)
1152{
1153        u16 sid, qdep;
1154        unsigned long flags;
1155        struct device_domain_info *info;
1156
1157        spin_lock_irqsave(&device_domain_lock, flags);
1158        list_for_each_entry(info, &domain->devices, link) {
1159                if (!info->dev || !pci_ats_enabled(info->dev))
1160                        continue;
1161
1162                sid = info->bus << 8 | info->devfn;
1163                qdep = pci_ats_queue_depth(info->dev);
1164                qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1165        }
1166        spin_unlock_irqrestore(&device_domain_lock, flags);
1167}
1168
1169static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1170                                  unsigned long pfn, unsigned int pages, int map)
1171{
1172        unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1173        uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1174
1175        BUG_ON(pages == 0);
1176
1177        /*
1178         * Fallback to domain selective flush if no PSI support or the size is
1179         * too big.
1180         * PSI requires page size to be 2 ^ x, and the base address is naturally
1181         * aligned to the size
1182         */
1183        if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1184                iommu->flush.flush_iotlb(iommu, did, 0, 0,
1185                                                DMA_TLB_DSI_FLUSH);
1186        else
1187                iommu->flush.flush_iotlb(iommu, did, addr, mask,
1188                                                DMA_TLB_PSI_FLUSH);
1189
1190        /*
1191         * In caching mode, changes of pages from non-present to present require
1192         * flush. However, device IOTLB doesn't need to be flushed in this case.
1193         */
1194        if (!cap_caching_mode(iommu->cap) || !map)
1195                iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1196}
1197
1198static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1199{
1200        u32 pmen;
1201        unsigned long flags;
1202
1203        raw_spin_lock_irqsave(&iommu->register_lock, flags);
1204        pmen = readl(iommu->reg + DMAR_PMEN_REG);
1205        pmen &= ~DMA_PMEN_EPM;
1206        writel(pmen, iommu->reg + DMAR_PMEN_REG);
1207
1208        /* wait for the protected region status bit to clear */
1209        IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1210                readl, !(pmen & DMA_PMEN_PRS), pmen);
1211
1212        raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1213}
1214
1215static int iommu_enable_translation(struct intel_iommu *iommu)
1216{
1217        u32 sts;
1218        unsigned long flags;
1219
1220        raw_spin_lock_irqsave(&iommu->register_lock, flags);
1221        iommu->gcmd |= DMA_GCMD_TE;
1222        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1223
1224        /* Make sure hardware complete it */
1225        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1226                      readl, (sts & DMA_GSTS_TES), sts);
1227
1228        raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1229        return 0;
1230}
1231
1232static int iommu_disable_translation(struct intel_iommu *iommu)
1233{
1234        u32 sts;
1235        unsigned long flag;
1236
1237        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1238        iommu->gcmd &= ~DMA_GCMD_TE;
1239        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1240
1241        /* Make sure hardware complete it */
1242        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1243                      readl, (!(sts & DMA_GSTS_TES)), sts);
1244
1245        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1246        return 0;
1247}
1248
1249
1250static int iommu_init_domains(struct intel_iommu *iommu)
1251{
1252        unsigned long ndomains;
1253        unsigned long nlongs;
1254
1255        ndomains = cap_ndoms(iommu->cap);
1256        pr_debug("IOMMU %d: Number of Domains supported <%ld>\n", iommu->seq_id,
1257                        ndomains);
1258        nlongs = BITS_TO_LONGS(ndomains);
1259
1260        spin_lock_init(&iommu->lock);
1261
1262        /* TBD: there might be 64K domains,
1263         * consider other allocation for future chip
1264         */
1265        iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1266        if (!iommu->domain_ids) {
1267                printk(KERN_ERR "Allocating domain id array failed\n");
1268                return -ENOMEM;
1269        }
1270        iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1271                        GFP_KERNEL);
1272        if (!iommu->domains) {
1273                printk(KERN_ERR "Allocating domain array failed\n");
1274                return -ENOMEM;
1275        }
1276
1277        /*
1278         * if Caching mode is set, then invalid translations are tagged
1279         * with domainid 0. Hence we need to pre-allocate it.
1280         */
1281        if (cap_caching_mode(iommu->cap))
1282                set_bit(0, iommu->domain_ids);
1283        return 0;
1284}
1285
1286
1287static void domain_exit(struct dmar_domain *domain);
1288static void vm_domain_exit(struct dmar_domain *domain);
1289
1290void free_dmar_iommu(struct intel_iommu *iommu)
1291{
1292        struct dmar_domain *domain;
1293        int i;
1294        unsigned long flags;
1295
1296        if ((iommu->domains) && (iommu->domain_ids)) {
1297                for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1298                        domain = iommu->domains[i];
1299                        clear_bit(i, iommu->domain_ids);
1300
1301                        spin_lock_irqsave(&domain->iommu_lock, flags);
1302                        if (--domain->iommu_count == 0) {
1303                                if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1304                                        vm_domain_exit(domain);
1305                                else
1306                                        domain_exit(domain);
1307                        }
1308                        spin_unlock_irqrestore(&domain->iommu_lock, flags);
1309                }
1310        }
1311
1312        if (iommu->gcmd & DMA_GCMD_TE)
1313                iommu_disable_translation(iommu);
1314
1315        if (iommu->irq) {
1316                irq_set_handler_data(iommu->irq, NULL);
1317                /* This will mask the irq */
1318                free_irq(iommu->irq, iommu);
1319                destroy_irq(iommu->irq);
1320        }
1321
1322        kfree(iommu->domains);
1323        kfree(iommu->domain_ids);
1324
1325        g_iommus[iommu->seq_id] = NULL;
1326
1327        /* if all iommus are freed, free g_iommus */
1328        for (i = 0; i < g_num_of_iommus; i++) {
1329                if (g_iommus[i])
1330                        break;
1331        }
1332
1333        if (i == g_num_of_iommus)
1334                kfree(g_iommus);
1335
1336        /* free context mapping */
1337        free_context_table(iommu);
1338}
1339
1340static struct dmar_domain *alloc_domain(void)
1341{
1342        struct dmar_domain *domain;
1343
1344        domain = alloc_domain_mem();
1345        if (!domain)
1346                return NULL;
1347
1348        domain->nid = -1;
1349        memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1350        domain->flags = 0;
1351
1352        return domain;
1353}
1354
1355static int iommu_attach_domain(struct dmar_domain *domain,
1356                               struct intel_iommu *iommu)
1357{
1358        int num;
1359        unsigned long ndomains;
1360        unsigned long flags;
1361
1362        ndomains = cap_ndoms(iommu->cap);
1363
1364        spin_lock_irqsave(&iommu->lock, flags);
1365
1366        num = find_first_zero_bit(iommu->domain_ids, ndomains);
1367        if (num >= ndomains) {
1368                spin_unlock_irqrestore(&iommu->lock, flags);
1369                printk(KERN_ERR "IOMMU: no free domain ids\n");
1370                return -ENOMEM;
1371        }
1372
1373        domain->id = num;
1374        set_bit(num, iommu->domain_ids);
1375        set_bit(iommu->seq_id, domain->iommu_bmp);
1376        iommu->domains[num] = domain;
1377        spin_unlock_irqrestore(&iommu->lock, flags);
1378
1379        return 0;
1380}
1381
1382static void iommu_detach_domain(struct dmar_domain *domain,
1383                                struct intel_iommu *iommu)
1384{
1385        unsigned long flags;
1386        int num, ndomains;
1387        int found = 0;
1388
1389        spin_lock_irqsave(&iommu->lock, flags);
1390        ndomains = cap_ndoms(iommu->cap);
1391        for_each_set_bit(num, iommu->domain_ids, ndomains) {
1392                if (iommu->domains[num] == domain) {
1393                        found = 1;
1394                        break;
1395                }
1396        }
1397
1398        if (found) {
1399                clear_bit(num, iommu->domain_ids);
1400                clear_bit(iommu->seq_id, domain->iommu_bmp);
1401                iommu->domains[num] = NULL;
1402        }
1403        spin_unlock_irqrestore(&iommu->lock, flags);
1404}
1405
1406static struct iova_domain reserved_iova_list;
1407static struct lock_class_key reserved_rbtree_key;
1408
1409static int dmar_init_reserved_ranges(void)
1410{
1411        struct pci_dev *pdev = NULL;
1412        struct iova *iova;
1413        int i;
1414
1415        init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1416
1417        lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1418                &reserved_rbtree_key);
1419
1420        /* IOAPIC ranges shouldn't be accessed by DMA */
1421        iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1422                IOVA_PFN(IOAPIC_RANGE_END));
1423        if (!iova) {
1424                printk(KERN_ERR "Reserve IOAPIC range failed\n");
1425                return -ENODEV;
1426        }
1427
1428        /* Reserve all PCI MMIO to avoid peer-to-peer access */
1429        for_each_pci_dev(pdev) {
1430                struct resource *r;
1431
1432                for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1433                        r = &pdev->resource[i];
1434                        if (!r->flags || !(r->flags & IORESOURCE_MEM))
1435                                continue;
1436                        iova = reserve_iova(&reserved_iova_list,
1437                                            IOVA_PFN(r->start),
1438                                            IOVA_PFN(r->end));
1439                        if (!iova) {
1440                                printk(KERN_ERR "Reserve iova failed\n");
1441                                return -ENODEV;
1442                        }
1443                }
1444        }
1445        return 0;
1446}
1447
1448static void domain_reserve_special_ranges(struct dmar_domain *domain)
1449{
1450        copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1451}
1452
1453static inline int guestwidth_to_adjustwidth(int gaw)
1454{
1455        int agaw;
1456        int r = (gaw - 12) % 9;
1457
1458        if (r == 0)
1459                agaw = gaw;
1460        else
1461                agaw = gaw + 9 - r;
1462        if (agaw > 64)
1463                agaw = 64;
1464        return agaw;
1465}
1466
1467static int domain_init(struct dmar_domain *domain, int guest_width)
1468{
1469        struct intel_iommu *iommu;
1470        int adjust_width, agaw;
1471        unsigned long sagaw;
1472
1473        init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1474        spin_lock_init(&domain->iommu_lock);
1475
1476        domain_reserve_special_ranges(domain);
1477
1478        /* calculate AGAW */
1479        iommu = domain_get_iommu(domain);
1480        if (guest_width > cap_mgaw(iommu->cap))
1481                guest_width = cap_mgaw(iommu->cap);
1482        domain->gaw = guest_width;
1483        adjust_width = guestwidth_to_adjustwidth(guest_width);
1484        agaw = width_to_agaw(adjust_width);
1485        sagaw = cap_sagaw(iommu->cap);
1486        if (!test_bit(agaw, &sagaw)) {
1487                /* hardware doesn't support it, choose a bigger one */
1488                pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1489                agaw = find_next_bit(&sagaw, 5, agaw);
1490                if (agaw >= 5)
1491                        return -ENODEV;
1492        }
1493        domain->agaw = agaw;
1494        INIT_LIST_HEAD(&domain->devices);
1495
1496        if (ecap_coherent(iommu->ecap))
1497                domain->iommu_coherency = 1;
1498        else
1499                domain->iommu_coherency = 0;
1500
1501        if (ecap_sc_support(iommu->ecap))
1502                domain->iommu_snooping = 1;
1503        else
1504                domain->iommu_snooping = 0;
1505
1506        domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1507        domain->iommu_count = 1;
1508        domain->nid = iommu->node;
1509
1510        /* always allocate the top pgd */
1511        domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1512        if (!domain->pgd)
1513                return -ENOMEM;
1514        __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1515        return 0;
1516}
1517
1518static void domain_exit(struct dmar_domain *domain)
1519{
1520        struct dmar_drhd_unit *drhd;
1521        struct intel_iommu *iommu;
1522
1523        /* Domain 0 is reserved, so dont process it */
1524        if (!domain)
1525                return;
1526
1527        /* Flush any lazy unmaps that may reference this domain */
1528        if (!intel_iommu_strict)
1529                flush_unmaps_timeout(0);
1530
1531        domain_remove_dev_info(domain);
1532        /* destroy iovas */
1533        put_iova_domain(&domain->iovad);
1534
1535        /* clear ptes */
1536        dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1537
1538        /* free page tables */
1539        dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1540
1541        for_each_active_iommu(iommu, drhd)
1542                if (test_bit(iommu->seq_id, domain->iommu_bmp))
1543                        iommu_detach_domain(domain, iommu);
1544
1545        free_domain_mem(domain);
1546}
1547
1548static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1549                                 u8 bus, u8 devfn, int translation)
1550{
1551        struct context_entry *context;
1552        unsigned long flags;
1553        struct intel_iommu *iommu;
1554        struct dma_pte *pgd;
1555        unsigned long num;
1556        unsigned long ndomains;
1557        int id;
1558        int agaw;
1559        struct device_domain_info *info = NULL;
1560
1561        pr_debug("Set context mapping for %02x:%02x.%d\n",
1562                bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1563
1564        BUG_ON(!domain->pgd);
1565        BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1566               translation != CONTEXT_TT_MULTI_LEVEL);
1567
1568        iommu = device_to_iommu(segment, bus, devfn);
1569        if (!iommu)
1570                return -ENODEV;
1571
1572        context = device_to_context_entry(iommu, bus, devfn);
1573        if (!context)
1574                return -ENOMEM;
1575        spin_lock_irqsave(&iommu->lock, flags);
1576        if (context_present(context)) {
1577                spin_unlock_irqrestore(&iommu->lock, flags);
1578                return 0;
1579        }
1580
1581        id = domain->id;
1582        pgd = domain->pgd;
1583
1584        if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1585            domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1586                int found = 0;
1587
1588                /* find an available domain id for this device in iommu */
1589                ndomains = cap_ndoms(iommu->cap);
1590                for_each_set_bit(num, iommu->domain_ids, ndomains) {
1591                        if (iommu->domains[num] == domain) {
1592                                id = num;
1593                                found = 1;
1594                                break;
1595                        }
1596                }
1597
1598                if (found == 0) {
1599                        num = find_first_zero_bit(iommu->domain_ids, ndomains);
1600                        if (num >= ndomains) {
1601                                spin_unlock_irqrestore(&iommu->lock, flags);
1602                                printk(KERN_ERR "IOMMU: no free domain ids\n");
1603                                return -EFAULT;
1604                        }
1605
1606                        set_bit(num, iommu->domain_ids);
1607                        iommu->domains[num] = domain;
1608                        id = num;
1609                }
1610
1611                /* Skip top levels of page tables for
1612                 * iommu which has less agaw than default.
1613                 * Unnecessary for PT mode.
1614                 */
1615                if (translation != CONTEXT_TT_PASS_THROUGH) {
1616                        for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1617                                pgd = phys_to_virt(dma_pte_addr(pgd));
1618                                if (!dma_pte_present(pgd)) {
1619                                        spin_unlock_irqrestore(&iommu->lock, flags);
1620                                        return -ENOMEM;
1621                                }
1622                        }
1623                }
1624        }
1625
1626        context_set_domain_id(context, id);
1627
1628        if (translation != CONTEXT_TT_PASS_THROUGH) {
1629                info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1630                translation = info ? CONTEXT_TT_DEV_IOTLB :
1631                                     CONTEXT_TT_MULTI_LEVEL;
1632        }
1633        /*
1634         * In pass through mode, AW must be programmed to indicate the largest
1635         * AGAW value supported by hardware. And ASR is ignored by hardware.
1636         */
1637        if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1638                context_set_address_width(context, iommu->msagaw);
1639        else {
1640                context_set_address_root(context, virt_to_phys(pgd));
1641                context_set_address_width(context, iommu->agaw);
1642        }
1643
1644        context_set_translation_type(context, translation);
1645        context_set_fault_enable(context);
1646        context_set_present(context);
1647        domain_flush_cache(domain, context, sizeof(*context));
1648
1649        /*
1650         * It's a non-present to present mapping. If hardware doesn't cache
1651         * non-present entry we only need to flush the write-buffer. If the
1652         * _does_ cache non-present entries, then it does so in the special
1653         * domain #0, which we have to flush:
1654         */
1655        if (cap_caching_mode(iommu->cap)) {
1656                iommu->flush.flush_context(iommu, 0,
1657                                           (((u16)bus) << 8) | devfn,
1658                                           DMA_CCMD_MASK_NOBIT,
1659                                           DMA_CCMD_DEVICE_INVL);
1660                iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1661        } else {
1662                iommu_flush_write_buffer(iommu);
1663        }
1664        iommu_enable_dev_iotlb(info);
1665        spin_unlock_irqrestore(&iommu->lock, flags);
1666
1667        spin_lock_irqsave(&domain->iommu_lock, flags);
1668        if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1669                domain->iommu_count++;
1670                if (domain->iommu_count == 1)
1671                        domain->nid = iommu->node;
1672                domain_update_iommu_cap(domain);
1673        }
1674        spin_unlock_irqrestore(&domain->iommu_lock, flags);
1675        return 0;
1676}
1677
1678static int
1679domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1680                        int translation)
1681{
1682        int ret;
1683        struct pci_dev *tmp, *parent;
1684
1685        ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1686                                         pdev->bus->number, pdev->devfn,
1687                                         translation);
1688        if (ret)
1689                return ret;
1690
1691        /* dependent device mapping */
1692        tmp = pci_find_upstream_pcie_bridge(pdev);
1693        if (!tmp)
1694                return 0;
1695        /* Secondary interface's bus number and devfn 0 */
1696        parent = pdev->bus->self;
1697        while (parent != tmp) {
1698                ret = domain_context_mapping_one(domain,
1699                                                 pci_domain_nr(parent->bus),
1700                                                 parent->bus->number,
1701                                                 parent->devfn, translation);
1702                if (ret)
1703                        return ret;
1704                parent = parent->bus->self;
1705        }
1706        if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1707                return domain_context_mapping_one(domain,
1708                                        pci_domain_nr(tmp->subordinate),
1709                                        tmp->subordinate->number, 0,
1710                                        translation);
1711        else /* this is a legacy PCI bridge */
1712                return domain_context_mapping_one(domain,
1713                                                  pci_domain_nr(tmp->bus),
1714                                                  tmp->bus->number,
1715                                                  tmp->devfn,
1716                                                  translation);
1717}
1718
1719static int domain_context_mapped(struct pci_dev *pdev)
1720{
1721        int ret;
1722        struct pci_dev *tmp, *parent;
1723        struct intel_iommu *iommu;
1724
1725        iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1726                                pdev->devfn);
1727        if (!iommu)
1728                return -ENODEV;
1729
1730        ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1731        if (!ret)
1732                return ret;
1733        /* dependent device mapping */
1734        tmp = pci_find_upstream_pcie_bridge(pdev);
1735        if (!tmp)
1736                return ret;
1737        /* Secondary interface's bus number and devfn 0 */
1738        parent = pdev->bus->self;
1739        while (parent != tmp) {
1740                ret = device_context_mapped(iommu, parent->bus->number,
1741                                            parent->devfn);
1742                if (!ret)
1743                        return ret;
1744                parent = parent->bus->self;
1745        }
1746        if (pci_is_pcie(tmp))
1747                return device_context_mapped(iommu, tmp->subordinate->number,
1748                                             0);
1749        else
1750                return device_context_mapped(iommu, tmp->bus->number,
1751                                             tmp->devfn);
1752}
1753
1754/* Returns a number of VTD pages, but aligned to MM page size */
1755static inline unsigned long aligned_nrpages(unsigned long host_addr,
1756                                            size_t size)
1757{
1758        host_addr &= ~PAGE_MASK;
1759        return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1760}
1761
1762/* Return largest possible superpage level for a given mapping */
1763static inline int hardware_largepage_caps(struct dmar_domain *domain,
1764                                          unsigned long iov_pfn,
1765                                          unsigned long phy_pfn,
1766                                          unsigned long pages)
1767{
1768        int support, level = 1;
1769        unsigned long pfnmerge;
1770
1771        support = domain->iommu_superpage;
1772
1773        /* To use a large page, the virtual *and* physical addresses
1774           must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1775           of them will mean we have to use smaller pages. So just
1776           merge them and check both at once. */
1777        pfnmerge = iov_pfn | phy_pfn;
1778
1779        while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1780                pages >>= VTD_STRIDE_SHIFT;
1781                if (!pages)
1782                        break;
1783                pfnmerge >>= VTD_STRIDE_SHIFT;
1784                level++;
1785                support--;
1786        }
1787        return level;
1788}
1789
1790static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1791                            struct scatterlist *sg, unsigned long phys_pfn,
1792                            unsigned long nr_pages, int prot)
1793{
1794        struct dma_pte *first_pte = NULL, *pte = NULL;
1795        phys_addr_t uninitialized_var(pteval);
1796        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1797        unsigned long sg_res;
1798        unsigned int largepage_lvl = 0;
1799        unsigned long lvl_pages = 0;
1800
1801        BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1802
1803        if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1804                return -EINVAL;
1805
1806        prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1807
1808        if (sg)
1809                sg_res = 0;
1810        else {
1811                sg_res = nr_pages + 1;
1812                pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1813        }
1814
1815        while (nr_pages > 0) {
1816                uint64_t tmp;
1817
1818                if (!sg_res) {
1819                        sg_res = aligned_nrpages(sg->offset, sg->length);
1820                        sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1821                        sg->dma_length = sg->length;
1822                        pteval = page_to_phys(sg_page(sg)) | prot;
1823                        phys_pfn = pteval >> VTD_PAGE_SHIFT;
1824                }
1825
1826                if (!pte) {
1827                        largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1828
1829                        first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1830                        if (!pte)
1831                                return -ENOMEM;
1832                        /* It is large page*/
1833                        if (largepage_lvl > 1) {
1834                                pteval |= DMA_PTE_LARGE_PAGE;
1835                                /* Ensure that old small page tables are removed to make room
1836                                   for superpage, if they exist. */
1837                                dma_pte_clear_range(domain, iov_pfn,
1838                                                    iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1839                                dma_pte_free_pagetable(domain, iov_pfn,
1840                                                       iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1841                        } else {
1842                                pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1843                        }
1844
1845                }
1846                /* We don't need lock here, nobody else
1847                 * touches the iova range
1848                 */
1849                tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1850                if (tmp) {
1851                        static int dumps = 5;
1852                        printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1853                               iov_pfn, tmp, (unsigned long long)pteval);
1854                        if (dumps) {
1855                                dumps--;
1856                                debug_dma_dump_mappings(NULL);
1857                        }
1858                        WARN_ON(1);
1859                }
1860
1861                lvl_pages = lvl_to_nr_pages(largepage_lvl);
1862
1863                BUG_ON(nr_pages < lvl_pages);
1864                BUG_ON(sg_res < lvl_pages);
1865
1866                nr_pages -= lvl_pages;
1867                iov_pfn += lvl_pages;
1868                phys_pfn += lvl_pages;
1869                pteval += lvl_pages * VTD_PAGE_SIZE;
1870                sg_res -= lvl_pages;
1871
1872                /* If the next PTE would be the first in a new page, then we
1873                   need to flush the cache on the entries we've just written.
1874                   And then we'll need to recalculate 'pte', so clear it and
1875                   let it get set again in the if (!pte) block above.
1876
1877                   If we're done (!nr_pages) we need to flush the cache too.
1878
1879                   Also if we've been setting superpages, we may need to
1880                   recalculate 'pte' and switch back to smaller pages for the
1881                   end of the mapping, if the trailing size is not enough to
1882                   use another superpage (i.e. sg_res < lvl_pages). */
1883                pte++;
1884                if (!nr_pages || first_pte_in_page(pte) ||
1885                    (largepage_lvl > 1 && sg_res < lvl_pages)) {
1886                        domain_flush_cache(domain, first_pte,
1887                                           (void *)pte - (void *)first_pte);
1888                        pte = NULL;
1889                }
1890
1891                if (!sg_res && nr_pages)
1892                        sg = sg_next(sg);
1893        }
1894        return 0;
1895}
1896
1897static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1898                                    struct scatterlist *sg, unsigned long nr_pages,
1899                                    int prot)
1900{
1901        return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1902}
1903
1904static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1905                                     unsigned long phys_pfn, unsigned long nr_pages,
1906                                     int prot)
1907{
1908        return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1909}
1910
1911static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1912{
1913        if (!iommu)
1914                return;
1915
1916        clear_context_table(iommu, bus, devfn);
1917        iommu->flush.flush_context(iommu, 0, 0, 0,
1918                                           DMA_CCMD_GLOBAL_INVL);
1919        iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1920}
1921
1922static inline void unlink_domain_info(struct device_domain_info *info)
1923{
1924        assert_spin_locked(&device_domain_lock);
1925        list_del(&info->link);
1926        list_del(&info->global);
1927        if (info->dev)
1928                info->dev->dev.archdata.iommu = NULL;
1929}
1930
1931static void domain_remove_dev_info(struct dmar_domain *domain)
1932{
1933        struct device_domain_info *info;
1934        unsigned long flags;
1935        struct intel_iommu *iommu;
1936
1937        spin_lock_irqsave(&device_domain_lock, flags);
1938        while (!list_empty(&domain->devices)) {
1939                info = list_entry(domain->devices.next,
1940                        struct device_domain_info, link);
1941                unlink_domain_info(info);
1942                spin_unlock_irqrestore(&device_domain_lock, flags);
1943
1944                iommu_disable_dev_iotlb(info);
1945                iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1946                iommu_detach_dev(iommu, info->bus, info->devfn);
1947                free_devinfo_mem(info);
1948
1949                spin_lock_irqsave(&device_domain_lock, flags);
1950        }
1951        spin_unlock_irqrestore(&device_domain_lock, flags);
1952}
1953
1954/*
1955 * find_domain
1956 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1957 */
1958static struct dmar_domain *
1959find_domain(struct pci_dev *pdev)
1960{
1961        struct device_domain_info *info;
1962
1963        /* No lock here, assumes no domain exit in normal case */
1964        info = pdev->dev.archdata.iommu;
1965        if (info)
1966                return info->domain;
1967        return NULL;
1968}
1969
1970/* domain is initialized */
1971static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1972{
1973        struct dmar_domain *domain, *found = NULL;
1974        struct intel_iommu *iommu;
1975        struct dmar_drhd_unit *drhd;
1976        struct device_domain_info *info, *tmp;
1977        struct pci_dev *dev_tmp;
1978        unsigned long flags;
1979        int bus = 0, devfn = 0;
1980        int segment;
1981        int ret;
1982
1983        domain = find_domain(pdev);
1984        if (domain)
1985                return domain;
1986
1987        segment = pci_domain_nr(pdev->bus);
1988
1989        dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1990        if (dev_tmp) {
1991                if (pci_is_pcie(dev_tmp)) {
1992                        bus = dev_tmp->subordinate->number;
1993                        devfn = 0;
1994                } else {
1995                        bus = dev_tmp->bus->number;
1996                        devfn = dev_tmp->devfn;
1997                }
1998                spin_lock_irqsave(&device_domain_lock, flags);
1999                list_for_each_entry(info, &device_domain_list, global) {
2000                        if (info->segment == segment &&
2001                            info->bus == bus && info->devfn == devfn) {
2002                                found = info->domain;
2003                                break;
2004                        }
2005                }
2006                spin_unlock_irqrestore(&device_domain_lock, flags);
2007                /* pcie-pci bridge already has a domain, uses it */
2008                if (found) {
2009                        domain = found;
2010                        goto found_domain;
2011                }
2012        }
2013
2014        domain = alloc_domain();
2015        if (!domain)
2016                goto error;
2017
2018        /* Allocate new domain for the device */
2019        drhd = dmar_find_matched_drhd_unit(pdev);
2020        if (!drhd) {
2021                printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
2022                        pci_name(pdev));
2023                free_domain_mem(domain);
2024                return NULL;
2025        }
2026        iommu = drhd->iommu;
2027
2028        ret = iommu_attach_domain(domain, iommu);
2029        if (ret) {
2030                free_domain_mem(domain);
2031                goto error;
2032        }
2033
2034        if (domain_init(domain, gaw)) {
2035                domain_exit(domain);
2036                goto error;
2037        }
2038
2039        /* register pcie-to-pci device */
2040        if (dev_tmp) {
2041                info = alloc_devinfo_mem();
2042                if (!info) {
2043                        domain_exit(domain);
2044                        goto error;
2045                }
2046                info->segment = segment;
2047                info->bus = bus;
2048                info->devfn = devfn;
2049                info->dev = NULL;
2050                info->domain = domain;
2051                /* This domain is shared by devices under p2p bridge */
2052                domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2053
2054                /* pcie-to-pci bridge already has a domain, uses it */
2055                found = NULL;
2056                spin_lock_irqsave(&device_domain_lock, flags);
2057                list_for_each_entry(tmp, &device_domain_list, global) {
2058                        if (tmp->segment == segment &&
2059                            tmp->bus == bus && tmp->devfn == devfn) {
2060                                found = tmp->domain;
2061                                break;
2062                        }
2063                }
2064                if (found) {
2065                        spin_unlock_irqrestore(&device_domain_lock, flags);
2066                        free_devinfo_mem(info);
2067                        domain_exit(domain);
2068                        domain = found;
2069                } else {
2070                        list_add(&info->link, &domain->devices);
2071                        list_add(&info->global, &device_domain_list);
2072                        spin_unlock_irqrestore(&device_domain_lock, flags);
2073                }
2074        }
2075
2076found_domain:
2077        info = alloc_devinfo_mem();
2078        if (!info)
2079                goto error;
2080        info->segment = segment;
2081        info->bus = pdev->bus->number;
2082        info->devfn = pdev->devfn;
2083        info->dev = pdev;
2084        info->domain = domain;
2085        spin_lock_irqsave(&device_domain_lock, flags);
2086        /* somebody is fast */
2087        found = find_domain(pdev);
2088        if (found != NULL) {
2089                spin_unlock_irqrestore(&device_domain_lock, flags);
2090                if (found != domain) {
2091                        domain_exit(domain);
2092                        domain = found;
2093                }
2094                free_devinfo_mem(info);
2095                return domain;
2096        }
2097        list_add(&info->link, &domain->devices);
2098        list_add(&info->global, &device_domain_list);
2099        pdev->dev.archdata.iommu = info;
2100        spin_unlock_irqrestore(&device_domain_lock, flags);
2101        return domain;
2102error:
2103        /* recheck it here, maybe others set it */
2104        return find_domain(pdev);
2105}
2106
2107static int iommu_identity_mapping;
2108#define IDENTMAP_ALL            1
2109#define IDENTMAP_GFX            2
2110#define IDENTMAP_AZALIA         4
2111
2112static int iommu_domain_identity_map(struct dmar_domain *domain,
2113                                     unsigned long long start,
2114                                     unsigned long long end)
2115{
2116        unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2117        unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2118
2119        if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2120                          dma_to_mm_pfn(last_vpfn))) {
2121                printk(KERN_ERR "IOMMU: reserve iova failed\n");
2122                return -ENOMEM;
2123        }
2124
2125        pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2126                 start, end, domain->id);
2127        /*
2128         * RMRR range might have overlap with physical memory range,
2129         * clear it first
2130         */
2131        dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2132
2133        return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2134                                  last_vpfn - first_vpfn + 1,
2135                                  DMA_PTE_READ|DMA_PTE_WRITE);
2136}
2137
2138static int iommu_prepare_identity_map(struct pci_dev *pdev,
2139                                      unsigned long long start,
2140                                      unsigned long long end)
2141{
2142        struct dmar_domain *domain;
2143        int ret;
2144
2145        domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2146        if (!domain)
2147                return -ENOMEM;
2148
2149        /* For _hardware_ passthrough, don't bother. But for software
2150           passthrough, we do it anyway -- it may indicate a memory
2151           range which is reserved in E820, so which didn't get set
2152           up to start with in si_domain */
2153        if (domain == si_domain && hw_pass_through) {
2154                printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2155                       pci_name(pdev), start, end);
2156                return 0;
2157        }
2158
2159        printk(KERN_INFO
2160               "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2161               pci_name(pdev), start, end);
2162        
2163        if (end < start) {
2164                WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2165                        "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2166                        dmi_get_system_info(DMI_BIOS_VENDOR),
2167                        dmi_get_system_info(DMI_BIOS_VERSION),
2168                     dmi_get_system_info(DMI_PRODUCT_VERSION));
2169                ret = -EIO;
2170                goto error;
2171        }
2172
2173        if (end >> agaw_to_width(domain->agaw)) {
2174                WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2175                     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2176                     agaw_to_width(domain->agaw),
2177                     dmi_get_system_info(DMI_BIOS_VENDOR),
2178                     dmi_get_system_info(DMI_BIOS_VERSION),
2179                     dmi_get_system_info(DMI_PRODUCT_VERSION));
2180                ret = -EIO;
2181                goto error;
2182        }
2183
2184        ret = iommu_domain_identity_map(domain, start, end);
2185        if (ret)
2186                goto error;
2187
2188        /* context entry init */
2189        ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2190        if (ret)
2191                goto error;
2192
2193        return 0;
2194
2195 error:
2196        domain_exit(domain);
2197        return ret;
2198}
2199
2200static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2201        struct pci_dev *pdev)
2202{
2203        if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2204                return 0;
2205        return iommu_prepare_identity_map(pdev, rmrr->base_address,
2206                rmrr->end_address);
2207}
2208
2209#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2210static inline void iommu_prepare_isa(void)
2211{
2212        struct pci_dev *pdev;
2213        int ret;
2214
2215        pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2216        if (!pdev)
2217                return;
2218
2219        printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2220        ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2221
2222        if (ret)
2223                printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2224                       "floppy might not work\n");
2225
2226}
2227#else
2228static inline void iommu_prepare_isa(void)
2229{
2230        return;
2231}
2232#endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2233
2234static int md_domain_init(struct dmar_domain *domain, int guest_width);
2235
2236static int __init si_domain_init(int hw)
2237{
2238        struct dmar_drhd_unit *drhd;
2239        struct intel_iommu *iommu;
2240        int nid, ret = 0;
2241
2242        si_domain = alloc_domain();
2243        if (!si_domain)
2244                return -EFAULT;
2245
2246        pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2247
2248        for_each_active_iommu(iommu, drhd) {
2249                ret = iommu_attach_domain(si_domain, iommu);
2250                if (ret) {
2251                        domain_exit(si_domain);
2252                        return -EFAULT;
2253                }
2254        }
2255
2256        if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2257                domain_exit(si_domain);
2258                return -EFAULT;
2259        }
2260
2261        si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2262
2263        if (hw)
2264                return 0;
2265
2266        for_each_online_node(nid) {
2267                unsigned long start_pfn, end_pfn;
2268                int i;
2269
2270                for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2271                        ret = iommu_domain_identity_map(si_domain,
2272                                        PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2273                        if (ret)
2274                                return ret;
2275                }
2276        }
2277
2278        return 0;
2279}
2280
2281static void domain_remove_one_dev_info(struct dmar_domain *domain,
2282                                          struct pci_dev *pdev);
2283static int identity_mapping(struct pci_dev *pdev)
2284{
2285        struct device_domain_info *info;
2286
2287        if (likely(!iommu_identity_mapping))
2288                return 0;
2289
2290        info = pdev->dev.archdata.iommu;
2291        if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2292                return (info->domain == si_domain);
2293
2294        return 0;
2295}
2296
2297static int domain_add_dev_info(struct dmar_domain *domain,
2298                               struct pci_dev *pdev,
2299                               int translation)
2300{
2301        struct device_domain_info *info;
2302        unsigned long flags;
2303        int ret;
2304
2305        info = alloc_devinfo_mem();
2306        if (!info)
2307                return -ENOMEM;
2308
2309        info->segment = pci_domain_nr(pdev->bus);
2310        info->bus = pdev->bus->number;
2311        info->devfn = pdev->devfn;
2312        info->dev = pdev;
2313        info->domain = domain;
2314
2315        spin_lock_irqsave(&device_domain_lock, flags);
2316        list_add(&info->link, &domain->devices);
2317        list_add(&info->global, &device_domain_list);
2318        pdev->dev.archdata.iommu = info;
2319        spin_unlock_irqrestore(&device_domain_lock, flags);
2320
2321        ret = domain_context_mapping(domain, pdev, translation);
2322        if (ret) {
2323                spin_lock_irqsave(&device_domain_lock, flags);
2324                unlink_domain_info(info);
2325                spin_unlock_irqrestore(&device_domain_lock, flags);
2326                free_devinfo_mem(info);
2327                return ret;
2328        }
2329
2330        return 0;
2331}
2332
2333static bool device_has_rmrr(struct pci_dev *dev)
2334{
2335        struct dmar_rmrr_unit *rmrr;
2336        int i;
2337
2338        for_each_rmrr_units(rmrr) {
2339                for (i = 0; i < rmrr->devices_cnt; i++) {
2340                        /*
2341                         * Return TRUE if this RMRR contains the device that
2342                         * is passed in.
2343                         */
2344                        if (rmrr->devices[i] == dev)
2345                                return true;
2346                }
2347        }
2348        return false;
2349}
2350
2351static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2352{
2353
2354        /*
2355         * We want to prevent any device associated with an RMRR from
2356         * getting placed into the SI Domain. This is done because
2357         * problems exist when devices are moved in and out of domains
2358         * and their respective RMRR info is lost. We exempt USB devices
2359         * from this process due to their usage of RMRRs that are known
2360         * to not be needed after BIOS hand-off to OS.
2361         */
2362        if (device_has_rmrr(pdev) &&
2363            (pdev->class >> 8) != PCI_CLASS_SERIAL_USB)
2364                return 0;
2365
2366        if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2367                return 1;
2368
2369        if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2370                return 1;
2371
2372        if (!(iommu_identity_mapping & IDENTMAP_ALL))
2373                return 0;
2374
2375        /*
2376         * We want to start off with all devices in the 1:1 domain, and
2377         * take them out later if we find they can't access all of memory.
2378         *
2379         * However, we can't do this for PCI devices behind bridges,
2380         * because all PCI devices behind the same bridge will end up
2381         * with the same source-id on their transactions.
2382         *
2383         * Practically speaking, we can't change things around for these
2384         * devices at run-time, because we can't be sure there'll be no
2385         * DMA transactions in flight for any of their siblings.
2386         * 
2387         * So PCI devices (unless they're on the root bus) as well as
2388         * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2389         * the 1:1 domain, just in _case_ one of their siblings turns out
2390         * not to be able to map all of memory.
2391         */
2392        if (!pci_is_pcie(pdev)) {
2393                if (!pci_is_root_bus(pdev->bus))
2394                        return 0;
2395                if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2396                        return 0;
2397        } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2398                return 0;
2399
2400        /* 
2401         * At boot time, we don't yet know if devices will be 64-bit capable.
2402         * Assume that they will -- if they turn out not to be, then we can 
2403         * take them out of the 1:1 domain later.
2404         */
2405        if (!startup) {
2406                /*
2407                 * If the device's dma_mask is less than the system's memory
2408                 * size then this is not a candidate for identity mapping.
2409                 */
2410                u64 dma_mask = pdev->dma_mask;
2411
2412                if (pdev->dev.coherent_dma_mask &&
2413                    pdev->dev.coherent_dma_mask < dma_mask)
2414                        dma_mask = pdev->dev.coherent_dma_mask;
2415
2416                return dma_mask >= dma_get_required_mask(&pdev->dev);
2417        }
2418
2419        return 1;
2420}
2421
2422static int __init iommu_prepare_static_identity_mapping(int hw)
2423{
2424        struct pci_dev *pdev = NULL;
2425        int ret;
2426
2427        ret = si_domain_init(hw);
2428        if (ret)
2429                return -EFAULT;
2430
2431        for_each_pci_dev(pdev) {
2432                if (iommu_should_identity_map(pdev, 1)) {
2433                        ret = domain_add_dev_info(si_domain, pdev,
2434                                             hw ? CONTEXT_TT_PASS_THROUGH :
2435                                                  CONTEXT_TT_MULTI_LEVEL);
2436                        if (ret) {
2437                                /* device not associated with an iommu */
2438                                if (ret == -ENODEV)
2439                                        continue;
2440                                return ret;
2441                        }
2442                        pr_info("IOMMU: %s identity mapping for device %s\n",
2443                                hw ? "hardware" : "software", pci_name(pdev));
2444                }
2445        }
2446
2447        return 0;
2448}
2449
2450static int __init init_dmars(void)
2451{
2452        struct dmar_drhd_unit *drhd;
2453        struct dmar_rmrr_unit *rmrr;
2454        struct pci_dev *pdev;
2455        struct intel_iommu *iommu;
2456        int i, ret;
2457
2458        /*
2459         * for each drhd
2460         *    allocate root
2461         *    initialize and program root entry to not present
2462         * endfor
2463         */
2464        for_each_drhd_unit(drhd) {
2465                /*
2466                 * lock not needed as this is only incremented in the single
2467                 * threaded kernel __init code path all other access are read
2468                 * only
2469                 */
2470                if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2471                        g_num_of_iommus++;
2472                        continue;
2473                }
2474                printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2475                          IOMMU_UNITS_SUPPORTED);
2476        }
2477
2478        g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2479                        GFP_KERNEL);
2480        if (!g_iommus) {
2481                printk(KERN_ERR "Allocating global iommu array failed\n");
2482                ret = -ENOMEM;
2483                goto error;
2484        }
2485
2486        deferred_flush = kzalloc(g_num_of_iommus *
2487                sizeof(struct deferred_flush_tables), GFP_KERNEL);
2488        if (!deferred_flush) {
2489                ret = -ENOMEM;
2490                goto error;
2491        }
2492
2493        for_each_drhd_unit(drhd) {
2494                if (drhd->ignored)
2495                        continue;
2496
2497                iommu = drhd->iommu;
2498                g_iommus[iommu->seq_id] = iommu;
2499
2500                ret = iommu_init_domains(iommu);
2501                if (ret)
2502                        goto error;
2503
2504                /*
2505                 * TBD:
2506                 * we could share the same root & context tables
2507                 * among all IOMMU's. Need to Split it later.
2508                 */
2509                ret = iommu_alloc_root_entry(iommu);
2510                if (ret) {
2511                        printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2512                        goto error;
2513                }
2514                if (!ecap_pass_through(iommu->ecap))
2515                        hw_pass_through = 0;
2516        }
2517
2518        /*
2519         * Start from the sane iommu hardware state.
2520         */
2521        for_each_drhd_unit(drhd) {
2522                if (drhd->ignored)
2523                        continue;
2524
2525                iommu = drhd->iommu;
2526
2527                /*
2528                 * If the queued invalidation is already initialized by us
2529                 * (for example, while enabling interrupt-remapping) then
2530                 * we got the things already rolling from a sane state.
2531                 */
2532                if (iommu->qi)
2533                        continue;
2534
2535                /*
2536                 * Clear any previous faults.
2537                 */
2538                dmar_fault(-1, iommu);
2539                /*
2540                 * Disable queued invalidation if supported and already enabled
2541                 * before OS handover.
2542                 */
2543                dmar_disable_qi(iommu);
2544        }
2545
2546        for_each_drhd_unit(drhd) {
2547                if (drhd->ignored)
2548                        continue;
2549
2550                iommu = drhd->iommu;
2551
2552                if (dmar_enable_qi(iommu)) {
2553                        /*
2554                         * Queued Invalidate not enabled, use Register Based
2555                         * Invalidate
2556                         */
2557                        iommu->flush.flush_context = __iommu_flush_context;
2558                        iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2559                        printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2560                               "invalidation\n",
2561                                iommu->seq_id,
2562                               (unsigned long long)drhd->reg_base_addr);
2563                } else {
2564                        iommu->flush.flush_context = qi_flush_context;
2565                        iommu->flush.flush_iotlb = qi_flush_iotlb;
2566                        printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2567                               "invalidation\n",
2568                                iommu->seq_id,
2569                               (unsigned long long)drhd->reg_base_addr);
2570                }
2571        }
2572
2573        if (iommu_pass_through)
2574                iommu_identity_mapping |= IDENTMAP_ALL;
2575
2576#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2577        iommu_identity_mapping |= IDENTMAP_GFX;
2578#endif
2579
2580        check_tylersburg_isoch();
2581
2582        /*
2583         * If pass through is not set or not enabled, setup context entries for
2584         * identity mappings for rmrr, gfx, and isa and may fall back to static
2585         * identity mapping if iommu_identity_mapping is set.
2586         */
2587        if (iommu_identity_mapping) {
2588                ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2589                if (ret) {
2590                        printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2591                        goto error;
2592                }
2593        }
2594        /*
2595         * For each rmrr
2596         *   for each dev attached to rmrr
2597         *   do
2598         *     locate drhd for dev, alloc domain for dev
2599         *     allocate free domain
2600         *     allocate page table entries for rmrr
2601         *     if context not allocated for bus
2602         *           allocate and init context
2603         *           set present in root table for this bus
2604         *     init context with domain, translation etc
2605         *    endfor
2606         * endfor
2607         */
2608        printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2609        for_each_rmrr_units(rmrr) {
2610                for (i = 0; i < rmrr->devices_cnt; i++) {
2611                        pdev = rmrr->devices[i];
2612                        /*
2613                         * some BIOS lists non-exist devices in DMAR
2614                         * table.
2615                         */
2616                        if (!pdev)
2617                                continue;
2618                        ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2619                        if (ret)
2620                                printk(KERN_ERR
2621                                       "IOMMU: mapping reserved region failed\n");
2622                }
2623        }
2624
2625        iommu_prepare_isa();
2626
2627        /*
2628         * for each drhd
2629         *   enable fault log
2630         *   global invalidate context cache
2631         *   global invalidate iotlb
2632         *   enable translation
2633         */
2634        for_each_drhd_unit(drhd) {
2635                if (drhd->ignored) {
2636                        /*
2637                         * we always have to disable PMRs or DMA may fail on
2638                         * this device
2639                         */
2640                        if (force_on)
2641                                iommu_disable_protect_mem_regions(drhd->iommu);
2642                        continue;
2643                }
2644                iommu = drhd->iommu;
2645
2646                iommu_flush_write_buffer(iommu);
2647
2648                ret = dmar_set_interrupt(iommu);
2649                if (ret)
2650                        goto error;
2651
2652                iommu_set_root_entry(iommu);
2653
2654                iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2655                iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2656
2657                ret = iommu_enable_translation(iommu);
2658                if (ret)
2659                        goto error;
2660
2661                iommu_disable_protect_mem_regions(iommu);
2662        }
2663
2664        return 0;
2665error:
2666        for_each_drhd_unit(drhd) {
2667                if (drhd->ignored)
2668                        continue;
2669                iommu = drhd->iommu;
2670                free_iommu(iommu);
2671        }
2672        kfree(g_iommus);
2673        return ret;
2674}
2675
2676/* This takes a number of _MM_ pages, not VTD pages */
2677static struct iova *intel_alloc_iova(struct device *dev,
2678                                     struct dmar_domain *domain,
2679                                     unsigned long nrpages, uint64_t dma_mask)
2680{
2681        struct pci_dev *pdev = to_pci_dev(dev);
2682        struct iova *iova = NULL;
2683
2684        /* Restrict dma_mask to the width that the iommu can handle */
2685        dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2686
2687        if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2688                /*
2689                 * First try to allocate an io virtual address in
2690                 * DMA_BIT_MASK(32) and if that fails then try allocating
2691                 * from higher range
2692                 */
2693                iova = alloc_iova(&domain->iovad, nrpages,
2694                                  IOVA_PFN(DMA_BIT_MASK(32)), 1);
2695                if (iova)
2696                        return iova;
2697        }
2698        iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2699        if (unlikely(!iova)) {
2700                printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2701                       nrpages, pci_name(pdev));
2702                return NULL;
2703        }
2704
2705        return iova;
2706}
2707
2708static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2709{
2710        struct dmar_domain *domain;
2711        int ret;
2712
2713        domain = get_domain_for_dev(pdev,
2714                        DEFAULT_DOMAIN_ADDRESS_WIDTH);
2715        if (!domain) {
2716                printk(KERN_ERR
2717                        "Allocating domain for %s failed", pci_name(pdev));
2718                return NULL;
2719        }
2720
2721        /* make sure context mapping is ok */
2722        if (unlikely(!domain_context_mapped(pdev))) {
2723                ret = domain_context_mapping(domain, pdev,
2724                                             CONTEXT_TT_MULTI_LEVEL);
2725                if (ret) {
2726                        printk(KERN_ERR
2727                                "Domain context map for %s failed",
2728                                pci_name(pdev));
2729                        return NULL;
2730                }
2731        }
2732
2733        return domain;
2734}
2735
2736static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2737{
2738        struct device_domain_info *info;
2739
2740        /* No lock here, assumes no domain exit in normal case */
2741        info = dev->dev.archdata.iommu;
2742        if (likely(info))
2743                return info->domain;
2744
2745        return __get_valid_domain_for_dev(dev);
2746}
2747
2748static int iommu_dummy(struct pci_dev *pdev)
2749{
2750        return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2751}
2752
2753/* Check if the pdev needs to go through non-identity map and unmap process.*/
2754static int iommu_no_mapping(struct device *dev)
2755{
2756        struct pci_dev *pdev;
2757        int found;
2758
2759        if (unlikely(dev->bus != &pci_bus_type))
2760                return 1;
2761
2762        pdev = to_pci_dev(dev);
2763        if (iommu_dummy(pdev))
2764                return 1;
2765
2766        if (!iommu_identity_mapping)
2767                return 0;
2768
2769        found = identity_mapping(pdev);
2770        if (found) {
2771                if (iommu_should_identity_map(pdev, 0))
2772                        return 1;
2773                else {
2774                        /*
2775                         * 32 bit DMA is removed from si_domain and fall back
2776                         * to non-identity mapping.
2777                         */
2778                        domain_remove_one_dev_info(si_domain, pdev);
2779                        printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2780                               pci_name(pdev));
2781                        return 0;
2782                }
2783        } else {
2784                /*
2785                 * In case of a detached 64 bit DMA device from vm, the device
2786                 * is put into si_domain for identity mapping.
2787                 */
2788                if (iommu_should_identity_map(pdev, 0)) {
2789                        int ret;
2790                        ret = domain_add_dev_info(si_domain, pdev,
2791                                                  hw_pass_through ?
2792                                                  CONTEXT_TT_PASS_THROUGH :
2793                                                  CONTEXT_TT_MULTI_LEVEL);
2794                        if (!ret) {
2795                                printk(KERN_INFO "64bit %s uses identity mapping\n",
2796                                       pci_name(pdev));
2797                                return 1;
2798                        }
2799                }
2800        }
2801
2802        return 0;
2803}
2804
2805static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2806                                     size_t size, int dir, u64 dma_mask)
2807{
2808        struct pci_dev *pdev = to_pci_dev(hwdev);
2809        struct dmar_domain *domain;
2810        phys_addr_t start_paddr;
2811        struct iova *iova;
2812        int prot = 0;
2813        int ret;
2814        struct intel_iommu *iommu;
2815        unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2816
2817        BUG_ON(dir == DMA_NONE);
2818
2819        if (iommu_no_mapping(hwdev))
2820                return paddr;
2821
2822        domain = get_valid_domain_for_dev(pdev);
2823        if (!domain)
2824                return 0;
2825
2826        iommu = domain_get_iommu(domain);
2827        size = aligned_nrpages(paddr, size);
2828
2829        iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2830        if (!iova)
2831                goto error;
2832
2833        /*
2834         * Check if DMAR supports zero-length reads on write only
2835         * mappings..
2836         */
2837        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2838                        !cap_zlr(iommu->cap))
2839                prot |= DMA_PTE_READ;
2840        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2841                prot |= DMA_PTE_WRITE;
2842        /*
2843         * paddr - (paddr + size) might be partial page, we should map the whole
2844         * page.  Note: if two part of one page are separately mapped, we
2845         * might have two guest_addr mapping to the same host paddr, but this
2846         * is not a big problem
2847         */
2848        ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2849                                 mm_to_dma_pfn(paddr_pfn), size, prot);
2850        if (ret)
2851                goto error;
2852
2853        /* it's a non-present to present mapping. Only flush if caching mode */
2854        if (cap_caching_mode(iommu->cap))
2855                iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2856        else
2857                iommu_flush_write_buffer(iommu);
2858
2859        start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2860        start_paddr += paddr & ~PAGE_MASK;
2861        return start_paddr;
2862
2863error:
2864        if (iova)
2865                __free_iova(&domain->iovad, iova);
2866        printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2867                pci_name(pdev), size, (unsigned long long)paddr, dir);
2868        return 0;
2869}
2870
2871static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2872                                 unsigned long offset, size_t size,
2873                                 enum dma_data_direction dir,
2874                                 struct dma_attrs *attrs)
2875{
2876        return __intel_map_single(dev, page_to_phys(page) + offset, size,
2877                                  dir, to_pci_dev(dev)->dma_mask);
2878}
2879
2880static void flush_unmaps(void)
2881{
2882        int i, j;
2883
2884        timer_on = 0;
2885
2886        /* just flush them all */
2887        for (i = 0; i < g_num_of_iommus; i++) {
2888                struct intel_iommu *iommu = g_iommus[i];
2889                if (!iommu)
2890                        continue;
2891
2892                if (!deferred_flush[i].next)
2893                        continue;
2894
2895                /* In caching mode, global flushes turn emulation expensive */
2896                if (!cap_caching_mode(iommu->cap))
2897                        iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2898                                         DMA_TLB_GLOBAL_FLUSH);
2899                for (j = 0; j < deferred_flush[i].next; j++) {
2900                        unsigned long mask;
2901                        struct iova *iova = deferred_flush[i].iova[j];
2902                        struct dmar_domain *domain = deferred_flush[i].domain[j];
2903
2904                        /* On real hardware multiple invalidations are expensive */
2905                        if (cap_caching_mode(iommu->cap))
2906                                iommu_flush_iotlb_psi(iommu, domain->id,
2907                                iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2908                        else {
2909                                mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2910                                iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2911                                                (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2912                        }
2913                        __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2914                }
2915                deferred_flush[i].next = 0;
2916        }
2917
2918        list_size = 0;
2919}
2920
2921static void flush_unmaps_timeout(unsigned long data)
2922{
2923        unsigned long flags;
2924
2925        spin_lock_irqsave(&async_umap_flush_lock, flags);
2926        flush_unmaps();
2927        spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2928}
2929
2930static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2931{
2932        unsigned long flags;
2933        int next, iommu_id;
2934        struct intel_iommu *iommu;
2935
2936        spin_lock_irqsave(&async_umap_flush_lock, flags);
2937        if (list_size == HIGH_WATER_MARK)
2938                flush_unmaps();
2939
2940        iommu = domain_get_iommu(dom);
2941        iommu_id = iommu->seq_id;
2942
2943        next = deferred_flush[iommu_id].next;
2944        deferred_flush[iommu_id].domain[next] = dom;
2945        deferred_flush[iommu_id].iova[next] = iova;
2946        deferred_flush[iommu_id].next++;
2947
2948        if (!timer_on) {
2949                mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2950                timer_on = 1;
2951        }
2952        list_size++;
2953        spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2954}
2955
2956static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2957                             size_t size, enum dma_data_direction dir,
2958                             struct dma_attrs *attrs)
2959{
2960        struct pci_dev *pdev = to_pci_dev(dev);
2961        struct dmar_domain *domain;
2962        unsigned long start_pfn, last_pfn;
2963        struct iova *iova;
2964        struct intel_iommu *iommu;
2965
2966        if (iommu_no_mapping(dev))
2967                return;
2968
2969        domain = find_domain(pdev);
2970        BUG_ON(!domain);
2971
2972        iommu = domain_get_iommu(domain);
2973
2974        iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2975        if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2976                      (unsigned long long)dev_addr))
2977                return;
2978
2979        start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2980        last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2981
2982        pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2983                 pci_name(pdev), start_pfn, last_pfn);
2984
2985        /*  clear the whole page */
2986        dma_pte_clear_range(domain, start_pfn, last_pfn);
2987
2988        /* free page tables */
2989        dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2990
2991        if (intel_iommu_strict) {
2992                iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2993                                      last_pfn - start_pfn + 1, 0);
2994                /* free iova */
2995                __free_iova(&domain->iovad, iova);
2996        } else {
2997                add_unmap(domain, iova);
2998                /*
2999                 * queue up the release of the unmap to save the 1/6th of the
3000                 * cpu used up by the iotlb flush operation...
3001                 */
3002        }
3003}
3004
3005static void *intel_alloc_coherent(struct device *hwdev, size_t size,
3006                                  dma_addr_t *dma_handle, gfp_t flags,
3007                                  struct dma_attrs *attrs)
3008{
3009        void *vaddr;
3010        int order;
3011
3012        size = PAGE_ALIGN(size);
3013        order = get_order(size);
3014
3015        if (!iommu_no_mapping(hwdev))
3016                flags &= ~(GFP_DMA | GFP_DMA32);
3017        else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
3018                if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
3019                        flags |= GFP_DMA;
3020                else
3021                        flags |= GFP_DMA32;
3022        }
3023
3024        vaddr = (void *)__get_free_pages(flags, order);
3025        if (!vaddr)
3026                return NULL;
3027        memset(vaddr, 0, size);
3028
3029        *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
3030                                         DMA_BIDIRECTIONAL,
3031                                         hwdev->coherent_dma_mask);
3032        if (*dma_handle)
3033                return vaddr;
3034        free_pages((unsigned long)vaddr, order);
3035        return NULL;
3036}
3037
3038static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
3039                                dma_addr_t dma_handle, struct dma_attrs *attrs)
3040{
3041        int order;
3042
3043        size = PAGE_ALIGN(size);
3044        order = get_order(size);
3045
3046        intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3047        free_pages((unsigned long)vaddr, order);
3048}
3049
3050static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3051                           int nelems, enum dma_data_direction dir,
3052                           struct dma_attrs *attrs)
3053{
3054        struct pci_dev *pdev = to_pci_dev(hwdev);
3055        struct dmar_domain *domain;
3056        unsigned long start_pfn, last_pfn;
3057        struct iova *iova;
3058        struct intel_iommu *iommu;
3059
3060        if (iommu_no_mapping(hwdev))
3061                return;
3062
3063        domain = find_domain(pdev);
3064        BUG_ON(!domain);
3065
3066        iommu = domain_get_iommu(domain);
3067
3068        iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3069        if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3070                      (unsigned long long)sglist[0].dma_address))
3071                return;
3072
3073        start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3074        last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3075
3076        /*  clear the whole page */
3077        dma_pte_clear_range(domain, start_pfn, last_pfn);
3078
3079        /* free page tables */
3080        dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3081
3082        if (intel_iommu_strict) {
3083                iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3084                                      last_pfn - start_pfn + 1, 0);
3085                /* free iova */
3086                __free_iova(&domain->iovad, iova);
3087        } else {
3088                add_unmap(domain, iova);
3089                /*
3090                 * queue up the release of the unmap to save the 1/6th of the
3091                 * cpu used up by the iotlb flush operation...
3092                 */
3093        }
3094}
3095
3096static int intel_nontranslate_map_sg(struct device *hddev,
3097        struct scatterlist *sglist, int nelems, int dir)
3098{
3099        int i;
3100        struct scatterlist *sg;
3101
3102        for_each_sg(sglist, sg, nelems, i) {
3103                BUG_ON(!sg_page(sg));
3104                sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3105                sg->dma_length = sg->length;
3106        }
3107        return nelems;
3108}
3109
3110static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3111                        enum dma_data_direction dir, struct dma_attrs *attrs)
3112{
3113        int i;
3114        struct pci_dev *pdev = to_pci_dev(hwdev);
3115        struct dmar_domain *domain;
3116        size_t size = 0;
3117        int prot = 0;
3118        struct iova *iova = NULL;
3119        int ret;
3120        struct scatterlist *sg;
3121        unsigned long start_vpfn;
3122        struct intel_iommu *iommu;
3123
3124        BUG_ON(dir == DMA_NONE);
3125        if (iommu_no_mapping(hwdev))
3126                return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3127
3128        domain = get_valid_domain_for_dev(pdev);
3129        if (!domain)
3130                return 0;
3131
3132        iommu = domain_get_iommu(domain);
3133
3134        for_each_sg(sglist, sg, nelems, i)
3135                size += aligned_nrpages(sg->offset, sg->length);
3136
3137        iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3138                                pdev->dma_mask);
3139        if (!iova) {
3140                sglist->dma_length = 0;
3141                return 0;
3142        }
3143
3144        /*
3145         * Check if DMAR supports zero-length reads on write only
3146         * mappings..
3147         */
3148        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3149                        !cap_zlr(iommu->cap))
3150                prot |= DMA_PTE_READ;
3151        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3152                prot |= DMA_PTE_WRITE;
3153
3154        start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3155
3156        ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3157        if (unlikely(ret)) {
3158                /*  clear the page */
3159                dma_pte_clear_range(domain, start_vpfn,
3160                                    start_vpfn + size - 1);
3161                /* free page tables */
3162                dma_pte_free_pagetable(domain, start_vpfn,
3163                                       start_vpfn + size - 1);
3164                /* free iova */
3165                __free_iova(&domain->iovad, iova);
3166                return 0;
3167        }
3168
3169        /* it's a non-present to present mapping. Only flush if caching mode */
3170        if (cap_caching_mode(iommu->cap))
3171                iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3172        else
3173                iommu_flush_write_buffer(iommu);
3174
3175        return nelems;
3176}
3177
3178static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3179{
3180        return !dma_addr;
3181}
3182
3183struct dma_map_ops intel_dma_ops = {
3184        .alloc = intel_alloc_coherent,
3185        .free = intel_free_coherent,
3186        .map_sg = intel_map_sg,
3187        .unmap_sg = intel_unmap_sg,
3188        .map_page = intel_map_page,
3189        .unmap_page = intel_unmap_page,
3190        .mapping_error = intel_mapping_error,
3191};
3192
3193static inline int iommu_domain_cache_init(void)
3194{
3195        int ret = 0;
3196
3197        iommu_domain_cache = kmem_cache_create("iommu_domain",
3198                                         sizeof(struct dmar_domain),
3199                                         0,
3200                                         SLAB_HWCACHE_ALIGN,
3201
3202                                         NULL);
3203        if (!iommu_domain_cache) {
3204                printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3205                ret = -ENOMEM;
3206        }
3207
3208        return ret;
3209}
3210
3211static inline int iommu_devinfo_cache_init(void)
3212{
3213        int ret = 0;
3214
3215        iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3216                                         sizeof(struct device_domain_info),
3217                                         0,
3218                                         SLAB_HWCACHE_ALIGN,
3219                                         NULL);
3220        if (!iommu_devinfo_cache) {
3221                printk(KERN_ERR "Couldn't create devinfo cache\n");
3222                ret = -ENOMEM;
3223        }
3224
3225        return ret;
3226}
3227
3228static inline int iommu_iova_cache_init(void)
3229{
3230        int ret = 0;
3231
3232        iommu_iova_cache = kmem_cache_create("iommu_iova",
3233                                         sizeof(struct iova),
3234                                         0,
3235                                         SLAB_HWCACHE_ALIGN,
3236                                         NULL);
3237        if (!iommu_iova_cache) {
3238                printk(KERN_ERR "Couldn't create iova cache\n");
3239                ret = -ENOMEM;
3240        }
3241
3242        return ret;
3243}
3244
3245static int __init iommu_init_mempool(void)
3246{
3247        int ret;
3248        ret = iommu_iova_cache_init();
3249        if (ret)
3250                return ret;
3251
3252        ret = iommu_domain_cache_init();
3253        if (ret)
3254                goto domain_error;
3255
3256        ret = iommu_devinfo_cache_init();
3257        if (!ret)
3258                return ret;
3259
3260        kmem_cache_destroy(iommu_domain_cache);
3261domain_error:
3262        kmem_cache_destroy(iommu_iova_cache);
3263
3264        return -ENOMEM;
3265}
3266
3267static void __init iommu_exit_mempool(void)
3268{
3269        kmem_cache_destroy(iommu_devinfo_cache);
3270        kmem_cache_destroy(iommu_domain_cache);
3271        kmem_cache_destroy(iommu_iova_cache);
3272
3273}
3274
3275static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3276{
3277        struct dmar_drhd_unit *drhd;
3278        u32 vtbar;
3279        int rc;
3280
3281        /* We know that this device on this chipset has its own IOMMU.
3282         * If we find it under a different IOMMU, then the BIOS is lying
3283         * to us. Hope that the IOMMU for this device is actually
3284         * disabled, and it needs no translation...
3285         */
3286        rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3287        if (rc) {
3288                /* "can't" happen */
3289                dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3290                return;
3291        }
3292        vtbar &= 0xffff0000;
3293
3294        /* we know that the this iommu should be at offset 0xa000 from vtbar */
3295        drhd = dmar_find_matched_drhd_unit(pdev);
3296        if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3297                            TAINT_FIRMWARE_WORKAROUND,
3298                            "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3299                pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3300}
3301DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3302
3303static void __init init_no_remapping_devices(void)
3304{
3305        struct dmar_drhd_unit *drhd;
3306
3307        for_each_drhd_unit(drhd) {
3308                if (!drhd->include_all) {
3309                        int i;
3310                        for (i = 0; i < drhd->devices_cnt; i++)
3311                                if (drhd->devices[i] != NULL)
3312                                        break;
3313                        /* ignore DMAR unit if no pci devices exist */
3314                        if (i == drhd->devices_cnt)
3315                                drhd->ignored = 1;
3316                }
3317        }
3318
3319        for_each_drhd_unit(drhd) {
3320                int i;
3321                if (drhd->ignored || drhd->include_all)
3322                        continue;
3323
3324                for (i = 0; i < drhd->devices_cnt; i++)
3325                        if (drhd->devices[i] &&
3326                            !IS_GFX_DEVICE(drhd->devices[i]))
3327                                break;
3328
3329                if (i < drhd->devices_cnt)
3330                        continue;
3331
3332                /* This IOMMU has *only* gfx devices. Either bypass it or
3333                   set the gfx_mapped flag, as appropriate */
3334                if (dmar_map_gfx) {
3335                        intel_iommu_gfx_mapped = 1;
3336                } else {
3337                        drhd->ignored = 1;
3338                        for (i = 0; i < drhd->devices_cnt; i++) {
3339                                if (!drhd->devices[i])
3340                                        continue;
3341                                drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3342                        }
3343                }
3344        }
3345}
3346
3347#ifdef CONFIG_SUSPEND
3348static int init_iommu_hw(void)
3349{
3350        struct dmar_drhd_unit *drhd;
3351        struct intel_iommu *iommu = NULL;
3352
3353        for_each_active_iommu(iommu, drhd)
3354                if (iommu->qi)
3355                        dmar_reenable_qi(iommu);
3356
3357        for_each_iommu(iommu, drhd) {
3358                if (drhd->ignored) {
3359                        /*
3360                         * we always have to disable PMRs or DMA may fail on
3361                         * this device
3362                         */
3363                        if (force_on)
3364                                iommu_disable_protect_mem_regions(iommu);
3365                        continue;
3366                }
3367        
3368                iommu_flush_write_buffer(iommu);
3369
3370                iommu_set_root_entry(iommu);
3371
3372                iommu->flush.flush_context(iommu, 0, 0, 0,
3373                                           DMA_CCMD_GLOBAL_INVL);
3374                iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3375                                         DMA_TLB_GLOBAL_FLUSH);
3376                if (iommu_enable_translation(iommu))
3377                        return 1;
3378                iommu_disable_protect_mem_regions(iommu);
3379        }
3380
3381        return 0;
3382}
3383
3384static void iommu_flush_all(void)
3385{
3386        struct dmar_drhd_unit *drhd;
3387        struct intel_iommu *iommu;
3388
3389        for_each_active_iommu(iommu, drhd) {
3390                iommu->flush.flush_context(iommu, 0, 0, 0,
3391                                           DMA_CCMD_GLOBAL_INVL);
3392                iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3393                                         DMA_TLB_GLOBAL_FLUSH);
3394        }
3395}
3396
3397static int iommu_suspend(void)
3398{
3399        struct dmar_drhd_unit *drhd;
3400        struct intel_iommu *iommu = NULL;
3401        unsigned long flag;
3402
3403        for_each_active_iommu(iommu, drhd) {
3404                iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3405                                                 GFP_ATOMIC);
3406                if (!iommu->iommu_state)
3407                        goto nomem;
3408        }
3409
3410        iommu_flush_all();
3411
3412        for_each_active_iommu(iommu, drhd) {
3413                iommu_disable_translation(iommu);
3414
3415                raw_spin_lock_irqsave(&iommu->register_lock, flag);
3416
3417                iommu->iommu_state[SR_DMAR_FECTL_REG] =
3418                        readl(iommu->reg + DMAR_FECTL_REG);
3419                iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3420                        readl(iommu->reg + DMAR_FEDATA_REG);
3421                iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3422                        readl(iommu->reg + DMAR_FEADDR_REG);
3423                iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3424                        readl(iommu->reg + DMAR_FEUADDR_REG);
3425
3426                raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3427        }
3428        return 0;
3429
3430nomem:
3431        for_each_active_iommu(iommu, drhd)
3432                kfree(iommu->iommu_state);
3433
3434        return -ENOMEM;
3435}
3436
3437static void iommu_resume(void)
3438{
3439        struct dmar_drhd_unit *drhd;
3440        struct intel_iommu *iommu = NULL;
3441        unsigned long flag;
3442
3443        if (init_iommu_hw()) {
3444                if (force_on)
3445                        panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3446                else
3447                        WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3448                return;
3449        }
3450
3451        for_each_active_iommu(iommu, drhd) {
3452
3453                raw_spin_lock_irqsave(&iommu->register_lock, flag);
3454
3455                writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3456                        iommu->reg + DMAR_FECTL_REG);
3457                writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3458                        iommu->reg + DMAR_FEDATA_REG);
3459                writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3460                        iommu->reg + DMAR_FEADDR_REG);
3461                writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3462                        iommu->reg + DMAR_FEUADDR_REG);
3463
3464                raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3465        }
3466
3467        for_each_active_iommu(iommu, drhd)
3468                kfree(iommu->iommu_state);
3469}
3470
3471static struct syscore_ops iommu_syscore_ops = {
3472        .resume         = iommu_resume,
3473        .suspend        = iommu_suspend,
3474};
3475
3476static void __init init_iommu_pm_ops(void)
3477{
3478        register_syscore_ops(&iommu_syscore_ops);
3479}
3480
3481#else
3482static inline void init_iommu_pm_ops(void) {}
3483#endif  /* CONFIG_PM */
3484
3485LIST_HEAD(dmar_rmrr_units);
3486
3487static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3488{
3489        list_add(&rmrr->list, &dmar_rmrr_units);
3490}
3491
3492
3493int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3494{
3495        struct acpi_dmar_reserved_memory *rmrr;
3496        struct dmar_rmrr_unit *rmrru;
3497
3498        rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3499        if (!rmrru)
3500                return -ENOMEM;
3501
3502        rmrru->hdr = header;
3503        rmrr = (struct acpi_dmar_reserved_memory *)header;
3504        rmrru->base_address = rmrr->base_address;
3505        rmrru->end_address = rmrr->end_address;
3506
3507        dmar_register_rmrr_unit(rmrru);
3508        return 0;
3509}
3510
3511static int __init
3512rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3513{
3514        struct acpi_dmar_reserved_memory *rmrr;
3515        int ret;
3516
3517        rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3518        ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3519                ((void *)rmrr) + rmrr->header.length,
3520                &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3521
3522        if (ret || (rmrru->devices_cnt == 0)) {
3523                list_del(&rmrru->list);
3524                kfree(rmrru);
3525        }
3526        return ret;
3527}
3528
3529static LIST_HEAD(dmar_atsr_units);
3530
3531int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3532{
3533        struct acpi_dmar_atsr *atsr;
3534        struct dmar_atsr_unit *atsru;
3535
3536        atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3537        atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3538        if (!atsru)
3539                return -ENOMEM;
3540
3541        atsru->hdr = hdr;
3542        atsru->include_all = atsr->flags & 0x1;
3543
3544        list_add(&atsru->list, &dmar_atsr_units);
3545
3546        return 0;
3547}
3548
3549static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3550{
3551        int rc;
3552        struct acpi_dmar_atsr *atsr;
3553
3554        if (atsru->include_all)
3555                return 0;
3556
3557        atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3558        rc = dmar_parse_dev_scope((void *)(atsr + 1),
3559                                (void *)atsr + atsr->header.length,
3560                                &atsru->devices_cnt, &atsru->devices,
3561                                atsr->segment);
3562        if (rc || !atsru->devices_cnt) {
3563                list_del(&atsru->list);
3564                kfree(atsru);
3565        }
3566
3567        return rc;
3568}
3569
3570int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3571{
3572        int i;
3573        struct pci_bus *bus;
3574        struct acpi_dmar_atsr *atsr;
3575        struct dmar_atsr_unit *atsru;
3576
3577        dev = pci_physfn(dev);
3578
3579        list_for_each_entry(atsru, &dmar_atsr_units, list) {
3580                atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3581                if (atsr->segment == pci_domain_nr(dev->bus))
3582                        goto found;
3583        }
3584
3585        return 0;
3586
3587found:
3588        for (bus = dev->bus; bus; bus = bus->parent) {
3589                struct pci_dev *bridge = bus->self;
3590
3591                if (!bridge || !pci_is_pcie(bridge) ||
3592                    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3593                        return 0;
3594
3595                if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) {
3596                        for (i = 0; i < atsru->devices_cnt; i++)
3597                                if (atsru->devices[i] == bridge)
3598                                        return 1;
3599                        break;
3600                }
3601        }
3602
3603        if (atsru->include_all)
3604                return 1;
3605
3606        return 0;
3607}
3608
3609int __init dmar_parse_rmrr_atsr_dev(void)
3610{
3611        struct dmar_rmrr_unit *rmrr, *rmrr_n;
3612        struct dmar_atsr_unit *atsr, *atsr_n;
3613        int ret = 0;
3614
3615        list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3616                ret = rmrr_parse_dev(rmrr);
3617                if (ret)
3618                        return ret;
3619        }
3620
3621        list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3622                ret = atsr_parse_dev(atsr);
3623                if (ret)
3624                        return ret;
3625        }
3626
3627        return ret;
3628}
3629
3630/*
3631 * Here we only respond to action of unbound device from driver.
3632 *
3633 * Added device is not attached to its DMAR domain here yet. That will happen
3634 * when mapping the device to iova.
3635 */
3636static int device_notifier(struct notifier_block *nb,
3637                                  unsigned long action, void *data)
3638{
3639        struct device *dev = data;
3640        struct pci_dev *pdev = to_pci_dev(dev);
3641        struct dmar_domain *domain;
3642
3643        if (iommu_no_mapping(dev))
3644                return 0;
3645
3646        domain = find_domain(pdev);
3647        if (!domain)
3648                return 0;
3649
3650        if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3651                domain_remove_one_dev_info(domain, pdev);
3652
3653                if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3654                    !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3655                    list_empty(&domain->devices))
3656                        domain_exit(domain);
3657        }
3658
3659        return 0;
3660}
3661
3662static struct notifier_block device_nb = {
3663        .notifier_call = device_notifier,
3664};
3665
3666int __init intel_iommu_init(void)
3667{
3668        int ret = 0;
3669        struct dmar_drhd_unit *drhd;
3670
3671        /* VT-d is required for a TXT/tboot launch, so enforce that */
3672        force_on = tboot_force_iommu();
3673
3674        if (dmar_table_init()) {
3675                if (force_on)
3676                        panic("tboot: Failed to initialize DMAR table\n");
3677                return  -ENODEV;
3678        }
3679
3680        /*
3681         * Disable translation if already enabled prior to OS handover.
3682         */
3683        for_each_drhd_unit(drhd) {
3684                struct intel_iommu *iommu;
3685
3686                if (drhd->ignored)
3687                        continue;
3688
3689                iommu = drhd->iommu;
3690                if (iommu->gcmd & DMA_GCMD_TE)
3691                        iommu_disable_translation(iommu);
3692        }
3693
3694        if (dmar_dev_scope_init() < 0) {
3695                if (force_on)
3696                        panic("tboot: Failed to initialize DMAR device scope\n");
3697                return  -ENODEV;
3698        }
3699
3700        if (no_iommu || dmar_disabled)
3701                return -ENODEV;
3702
3703        if (iommu_init_mempool()) {
3704                if (force_on)
3705                        panic("tboot: Failed to initialize iommu memory\n");
3706                return  -ENODEV;
3707        }
3708
3709        if (list_empty(&dmar_rmrr_units))
3710                printk(KERN_INFO "DMAR: No RMRR found\n");
3711
3712        if (list_empty(&dmar_atsr_units))
3713                printk(KERN_INFO "DMAR: No ATSR found\n");
3714
3715        if (dmar_init_reserved_ranges()) {
3716                if (force_on)
3717                        panic("tboot: Failed to reserve iommu ranges\n");
3718                return  -ENODEV;
3719        }
3720
3721        init_no_remapping_devices();
3722
3723        ret = init_dmars();
3724        if (ret) {
3725                if (force_on)
3726                        panic("tboot: Failed to initialize DMARs\n");
3727                printk(KERN_ERR "IOMMU: dmar init failed\n");
3728                put_iova_domain(&reserved_iova_list);
3729                iommu_exit_mempool();
3730                return ret;
3731        }
3732        printk(KERN_INFO
3733        "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3734
3735        init_timer(&unmap_timer);
3736#ifdef CONFIG_SWIOTLB
3737        swiotlb = 0;
3738#endif
3739        dma_ops = &intel_dma_ops;
3740
3741        init_iommu_pm_ops();
3742
3743        bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3744
3745        bus_register_notifier(&pci_bus_type, &device_nb);
3746
3747        intel_iommu_enabled = 1;
3748
3749        return 0;
3750}
3751
3752static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3753                                           struct pci_dev *pdev)
3754{
3755        struct pci_dev *tmp, *parent;
3756
3757        if (!iommu || !pdev)
3758                return;
3759
3760        /* dependent device detach */
3761        tmp = pci_find_upstream_pcie_bridge(pdev);
3762        /* Secondary interface's bus number and devfn 0 */
3763        if (tmp) {
3764                parent = pdev->bus->self;
3765                while (parent != tmp) {
3766                        iommu_detach_dev(iommu, parent->bus->number,
3767                                         parent->devfn);
3768                        parent = parent->bus->self;
3769                }
3770                if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3771                        iommu_detach_dev(iommu,
3772                                tmp->subordinate->number, 0);
3773                else /* this is a legacy PCI bridge */
3774                        iommu_detach_dev(iommu, tmp->bus->number,
3775                                         tmp->devfn);
3776        }
3777}
3778
3779static void domain_remove_one_dev_info(struct dmar_domain *domain,
3780                                          struct pci_dev *pdev)
3781{
3782        struct device_domain_info *info;
3783        struct intel_iommu *iommu;
3784        unsigned long flags;
3785        int found = 0;
3786        struct list_head *entry, *tmp;
3787
3788        iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3789                                pdev->devfn);
3790        if (!iommu)
3791                return;
3792
3793        spin_lock_irqsave(&device_domain_lock, flags);
3794        list_for_each_safe(entry, tmp, &domain->devices) {
3795                info = list_entry(entry, struct device_domain_info, link);
3796                if (info->segment == pci_domain_nr(pdev->bus) &&
3797                    info->bus == pdev->bus->number &&
3798                    info->devfn == pdev->devfn) {
3799                        unlink_domain_info(info);
3800                        spin_unlock_irqrestore(&device_domain_lock, flags);
3801
3802                        iommu_disable_dev_iotlb(info);
3803                        iommu_detach_dev(iommu, info->bus, info->devfn);
3804                        iommu_detach_dependent_devices(iommu, pdev);
3805                        free_devinfo_mem(info);
3806
3807                        spin_lock_irqsave(&device_domain_lock, flags);
3808
3809                        if (found)
3810                                break;
3811                        else
3812                                continue;
3813                }
3814
3815                /* if there is no other devices under the same iommu
3816                 * owned by this domain, clear this iommu in iommu_bmp
3817                 * update iommu count and coherency
3818                 */
3819                if (iommu == device_to_iommu(info->segment, info->bus,
3820                                            info->devfn))
3821                        found = 1;
3822        }
3823
3824        spin_unlock_irqrestore(&device_domain_lock, flags);
3825
3826        if (found == 0) {
3827                unsigned long tmp_flags;
3828                spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3829                clear_bit(iommu->seq_id, domain->iommu_bmp);
3830                domain->iommu_count--;
3831                domain_update_iommu_cap(domain);
3832                spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3833
3834                if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3835                    !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3836                        spin_lock_irqsave(&iommu->lock, tmp_flags);
3837                        clear_bit(domain->id, iommu->domain_ids);
3838                        iommu->domains[domain->id] = NULL;
3839                        spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3840                }
3841        }
3842}
3843
3844static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3845{
3846        struct device_domain_info *info;
3847        struct intel_iommu *iommu;
3848        unsigned long flags1, flags2;
3849
3850        spin_lock_irqsave(&device_domain_lock, flags1);
3851        while (!list_empty(&domain->devices)) {
3852                info = list_entry(domain->devices.next,
3853                        struct device_domain_info, link);
3854                unlink_domain_info(info);
3855                spin_unlock_irqrestore(&device_domain_lock, flags1);
3856
3857                iommu_disable_dev_iotlb(info);
3858                iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3859                iommu_detach_dev(iommu, info->bus, info->devfn);
3860                iommu_detach_dependent_devices(iommu, info->dev);
3861
3862                /* clear this iommu in iommu_bmp, update iommu count
3863                 * and capabilities
3864                 */
3865                spin_lock_irqsave(&domain->iommu_lock, flags2);
3866                if (test_and_clear_bit(iommu->seq_id,
3867                                       domain->iommu_bmp)) {
3868                        domain->iommu_count--;
3869                        domain_update_iommu_cap(domain);
3870                }
3871                spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3872
3873                free_devinfo_mem(info);
3874                spin_lock_irqsave(&device_domain_lock, flags1);
3875        }
3876        spin_unlock_irqrestore(&device_domain_lock, flags1);
3877}
3878
3879/* domain id for virtual machine, it won't be set in context */
3880static unsigned long vm_domid;
3881
3882static struct dmar_domain *iommu_alloc_vm_domain(void)
3883{
3884        struct dmar_domain *domain;
3885
3886        domain = alloc_domain_mem();
3887        if (!domain)
3888                return NULL;
3889
3890        domain->id = vm_domid++;
3891        domain->nid = -1;
3892        memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
3893        domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3894
3895        return domain;
3896}
3897
3898static int md_domain_init(struct dmar_domain *domain, int guest_width)
3899{
3900        int adjust_width;
3901
3902        init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3903        spin_lock_init(&domain->iommu_lock);
3904
3905        domain_reserve_special_ranges(domain);
3906
3907        /* calculate AGAW */
3908        domain->gaw = guest_width;
3909        adjust_width = guestwidth_to_adjustwidth(guest_width);
3910        domain->agaw = width_to_agaw(adjust_width);
3911
3912        INIT_LIST_HEAD(&domain->devices);
3913
3914        domain->iommu_count = 0;
3915        domain->iommu_coherency = 0;
3916        domain->iommu_snooping = 0;
3917        domain->iommu_superpage = 0;
3918        domain->max_addr = 0;
3919        domain->nid = -1;
3920
3921        /* always allocate the top pgd */
3922        domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3923        if (!domain->pgd)
3924                return -ENOMEM;
3925        domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3926        return 0;
3927}
3928
3929static void iommu_free_vm_domain(struct dmar_domain *domain)
3930{
3931        unsigned long flags;
3932        struct dmar_drhd_unit *drhd;
3933        struct intel_iommu *iommu;
3934        unsigned long i;
3935        unsigned long ndomains;
3936
3937        for_each_drhd_unit(drhd) {
3938                if (drhd->ignored)
3939                        continue;
3940                iommu = drhd->iommu;
3941
3942                ndomains = cap_ndoms(iommu->cap);
3943                for_each_set_bit(i, iommu->domain_ids, ndomains) {
3944                        if (iommu->domains[i] == domain) {
3945                                spin_lock_irqsave(&iommu->lock, flags);
3946                                clear_bit(i, iommu->domain_ids);
3947                                iommu->domains[i] = NULL;
3948                                spin_unlock_irqrestore(&iommu->lock, flags);
3949                                break;
3950                        }
3951                }
3952        }
3953}
3954
3955static void vm_domain_exit(struct dmar_domain *domain)
3956{
3957        /* Domain 0 is reserved, so dont process it */
3958        if (!domain)
3959                return;
3960
3961        vm_domain_remove_all_dev_info(domain);
3962        /* destroy iovas */
3963        put_iova_domain(&domain->iovad);
3964
3965        /* clear ptes */
3966        dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3967
3968        /* free page tables */
3969        dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3970
3971        iommu_free_vm_domain(domain);
3972        free_domain_mem(domain);
3973}
3974
3975static int intel_iommu_domain_init(struct iommu_domain *domain)
3976{
3977        struct dmar_domain *dmar_domain;
3978
3979        dmar_domain = iommu_alloc_vm_domain();
3980        if (!dmar_domain) {
3981                printk(KERN_ERR
3982                        "intel_iommu_domain_init: dmar_domain == NULL\n");
3983                return -ENOMEM;
3984        }
3985        if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3986                printk(KERN_ERR
3987                        "intel_iommu_domain_init() failed\n");
3988                vm_domain_exit(dmar_domain);
3989                return -ENOMEM;
3990        }
3991        domain_update_iommu_cap(dmar_domain);
3992        domain->priv = dmar_domain;
3993
3994        domain->geometry.aperture_start = 0;
3995        domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
3996        domain->geometry.force_aperture = true;
3997
3998        return 0;
3999}
4000
4001static void intel_iommu_domain_destroy(struct iommu_domain *domain)
4002{
4003        struct dmar_domain *dmar_domain = domain->priv;
4004
4005        domain->priv = NULL;
4006        vm_domain_exit(dmar_domain);
4007}
4008
4009static int intel_iommu_attach_device(struct iommu_domain *domain,
4010                                     struct device *dev)
4011{
4012        struct dmar_domain *dmar_domain = domain->priv;
4013        struct pci_dev *pdev = to_pci_dev(dev);
4014        struct intel_iommu *iommu;
4015        int addr_width;
4016
4017        /* normally pdev is not mapped */
4018        if (unlikely(domain_context_mapped(pdev))) {
4019                struct dmar_domain *old_domain;
4020
4021                old_domain = find_domain(pdev);
4022                if (old_domain) {
4023                        if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
4024                            dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
4025                                domain_remove_one_dev_info(old_domain, pdev);
4026                        else
4027                                domain_remove_dev_info(old_domain);
4028                }
4029        }
4030
4031        iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
4032                                pdev->devfn);
4033        if (!iommu)
4034                return -ENODEV;
4035
4036        /* check if this iommu agaw is sufficient for max mapped address */
4037        addr_width = agaw_to_width(iommu->agaw);
4038        if (addr_width > cap_mgaw(iommu->cap))
4039                addr_width = cap_mgaw(iommu->cap);
4040
4041        if (dmar_domain->max_addr > (1LL << addr_width)) {
4042                printk(KERN_ERR "%s: iommu width (%d) is not "
4043                       "sufficient for the mapped address (%llx)\n",
4044                       __func__, addr_width, dmar_domain->max_addr);
4045                return -EFAULT;
4046        }
4047        dmar_domain->gaw = addr_width;
4048
4049        /*
4050         * Knock out extra levels of page tables if necessary
4051         */
4052        while (iommu->agaw < dmar_domain->agaw) {
4053                struct dma_pte *pte;
4054
4055                pte = dmar_domain->pgd;
4056                if (dma_pte_present(pte)) {
4057                        dmar_domain->pgd = (struct dma_pte *)
4058                                phys_to_virt(dma_pte_addr(pte));
4059                        free_pgtable_page(pte);
4060                }
4061                dmar_domain->agaw--;
4062        }
4063
4064        return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4065}
4066
4067static void intel_iommu_detach_device(struct iommu_domain *domain,
4068                                      struct device *dev)
4069{
4070        struct dmar_domain *dmar_domain = domain->priv;
4071        struct pci_dev *pdev = to_pci_dev(dev);
4072
4073        domain_remove_one_dev_info(dmar_domain, pdev);
4074}
4075
4076static int intel_iommu_map(struct iommu_domain *domain,
4077                           unsigned long iova, phys_addr_t hpa,
4078                           size_t size, int iommu_prot)
4079{
4080        struct dmar_domain *dmar_domain = domain->priv;
4081        u64 max_addr;
4082        int prot = 0;
4083        int ret;
4084
4085        if (iommu_prot & IOMMU_READ)
4086                prot |= DMA_PTE_READ;
4087        if (iommu_prot & IOMMU_WRITE)
4088                prot |= DMA_PTE_WRITE;
4089        if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4090                prot |= DMA_PTE_SNP;
4091
4092        max_addr = iova + size;
4093        if (dmar_domain->max_addr < max_addr) {
4094                u64 end;
4095
4096                /* check if minimum agaw is sufficient for mapped address */
4097                end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4098                if (end < max_addr) {
4099                        printk(KERN_ERR "%s: iommu width (%d) is not "
4100                               "sufficient for the mapped address (%llx)\n",
4101                               __func__, dmar_domain->gaw, max_addr);
4102                        return -EFAULT;
4103                }
4104                dmar_domain->max_addr = max_addr;
4105        }
4106        /* Round up size to next multiple of PAGE_SIZE, if it and
4107           the low bits of hpa would take us onto the next page */
4108        size = aligned_nrpages(hpa, size);
4109        ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4110                                 hpa >> VTD_PAGE_SHIFT, size, prot);
4111        return ret;
4112}
4113
4114static size_t intel_iommu_unmap(struct iommu_domain *domain,
4115                             unsigned long iova, size_t size)
4116{
4117        struct dmar_domain *dmar_domain = domain->priv;
4118        int order;
4119
4120        order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4121                            (iova + size - 1) >> VTD_PAGE_SHIFT);
4122
4123        if (dmar_domain->max_addr == iova + size)
4124                dmar_domain->max_addr = iova;
4125
4126        return PAGE_SIZE << order;
4127}
4128
4129static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4130                                            dma_addr_t iova)
4131{
4132        struct dmar_domain *dmar_domain = domain->priv;
4133        struct dma_pte *pte;
4134        u64 phys = 0;
4135
4136        pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4137        if (pte)
4138                phys = dma_pte_addr(pte);
4139
4140        return phys;
4141}
4142
4143static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4144                                      unsigned long cap)
4145{
4146        struct dmar_domain *dmar_domain = domain->priv;
4147
4148        if (cap == IOMMU_CAP_CACHE_COHERENCY)
4149                return dmar_domain->iommu_snooping;
4150        if (cap == IOMMU_CAP_INTR_REMAP)
4151                return irq_remapping_enabled;
4152
4153        return 0;
4154}
4155
4156#define REQ_ACS_FLAGS   (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
4157
4158static int intel_iommu_add_device(struct device *dev)
4159{
4160        struct pci_dev *pdev = to_pci_dev(dev);
4161        struct pci_dev *bridge, *dma_pdev = NULL;
4162        struct iommu_group *group;
4163        int ret;
4164
4165        if (!device_to_iommu(pci_domain_nr(pdev->bus),
4166                             pdev->bus->number, pdev->devfn))
4167                return -ENODEV;
4168
4169        bridge = pci_find_upstream_pcie_bridge(pdev);
4170        if (bridge) {
4171                if (pci_is_pcie(bridge))
4172                        dma_pdev = pci_get_domain_bus_and_slot(
4173                                                pci_domain_nr(pdev->bus),
4174                                                bridge->subordinate->number, 0);
4175                if (!dma_pdev)
4176                        dma_pdev = pci_dev_get(bridge);
4177        } else
4178                dma_pdev = pci_dev_get(pdev);
4179
4180        /* Account for quirked devices */
4181        swap_pci_ref(&dma_pdev, pci_get_dma_source(dma_pdev));
4182
4183        /*
4184         * If it's a multifunction device that does not support our
4185         * required ACS flags, add to the same group as lowest numbered
4186         * function that also does not suport the required ACS flags.
4187         */
4188        if (dma_pdev->multifunction &&
4189            !pci_acs_enabled(dma_pdev, REQ_ACS_FLAGS)) {
4190                u8 i, slot = PCI_SLOT(dma_pdev->devfn);
4191
4192                for (i = 0; i < 8; i++) {
4193                        struct pci_dev *tmp;
4194
4195                        tmp = pci_get_slot(dma_pdev->bus, PCI_DEVFN(slot, i));
4196                        if (!tmp)
4197                                continue;
4198
4199                        if (!pci_acs_enabled(tmp, REQ_ACS_FLAGS)) {
4200                                swap_pci_ref(&dma_pdev, tmp);
4201                                break;
4202                        }
4203                        pci_dev_put(tmp);
4204                }
4205        }
4206
4207        /*
4208         * Devices on the root bus go through the iommu.  If that's not us,
4209         * find the next upstream device and test ACS up to the root bus.
4210         * Finding the next device may require skipping virtual buses.
4211         */
4212        while (!pci_is_root_bus(dma_pdev->bus)) {
4213                struct pci_bus *bus = dma_pdev->bus;
4214
4215                while (!bus->self) {
4216                        if (!pci_is_root_bus(bus))
4217                                bus = bus->parent;
4218                        else
4219                                goto root_bus;
4220                }
4221
4222                if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS))
4223                        break;
4224
4225                swap_pci_ref(&dma_pdev, pci_dev_get(bus->self));
4226        }
4227
4228root_bus:
4229        group = iommu_group_get(&dma_pdev->dev);
4230        pci_dev_put(dma_pdev);
4231        if (!group) {
4232                group = iommu_group_alloc();
4233                if (IS_ERR(group))
4234                        return PTR_ERR(group);
4235        }
4236
4237        ret = iommu_group_add_device(group, dev);
4238
4239        iommu_group_put(group);
4240        return ret;
4241}
4242
4243static void intel_iommu_remove_device(struct device *dev)
4244{
4245        iommu_group_remove_device(dev);
4246}
4247
4248static struct iommu_ops intel_iommu_ops = {
4249        .domain_init    = intel_iommu_domain_init,
4250        .domain_destroy = intel_iommu_domain_destroy,
4251        .attach_dev     = intel_iommu_attach_device,
4252        .detach_dev     = intel_iommu_detach_device,
4253        .map            = intel_iommu_map,
4254        .unmap          = intel_iommu_unmap,
4255        .iova_to_phys   = intel_iommu_iova_to_phys,
4256        .domain_has_cap = intel_iommu_domain_has_cap,
4257        .add_device     = intel_iommu_add_device,
4258        .remove_device  = intel_iommu_remove_device,
4259        .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
4260};
4261
4262static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4263{
4264        /* G4x/GM45 integrated gfx dmar support is totally busted. */
4265        printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4266        dmar_map_gfx = 0;
4267}
4268
4269DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4270DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4271DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4272DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4273DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4274DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4275DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4276
4277static void quirk_iommu_rwbf(struct pci_dev *dev)
4278{
4279        /*
4280         * Mobile 4 Series Chipset neglects to set RWBF capability,
4281         * but needs it. Same seems to hold for the desktop versions.
4282         */
4283        printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4284        rwbf_quirk = 1;
4285}
4286
4287DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4288DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4289DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4290DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4291DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4292DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4293DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4294
4295#define GGC 0x52
4296#define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4297#define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4298#define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4299#define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4300#define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4301#define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4302#define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4303#define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4304
4305static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4306{
4307        unsigned short ggc;
4308
4309        if (pci_read_config_word(dev, GGC, &ggc))
4310                return;
4311
4312        if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4313                printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4314                dmar_map_gfx = 0;
4315        } else if (dmar_map_gfx) {
4316                /* we have to ensure the gfx device is idle before we flush */
4317                printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4318                intel_iommu_strict = 1;
4319       }
4320}
4321DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4322DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4323DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4324DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4325
4326/* On Tylersburg chipsets, some BIOSes have been known to enable the
4327   ISOCH DMAR unit for the Azalia sound device, but not give it any
4328   TLB entries, which causes it to deadlock. Check for that.  We do
4329   this in a function called from init_dmars(), instead of in a PCI
4330   quirk, because we don't want to print the obnoxious "BIOS broken"
4331   message if VT-d is actually disabled.
4332*/
4333static void __init check_tylersburg_isoch(void)
4334{
4335        struct pci_dev *pdev;
4336        uint32_t vtisochctrl;
4337
4338        /* If there's no Azalia in the system anyway, forget it. */
4339        pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4340        if (!pdev)
4341                return;
4342        pci_dev_put(pdev);
4343
4344        /* System Management Registers. Might be hidden, in which case
4345           we can't do the sanity check. But that's OK, because the
4346           known-broken BIOSes _don't_ actually hide it, so far. */
4347        pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4348        if (!pdev)
4349                return;
4350
4351        if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4352                pci_dev_put(pdev);
4353                return;
4354        }
4355
4356        pci_dev_put(pdev);
4357
4358        /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4359        if (vtisochctrl & 1)
4360                return;
4361
4362        /* Drop all bits other than the number of TLB entries */
4363        vtisochctrl &= 0x1c;
4364
4365        /* If we have the recommended number of TLB entries (16), fine. */
4366        if (vtisochctrl == 0x10)
4367                return;
4368
4369        /* Zero TLB entries? You get to ride the short bus to school. */
4370        if (!vtisochctrl) {
4371                WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4372                     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4373                     dmi_get_system_info(DMI_BIOS_VENDOR),
4374                     dmi_get_system_info(DMI_BIOS_VERSION),
4375                     dmi_get_system_info(DMI_PRODUCT_VERSION));
4376                iommu_identity_mapping |= IDENTMAP_AZALIA;
4377                return;
4378        }
4379        
4380        printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4381               vtisochctrl);
4382}
4383