linux/drivers/pci/intel-iommu.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2006, Intel Corporation.
   3 *
   4 * This program is free software; you can redistribute it and/or modify it
   5 * under the terms and conditions of the GNU General Public License,
   6 * version 2, as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope it will be useful, but WITHOUT
   9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  11 * more details.
  12 *
  13 * You should have received a copy of the GNU General Public License along with
  14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
  15 * Place - Suite 330, Boston, MA 02111-1307 USA.
  16 *
  17 * Copyright (C) 2006-2008 Intel Corporation
  18 * Author: Ashok Raj <ashok.raj@intel.com>
  19 * Author: Shaohua Li <shaohua.li@intel.com>
  20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
  21 * Author: Fenghua Yu <fenghua.yu@intel.com>
  22 */
  23
  24#include <linux/init.h>
  25#include <linux/bitmap.h>
  26#include <linux/debugfs.h>
  27#include <linux/slab.h>
  28#include <linux/irq.h>
  29#include <linux/interrupt.h>
  30#include <linux/spinlock.h>
  31#include <linux/pci.h>
  32#include <linux/dmar.h>
  33#include <linux/dma-mapping.h>
  34#include <linux/mempool.h>
  35#include <linux/timer.h>
  36#include <linux/iova.h>
  37#include <linux/iommu.h>
  38#include <linux/intel-iommu.h>
  39#include <linux/syscore_ops.h>
  40#include <linux/tboot.h>
  41#include <linux/dmi.h>
  42#include <asm/cacheflush.h>
  43#include <asm/iommu.h>
  44#include "pci.h"
  45
  46#define ROOT_SIZE               VTD_PAGE_SIZE
  47#define CONTEXT_SIZE            VTD_PAGE_SIZE
  48
  49#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  50#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  51#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  52
  53#define IOAPIC_RANGE_START      (0xfee00000)
  54#define IOAPIC_RANGE_END        (0xfeefffff)
  55#define IOVA_START_ADDR         (0x1000)
  56
  57#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
  58
  59#define MAX_AGAW_WIDTH 64
  60
  61#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  62#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  63
  64/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  65   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  66#define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  67                                __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  68#define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  69
  70#define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  71#define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
  72#define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
  73
  74/* page table handling */
  75#define LEVEL_STRIDE            (9)
  76#define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  77
  78static inline int agaw_to_level(int agaw)
  79{
  80        return agaw + 2;
  81}
  82
  83static inline int agaw_to_width(int agaw)
  84{
  85        return 30 + agaw * LEVEL_STRIDE;
  86}
  87
  88static inline int width_to_agaw(int width)
  89{
  90        return (width - 30) / LEVEL_STRIDE;
  91}
  92
  93static inline unsigned int level_to_offset_bits(int level)
  94{
  95        return (level - 1) * LEVEL_STRIDE;
  96}
  97
  98static inline int pfn_level_offset(unsigned long pfn, int level)
  99{
 100        return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 101}
 102
 103static inline unsigned long level_mask(int level)
 104{
 105        return -1UL << level_to_offset_bits(level);
 106}
 107
 108static inline unsigned long level_size(int level)
 109{
 110        return 1UL << level_to_offset_bits(level);
 111}
 112
 113static inline unsigned long align_to_level(unsigned long pfn, int level)
 114{
 115        return (pfn + level_size(level) - 1) & level_mask(level);
 116}
 117
 118/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 119   are never going to work. */
 120static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 121{
 122        return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 123}
 124
 125static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 126{
 127        return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 128}
 129static inline unsigned long page_to_dma_pfn(struct page *pg)
 130{
 131        return mm_to_dma_pfn(page_to_pfn(pg));
 132}
 133static inline unsigned long virt_to_dma_pfn(void *p)
 134{
 135        return page_to_dma_pfn(virt_to_page(p));
 136}
 137
 138/* global iommu list, set NULL for ignored DMAR units */
 139static struct intel_iommu **g_iommus;
 140
 141static void __init check_tylersburg_isoch(void);
 142static int rwbf_quirk;
 143
 144/*
 145 * 0: Present
 146 * 1-11: Reserved
 147 * 12-63: Context Ptr (12 - (haw-1))
 148 * 64-127: Reserved
 149 */
 150struct root_entry {
 151        u64     val;
 152        u64     rsvd1;
 153};
 154#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 155static inline bool root_present(struct root_entry *root)
 156{
 157        return (root->val & 1);
 158}
 159static inline void set_root_present(struct root_entry *root)
 160{
 161        root->val |= 1;
 162}
 163static inline void set_root_value(struct root_entry *root, unsigned long value)
 164{
 165        root->val |= value & VTD_PAGE_MASK;
 166}
 167
 168static inline struct context_entry *
 169get_context_addr_from_root(struct root_entry *root)
 170{
 171        return (struct context_entry *)
 172                (root_present(root)?phys_to_virt(
 173                root->val & VTD_PAGE_MASK) :
 174                NULL);
 175}
 176
 177/*
 178 * low 64 bits:
 179 * 0: present
 180 * 1: fault processing disable
 181 * 2-3: translation type
 182 * 12-63: address space root
 183 * high 64 bits:
 184 * 0-2: address width
 185 * 3-6: aval
 186 * 8-23: domain id
 187 */
 188struct context_entry {
 189        u64 lo;
 190        u64 hi;
 191};
 192
 193static inline bool context_present(struct context_entry *context)
 194{
 195        return (context->lo & 1);
 196}
 197static inline void context_set_present(struct context_entry *context)
 198{
 199        context->lo |= 1;
 200}
 201
 202static inline void context_set_fault_enable(struct context_entry *context)
 203{
 204        context->lo &= (((u64)-1) << 2) | 1;
 205}
 206
 207static inline void context_set_translation_type(struct context_entry *context,
 208                                                unsigned long value)
 209{
 210        context->lo &= (((u64)-1) << 4) | 3;
 211        context->lo |= (value & 3) << 2;
 212}
 213
 214static inline void context_set_address_root(struct context_entry *context,
 215                                            unsigned long value)
 216{
 217        context->lo |= value & VTD_PAGE_MASK;
 218}
 219
 220static inline void context_set_address_width(struct context_entry *context,
 221                                             unsigned long value)
 222{
 223        context->hi |= value & 7;
 224}
 225
 226static inline void context_set_domain_id(struct context_entry *context,
 227                                         unsigned long value)
 228{
 229        context->hi |= (value & ((1 << 16) - 1)) << 8;
 230}
 231
 232static inline void context_clear_entry(struct context_entry *context)
 233{
 234        context->lo = 0;
 235        context->hi = 0;
 236}
 237
 238/*
 239 * 0: readable
 240 * 1: writable
 241 * 2-6: reserved
 242 * 7: super page
 243 * 8-10: available
 244 * 11: snoop behavior
 245 * 12-63: Host physcial address
 246 */
 247struct dma_pte {
 248        u64 val;
 249};
 250
 251static inline void dma_clear_pte(struct dma_pte *pte)
 252{
 253        pte->val = 0;
 254}
 255
 256static inline void dma_set_pte_readable(struct dma_pte *pte)
 257{
 258        pte->val |= DMA_PTE_READ;
 259}
 260
 261static inline void dma_set_pte_writable(struct dma_pte *pte)
 262{
 263        pte->val |= DMA_PTE_WRITE;
 264}
 265
 266static inline void dma_set_pte_snp(struct dma_pte *pte)
 267{
 268        pte->val |= DMA_PTE_SNP;
 269}
 270
 271static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
 272{
 273        pte->val = (pte->val & ~3) | (prot & 3);
 274}
 275
 276static inline u64 dma_pte_addr(struct dma_pte *pte)
 277{
 278#ifdef CONFIG_64BIT
 279        return pte->val & VTD_PAGE_MASK;
 280#else
 281        /* Must have a full atomic 64-bit read */
 282        return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
 283#endif
 284}
 285
 286static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
 287{
 288        pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
 289}
 290
 291static inline bool dma_pte_present(struct dma_pte *pte)
 292{
 293        return (pte->val & 3) != 0;
 294}
 295
 296static inline int first_pte_in_page(struct dma_pte *pte)
 297{
 298        return !((unsigned long)pte & ~VTD_PAGE_MASK);
 299}
 300
 301/*
 302 * This domain is a statically identity mapping domain.
 303 *      1. This domain creats a static 1:1 mapping to all usable memory.
 304 *      2. It maps to each iommu if successful.
 305 *      3. Each iommu mapps to this domain if successful.
 306 */
 307static struct dmar_domain *si_domain;
 308static int hw_pass_through = 1;
 309
 310/* devices under the same p2p bridge are owned in one domain */
 311#define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
 312
 313/* domain represents a virtual machine, more than one devices
 314 * across iommus may be owned in one domain, e.g. kvm guest.
 315 */
 316#define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
 317
 318/* si_domain contains mulitple devices */
 319#define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
 320
 321struct dmar_domain {
 322        int     id;                     /* domain id */
 323        int     nid;                    /* node id */
 324        unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
 325
 326        struct list_head devices;       /* all devices' list */
 327        struct iova_domain iovad;       /* iova's that belong to this domain */
 328
 329        struct dma_pte  *pgd;           /* virtual address */
 330        int             gaw;            /* max guest address width */
 331
 332        /* adjusted guest address width, 0 is level 2 30-bit */
 333        int             agaw;
 334
 335        int             flags;          /* flags to find out type of domain */
 336
 337        int             iommu_coherency;/* indicate coherency of iommu access */
 338        int             iommu_snooping; /* indicate snooping control feature*/
 339        int             iommu_count;    /* reference count of iommu */
 340        spinlock_t      iommu_lock;     /* protect iommu set in domain */
 341        u64             max_addr;       /* maximum mapped address */
 342};
 343
 344/* PCI domain-device relationship */
 345struct device_domain_info {
 346        struct list_head link;  /* link to domain siblings */
 347        struct list_head global; /* link to global list */
 348        int segment;            /* PCI domain */
 349        u8 bus;                 /* PCI bus number */
 350        u8 devfn;               /* PCI devfn number */
 351        struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
 352        struct intel_iommu *iommu; /* IOMMU used by this device */
 353        struct dmar_domain *domain; /* pointer to domain */
 354};
 355
 356static void flush_unmaps_timeout(unsigned long data);
 357
 358DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
 359
 360#define HIGH_WATER_MARK 250
 361struct deferred_flush_tables {
 362        int next;
 363        struct iova *iova[HIGH_WATER_MARK];
 364        struct dmar_domain *domain[HIGH_WATER_MARK];
 365};
 366
 367static struct deferred_flush_tables *deferred_flush;
 368
 369/* bitmap for indexing intel_iommus */
 370static int g_num_of_iommus;
 371
 372static DEFINE_SPINLOCK(async_umap_flush_lock);
 373static LIST_HEAD(unmaps_to_do);
 374
 375static int timer_on;
 376static long list_size;
 377
 378static void domain_remove_dev_info(struct dmar_domain *domain);
 379
 380#ifdef CONFIG_DMAR_DEFAULT_ON
 381int dmar_disabled = 0;
 382#else
 383int dmar_disabled = 1;
 384#endif /*CONFIG_DMAR_DEFAULT_ON*/
 385
 386static int dmar_map_gfx = 1;
 387static int dmar_forcedac;
 388static int intel_iommu_strict;
 389
 390#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 391static DEFINE_SPINLOCK(device_domain_lock);
 392static LIST_HEAD(device_domain_list);
 393
 394static struct iommu_ops intel_iommu_ops;
 395
 396static int __init intel_iommu_setup(char *str)
 397{
 398        if (!str)
 399                return -EINVAL;
 400        while (*str) {
 401                if (!strncmp(str, "on", 2)) {
 402                        dmar_disabled = 0;
 403                        printk(KERN_INFO "Intel-IOMMU: enabled\n");
 404                } else if (!strncmp(str, "off", 3)) {
 405                        dmar_disabled = 1;
 406                        printk(KERN_INFO "Intel-IOMMU: disabled\n");
 407                } else if (!strncmp(str, "igfx_off", 8)) {
 408                        dmar_map_gfx = 0;
 409                        printk(KERN_INFO
 410                                "Intel-IOMMU: disable GFX device mapping\n");
 411                } else if (!strncmp(str, "forcedac", 8)) {
 412                        printk(KERN_INFO
 413                                "Intel-IOMMU: Forcing DAC for PCI devices\n");
 414                        dmar_forcedac = 1;
 415                } else if (!strncmp(str, "strict", 6)) {
 416                        printk(KERN_INFO
 417                                "Intel-IOMMU: disable batched IOTLB flush\n");
 418                        intel_iommu_strict = 1;
 419                }
 420
 421                str += strcspn(str, ",");
 422                while (*str == ',')
 423                        str++;
 424        }
 425        return 0;
 426}
 427__setup("intel_iommu=", intel_iommu_setup);
 428
 429static struct kmem_cache *iommu_domain_cache;
 430static struct kmem_cache *iommu_devinfo_cache;
 431static struct kmem_cache *iommu_iova_cache;
 432
 433static inline void *alloc_pgtable_page(int node)
 434{
 435        struct page *page;
 436        void *vaddr = NULL;
 437
 438        page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 439        if (page)
 440                vaddr = page_address(page);
 441        return vaddr;
 442}
 443
 444static inline void free_pgtable_page(void *vaddr)
 445{
 446        free_page((unsigned long)vaddr);
 447}
 448
 449static inline void *alloc_domain_mem(void)
 450{
 451        return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 452}
 453
 454static void free_domain_mem(void *vaddr)
 455{
 456        kmem_cache_free(iommu_domain_cache, vaddr);
 457}
 458
 459static inline void * alloc_devinfo_mem(void)
 460{
 461        return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 462}
 463
 464static inline void free_devinfo_mem(void *vaddr)
 465{
 466        kmem_cache_free(iommu_devinfo_cache, vaddr);
 467}
 468
 469struct iova *alloc_iova_mem(void)
 470{
 471        return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
 472}
 473
 474void free_iova_mem(struct iova *iova)
 475{
 476        kmem_cache_free(iommu_iova_cache, iova);
 477}
 478
 479
 480static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 481{
 482        unsigned long sagaw;
 483        int agaw = -1;
 484
 485        sagaw = cap_sagaw(iommu->cap);
 486        for (agaw = width_to_agaw(max_gaw);
 487             agaw >= 0; agaw--) {
 488                if (test_bit(agaw, &sagaw))
 489                        break;
 490        }
 491
 492        return agaw;
 493}
 494
 495/*
 496 * Calculate max SAGAW for each iommu.
 497 */
 498int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 499{
 500        return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 501}
 502
 503/*
 504 * calculate agaw for each iommu.
 505 * "SAGAW" may be different across iommus, use a default agaw, and
 506 * get a supported less agaw for iommus that don't support the default agaw.
 507 */
 508int iommu_calculate_agaw(struct intel_iommu *iommu)
 509{
 510        return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 511}
 512
 513/* This functionin only returns single iommu in a domain */
 514static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 515{
 516        int iommu_id;
 517
 518        /* si_domain and vm domain should not get here. */
 519        BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
 520        BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
 521
 522        iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
 523        if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 524                return NULL;
 525
 526        return g_iommus[iommu_id];
 527}
 528
 529static void domain_update_iommu_coherency(struct dmar_domain *domain)
 530{
 531        int i;
 532
 533        domain->iommu_coherency = 1;
 534
 535        for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
 536                if (!ecap_coherent(g_iommus[i]->ecap)) {
 537                        domain->iommu_coherency = 0;
 538                        break;
 539                }
 540        }
 541}
 542
 543static void domain_update_iommu_snooping(struct dmar_domain *domain)
 544{
 545        int i;
 546
 547        domain->iommu_snooping = 1;
 548
 549        for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
 550                if (!ecap_sc_support(g_iommus[i]->ecap)) {
 551                        domain->iommu_snooping = 0;
 552                        break;
 553                }
 554        }
 555}
 556
 557/* Some capabilities may be different across iommus */
 558static void domain_update_iommu_cap(struct dmar_domain *domain)
 559{
 560        domain_update_iommu_coherency(domain);
 561        domain_update_iommu_snooping(domain);
 562}
 563
 564static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
 565{
 566        struct dmar_drhd_unit *drhd = NULL;
 567        int i;
 568
 569        for_each_drhd_unit(drhd) {
 570                if (drhd->ignored)
 571                        continue;
 572                if (segment != drhd->segment)
 573                        continue;
 574
 575                for (i = 0; i < drhd->devices_cnt; i++) {
 576                        if (drhd->devices[i] &&
 577                            drhd->devices[i]->bus->number == bus &&
 578                            drhd->devices[i]->devfn == devfn)
 579                                return drhd->iommu;
 580                        if (drhd->devices[i] &&
 581                            drhd->devices[i]->subordinate &&
 582                            drhd->devices[i]->subordinate->number <= bus &&
 583                            drhd->devices[i]->subordinate->subordinate >= bus)
 584                                return drhd->iommu;
 585                }
 586
 587                if (drhd->include_all)
 588                        return drhd->iommu;
 589        }
 590
 591        return NULL;
 592}
 593
 594static void domain_flush_cache(struct dmar_domain *domain,
 595                               void *addr, int size)
 596{
 597        if (!domain->iommu_coherency)
 598                clflush_cache_range(addr, size);
 599}
 600
 601/* Gets context entry for a given bus and devfn */
 602static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
 603                u8 bus, u8 devfn)
 604{
 605        struct root_entry *root;
 606        struct context_entry *context;
 607        unsigned long phy_addr;
 608        unsigned long flags;
 609
 610        spin_lock_irqsave(&iommu->lock, flags);
 611        root = &iommu->root_entry[bus];
 612        context = get_context_addr_from_root(root);
 613        if (!context) {
 614                context = (struct context_entry *)
 615                                alloc_pgtable_page(iommu->node);
 616                if (!context) {
 617                        spin_unlock_irqrestore(&iommu->lock, flags);
 618                        return NULL;
 619                }
 620                __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 621                phy_addr = virt_to_phys((void *)context);
 622                set_root_value(root, phy_addr);
 623                set_root_present(root);
 624                __iommu_flush_cache(iommu, root, sizeof(*root));
 625        }
 626        spin_unlock_irqrestore(&iommu->lock, flags);
 627        return &context[devfn];
 628}
 629
 630static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 631{
 632        struct root_entry *root;
 633        struct context_entry *context;
 634        int ret;
 635        unsigned long flags;
 636
 637        spin_lock_irqsave(&iommu->lock, flags);
 638        root = &iommu->root_entry[bus];
 639        context = get_context_addr_from_root(root);
 640        if (!context) {
 641                ret = 0;
 642                goto out;
 643        }
 644        ret = context_present(&context[devfn]);
 645out:
 646        spin_unlock_irqrestore(&iommu->lock, flags);
 647        return ret;
 648}
 649
 650static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
 651{
 652        struct root_entry *root;
 653        struct context_entry *context;
 654        unsigned long flags;
 655
 656        spin_lock_irqsave(&iommu->lock, flags);
 657        root = &iommu->root_entry[bus];
 658        context = get_context_addr_from_root(root);
 659        if (context) {
 660                context_clear_entry(&context[devfn]);
 661                __iommu_flush_cache(iommu, &context[devfn], \
 662                        sizeof(*context));
 663        }
 664        spin_unlock_irqrestore(&iommu->lock, flags);
 665}
 666
 667static void free_context_table(struct intel_iommu *iommu)
 668{
 669        struct root_entry *root;
 670        int i;
 671        unsigned long flags;
 672        struct context_entry *context;
 673
 674        spin_lock_irqsave(&iommu->lock, flags);
 675        if (!iommu->root_entry) {
 676                goto out;
 677        }
 678        for (i = 0; i < ROOT_ENTRY_NR; i++) {
 679                root = &iommu->root_entry[i];
 680                context = get_context_addr_from_root(root);
 681                if (context)
 682                        free_pgtable_page(context);
 683        }
 684        free_pgtable_page(iommu->root_entry);
 685        iommu->root_entry = NULL;
 686out:
 687        spin_unlock_irqrestore(&iommu->lock, flags);
 688}
 689
 690static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 691                                      unsigned long pfn)
 692{
 693        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 694        struct dma_pte *parent, *pte = NULL;
 695        int level = agaw_to_level(domain->agaw);
 696        int offset;
 697
 698        BUG_ON(!domain->pgd);
 699        BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
 700        parent = domain->pgd;
 701
 702        while (level > 0) {
 703                void *tmp_page;
 704
 705                offset = pfn_level_offset(pfn, level);
 706                pte = &parent[offset];
 707                if (level == 1)
 708                        break;
 709
 710                if (!dma_pte_present(pte)) {
 711                        uint64_t pteval;
 712
 713                        tmp_page = alloc_pgtable_page(domain->nid);
 714
 715                        if (!tmp_page)
 716                                return NULL;
 717
 718                        domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 719                        pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 720                        if (cmpxchg64(&pte->val, 0ULL, pteval)) {
 721                                /* Someone else set it while we were thinking; use theirs. */
 722                                free_pgtable_page(tmp_page);
 723                        } else {
 724                                dma_pte_addr(pte);
 725                                domain_flush_cache(domain, pte, sizeof(*pte));
 726                        }
 727                }
 728                parent = phys_to_virt(dma_pte_addr(pte));
 729                level--;
 730        }
 731
 732        return pte;
 733}
 734
 735/* return address's pte at specific level */
 736static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
 737                                         unsigned long pfn,
 738                                         int level)
 739{
 740        struct dma_pte *parent, *pte = NULL;
 741        int total = agaw_to_level(domain->agaw);
 742        int offset;
 743
 744        parent = domain->pgd;
 745        while (level <= total) {
 746                offset = pfn_level_offset(pfn, total);
 747                pte = &parent[offset];
 748                if (level == total)
 749                        return pte;
 750
 751                if (!dma_pte_present(pte))
 752                        break;
 753                parent = phys_to_virt(dma_pte_addr(pte));
 754                total--;
 755        }
 756        return NULL;
 757}
 758
 759/* clear last level pte, a tlb flush should be followed */
 760static void dma_pte_clear_range(struct dmar_domain *domain,
 761                                unsigned long start_pfn,
 762                                unsigned long last_pfn)
 763{
 764        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 765        struct dma_pte *first_pte, *pte;
 766
 767        BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
 768        BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
 769        BUG_ON(start_pfn > last_pfn);
 770
 771        /* we don't need lock here; nobody else touches the iova range */
 772        do {
 773                first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1);
 774                if (!pte) {
 775                        start_pfn = align_to_level(start_pfn + 1, 2);
 776                        continue;
 777                }
 778                do { 
 779                        dma_clear_pte(pte);
 780                        start_pfn++;
 781                        pte++;
 782                } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
 783
 784                domain_flush_cache(domain, first_pte,
 785                                   (void *)pte - (void *)first_pte);
 786
 787        } while (start_pfn && start_pfn <= last_pfn);
 788}
 789
 790/* free page table pages. last level pte should already be cleared */
 791static void dma_pte_free_pagetable(struct dmar_domain *domain,
 792                                   unsigned long start_pfn,
 793                                   unsigned long last_pfn)
 794{
 795        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 796        struct dma_pte *first_pte, *pte;
 797        int total = agaw_to_level(domain->agaw);
 798        int level;
 799        unsigned long tmp;
 800
 801        BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
 802        BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
 803        BUG_ON(start_pfn > last_pfn);
 804
 805        /* We don't need lock here; nobody else touches the iova range */
 806        level = 2;
 807        while (level <= total) {
 808                tmp = align_to_level(start_pfn, level);
 809
 810                /* If we can't even clear one PTE at this level, we're done */
 811                if (tmp + level_size(level) - 1 > last_pfn)
 812                        return;
 813
 814                do {
 815                        first_pte = pte = dma_pfn_level_pte(domain, tmp, level);
 816                        if (!pte) {
 817                                tmp = align_to_level(tmp + 1, level + 1);
 818                                continue;
 819                        }
 820                        do {
 821                                if (dma_pte_present(pte)) {
 822                                        free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
 823                                        dma_clear_pte(pte);
 824                                }
 825                                pte++;
 826                                tmp += level_size(level);
 827                        } while (!first_pte_in_page(pte) &&
 828                                 tmp + level_size(level) - 1 <= last_pfn);
 829
 830                        domain_flush_cache(domain, first_pte,
 831                                           (void *)pte - (void *)first_pte);
 832                        
 833                } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
 834                level++;
 835        }
 836        /* free pgd */
 837        if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
 838                free_pgtable_page(domain->pgd);
 839                domain->pgd = NULL;
 840        }
 841}
 842
 843/* iommu handling */
 844static int iommu_alloc_root_entry(struct intel_iommu *iommu)
 845{
 846        struct root_entry *root;
 847        unsigned long flags;
 848
 849        root = (struct root_entry *)alloc_pgtable_page(iommu->node);
 850        if (!root)
 851                return -ENOMEM;
 852
 853        __iommu_flush_cache(iommu, root, ROOT_SIZE);
 854
 855        spin_lock_irqsave(&iommu->lock, flags);
 856        iommu->root_entry = root;
 857        spin_unlock_irqrestore(&iommu->lock, flags);
 858
 859        return 0;
 860}
 861
 862static void iommu_set_root_entry(struct intel_iommu *iommu)
 863{
 864        void *addr;
 865        u32 sts;
 866        unsigned long flag;
 867
 868        addr = iommu->root_entry;
 869
 870        spin_lock_irqsave(&iommu->register_lock, flag);
 871        dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
 872
 873        writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
 874
 875        /* Make sure hardware complete it */
 876        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
 877                      readl, (sts & DMA_GSTS_RTPS), sts);
 878
 879        spin_unlock_irqrestore(&iommu->register_lock, flag);
 880}
 881
 882static void iommu_flush_write_buffer(struct intel_iommu *iommu)
 883{
 884        u32 val;
 885        unsigned long flag;
 886
 887        if (!rwbf_quirk && !cap_rwbf(iommu->cap))
 888                return;
 889
 890        spin_lock_irqsave(&iommu->register_lock, flag);
 891        writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
 892
 893        /* Make sure hardware complete it */
 894        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
 895                      readl, (!(val & DMA_GSTS_WBFS)), val);
 896
 897        spin_unlock_irqrestore(&iommu->register_lock, flag);
 898}
 899
 900/* return value determine if we need a write buffer flush */
 901static void __iommu_flush_context(struct intel_iommu *iommu,
 902                                  u16 did, u16 source_id, u8 function_mask,
 903                                  u64 type)
 904{
 905        u64 val = 0;
 906        unsigned long flag;
 907
 908        switch (type) {
 909        case DMA_CCMD_GLOBAL_INVL:
 910                val = DMA_CCMD_GLOBAL_INVL;
 911                break;
 912        case DMA_CCMD_DOMAIN_INVL:
 913                val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
 914                break;
 915        case DMA_CCMD_DEVICE_INVL:
 916                val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
 917                        | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
 918                break;
 919        default:
 920                BUG();
 921        }
 922        val |= DMA_CCMD_ICC;
 923
 924        spin_lock_irqsave(&iommu->register_lock, flag);
 925        dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
 926
 927        /* Make sure hardware complete it */
 928        IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
 929                dmar_readq, (!(val & DMA_CCMD_ICC)), val);
 930
 931        spin_unlock_irqrestore(&iommu->register_lock, flag);
 932}
 933
 934/* return value determine if we need a write buffer flush */
 935static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
 936                                u64 addr, unsigned int size_order, u64 type)
 937{
 938        int tlb_offset = ecap_iotlb_offset(iommu->ecap);
 939        u64 val = 0, val_iva = 0;
 940        unsigned long flag;
 941
 942        switch (type) {
 943        case DMA_TLB_GLOBAL_FLUSH:
 944                /* global flush doesn't need set IVA_REG */
 945                val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
 946                break;
 947        case DMA_TLB_DSI_FLUSH:
 948                val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
 949                break;
 950        case DMA_TLB_PSI_FLUSH:
 951                val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
 952                /* Note: always flush non-leaf currently */
 953                val_iva = size_order | addr;
 954                break;
 955        default:
 956                BUG();
 957        }
 958        /* Note: set drain read/write */
 959#if 0
 960        /*
 961         * This is probably to be super secure.. Looks like we can
 962         * ignore it without any impact.
 963         */
 964        if (cap_read_drain(iommu->cap))
 965                val |= DMA_TLB_READ_DRAIN;
 966#endif
 967        if (cap_write_drain(iommu->cap))
 968                val |= DMA_TLB_WRITE_DRAIN;
 969
 970        spin_lock_irqsave(&iommu->register_lock, flag);
 971        /* Note: Only uses first TLB reg currently */
 972        if (val_iva)
 973                dmar_writeq(iommu->reg + tlb_offset, val_iva);
 974        dmar_writeq(iommu->reg + tlb_offset + 8, val);
 975
 976        /* Make sure hardware complete it */
 977        IOMMU_WAIT_OP(iommu, tlb_offset + 8,
 978                dmar_readq, (!(val & DMA_TLB_IVT)), val);
 979
 980        spin_unlock_irqrestore(&iommu->register_lock, flag);
 981
 982        /* check IOTLB invalidation granularity */
 983        if (DMA_TLB_IAIG(val) == 0)
 984                printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
 985        if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
 986                pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
 987                        (unsigned long long)DMA_TLB_IIRG(type),
 988                        (unsigned long long)DMA_TLB_IAIG(val));
 989}
 990
 991static struct device_domain_info *iommu_support_dev_iotlb(
 992        struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
 993{
 994        int found = 0;
 995        unsigned long flags;
 996        struct device_domain_info *info;
 997        struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
 998
 999        if (!ecap_dev_iotlb_support(iommu->ecap))
1000                return NULL;
1001
1002        if (!iommu->qi)
1003                return NULL;
1004
1005        spin_lock_irqsave(&device_domain_lock, flags);
1006        list_for_each_entry(info, &domain->devices, link)
1007                if (info->bus == bus && info->devfn == devfn) {
1008                        found = 1;
1009                        break;
1010                }
1011        spin_unlock_irqrestore(&device_domain_lock, flags);
1012
1013        if (!found || !info->dev)
1014                return NULL;
1015
1016        if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1017                return NULL;
1018
1019        if (!dmar_find_matched_atsr_unit(info->dev))
1020                return NULL;
1021
1022        info->iommu = iommu;
1023
1024        return info;
1025}
1026
1027static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1028{
1029        if (!info)
1030                return;
1031
1032        pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1033}
1034
1035static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1036{
1037        if (!info->dev || !pci_ats_enabled(info->dev))
1038                return;
1039
1040        pci_disable_ats(info->dev);
1041}
1042
1043static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1044                                  u64 addr, unsigned mask)
1045{
1046        u16 sid, qdep;
1047        unsigned long flags;
1048        struct device_domain_info *info;
1049
1050        spin_lock_irqsave(&device_domain_lock, flags);
1051        list_for_each_entry(info, &domain->devices, link) {
1052                if (!info->dev || !pci_ats_enabled(info->dev))
1053                        continue;
1054
1055                sid = info->bus << 8 | info->devfn;
1056                qdep = pci_ats_queue_depth(info->dev);
1057                qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1058        }
1059        spin_unlock_irqrestore(&device_domain_lock, flags);
1060}
1061
1062static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1063                                  unsigned long pfn, unsigned int pages, int map)
1064{
1065        unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1066        uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1067
1068        BUG_ON(pages == 0);
1069
1070        /*
1071         * Fallback to domain selective flush if no PSI support or the size is
1072         * too big.
1073         * PSI requires page size to be 2 ^ x, and the base address is naturally
1074         * aligned to the size
1075         */
1076        if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1077                iommu->flush.flush_iotlb(iommu, did, 0, 0,
1078                                                DMA_TLB_DSI_FLUSH);
1079        else
1080                iommu->flush.flush_iotlb(iommu, did, addr, mask,
1081                                                DMA_TLB_PSI_FLUSH);
1082
1083        /*
1084         * In caching mode, changes of pages from non-present to present require
1085         * flush. However, device IOTLB doesn't need to be flushed in this case.
1086         */
1087        if (!cap_caching_mode(iommu->cap) || !map)
1088                iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1089}
1090
1091static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1092{
1093        u32 pmen;
1094        unsigned long flags;
1095
1096        spin_lock_irqsave(&iommu->register_lock, flags);
1097        pmen = readl(iommu->reg + DMAR_PMEN_REG);
1098        pmen &= ~DMA_PMEN_EPM;
1099        writel(pmen, iommu->reg + DMAR_PMEN_REG);
1100
1101        /* wait for the protected region status bit to clear */
1102        IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1103                readl, !(pmen & DMA_PMEN_PRS), pmen);
1104
1105        spin_unlock_irqrestore(&iommu->register_lock, flags);
1106}
1107
1108static int iommu_enable_translation(struct intel_iommu *iommu)
1109{
1110        u32 sts;
1111        unsigned long flags;
1112
1113        spin_lock_irqsave(&iommu->register_lock, flags);
1114        iommu->gcmd |= DMA_GCMD_TE;
1115        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1116
1117        /* Make sure hardware complete it */
1118        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1119                      readl, (sts & DMA_GSTS_TES), sts);
1120
1121        spin_unlock_irqrestore(&iommu->register_lock, flags);
1122        return 0;
1123}
1124
1125static int iommu_disable_translation(struct intel_iommu *iommu)
1126{
1127        u32 sts;
1128        unsigned long flag;
1129
1130        spin_lock_irqsave(&iommu->register_lock, flag);
1131        iommu->gcmd &= ~DMA_GCMD_TE;
1132        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1133
1134        /* Make sure hardware complete it */
1135        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1136                      readl, (!(sts & DMA_GSTS_TES)), sts);
1137
1138        spin_unlock_irqrestore(&iommu->register_lock, flag);
1139        return 0;
1140}
1141
1142
1143static int iommu_init_domains(struct intel_iommu *iommu)
1144{
1145        unsigned long ndomains;
1146        unsigned long nlongs;
1147
1148        ndomains = cap_ndoms(iommu->cap);
1149        pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu->seq_id,
1150                        ndomains);
1151        nlongs = BITS_TO_LONGS(ndomains);
1152
1153        spin_lock_init(&iommu->lock);
1154
1155        /* TBD: there might be 64K domains,
1156         * consider other allocation for future chip
1157         */
1158        iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1159        if (!iommu->domain_ids) {
1160                printk(KERN_ERR "Allocating domain id array failed\n");
1161                return -ENOMEM;
1162        }
1163        iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1164                        GFP_KERNEL);
1165        if (!iommu->domains) {
1166                printk(KERN_ERR "Allocating domain array failed\n");
1167                return -ENOMEM;
1168        }
1169
1170        /*
1171         * if Caching mode is set, then invalid translations are tagged
1172         * with domainid 0. Hence we need to pre-allocate it.
1173         */
1174        if (cap_caching_mode(iommu->cap))
1175                set_bit(0, iommu->domain_ids);
1176        return 0;
1177}
1178
1179
1180static void domain_exit(struct dmar_domain *domain);
1181static void vm_domain_exit(struct dmar_domain *domain);
1182
1183void free_dmar_iommu(struct intel_iommu *iommu)
1184{
1185        struct dmar_domain *domain;
1186        int i;
1187        unsigned long flags;
1188
1189        if ((iommu->domains) && (iommu->domain_ids)) {
1190                for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1191                        domain = iommu->domains[i];
1192                        clear_bit(i, iommu->domain_ids);
1193
1194                        spin_lock_irqsave(&domain->iommu_lock, flags);
1195                        if (--domain->iommu_count == 0) {
1196                                if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1197                                        vm_domain_exit(domain);
1198                                else
1199                                        domain_exit(domain);
1200                        }
1201                        spin_unlock_irqrestore(&domain->iommu_lock, flags);
1202                }
1203        }
1204
1205        if (iommu->gcmd & DMA_GCMD_TE)
1206                iommu_disable_translation(iommu);
1207
1208        if (iommu->irq) {
1209                irq_set_handler_data(iommu->irq, NULL);
1210                /* This will mask the irq */
1211                free_irq(iommu->irq, iommu);
1212                destroy_irq(iommu->irq);
1213        }
1214
1215        kfree(iommu->domains);
1216        kfree(iommu->domain_ids);
1217
1218        g_iommus[iommu->seq_id] = NULL;
1219
1220        /* if all iommus are freed, free g_iommus */
1221        for (i = 0; i < g_num_of_iommus; i++) {
1222                if (g_iommus[i])
1223                        break;
1224        }
1225
1226        if (i == g_num_of_iommus)
1227                kfree(g_iommus);
1228
1229        /* free context mapping */
1230        free_context_table(iommu);
1231}
1232
1233static struct dmar_domain *alloc_domain(void)
1234{
1235        struct dmar_domain *domain;
1236
1237        domain = alloc_domain_mem();
1238        if (!domain)
1239                return NULL;
1240
1241        domain->nid = -1;
1242        memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1243        domain->flags = 0;
1244
1245        return domain;
1246}
1247
1248static int iommu_attach_domain(struct dmar_domain *domain,
1249                               struct intel_iommu *iommu)
1250{
1251        int num;
1252        unsigned long ndomains;
1253        unsigned long flags;
1254
1255        ndomains = cap_ndoms(iommu->cap);
1256
1257        spin_lock_irqsave(&iommu->lock, flags);
1258
1259        num = find_first_zero_bit(iommu->domain_ids, ndomains);
1260        if (num >= ndomains) {
1261                spin_unlock_irqrestore(&iommu->lock, flags);
1262                printk(KERN_ERR "IOMMU: no free domain ids\n");
1263                return -ENOMEM;
1264        }
1265
1266        domain->id = num;
1267        set_bit(num, iommu->domain_ids);
1268        set_bit(iommu->seq_id, &domain->iommu_bmp);
1269        iommu->domains[num] = domain;
1270        spin_unlock_irqrestore(&iommu->lock, flags);
1271
1272        return 0;
1273}
1274
1275static void iommu_detach_domain(struct dmar_domain *domain,
1276                                struct intel_iommu *iommu)
1277{
1278        unsigned long flags;
1279        int num, ndomains;
1280        int found = 0;
1281
1282        spin_lock_irqsave(&iommu->lock, flags);
1283        ndomains = cap_ndoms(iommu->cap);
1284        for_each_set_bit(num, iommu->domain_ids, ndomains) {
1285                if (iommu->domains[num] == domain) {
1286                        found = 1;
1287                        break;
1288                }
1289        }
1290
1291        if (found) {
1292                clear_bit(num, iommu->domain_ids);
1293                clear_bit(iommu->seq_id, &domain->iommu_bmp);
1294                iommu->domains[num] = NULL;
1295        }
1296        spin_unlock_irqrestore(&iommu->lock, flags);
1297}
1298
1299static struct iova_domain reserved_iova_list;
1300static struct lock_class_key reserved_rbtree_key;
1301
1302static int dmar_init_reserved_ranges(void)
1303{
1304        struct pci_dev *pdev = NULL;
1305        struct iova *iova;
1306        int i;
1307
1308        init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1309
1310        lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1311                &reserved_rbtree_key);
1312
1313        /* IOAPIC ranges shouldn't be accessed by DMA */
1314        iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1315                IOVA_PFN(IOAPIC_RANGE_END));
1316        if (!iova) {
1317                printk(KERN_ERR "Reserve IOAPIC range failed\n");
1318                return -ENODEV;
1319        }
1320
1321        /* Reserve all PCI MMIO to avoid peer-to-peer access */
1322        for_each_pci_dev(pdev) {
1323                struct resource *r;
1324
1325                for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1326                        r = &pdev->resource[i];
1327                        if (!r->flags || !(r->flags & IORESOURCE_MEM))
1328                                continue;
1329                        iova = reserve_iova(&reserved_iova_list,
1330                                            IOVA_PFN(r->start),
1331                                            IOVA_PFN(r->end));
1332                        if (!iova) {
1333                                printk(KERN_ERR "Reserve iova failed\n");
1334                                return -ENODEV;
1335                        }
1336                }
1337        }
1338        return 0;
1339}
1340
1341static void domain_reserve_special_ranges(struct dmar_domain *domain)
1342{
1343        copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1344}
1345
1346static inline int guestwidth_to_adjustwidth(int gaw)
1347{
1348        int agaw;
1349        int r = (gaw - 12) % 9;
1350
1351        if (r == 0)
1352                agaw = gaw;
1353        else
1354                agaw = gaw + 9 - r;
1355        if (agaw > 64)
1356                agaw = 64;
1357        return agaw;
1358}
1359
1360static int domain_init(struct dmar_domain *domain, int guest_width)
1361{
1362        struct intel_iommu *iommu;
1363        int adjust_width, agaw;
1364        unsigned long sagaw;
1365
1366        init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1367        spin_lock_init(&domain->iommu_lock);
1368
1369        domain_reserve_special_ranges(domain);
1370
1371        /* calculate AGAW */
1372        iommu = domain_get_iommu(domain);
1373        if (guest_width > cap_mgaw(iommu->cap))
1374                guest_width = cap_mgaw(iommu->cap);
1375        domain->gaw = guest_width;
1376        adjust_width = guestwidth_to_adjustwidth(guest_width);
1377        agaw = width_to_agaw(adjust_width);
1378        sagaw = cap_sagaw(iommu->cap);
1379        if (!test_bit(agaw, &sagaw)) {
1380                /* hardware doesn't support it, choose a bigger one */
1381                pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1382                agaw = find_next_bit(&sagaw, 5, agaw);
1383                if (agaw >= 5)
1384                        return -ENODEV;
1385        }
1386        domain->agaw = agaw;
1387        INIT_LIST_HEAD(&domain->devices);
1388
1389        if (ecap_coherent(iommu->ecap))
1390                domain->iommu_coherency = 1;
1391        else
1392                domain->iommu_coherency = 0;
1393
1394        if (ecap_sc_support(iommu->ecap))
1395                domain->iommu_snooping = 1;
1396        else
1397                domain->iommu_snooping = 0;
1398
1399        domain->iommu_count = 1;
1400        domain->nid = iommu->node;
1401
1402        /* always allocate the top pgd */
1403        domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1404        if (!domain->pgd)
1405                return -ENOMEM;
1406        __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1407        return 0;
1408}
1409
1410static void domain_exit(struct dmar_domain *domain)
1411{
1412        struct dmar_drhd_unit *drhd;
1413        struct intel_iommu *iommu;
1414
1415        /* Domain 0 is reserved, so dont process it */
1416        if (!domain)
1417                return;
1418
1419        domain_remove_dev_info(domain);
1420        /* destroy iovas */
1421        put_iova_domain(&domain->iovad);
1422
1423        /* clear ptes */
1424        dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1425
1426        /* free page tables */
1427        dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1428
1429        for_each_active_iommu(iommu, drhd)
1430                if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1431                        iommu_detach_domain(domain, iommu);
1432
1433        free_domain_mem(domain);
1434}
1435
1436static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1437                                 u8 bus, u8 devfn, int translation)
1438{
1439        struct context_entry *context;
1440        unsigned long flags;
1441        struct intel_iommu *iommu;
1442        struct dma_pte *pgd;
1443        unsigned long num;
1444        unsigned long ndomains;
1445        int id;
1446        int agaw;
1447        struct device_domain_info *info = NULL;
1448
1449        pr_debug("Set context mapping for %02x:%02x.%d\n",
1450                bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1451
1452        BUG_ON(!domain->pgd);
1453        BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1454               translation != CONTEXT_TT_MULTI_LEVEL);
1455
1456        iommu = device_to_iommu(segment, bus, devfn);
1457        if (!iommu)
1458                return -ENODEV;
1459
1460        context = device_to_context_entry(iommu, bus, devfn);
1461        if (!context)
1462                return -ENOMEM;
1463        spin_lock_irqsave(&iommu->lock, flags);
1464        if (context_present(context)) {
1465                spin_unlock_irqrestore(&iommu->lock, flags);
1466                return 0;
1467        }
1468
1469        id = domain->id;
1470        pgd = domain->pgd;
1471
1472        if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1473            domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1474                int found = 0;
1475
1476                /* find an available domain id for this device in iommu */
1477                ndomains = cap_ndoms(iommu->cap);
1478                for_each_set_bit(num, iommu->domain_ids, ndomains) {
1479                        if (iommu->domains[num] == domain) {
1480                                id = num;
1481                                found = 1;
1482                                break;
1483                        }
1484                }
1485
1486                if (found == 0) {
1487                        num = find_first_zero_bit(iommu->domain_ids, ndomains);
1488                        if (num >= ndomains) {
1489                                spin_unlock_irqrestore(&iommu->lock, flags);
1490                                printk(KERN_ERR "IOMMU: no free domain ids\n");
1491                                return -EFAULT;
1492                        }
1493
1494                        set_bit(num, iommu->domain_ids);
1495                        iommu->domains[num] = domain;
1496                        id = num;
1497                }
1498
1499                /* Skip top levels of page tables for
1500                 * iommu which has less agaw than default.
1501                 * Unnecessary for PT mode.
1502                 */
1503                if (translation != CONTEXT_TT_PASS_THROUGH) {
1504                        for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1505                                pgd = phys_to_virt(dma_pte_addr(pgd));
1506                                if (!dma_pte_present(pgd)) {
1507                                        spin_unlock_irqrestore(&iommu->lock, flags);
1508                                        return -ENOMEM;
1509                                }
1510                        }
1511                }
1512        }
1513
1514        context_set_domain_id(context, id);
1515
1516        if (translation != CONTEXT_TT_PASS_THROUGH) {
1517                info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1518                translation = info ? CONTEXT_TT_DEV_IOTLB :
1519                                     CONTEXT_TT_MULTI_LEVEL;
1520        }
1521        /*
1522         * In pass through mode, AW must be programmed to indicate the largest
1523         * AGAW value supported by hardware. And ASR is ignored by hardware.
1524         */
1525        if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1526                context_set_address_width(context, iommu->msagaw);
1527        else {
1528                context_set_address_root(context, virt_to_phys(pgd));
1529                context_set_address_width(context, iommu->agaw);
1530        }
1531
1532        context_set_translation_type(context, translation);
1533        context_set_fault_enable(context);
1534        context_set_present(context);
1535        domain_flush_cache(domain, context, sizeof(*context));
1536
1537        /*
1538         * It's a non-present to present mapping. If hardware doesn't cache
1539         * non-present entry we only need to flush the write-buffer. If the
1540         * _does_ cache non-present entries, then it does so in the special
1541         * domain #0, which we have to flush:
1542         */
1543        if (cap_caching_mode(iommu->cap)) {
1544                iommu->flush.flush_context(iommu, 0,
1545                                           (((u16)bus) << 8) | devfn,
1546                                           DMA_CCMD_MASK_NOBIT,
1547                                           DMA_CCMD_DEVICE_INVL);
1548                iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1549        } else {
1550                iommu_flush_write_buffer(iommu);
1551        }
1552        iommu_enable_dev_iotlb(info);
1553        spin_unlock_irqrestore(&iommu->lock, flags);
1554
1555        spin_lock_irqsave(&domain->iommu_lock, flags);
1556        if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1557                domain->iommu_count++;
1558                if (domain->iommu_count == 1)
1559                        domain->nid = iommu->node;
1560                domain_update_iommu_cap(domain);
1561        }
1562        spin_unlock_irqrestore(&domain->iommu_lock, flags);
1563        return 0;
1564}
1565
1566static int
1567domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1568                        int translation)
1569{
1570        int ret;
1571        struct pci_dev *tmp, *parent;
1572
1573        ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1574                                         pdev->bus->number, pdev->devfn,
1575                                         translation);
1576        if (ret)
1577                return ret;
1578
1579        /* dependent device mapping */
1580        tmp = pci_find_upstream_pcie_bridge(pdev);
1581        if (!tmp)
1582                return 0;
1583        /* Secondary interface's bus number and devfn 0 */
1584        parent = pdev->bus->self;
1585        while (parent != tmp) {
1586                ret = domain_context_mapping_one(domain,
1587                                                 pci_domain_nr(parent->bus),
1588                                                 parent->bus->number,
1589                                                 parent->devfn, translation);
1590                if (ret)
1591                        return ret;
1592                parent = parent->bus->self;
1593        }
1594        if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1595                return domain_context_mapping_one(domain,
1596                                        pci_domain_nr(tmp->subordinate),
1597                                        tmp->subordinate->number, 0,
1598                                        translation);
1599        else /* this is a legacy PCI bridge */
1600                return domain_context_mapping_one(domain,
1601                                                  pci_domain_nr(tmp->bus),
1602                                                  tmp->bus->number,
1603                                                  tmp->devfn,
1604                                                  translation);
1605}
1606
1607static int domain_context_mapped(struct pci_dev *pdev)
1608{
1609        int ret;
1610        struct pci_dev *tmp, *parent;
1611        struct intel_iommu *iommu;
1612
1613        iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1614                                pdev->devfn);
1615        if (!iommu)
1616                return -ENODEV;
1617
1618        ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1619        if (!ret)
1620                return ret;
1621        /* dependent device mapping */
1622        tmp = pci_find_upstream_pcie_bridge(pdev);
1623        if (!tmp)
1624                return ret;
1625        /* Secondary interface's bus number and devfn 0 */
1626        parent = pdev->bus->self;
1627        while (parent != tmp) {
1628                ret = device_context_mapped(iommu, parent->bus->number,
1629                                            parent->devfn);
1630                if (!ret)
1631                        return ret;
1632                parent = parent->bus->self;
1633        }
1634        if (pci_is_pcie(tmp))
1635                return device_context_mapped(iommu, tmp->subordinate->number,
1636                                             0);
1637        else
1638                return device_context_mapped(iommu, tmp->bus->number,
1639                                             tmp->devfn);
1640}
1641
1642/* Returns a number of VTD pages, but aligned to MM page size */
1643static inline unsigned long aligned_nrpages(unsigned long host_addr,
1644                                            size_t size)
1645{
1646        host_addr &= ~PAGE_MASK;
1647        return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1648}
1649
1650static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1651                            struct scatterlist *sg, unsigned long phys_pfn,
1652                            unsigned long nr_pages, int prot)
1653{
1654        struct dma_pte *first_pte = NULL, *pte = NULL;
1655        phys_addr_t uninitialized_var(pteval);
1656        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1657        unsigned long sg_res;
1658
1659        BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1660
1661        if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1662                return -EINVAL;
1663
1664        prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1665
1666        if (sg)
1667                sg_res = 0;
1668        else {
1669                sg_res = nr_pages + 1;
1670                pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1671        }
1672
1673        while (nr_pages--) {
1674                uint64_t tmp;
1675
1676                if (!sg_res) {
1677                        sg_res = aligned_nrpages(sg->offset, sg->length);
1678                        sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1679                        sg->dma_length = sg->length;
1680                        pteval = page_to_phys(sg_page(sg)) | prot;
1681                }
1682                if (!pte) {
1683                        first_pte = pte = pfn_to_dma_pte(domain, iov_pfn);
1684                        if (!pte)
1685                                return -ENOMEM;
1686                }
1687                /* We don't need lock here, nobody else
1688                 * touches the iova range
1689                 */
1690                tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1691                if (tmp) {
1692                        static int dumps = 5;
1693                        printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1694                               iov_pfn, tmp, (unsigned long long)pteval);
1695                        if (dumps) {
1696                                dumps--;
1697                                debug_dma_dump_mappings(NULL);
1698                        }
1699                        WARN_ON(1);
1700                }
1701                pte++;
1702                if (!nr_pages || first_pte_in_page(pte)) {
1703                        domain_flush_cache(domain, first_pte,
1704                                           (void *)pte - (void *)first_pte);
1705                        pte = NULL;
1706                }
1707                iov_pfn++;
1708                pteval += VTD_PAGE_SIZE;
1709                sg_res--;
1710                if (!sg_res)
1711                        sg = sg_next(sg);
1712        }
1713        return 0;
1714}
1715
1716static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1717                                    struct scatterlist *sg, unsigned long nr_pages,
1718                                    int prot)
1719{
1720        return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1721}
1722
1723static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1724                                     unsigned long phys_pfn, unsigned long nr_pages,
1725                                     int prot)
1726{
1727        return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1728}
1729
1730static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1731{
1732        if (!iommu)
1733                return;
1734
1735        clear_context_table(iommu, bus, devfn);
1736        iommu->flush.flush_context(iommu, 0, 0, 0,
1737                                           DMA_CCMD_GLOBAL_INVL);
1738        iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1739}
1740
1741static void domain_remove_dev_info(struct dmar_domain *domain)
1742{
1743        struct device_domain_info *info;
1744        unsigned long flags;
1745        struct intel_iommu *iommu;
1746
1747        spin_lock_irqsave(&device_domain_lock, flags);
1748        while (!list_empty(&domain->devices)) {
1749                info = list_entry(domain->devices.next,
1750                        struct device_domain_info, link);
1751                list_del(&info->link);
1752                list_del(&info->global);
1753                if (info->dev)
1754                        info->dev->dev.archdata.iommu = NULL;
1755                spin_unlock_irqrestore(&device_domain_lock, flags);
1756
1757                iommu_disable_dev_iotlb(info);
1758                iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1759                iommu_detach_dev(iommu, info->bus, info->devfn);
1760                free_devinfo_mem(info);
1761
1762                spin_lock_irqsave(&device_domain_lock, flags);
1763        }
1764        spin_unlock_irqrestore(&device_domain_lock, flags);
1765}
1766
1767/*
1768 * find_domain
1769 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1770 */
1771static struct dmar_domain *
1772find_domain(struct pci_dev *pdev)
1773{
1774        struct device_domain_info *info;
1775
1776        /* No lock here, assumes no domain exit in normal case */
1777        info = pdev->dev.archdata.iommu;
1778        if (info)
1779                return info->domain;
1780        return NULL;
1781}
1782
1783/* domain is initialized */
1784static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1785{
1786        struct dmar_domain *domain, *found = NULL;
1787        struct intel_iommu *iommu;
1788        struct dmar_drhd_unit *drhd;
1789        struct device_domain_info *info, *tmp;
1790        struct pci_dev *dev_tmp;
1791        unsigned long flags;
1792        int bus = 0, devfn = 0;
1793        int segment;
1794        int ret;
1795
1796        domain = find_domain(pdev);
1797        if (domain)
1798                return domain;
1799
1800        segment = pci_domain_nr(pdev->bus);
1801
1802        dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1803        if (dev_tmp) {
1804                if (pci_is_pcie(dev_tmp)) {
1805                        bus = dev_tmp->subordinate->number;
1806                        devfn = 0;
1807                } else {
1808                        bus = dev_tmp->bus->number;
1809                        devfn = dev_tmp->devfn;
1810                }
1811                spin_lock_irqsave(&device_domain_lock, flags);
1812                list_for_each_entry(info, &device_domain_list, global) {
1813                        if (info->segment == segment &&
1814                            info->bus == bus && info->devfn == devfn) {
1815                                found = info->domain;
1816                                break;
1817                        }
1818                }
1819                spin_unlock_irqrestore(&device_domain_lock, flags);
1820                /* pcie-pci bridge already has a domain, uses it */
1821                if (found) {
1822                        domain = found;
1823                        goto found_domain;
1824                }
1825        }
1826
1827        domain = alloc_domain();
1828        if (!domain)
1829                goto error;
1830
1831        /* Allocate new domain for the device */
1832        drhd = dmar_find_matched_drhd_unit(pdev);
1833        if (!drhd) {
1834                printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1835                        pci_name(pdev));
1836                return NULL;
1837        }
1838        iommu = drhd->iommu;
1839
1840        ret = iommu_attach_domain(domain, iommu);
1841        if (ret) {
1842                free_domain_mem(domain);
1843                goto error;
1844        }
1845
1846        if (domain_init(domain, gaw)) {
1847                domain_exit(domain);
1848                goto error;
1849        }
1850
1851        /* register pcie-to-pci device */
1852        if (dev_tmp) {
1853                info = alloc_devinfo_mem();
1854                if (!info) {
1855                        domain_exit(domain);
1856                        goto error;
1857                }
1858                info->segment = segment;
1859                info->bus = bus;
1860                info->devfn = devfn;
1861                info->dev = NULL;
1862                info->domain = domain;
1863                /* This domain is shared by devices under p2p bridge */
1864                domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1865
1866                /* pcie-to-pci bridge already has a domain, uses it */
1867                found = NULL;
1868                spin_lock_irqsave(&device_domain_lock, flags);
1869                list_for_each_entry(tmp, &device_domain_list, global) {
1870                        if (tmp->segment == segment &&
1871                            tmp->bus == bus && tmp->devfn == devfn) {
1872                                found = tmp->domain;
1873                                break;
1874                        }
1875                }
1876                if (found) {
1877                        spin_unlock_irqrestore(&device_domain_lock, flags);
1878                        free_devinfo_mem(info);
1879                        domain_exit(domain);
1880                        domain = found;
1881                } else {
1882                        list_add(&info->link, &domain->devices);
1883                        list_add(&info->global, &device_domain_list);
1884                        spin_unlock_irqrestore(&device_domain_lock, flags);
1885                }
1886        }
1887
1888found_domain:
1889        info = alloc_devinfo_mem();
1890        if (!info)
1891                goto error;
1892        info->segment = segment;
1893        info->bus = pdev->bus->number;
1894        info->devfn = pdev->devfn;
1895        info->dev = pdev;
1896        info->domain = domain;
1897        spin_lock_irqsave(&device_domain_lock, flags);
1898        /* somebody is fast */
1899        found = find_domain(pdev);
1900        if (found != NULL) {
1901                spin_unlock_irqrestore(&device_domain_lock, flags);
1902                if (found != domain) {
1903                        domain_exit(domain);
1904                        domain = found;
1905                }
1906                free_devinfo_mem(info);
1907                return domain;
1908        }
1909        list_add(&info->link, &domain->devices);
1910        list_add(&info->global, &device_domain_list);
1911        pdev->dev.archdata.iommu = info;
1912        spin_unlock_irqrestore(&device_domain_lock, flags);
1913        return domain;
1914error:
1915        /* recheck it here, maybe others set it */
1916        return find_domain(pdev);
1917}
1918
1919static int iommu_identity_mapping;
1920#define IDENTMAP_ALL            1
1921#define IDENTMAP_GFX            2
1922#define IDENTMAP_AZALIA         4
1923
1924static int iommu_domain_identity_map(struct dmar_domain *domain,
1925                                     unsigned long long start,
1926                                     unsigned long long end)
1927{
1928        unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
1929        unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
1930
1931        if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
1932                          dma_to_mm_pfn(last_vpfn))) {
1933                printk(KERN_ERR "IOMMU: reserve iova failed\n");
1934                return -ENOMEM;
1935        }
1936
1937        pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
1938                 start, end, domain->id);
1939        /*
1940         * RMRR range might have overlap with physical memory range,
1941         * clear it first
1942         */
1943        dma_pte_clear_range(domain, first_vpfn, last_vpfn);
1944
1945        return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
1946                                  last_vpfn - first_vpfn + 1,
1947                                  DMA_PTE_READ|DMA_PTE_WRITE);
1948}
1949
1950static int iommu_prepare_identity_map(struct pci_dev *pdev,
1951                                      unsigned long long start,
1952                                      unsigned long long end)
1953{
1954        struct dmar_domain *domain;
1955        int ret;
1956
1957        domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1958        if (!domain)
1959                return -ENOMEM;
1960
1961        /* For _hardware_ passthrough, don't bother. But for software
1962           passthrough, we do it anyway -- it may indicate a memory
1963           range which is reserved in E820, so which didn't get set
1964           up to start with in si_domain */
1965        if (domain == si_domain && hw_pass_through) {
1966                printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
1967                       pci_name(pdev), start, end);
1968                return 0;
1969        }
1970
1971        printk(KERN_INFO
1972               "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1973               pci_name(pdev), start, end);
1974        
1975        if (end < start) {
1976                WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
1977                        "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1978                        dmi_get_system_info(DMI_BIOS_VENDOR),
1979                        dmi_get_system_info(DMI_BIOS_VERSION),
1980                     dmi_get_system_info(DMI_PRODUCT_VERSION));
1981                ret = -EIO;
1982                goto error;
1983        }
1984
1985        if (end >> agaw_to_width(domain->agaw)) {
1986                WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
1987                     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1988                     agaw_to_width(domain->agaw),
1989                     dmi_get_system_info(DMI_BIOS_VENDOR),
1990                     dmi_get_system_info(DMI_BIOS_VERSION),
1991                     dmi_get_system_info(DMI_PRODUCT_VERSION));
1992                ret = -EIO;
1993                goto error;
1994        }
1995
1996        ret = iommu_domain_identity_map(domain, start, end);
1997        if (ret)
1998                goto error;
1999
2000        /* context entry init */
2001        ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2002        if (ret)
2003                goto error;
2004
2005        return 0;
2006
2007 error:
2008        domain_exit(domain);
2009        return ret;
2010}
2011
2012static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2013        struct pci_dev *pdev)
2014{
2015        if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2016                return 0;
2017        return iommu_prepare_identity_map(pdev, rmrr->base_address,
2018                rmrr->end_address + 1);
2019}
2020
2021#ifdef CONFIG_DMAR_FLOPPY_WA
2022static inline void iommu_prepare_isa(void)
2023{
2024        struct pci_dev *pdev;
2025        int ret;
2026
2027        pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2028        if (!pdev)
2029                return;
2030
2031        printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2032        ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
2033
2034        if (ret)
2035                printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2036                       "floppy might not work\n");
2037
2038}
2039#else
2040static inline void iommu_prepare_isa(void)
2041{
2042        return;
2043}
2044#endif /* !CONFIG_DMAR_FLPY_WA */
2045
2046static int md_domain_init(struct dmar_domain *domain, int guest_width);
2047
2048static int __init si_domain_work_fn(unsigned long start_pfn,
2049                                    unsigned long end_pfn, void *datax)
2050{
2051        int *ret = datax;
2052
2053        *ret = iommu_domain_identity_map(si_domain,
2054                                         (uint64_t)start_pfn << PAGE_SHIFT,
2055                                         (uint64_t)end_pfn << PAGE_SHIFT);
2056        return *ret;
2057
2058}
2059
2060static int __init si_domain_init(int hw)
2061{
2062        struct dmar_drhd_unit *drhd;
2063        struct intel_iommu *iommu;
2064        int nid, ret = 0;
2065
2066        si_domain = alloc_domain();
2067        if (!si_domain)
2068                return -EFAULT;
2069
2070        pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2071
2072        for_each_active_iommu(iommu, drhd) {
2073                ret = iommu_attach_domain(si_domain, iommu);
2074                if (ret) {
2075                        domain_exit(si_domain);
2076                        return -EFAULT;
2077                }
2078        }
2079
2080        if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2081                domain_exit(si_domain);
2082                return -EFAULT;
2083        }
2084
2085        si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2086
2087        if (hw)
2088                return 0;
2089
2090        for_each_online_node(nid) {
2091                work_with_active_regions(nid, si_domain_work_fn, &ret);
2092                if (ret)
2093                        return ret;
2094        }
2095
2096        return 0;
2097}
2098
2099static void domain_remove_one_dev_info(struct dmar_domain *domain,
2100                                          struct pci_dev *pdev);
2101static int identity_mapping(struct pci_dev *pdev)
2102{
2103        struct device_domain_info *info;
2104
2105        if (likely(!iommu_identity_mapping))
2106                return 0;
2107
2108
2109        list_for_each_entry(info, &si_domain->devices, link)
2110                if (info->dev == pdev)
2111                        return 1;
2112        return 0;
2113}
2114
2115static int domain_add_dev_info(struct dmar_domain *domain,
2116                               struct pci_dev *pdev,
2117                               int translation)
2118{
2119        struct device_domain_info *info;
2120        unsigned long flags;
2121        int ret;
2122
2123        info = alloc_devinfo_mem();
2124        if (!info)
2125                return -ENOMEM;
2126
2127        ret = domain_context_mapping(domain, pdev, translation);
2128        if (ret) {
2129                free_devinfo_mem(info);
2130                return ret;
2131        }
2132
2133        info->segment = pci_domain_nr(pdev->bus);
2134        info->bus = pdev->bus->number;
2135        info->devfn = pdev->devfn;
2136        info->dev = pdev;
2137        info->domain = domain;
2138
2139        spin_lock_irqsave(&device_domain_lock, flags);
2140        list_add(&info->link, &domain->devices);
2141        list_add(&info->global, &device_domain_list);
2142        pdev->dev.archdata.iommu = info;
2143        spin_unlock_irqrestore(&device_domain_lock, flags);
2144
2145        return 0;
2146}
2147
2148static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2149{
2150        if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2151                return 1;
2152
2153        if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2154                return 1;
2155
2156        if (!(iommu_identity_mapping & IDENTMAP_ALL))
2157                return 0;
2158
2159        /*
2160         * We want to start off with all devices in the 1:1 domain, and
2161         * take them out later if we find they can't access all of memory.
2162         *
2163         * However, we can't do this for PCI devices behind bridges,
2164         * because all PCI devices behind the same bridge will end up
2165         * with the same source-id on their transactions.
2166         *
2167         * Practically speaking, we can't change things around for these
2168         * devices at run-time, because we can't be sure there'll be no
2169         * DMA transactions in flight for any of their siblings.
2170         * 
2171         * So PCI devices (unless they're on the root bus) as well as
2172         * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2173         * the 1:1 domain, just in _case_ one of their siblings turns out
2174         * not to be able to map all of memory.
2175         */
2176        if (!pci_is_pcie(pdev)) {
2177                if (!pci_is_root_bus(pdev->bus))
2178                        return 0;
2179                if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2180                        return 0;
2181        } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2182                return 0;
2183
2184        /* 
2185         * At boot time, we don't yet know if devices will be 64-bit capable.
2186         * Assume that they will -- if they turn out not to be, then we can 
2187         * take them out of the 1:1 domain later.
2188         */
2189        if (!startup)
2190                return pdev->dma_mask > DMA_BIT_MASK(32);
2191
2192        return 1;
2193}
2194
2195static int __init iommu_prepare_static_identity_mapping(int hw)
2196{
2197        struct pci_dev *pdev = NULL;
2198        int ret;
2199
2200        ret = si_domain_init(hw);
2201        if (ret)
2202                return -EFAULT;
2203
2204        for_each_pci_dev(pdev) {
2205                if (iommu_should_identity_map(pdev, 1)) {
2206                        printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2207                               hw ? "hardware" : "software", pci_name(pdev));
2208
2209                        ret = domain_add_dev_info(si_domain, pdev,
2210                                                     hw ? CONTEXT_TT_PASS_THROUGH :
2211                                                     CONTEXT_TT_MULTI_LEVEL);
2212                        if (ret)
2213                                return ret;
2214                }
2215        }
2216
2217        return 0;
2218}
2219
2220static int __init init_dmars(int force_on)
2221{
2222        struct dmar_drhd_unit *drhd;
2223        struct dmar_rmrr_unit *rmrr;
2224        struct pci_dev *pdev;
2225        struct intel_iommu *iommu;
2226        int i, ret;
2227
2228        /*
2229         * for each drhd
2230         *    allocate root
2231         *    initialize and program root entry to not present
2232         * endfor
2233         */
2234        for_each_drhd_unit(drhd) {
2235                g_num_of_iommus++;
2236                /*
2237                 * lock not needed as this is only incremented in the single
2238                 * threaded kernel __init code path all other access are read
2239                 * only
2240                 */
2241        }
2242
2243        g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2244                        GFP_KERNEL);
2245        if (!g_iommus) {
2246                printk(KERN_ERR "Allocating global iommu array failed\n");
2247                ret = -ENOMEM;
2248                goto error;
2249        }
2250
2251        deferred_flush = kzalloc(g_num_of_iommus *
2252                sizeof(struct deferred_flush_tables), GFP_KERNEL);
2253        if (!deferred_flush) {
2254                ret = -ENOMEM;
2255                goto error;
2256        }
2257
2258        for_each_drhd_unit(drhd) {
2259                if (drhd->ignored)
2260                        continue;
2261
2262                iommu = drhd->iommu;
2263                g_iommus[iommu->seq_id] = iommu;
2264
2265                ret = iommu_init_domains(iommu);
2266                if (ret)
2267                        goto error;
2268
2269                /*
2270                 * TBD:
2271                 * we could share the same root & context tables
2272                 * among all IOMMU's. Need to Split it later.
2273                 */
2274                ret = iommu_alloc_root_entry(iommu);
2275                if (ret) {
2276                        printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2277                        goto error;
2278                }
2279                if (!ecap_pass_through(iommu->ecap))
2280                        hw_pass_through = 0;
2281        }
2282
2283        /*
2284         * Start from the sane iommu hardware state.
2285         */
2286        for_each_drhd_unit(drhd) {
2287                if (drhd->ignored)
2288                        continue;
2289
2290                iommu = drhd->iommu;
2291
2292                /*
2293                 * If the queued invalidation is already initialized by us
2294                 * (for example, while enabling interrupt-remapping) then
2295                 * we got the things already rolling from a sane state.
2296                 */
2297                if (iommu->qi)
2298                        continue;
2299
2300                /*
2301                 * Clear any previous faults.
2302                 */
2303                dmar_fault(-1, iommu);
2304                /*
2305                 * Disable queued invalidation if supported and already enabled
2306                 * before OS handover.
2307                 */
2308                dmar_disable_qi(iommu);
2309        }
2310
2311        for_each_drhd_unit(drhd) {
2312                if (drhd->ignored)
2313                        continue;
2314
2315                iommu = drhd->iommu;
2316
2317                if (dmar_enable_qi(iommu)) {
2318                        /*
2319                         * Queued Invalidate not enabled, use Register Based
2320                         * Invalidate
2321                         */
2322                        iommu->flush.flush_context = __iommu_flush_context;
2323                        iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2324                        printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2325                               "invalidation\n",
2326                                iommu->seq_id,
2327                               (unsigned long long)drhd->reg_base_addr);
2328                } else {
2329                        iommu->flush.flush_context = qi_flush_context;
2330                        iommu->flush.flush_iotlb = qi_flush_iotlb;
2331                        printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2332                               "invalidation\n",
2333                                iommu->seq_id,
2334                               (unsigned long long)drhd->reg_base_addr);
2335                }
2336        }
2337
2338        if (iommu_pass_through)
2339                iommu_identity_mapping |= IDENTMAP_ALL;
2340
2341#ifdef CONFIG_DMAR_BROKEN_GFX_WA
2342        iommu_identity_mapping |= IDENTMAP_GFX;
2343#endif
2344
2345        check_tylersburg_isoch();
2346
2347        /*
2348         * If pass through is not set or not enabled, setup context entries for
2349         * identity mappings for rmrr, gfx, and isa and may fall back to static
2350         * identity mapping if iommu_identity_mapping is set.
2351         */
2352        if (iommu_identity_mapping) {
2353                ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2354                if (ret) {
2355                        printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2356                        goto error;
2357                }
2358        }
2359        /*
2360         * For each rmrr
2361         *   for each dev attached to rmrr
2362         *   do
2363         *     locate drhd for dev, alloc domain for dev
2364         *     allocate free domain
2365         *     allocate page table entries for rmrr
2366         *     if context not allocated for bus
2367         *           allocate and init context
2368         *           set present in root table for this bus
2369         *     init context with domain, translation etc
2370         *    endfor
2371         * endfor
2372         */
2373        printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2374        for_each_rmrr_units(rmrr) {
2375                for (i = 0; i < rmrr->devices_cnt; i++) {
2376                        pdev = rmrr->devices[i];
2377                        /*
2378                         * some BIOS lists non-exist devices in DMAR
2379                         * table.
2380                         */
2381                        if (!pdev)
2382                                continue;
2383                        ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2384                        if (ret)
2385                                printk(KERN_ERR
2386                                       "IOMMU: mapping reserved region failed\n");
2387                }
2388        }
2389
2390        iommu_prepare_isa();
2391
2392        /*
2393         * for each drhd
2394         *   enable fault log
2395         *   global invalidate context cache
2396         *   global invalidate iotlb
2397         *   enable translation
2398         */
2399        for_each_drhd_unit(drhd) {
2400                if (drhd->ignored) {
2401                        /*
2402                         * we always have to disable PMRs or DMA may fail on
2403                         * this device
2404                         */
2405                        if (force_on)
2406                                iommu_disable_protect_mem_regions(drhd->iommu);
2407                        continue;
2408                }
2409                iommu = drhd->iommu;
2410
2411                iommu_flush_write_buffer(iommu);
2412
2413                ret = dmar_set_interrupt(iommu);
2414                if (ret)
2415                        goto error;
2416
2417                iommu_set_root_entry(iommu);
2418
2419                iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2420                iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2421
2422                ret = iommu_enable_translation(iommu);
2423                if (ret)
2424                        goto error;
2425
2426                iommu_disable_protect_mem_regions(iommu);
2427        }
2428
2429        return 0;
2430error:
2431        for_each_drhd_unit(drhd) {
2432                if (drhd->ignored)
2433                        continue;
2434                iommu = drhd->iommu;
2435                free_iommu(iommu);
2436        }
2437        kfree(g_iommus);
2438        return ret;
2439}
2440
2441/* This takes a number of _MM_ pages, not VTD pages */
2442static struct iova *intel_alloc_iova(struct device *dev,
2443                                     struct dmar_domain *domain,
2444                                     unsigned long nrpages, uint64_t dma_mask)
2445{
2446        struct pci_dev *pdev = to_pci_dev(dev);
2447        struct iova *iova = NULL;
2448
2449        /* Restrict dma_mask to the width that the iommu can handle */
2450        dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2451
2452        if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2453                /*
2454                 * First try to allocate an io virtual address in
2455                 * DMA_BIT_MASK(32) and if that fails then try allocating
2456                 * from higher range
2457                 */
2458                iova = alloc_iova(&domain->iovad, nrpages,
2459                                  IOVA_PFN(DMA_BIT_MASK(32)), 1);
2460                if (iova)
2461                        return iova;
2462        }
2463        iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2464        if (unlikely(!iova)) {
2465                printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2466                       nrpages, pci_name(pdev));
2467                return NULL;
2468        }
2469
2470        return iova;
2471}
2472
2473static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2474{
2475        struct dmar_domain *domain;
2476        int ret;
2477
2478        domain = get_domain_for_dev(pdev,
2479                        DEFAULT_DOMAIN_ADDRESS_WIDTH);
2480        if (!domain) {
2481                printk(KERN_ERR
2482                        "Allocating domain for %s failed", pci_name(pdev));
2483                return NULL;
2484        }
2485
2486        /* make sure context mapping is ok */
2487        if (unlikely(!domain_context_mapped(pdev))) {
2488                ret = domain_context_mapping(domain, pdev,
2489                                             CONTEXT_TT_MULTI_LEVEL);
2490                if (ret) {
2491                        printk(KERN_ERR
2492                                "Domain context map for %s failed",
2493                                pci_name(pdev));
2494                        return NULL;
2495                }
2496        }
2497
2498        return domain;
2499}
2500
2501static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2502{
2503        struct device_domain_info *info;
2504
2505        /* No lock here, assumes no domain exit in normal case */
2506        info = dev->dev.archdata.iommu;
2507        if (likely(info))
2508                return info->domain;
2509
2510        return __get_valid_domain_for_dev(dev);
2511}
2512
2513static int iommu_dummy(struct pci_dev *pdev)
2514{
2515        return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2516}
2517
2518/* Check if the pdev needs to go through non-identity map and unmap process.*/
2519static int iommu_no_mapping(struct device *dev)
2520{
2521        struct pci_dev *pdev;
2522        int found;
2523
2524        if (unlikely(dev->bus != &pci_bus_type))
2525                return 1;
2526
2527        pdev = to_pci_dev(dev);
2528        if (iommu_dummy(pdev))
2529                return 1;
2530
2531        if (!iommu_identity_mapping)
2532                return 0;
2533
2534        found = identity_mapping(pdev);
2535        if (found) {
2536                if (iommu_should_identity_map(pdev, 0))
2537                        return 1;
2538                else {
2539                        /*
2540                         * 32 bit DMA is removed from si_domain and fall back
2541                         * to non-identity mapping.
2542                         */
2543                        domain_remove_one_dev_info(si_domain, pdev);
2544                        printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2545                               pci_name(pdev));
2546                        return 0;
2547                }
2548        } else {
2549                /*
2550                 * In case of a detached 64 bit DMA device from vm, the device
2551                 * is put into si_domain for identity mapping.
2552                 */
2553                if (iommu_should_identity_map(pdev, 0)) {
2554                        int ret;
2555                        ret = domain_add_dev_info(si_domain, pdev,
2556                                                  hw_pass_through ?
2557                                                  CONTEXT_TT_PASS_THROUGH :
2558                                                  CONTEXT_TT_MULTI_LEVEL);
2559                        if (!ret) {
2560                                printk(KERN_INFO "64bit %s uses identity mapping\n",
2561                                       pci_name(pdev));
2562                                return 1;
2563                        }
2564                }
2565        }
2566
2567        return 0;
2568}
2569
2570static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2571                                     size_t size, int dir, u64 dma_mask)
2572{
2573        struct pci_dev *pdev = to_pci_dev(hwdev);
2574        struct dmar_domain *domain;
2575        phys_addr_t start_paddr;
2576        struct iova *iova;
2577        int prot = 0;
2578        int ret;
2579        struct intel_iommu *iommu;
2580        unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2581
2582        BUG_ON(dir == DMA_NONE);
2583
2584        if (iommu_no_mapping(hwdev))
2585                return paddr;
2586
2587        domain = get_valid_domain_for_dev(pdev);
2588        if (!domain)
2589                return 0;
2590
2591        iommu = domain_get_iommu(domain);
2592        size = aligned_nrpages(paddr, size);
2593
2594        iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2595                                pdev->dma_mask);
2596        if (!iova)
2597                goto error;
2598
2599        /*
2600         * Check if DMAR supports zero-length reads on write only
2601         * mappings..
2602         */
2603        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2604                        !cap_zlr(iommu->cap))
2605                prot |= DMA_PTE_READ;
2606        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2607                prot |= DMA_PTE_WRITE;
2608        /*
2609         * paddr - (paddr + size) might be partial page, we should map the whole
2610         * page.  Note: if two part of one page are separately mapped, we
2611         * might have two guest_addr mapping to the same host paddr, but this
2612         * is not a big problem
2613         */
2614        ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2615                                 mm_to_dma_pfn(paddr_pfn), size, prot);
2616        if (ret)
2617                goto error;
2618
2619        /* it's a non-present to present mapping. Only flush if caching mode */
2620        if (cap_caching_mode(iommu->cap))
2621                iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2622        else
2623                iommu_flush_write_buffer(iommu);
2624
2625        start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2626        start_paddr += paddr & ~PAGE_MASK;
2627        return start_paddr;
2628
2629error:
2630        if (iova)
2631                __free_iova(&domain->iovad, iova);
2632        printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2633                pci_name(pdev), size, (unsigned long long)paddr, dir);
2634        return 0;
2635}
2636
2637static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2638                                 unsigned long offset, size_t size,
2639                                 enum dma_data_direction dir,
2640                                 struct dma_attrs *attrs)
2641{
2642        return __intel_map_single(dev, page_to_phys(page) + offset, size,
2643                                  dir, to_pci_dev(dev)->dma_mask);
2644}
2645
2646static void flush_unmaps(void)
2647{
2648        int i, j;
2649
2650        timer_on = 0;
2651
2652        /* just flush them all */
2653        for (i = 0; i < g_num_of_iommus; i++) {
2654                struct intel_iommu *iommu = g_iommus[i];
2655                if (!iommu)
2656                        continue;
2657
2658                if (!deferred_flush[i].next)
2659                        continue;
2660
2661                /* In caching mode, global flushes turn emulation expensive */
2662                if (!cap_caching_mode(iommu->cap))
2663                        iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2664                                         DMA_TLB_GLOBAL_FLUSH);
2665                for (j = 0; j < deferred_flush[i].next; j++) {
2666                        unsigned long mask;
2667                        struct iova *iova = deferred_flush[i].iova[j];
2668                        struct dmar_domain *domain = deferred_flush[i].domain[j];
2669
2670                        /* On real hardware multiple invalidations are expensive */
2671                        if (cap_caching_mode(iommu->cap))
2672                                iommu_flush_iotlb_psi(iommu, domain->id,
2673                                iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2674                        else {
2675                                mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2676                                iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2677                                                (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2678                        }
2679                        __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2680                }
2681                deferred_flush[i].next = 0;
2682        }
2683
2684        list_size = 0;
2685}
2686
2687static void flush_unmaps_timeout(unsigned long data)
2688{
2689        unsigned long flags;
2690
2691        spin_lock_irqsave(&async_umap_flush_lock, flags);
2692        flush_unmaps();
2693        spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2694}
2695
2696static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2697{
2698        unsigned long flags;
2699        int next, iommu_id;
2700        struct intel_iommu *iommu;
2701
2702        spin_lock_irqsave(&async_umap_flush_lock, flags);
2703        if (list_size == HIGH_WATER_MARK)
2704                flush_unmaps();
2705
2706        iommu = domain_get_iommu(dom);
2707        iommu_id = iommu->seq_id;
2708
2709        next = deferred_flush[iommu_id].next;
2710        deferred_flush[iommu_id].domain[next] = dom;
2711        deferred_flush[iommu_id].iova[next] = iova;
2712        deferred_flush[iommu_id].next++;
2713
2714        if (!timer_on) {
2715                mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2716                timer_on = 1;
2717        }
2718        list_size++;
2719        spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2720}
2721
2722static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2723                             size_t size, enum dma_data_direction dir,
2724                             struct dma_attrs *attrs)
2725{
2726        struct pci_dev *pdev = to_pci_dev(dev);
2727        struct dmar_domain *domain;
2728        unsigned long start_pfn, last_pfn;
2729        struct iova *iova;
2730        struct intel_iommu *iommu;
2731
2732        if (iommu_no_mapping(dev))
2733                return;
2734
2735        domain = find_domain(pdev);
2736        BUG_ON(!domain);
2737
2738        iommu = domain_get_iommu(domain);
2739
2740        iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2741        if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2742                      (unsigned long long)dev_addr))
2743                return;
2744
2745        start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2746        last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2747
2748        pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2749                 pci_name(pdev), start_pfn, last_pfn);
2750
2751        /*  clear the whole page */
2752        dma_pte_clear_range(domain, start_pfn, last_pfn);
2753
2754        /* free page tables */
2755        dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2756
2757        if (intel_iommu_strict) {
2758                iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2759                                      last_pfn - start_pfn + 1, 0);
2760                /* free iova */
2761                __free_iova(&domain->iovad, iova);
2762        } else {
2763                add_unmap(domain, iova);
2764                /*
2765                 * queue up the release of the unmap to save the 1/6th of the
2766                 * cpu used up by the iotlb flush operation...
2767                 */
2768        }
2769}
2770
2771static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2772                                  dma_addr_t *dma_handle, gfp_t flags)
2773{
2774        void *vaddr;
2775        int order;
2776
2777        size = PAGE_ALIGN(size);
2778        order = get_order(size);
2779
2780        if (!iommu_no_mapping(hwdev))
2781                flags &= ~(GFP_DMA | GFP_DMA32);
2782        else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2783                if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2784                        flags |= GFP_DMA;
2785                else
2786                        flags |= GFP_DMA32;
2787        }
2788
2789        vaddr = (void *)__get_free_pages(flags, order);
2790        if (!vaddr)
2791                return NULL;
2792        memset(vaddr, 0, size);
2793
2794        *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2795                                         DMA_BIDIRECTIONAL,
2796                                         hwdev->coherent_dma_mask);
2797        if (*dma_handle)
2798                return vaddr;
2799        free_pages((unsigned long)vaddr, order);
2800        return NULL;
2801}
2802
2803static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2804                                dma_addr_t dma_handle)
2805{
2806        int order;
2807
2808        size = PAGE_ALIGN(size);
2809        order = get_order(size);
2810
2811        intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2812        free_pages((unsigned long)vaddr, order);
2813}
2814
2815static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2816                           int nelems, enum dma_data_direction dir,
2817                           struct dma_attrs *attrs)
2818{
2819        struct pci_dev *pdev = to_pci_dev(hwdev);
2820        struct dmar_domain *domain;
2821        unsigned long start_pfn, last_pfn;
2822        struct iova *iova;
2823        struct intel_iommu *iommu;
2824
2825        if (iommu_no_mapping(hwdev))
2826                return;
2827
2828        domain = find_domain(pdev);
2829        BUG_ON(!domain);
2830
2831        iommu = domain_get_iommu(domain);
2832
2833        iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2834        if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
2835                      (unsigned long long)sglist[0].dma_address))
2836                return;
2837
2838        start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2839        last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2840
2841        /*  clear the whole page */
2842        dma_pte_clear_range(domain, start_pfn, last_pfn);
2843
2844        /* free page tables */
2845        dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2846
2847        if (intel_iommu_strict) {
2848                iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2849                                      last_pfn - start_pfn + 1, 0);
2850                /* free iova */
2851                __free_iova(&domain->iovad, iova);
2852        } else {
2853                add_unmap(domain, iova);
2854                /*
2855                 * queue up the release of the unmap to save the 1/6th of the
2856                 * cpu used up by the iotlb flush operation...
2857                 */
2858        }
2859}
2860
2861static int intel_nontranslate_map_sg(struct device *hddev,
2862        struct scatterlist *sglist, int nelems, int dir)
2863{
2864        int i;
2865        struct scatterlist *sg;
2866
2867        for_each_sg(sglist, sg, nelems, i) {
2868                BUG_ON(!sg_page(sg));
2869                sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
2870                sg->dma_length = sg->length;
2871        }
2872        return nelems;
2873}
2874
2875static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2876                        enum dma_data_direction dir, struct dma_attrs *attrs)
2877{
2878        int i;
2879        struct pci_dev *pdev = to_pci_dev(hwdev);
2880        struct dmar_domain *domain;
2881        size_t size = 0;
2882        int prot = 0;
2883        struct iova *iova = NULL;
2884        int ret;
2885        struct scatterlist *sg;
2886        unsigned long start_vpfn;
2887        struct intel_iommu *iommu;
2888
2889        BUG_ON(dir == DMA_NONE);
2890        if (iommu_no_mapping(hwdev))
2891                return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2892
2893        domain = get_valid_domain_for_dev(pdev);
2894        if (!domain)
2895                return 0;
2896
2897        iommu = domain_get_iommu(domain);
2898
2899        for_each_sg(sglist, sg, nelems, i)
2900                size += aligned_nrpages(sg->offset, sg->length);
2901
2902        iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2903                                pdev->dma_mask);
2904        if (!iova) {
2905                sglist->dma_length = 0;
2906                return 0;
2907        }
2908
2909        /*
2910         * Check if DMAR supports zero-length reads on write only
2911         * mappings..
2912         */
2913        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2914                        !cap_zlr(iommu->cap))
2915                prot |= DMA_PTE_READ;
2916        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2917                prot |= DMA_PTE_WRITE;
2918
2919        start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
2920
2921        ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
2922        if (unlikely(ret)) {
2923                /*  clear the page */
2924                dma_pte_clear_range(domain, start_vpfn,
2925                                    start_vpfn + size - 1);
2926                /* free page tables */
2927                dma_pte_free_pagetable(domain, start_vpfn,
2928                                       start_vpfn + size - 1);
2929                /* free iova */
2930                __free_iova(&domain->iovad, iova);
2931                return 0;
2932        }
2933
2934        /* it's a non-present to present mapping. Only flush if caching mode */
2935        if (cap_caching_mode(iommu->cap))
2936                iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
2937        else
2938                iommu_flush_write_buffer(iommu);
2939
2940        return nelems;
2941}
2942
2943static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
2944{
2945        return !dma_addr;
2946}
2947
2948struct dma_map_ops intel_dma_ops = {
2949        .alloc_coherent = intel_alloc_coherent,
2950        .free_coherent = intel_free_coherent,
2951        .map_sg = intel_map_sg,
2952        .unmap_sg = intel_unmap_sg,
2953        .map_page = intel_map_page,
2954        .unmap_page = intel_unmap_page,
2955        .mapping_error = intel_mapping_error,
2956};
2957
2958static inline int iommu_domain_cache_init(void)
2959{
2960        int ret = 0;
2961
2962        iommu_domain_cache = kmem_cache_create("iommu_domain",
2963                                         sizeof(struct dmar_domain),
2964                                         0,
2965                                         SLAB_HWCACHE_ALIGN,
2966
2967                                         NULL);
2968        if (!iommu_domain_cache) {
2969                printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2970                ret = -ENOMEM;
2971        }
2972
2973        return ret;
2974}
2975
2976static inline int iommu_devinfo_cache_init(void)
2977{
2978        int ret = 0;
2979
2980        iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2981                                         sizeof(struct device_domain_info),
2982                                         0,
2983                                         SLAB_HWCACHE_ALIGN,
2984                                         NULL);
2985        if (!iommu_devinfo_cache) {
2986                printk(KERN_ERR "Couldn't create devinfo cache\n");
2987                ret = -ENOMEM;
2988        }
2989
2990        return ret;
2991}
2992
2993static inline int iommu_iova_cache_init(void)
2994{
2995        int ret = 0;
2996
2997        iommu_iova_cache = kmem_cache_create("iommu_iova",
2998                                         sizeof(struct iova),
2999                                         0,
3000                                         SLAB_HWCACHE_ALIGN,
3001                                         NULL);
3002        if (!iommu_iova_cache) {
3003                printk(KERN_ERR "Couldn't create iova cache\n");
3004                ret = -ENOMEM;
3005        }
3006
3007        return ret;
3008}
3009
3010static int __init iommu_init_mempool(void)
3011{
3012        int ret;
3013        ret = iommu_iova_cache_init();
3014        if (ret)
3015                return ret;
3016
3017        ret = iommu_domain_cache_init();
3018        if (ret)
3019                goto domain_error;
3020
3021        ret = iommu_devinfo_cache_init();
3022        if (!ret)
3023                return ret;
3024
3025        kmem_cache_destroy(iommu_domain_cache);
3026domain_error:
3027        kmem_cache_destroy(iommu_iova_cache);
3028
3029        return -ENOMEM;
3030}
3031
3032static void __init iommu_exit_mempool(void)
3033{
3034        kmem_cache_destroy(iommu_devinfo_cache);
3035        kmem_cache_destroy(iommu_domain_cache);
3036        kmem_cache_destroy(iommu_iova_cache);
3037
3038}
3039
3040static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3041{
3042        struct dmar_drhd_unit *drhd;
3043        u32 vtbar;
3044        int rc;
3045
3046        /* We know that this device on this chipset has its own IOMMU.
3047         * If we find it under a different IOMMU, then the BIOS is lying
3048         * to us. Hope that the IOMMU for this device is actually
3049         * disabled, and it needs no translation...
3050         */
3051        rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3052        if (rc) {
3053                /* "can't" happen */
3054                dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3055                return;
3056        }
3057        vtbar &= 0xffff0000;
3058
3059        /* we know that the this iommu should be at offset 0xa000 from vtbar */
3060        drhd = dmar_find_matched_drhd_unit(pdev);
3061        if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3062                            TAINT_FIRMWARE_WORKAROUND,
3063                            "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3064                pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3065}
3066DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3067
3068static void __init init_no_remapping_devices(void)
3069{
3070        struct dmar_drhd_unit *drhd;
3071
3072        for_each_drhd_unit(drhd) {
3073                if (!drhd->include_all) {
3074                        int i;
3075                        for (i = 0; i < drhd->devices_cnt; i++)
3076                                if (drhd->devices[i] != NULL)
3077                                        break;
3078                        /* ignore DMAR unit if no pci devices exist */
3079                        if (i == drhd->devices_cnt)
3080                                drhd->ignored = 1;
3081                }
3082        }
3083
3084        if (dmar_map_gfx)
3085                return;
3086
3087        for_each_drhd_unit(drhd) {
3088                int i;
3089                if (drhd->ignored || drhd->include_all)
3090                        continue;
3091
3092                for (i = 0; i < drhd->devices_cnt; i++)
3093                        if (drhd->devices[i] &&
3094                                !IS_GFX_DEVICE(drhd->devices[i]))
3095                                break;
3096
3097                if (i < drhd->devices_cnt)
3098                        continue;
3099
3100                /* bypass IOMMU if it is just for gfx devices */
3101                drhd->ignored = 1;
3102                for (i = 0; i < drhd->devices_cnt; i++) {
3103                        if (!drhd->devices[i])
3104                                continue;
3105                        drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3106                }
3107        }
3108}
3109
3110#ifdef CONFIG_SUSPEND
3111static int init_iommu_hw(void)
3112{
3113        struct dmar_drhd_unit *drhd;
3114        struct intel_iommu *iommu = NULL;
3115
3116        for_each_active_iommu(iommu, drhd)
3117                if (iommu->qi)
3118                        dmar_reenable_qi(iommu);
3119
3120        for_each_active_iommu(iommu, drhd) {
3121                iommu_flush_write_buffer(iommu);
3122
3123                iommu_set_root_entry(iommu);
3124
3125                iommu->flush.flush_context(iommu, 0, 0, 0,
3126                                           DMA_CCMD_GLOBAL_INVL);
3127                iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3128                                         DMA_TLB_GLOBAL_FLUSH);
3129                iommu_enable_translation(iommu);
3130                iommu_disable_protect_mem_regions(iommu);
3131        }
3132
3133        return 0;
3134}
3135
3136static void iommu_flush_all(void)
3137{
3138        struct dmar_drhd_unit *drhd;
3139        struct intel_iommu *iommu;
3140
3141        for_each_active_iommu(iommu, drhd) {
3142                iommu->flush.flush_context(iommu, 0, 0, 0,
3143                                           DMA_CCMD_GLOBAL_INVL);
3144                iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3145                                         DMA_TLB_GLOBAL_FLUSH);
3146        }
3147}
3148
3149static int iommu_suspend(void)
3150{
3151        struct dmar_drhd_unit *drhd;
3152        struct intel_iommu *iommu = NULL;
3153        unsigned long flag;
3154
3155        for_each_active_iommu(iommu, drhd) {
3156                iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3157                                                 GFP_ATOMIC);
3158                if (!iommu->iommu_state)
3159                        goto nomem;
3160        }
3161
3162        iommu_flush_all();
3163
3164        for_each_active_iommu(iommu, drhd) {
3165                iommu_disable_translation(iommu);
3166
3167                spin_lock_irqsave(&iommu->register_lock, flag);
3168
3169                iommu->iommu_state[SR_DMAR_FECTL_REG] =
3170                        readl(iommu->reg + DMAR_FECTL_REG);
3171                iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3172                        readl(iommu->reg + DMAR_FEDATA_REG);
3173                iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3174                        readl(iommu->reg + DMAR_FEADDR_REG);
3175                iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3176                        readl(iommu->reg + DMAR_FEUADDR_REG);
3177
3178                spin_unlock_irqrestore(&iommu->register_lock, flag);
3179        }
3180        return 0;
3181
3182nomem:
3183        for_each_active_iommu(iommu, drhd)
3184                kfree(iommu->iommu_state);
3185
3186        return -ENOMEM;
3187}
3188
3189static void iommu_resume(void)
3190{
3191        struct dmar_drhd_unit *drhd;
3192        struct intel_iommu *iommu = NULL;
3193        unsigned long flag;
3194
3195        if (init_iommu_hw()) {
3196                WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3197                return;
3198        }
3199
3200        for_each_active_iommu(iommu, drhd) {
3201
3202                spin_lock_irqsave(&iommu->register_lock, flag);
3203
3204                writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3205                        iommu->reg + DMAR_FECTL_REG);
3206                writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3207                        iommu->reg + DMAR_FEDATA_REG);
3208                writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3209                        iommu->reg + DMAR_FEADDR_REG);
3210                writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3211                        iommu->reg + DMAR_FEUADDR_REG);
3212
3213                spin_unlock_irqrestore(&iommu->register_lock, flag);
3214        }
3215
3216        for_each_active_iommu(iommu, drhd)
3217                kfree(iommu->iommu_state);
3218}
3219
3220static struct syscore_ops iommu_syscore_ops = {
3221        .resume         = iommu_resume,
3222        .suspend        = iommu_suspend,
3223};
3224
3225static void __init init_iommu_pm_ops(void)
3226{
3227        register_syscore_ops(&iommu_syscore_ops);
3228}
3229
3230#else
3231static inline int init_iommu_pm_ops(void) { }
3232#endif  /* CONFIG_PM */
3233
3234/*
3235 * Here we only respond to action of unbound device from driver.
3236 *
3237 * Added device is not attached to its DMAR domain here yet. That will happen
3238 * when mapping the device to iova.
3239 */
3240static int device_notifier(struct notifier_block *nb,
3241                                  unsigned long action, void *data)
3242{
3243        struct device *dev = data;
3244        struct pci_dev *pdev = to_pci_dev(dev);
3245        struct dmar_domain *domain;
3246
3247        if (iommu_no_mapping(dev))
3248                return 0;
3249
3250        domain = find_domain(pdev);
3251        if (!domain)
3252                return 0;
3253
3254        if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3255                domain_remove_one_dev_info(domain, pdev);
3256
3257                if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3258                    !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3259                    list_empty(&domain->devices))
3260                        domain_exit(domain);
3261        }
3262
3263        return 0;
3264}
3265
3266static struct notifier_block device_nb = {
3267        .notifier_call = device_notifier,
3268};
3269
3270int __init intel_iommu_init(void)
3271{
3272        int ret = 0;
3273        int force_on = 0;
3274
3275        /* VT-d is required for a TXT/tboot launch, so enforce that */
3276        force_on = tboot_force_iommu();
3277
3278        if (dmar_table_init()) {
3279                if (force_on)
3280                        panic("tboot: Failed to initialize DMAR table\n");
3281                return  -ENODEV;
3282        }
3283
3284        if (dmar_dev_scope_init()) {
3285                if (force_on)
3286                        panic("tboot: Failed to initialize DMAR device scope\n");
3287                return  -ENODEV;
3288        }
3289
3290        /*
3291         * Check the need for DMA-remapping initialization now.
3292         * Above initialization will also be used by Interrupt-remapping.
3293         */
3294        if (no_iommu || dmar_disabled)
3295                return -ENODEV;
3296
3297        if (iommu_init_mempool()) {
3298                if (force_on)
3299                        panic("tboot: Failed to initialize iommu memory\n");
3300                return  -ENODEV;
3301        }
3302
3303        if (dmar_init_reserved_ranges()) {
3304                if (force_on)
3305                        panic("tboot: Failed to reserve iommu ranges\n");
3306                return  -ENODEV;
3307        }
3308
3309        init_no_remapping_devices();
3310
3311        ret = init_dmars(force_on);
3312        if (ret) {
3313                if (force_on)
3314                        panic("tboot: Failed to initialize DMARs\n");
3315                printk(KERN_ERR "IOMMU: dmar init failed\n");
3316                put_iova_domain(&reserved_iova_list);
3317                iommu_exit_mempool();
3318                return ret;
3319        }
3320        printk(KERN_INFO
3321        "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3322
3323        init_timer(&unmap_timer);
3324#ifdef CONFIG_SWIOTLB
3325        swiotlb = 0;
3326#endif
3327        dma_ops = &intel_dma_ops;
3328
3329        init_iommu_pm_ops();
3330
3331        register_iommu(&intel_iommu_ops);
3332
3333        bus_register_notifier(&pci_bus_type, &device_nb);
3334
3335        return 0;
3336}
3337
3338static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3339                                           struct pci_dev *pdev)
3340{
3341        struct pci_dev *tmp, *parent;
3342
3343        if (!iommu || !pdev)
3344                return;
3345
3346        /* dependent device detach */
3347        tmp = pci_find_upstream_pcie_bridge(pdev);
3348        /* Secondary interface's bus number and devfn 0 */
3349        if (tmp) {
3350                parent = pdev->bus->self;
3351                while (parent != tmp) {
3352                        iommu_detach_dev(iommu, parent->bus->number,
3353                                         parent->devfn);
3354                        parent = parent->bus->self;
3355                }
3356                if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3357                        iommu_detach_dev(iommu,
3358                                tmp->subordinate->number, 0);
3359                else /* this is a legacy PCI bridge */
3360                        iommu_detach_dev(iommu, tmp->bus->number,
3361                                         tmp->devfn);
3362        }
3363}
3364
3365static void domain_remove_one_dev_info(struct dmar_domain *domain,
3366                                          struct pci_dev *pdev)
3367{
3368        struct device_domain_info *info;
3369        struct intel_iommu *iommu;
3370        unsigned long flags;
3371        int found = 0;
3372        struct list_head *entry, *tmp;
3373
3374        iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3375                                pdev->devfn);
3376        if (!iommu)
3377                return;
3378
3379        spin_lock_irqsave(&device_domain_lock, flags);
3380        list_for_each_safe(entry, tmp, &domain->devices) {
3381                info = list_entry(entry, struct device_domain_info, link);
3382                /* No need to compare PCI domain; it has to be the same */
3383                if (info->bus == pdev->bus->number &&
3384                    info->devfn == pdev->devfn) {
3385                        list_del(&info->link);
3386                        list_del(&info->global);
3387                        if (info->dev)
3388                                info->dev->dev.archdata.iommu = NULL;
3389                        spin_unlock_irqrestore(&device_domain_lock, flags);
3390
3391                        iommu_disable_dev_iotlb(info);
3392                        iommu_detach_dev(iommu, info->bus, info->devfn);
3393                        iommu_detach_dependent_devices(iommu, pdev);
3394                        free_devinfo_mem(info);
3395
3396                        spin_lock_irqsave(&device_domain_lock, flags);
3397
3398                        if (found)
3399                                break;
3400                        else
3401                                continue;
3402                }
3403
3404                /* if there is no other devices under the same iommu
3405                 * owned by this domain, clear this iommu in iommu_bmp
3406                 * update iommu count and coherency
3407                 */
3408                if (iommu == device_to_iommu(info->segment, info->bus,
3409                                            info->devfn))
3410                        found = 1;
3411        }
3412
3413        if (found == 0) {
3414                unsigned long tmp_flags;
3415                spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3416                clear_bit(iommu->seq_id, &domain->iommu_bmp);
3417                domain->iommu_count--;
3418                domain_update_iommu_cap(domain);
3419                spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3420
3421                spin_lock_irqsave(&iommu->lock, tmp_flags);
3422                clear_bit(domain->id, iommu->domain_ids);
3423                iommu->domains[domain->id] = NULL;
3424                spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3425        }
3426
3427        spin_unlock_irqrestore(&device_domain_lock, flags);
3428}
3429
3430static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3431{
3432        struct device_domain_info *info;
3433        struct intel_iommu *iommu;
3434        unsigned long flags1, flags2;
3435
3436        spin_lock_irqsave(&device_domain_lock, flags1);
3437        while (!list_empty(&domain->devices)) {
3438                info = list_entry(domain->devices.next,
3439                        struct device_domain_info, link);
3440                list_del(&info->link);
3441                list_del(&info->global);
3442                if (info->dev)
3443                        info->dev->dev.archdata.iommu = NULL;
3444
3445                spin_unlock_irqrestore(&device_domain_lock, flags1);
3446
3447                iommu_disable_dev_iotlb(info);
3448                iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3449                iommu_detach_dev(iommu, info->bus, info->devfn);
3450                iommu_detach_dependent_devices(iommu, info->dev);
3451
3452                /* clear this iommu in iommu_bmp, update iommu count
3453                 * and capabilities
3454                 */
3455                spin_lock_irqsave(&domain->iommu_lock, flags2);
3456                if (test_and_clear_bit(iommu->seq_id,
3457                                       &domain->iommu_bmp)) {
3458                        domain->iommu_count--;
3459                        domain_update_iommu_cap(domain);
3460                }
3461                spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3462
3463                free_devinfo_mem(info);
3464                spin_lock_irqsave(&device_domain_lock, flags1);
3465        }
3466        spin_unlock_irqrestore(&device_domain_lock, flags1);
3467}
3468
3469/* domain id for virtual machine, it won't be set in context */
3470static unsigned long vm_domid;
3471
3472static struct dmar_domain *iommu_alloc_vm_domain(void)
3473{
3474        struct dmar_domain *domain;
3475
3476        domain = alloc_domain_mem();
3477        if (!domain)
3478                return NULL;
3479
3480        domain->id = vm_domid++;
3481        domain->nid = -1;
3482        memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3483        domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3484
3485        return domain;
3486}
3487
3488static int md_domain_init(struct dmar_domain *domain, int guest_width)
3489{
3490        int adjust_width;
3491
3492        init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3493        spin_lock_init(&domain->iommu_lock);
3494
3495        domain_reserve_special_ranges(domain);
3496
3497        /* calculate AGAW */
3498        domain->gaw = guest_width;
3499        adjust_width = guestwidth_to_adjustwidth(guest_width);
3500        domain->agaw = width_to_agaw(adjust_width);
3501
3502        INIT_LIST_HEAD(&domain->devices);
3503
3504        domain->iommu_count = 0;
3505        domain->iommu_coherency = 0;
3506        domain->iommu_snooping = 0;
3507        domain->max_addr = 0;
3508        domain->nid = -1;
3509
3510        /* always allocate the top pgd */
3511        domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3512        if (!domain->pgd)
3513                return -ENOMEM;
3514        domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3515        return 0;
3516}
3517
3518static void iommu_free_vm_domain(struct dmar_domain *domain)
3519{
3520        unsigned long flags;
3521        struct dmar_drhd_unit *drhd;
3522        struct intel_iommu *iommu;
3523        unsigned long i;
3524        unsigned long ndomains;
3525
3526        for_each_drhd_unit(drhd) {
3527                if (drhd->ignored)
3528                        continue;
3529                iommu = drhd->iommu;
3530
3531                ndomains = cap_ndoms(iommu->cap);
3532                for_each_set_bit(i, iommu->domain_ids, ndomains) {
3533                        if (iommu->domains[i] == domain) {
3534                                spin_lock_irqsave(&iommu->lock, flags);
3535                                clear_bit(i, iommu->domain_ids);
3536                                iommu->domains[i] = NULL;
3537                                spin_unlock_irqrestore(&iommu->lock, flags);
3538                                break;
3539                        }
3540                }
3541        }
3542}
3543
3544static void vm_domain_exit(struct dmar_domain *domain)
3545{
3546        /* Domain 0 is reserved, so dont process it */
3547        if (!domain)
3548                return;
3549
3550        vm_domain_remove_all_dev_info(domain);
3551        /* destroy iovas */
3552        put_iova_domain(&domain->iovad);
3553
3554        /* clear ptes */
3555        dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3556
3557        /* free page tables */
3558        dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3559
3560        iommu_free_vm_domain(domain);
3561        free_domain_mem(domain);
3562}
3563
3564static int intel_iommu_domain_init(struct iommu_domain *domain)
3565{
3566        struct dmar_domain *dmar_domain;
3567
3568        dmar_domain = iommu_alloc_vm_domain();
3569        if (!dmar_domain) {
3570                printk(KERN_ERR
3571                        "intel_iommu_domain_init: dmar_domain == NULL\n");
3572                return -ENOMEM;
3573        }
3574        if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3575                printk(KERN_ERR
3576                        "intel_iommu_domain_init() failed\n");
3577                vm_domain_exit(dmar_domain);
3578                return -ENOMEM;
3579        }
3580        domain->priv = dmar_domain;
3581
3582        return 0;
3583}
3584
3585static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3586{
3587        struct dmar_domain *dmar_domain = domain->priv;
3588
3589        domain->priv = NULL;
3590        vm_domain_exit(dmar_domain);
3591}
3592
3593static int intel_iommu_attach_device(struct iommu_domain *domain,
3594                                     struct device *dev)
3595{
3596        struct dmar_domain *dmar_domain = domain->priv;
3597        struct pci_dev *pdev = to_pci_dev(dev);
3598        struct intel_iommu *iommu;
3599        int addr_width;
3600
3601        /* normally pdev is not mapped */
3602        if (unlikely(domain_context_mapped(pdev))) {
3603                struct dmar_domain *old_domain;
3604
3605                old_domain = find_domain(pdev);
3606                if (old_domain) {
3607                        if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3608                            dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3609                                domain_remove_one_dev_info(old_domain, pdev);
3610                        else
3611                                domain_remove_dev_info(old_domain);
3612                }
3613        }
3614
3615        iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3616                                pdev->devfn);
3617        if (!iommu)
3618                return -ENODEV;
3619
3620        /* check if this iommu agaw is sufficient for max mapped address */
3621        addr_width = agaw_to_width(iommu->agaw);
3622        if (addr_width > cap_mgaw(iommu->cap))
3623                addr_width = cap_mgaw(iommu->cap);
3624
3625        if (dmar_domain->max_addr > (1LL << addr_width)) {
3626                printk(KERN_ERR "%s: iommu width (%d) is not "
3627                       "sufficient for the mapped address (%llx)\n",
3628                       __func__, addr_width, dmar_domain->max_addr);
3629                return -EFAULT;
3630        }
3631        dmar_domain->gaw = addr_width;
3632
3633        /*
3634         * Knock out extra levels of page tables if necessary
3635         */
3636        while (iommu->agaw < dmar_domain->agaw) {
3637                struct dma_pte *pte;
3638
3639                pte = dmar_domain->pgd;
3640                if (dma_pte_present(pte)) {
3641                        dmar_domain->pgd = (struct dma_pte *)
3642                                phys_to_virt(dma_pte_addr(pte));
3643                        free_pgtable_page(pte);
3644                }
3645                dmar_domain->agaw--;
3646        }
3647
3648        return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3649}
3650
3651static void intel_iommu_detach_device(struct iommu_domain *domain,
3652                                      struct device *dev)
3653{
3654        struct dmar_domain *dmar_domain = domain->priv;
3655        struct pci_dev *pdev = to_pci_dev(dev);
3656
3657        domain_remove_one_dev_info(dmar_domain, pdev);
3658}
3659
3660static int intel_iommu_map(struct iommu_domain *domain,
3661                           unsigned long iova, phys_addr_t hpa,
3662                           int gfp_order, int iommu_prot)
3663{
3664        struct dmar_domain *dmar_domain = domain->priv;
3665        u64 max_addr;
3666        int prot = 0;
3667        size_t size;
3668        int ret;
3669
3670        if (iommu_prot & IOMMU_READ)
3671                prot |= DMA_PTE_READ;
3672        if (iommu_prot & IOMMU_WRITE)
3673                prot |= DMA_PTE_WRITE;
3674        if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3675                prot |= DMA_PTE_SNP;
3676
3677        size     = PAGE_SIZE << gfp_order;
3678        max_addr = iova + size;
3679        if (dmar_domain->max_addr < max_addr) {
3680                u64 end;
3681
3682                /* check if minimum agaw is sufficient for mapped address */
3683                end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3684                if (end < max_addr) {
3685                        printk(KERN_ERR "%s: iommu width (%d) is not "
3686                               "sufficient for the mapped address (%llx)\n",
3687                               __func__, dmar_domain->gaw, max_addr);
3688                        return -EFAULT;
3689                }
3690                dmar_domain->max_addr = max_addr;
3691        }
3692        /* Round up size to next multiple of PAGE_SIZE, if it and
3693           the low bits of hpa would take us onto the next page */
3694        size = aligned_nrpages(hpa, size);
3695        ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3696                                 hpa >> VTD_PAGE_SHIFT, size, prot);
3697        return ret;
3698}
3699
3700static int intel_iommu_unmap(struct iommu_domain *domain,
3701                             unsigned long iova, int gfp_order)
3702{
3703        struct dmar_domain *dmar_domain = domain->priv;
3704        size_t size = PAGE_SIZE << gfp_order;
3705
3706        dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
3707                            (iova + size - 1) >> VTD_PAGE_SHIFT);
3708
3709        if (dmar_domain->max_addr == iova + size)
3710                dmar_domain->max_addr = iova;
3711
3712        return gfp_order;
3713}
3714
3715static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3716                                            unsigned long iova)
3717{
3718        struct dmar_domain *dmar_domain = domain->priv;
3719        struct dma_pte *pte;
3720        u64 phys = 0;
3721
3722        pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT);
3723        if (pte)
3724                phys = dma_pte_addr(pte);
3725
3726        return phys;
3727}
3728
3729static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3730                                      unsigned long cap)
3731{
3732        struct dmar_domain *dmar_domain = domain->priv;
3733
3734        if (cap == IOMMU_CAP_CACHE_COHERENCY)
3735                return dmar_domain->iommu_snooping;
3736        if (cap == IOMMU_CAP_INTR_REMAP)
3737                return intr_remapping_enabled;
3738
3739        return 0;
3740}
3741
3742static struct iommu_ops intel_iommu_ops = {
3743        .domain_init    = intel_iommu_domain_init,
3744        .domain_destroy = intel_iommu_domain_destroy,
3745        .attach_dev     = intel_iommu_attach_device,
3746        .detach_dev     = intel_iommu_detach_device,
3747        .map            = intel_iommu_map,
3748        .unmap          = intel_iommu_unmap,
3749        .iova_to_phys   = intel_iommu_iova_to_phys,
3750        .domain_has_cap = intel_iommu_domain_has_cap,
3751};
3752
3753static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3754{
3755        /*
3756         * Mobile 4 Series Chipset neglects to set RWBF capability,
3757         * but needs it:
3758         */
3759        printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3760        rwbf_quirk = 1;
3761
3762        /* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
3763        if (dev->revision == 0x07) {
3764                printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
3765                dmar_map_gfx = 0;
3766        }
3767}
3768
3769DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
3770
3771#define GGC 0x52
3772#define GGC_MEMORY_SIZE_MASK    (0xf << 8)
3773#define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
3774#define GGC_MEMORY_SIZE_1M      (0x1 << 8)
3775#define GGC_MEMORY_SIZE_2M      (0x3 << 8)
3776#define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
3777#define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
3778#define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
3779#define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
3780
3781static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
3782{
3783        unsigned short ggc;
3784
3785        if (pci_read_config_word(dev, GGC, &ggc))
3786                return;
3787
3788        if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
3789                printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
3790                dmar_map_gfx = 0;
3791        }
3792}
3793DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
3794DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
3795DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
3796DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
3797
3798/* On Tylersburg chipsets, some BIOSes have been known to enable the
3799   ISOCH DMAR unit for the Azalia sound device, but not give it any
3800   TLB entries, which causes it to deadlock. Check for that.  We do
3801   this in a function called from init_dmars(), instead of in a PCI
3802   quirk, because we don't want to print the obnoxious "BIOS broken"
3803   message if VT-d is actually disabled.
3804*/
3805static void __init check_tylersburg_isoch(void)
3806{
3807        struct pci_dev *pdev;
3808        uint32_t vtisochctrl;
3809
3810        /* If there's no Azalia in the system anyway, forget it. */
3811        pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
3812        if (!pdev)
3813                return;
3814        pci_dev_put(pdev);
3815
3816        /* System Management Registers. Might be hidden, in which case
3817           we can't do the sanity check. But that's OK, because the
3818           known-broken BIOSes _don't_ actually hide it, so far. */
3819        pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
3820        if (!pdev)
3821                return;
3822
3823        if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
3824                pci_dev_put(pdev);
3825                return;
3826        }
3827
3828        pci_dev_put(pdev);
3829
3830        /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
3831        if (vtisochctrl & 1)
3832                return;
3833
3834        /* Drop all bits other than the number of TLB entries */
3835        vtisochctrl &= 0x1c;
3836
3837        /* If we have the recommended number of TLB entries (16), fine. */
3838        if (vtisochctrl == 0x10)
3839                return;
3840
3841        /* Zero TLB entries? You get to ride the short bus to school. */
3842        if (!vtisochctrl) {
3843                WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
3844                     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3845                     dmi_get_system_info(DMI_BIOS_VENDOR),
3846                     dmi_get_system_info(DMI_BIOS_VERSION),
3847                     dmi_get_system_info(DMI_PRODUCT_VERSION));
3848                iommu_identity_mapping |= IDENTMAP_AZALIA;
3849                return;
3850        }
3851        
3852        printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
3853               vtisochctrl);
3854}
3855