linux/drivers/pci/intel-iommu.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2006, Intel Corporation.
   3 *
   4 * This program is free software; you can redistribute it and/or modify it
   5 * under the terms and conditions of the GNU General Public License,
   6 * version 2, as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope it will be useful, but WITHOUT
   9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  11 * more details.
  12 *
  13 * You should have received a copy of the GNU General Public License along with
  14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
  15 * Place - Suite 330, Boston, MA 02111-1307 USA.
  16 *
  17 * Copyright (C) 2006-2008 Intel Corporation
  18 * Author: Ashok Raj <ashok.raj@intel.com>
  19 * Author: Shaohua Li <shaohua.li@intel.com>
  20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
  21 * Author: Fenghua Yu <fenghua.yu@intel.com>
  22 */
  23
  24#include <linux/init.h>
  25#include <linux/bitmap.h>
  26#include <linux/debugfs.h>
  27#include <linux/slab.h>
  28#include <linux/irq.h>
  29#include <linux/interrupt.h>
  30#include <linux/spinlock.h>
  31#include <linux/pci.h>
  32#include <linux/dmar.h>
  33#include <linux/dma-mapping.h>
  34#include <linux/mempool.h>
  35#include <linux/timer.h>
  36#include <linux/iova.h>
  37#include <linux/iommu.h>
  38#include <linux/intel-iommu.h>
  39#include <linux/sysdev.h>
  40#include <linux/tboot.h>
  41#include <linux/dmi.h>
  42#include <asm/cacheflush.h>
  43#include <asm/iommu.h>
  44#include "pci.h"
  45
  46#define ROOT_SIZE               VTD_PAGE_SIZE
  47#define CONTEXT_SIZE            VTD_PAGE_SIZE
  48
  49#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  50#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  51#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  52
  53#define IOAPIC_RANGE_START      (0xfee00000)
  54#define IOAPIC_RANGE_END        (0xfeefffff)
  55#define IOVA_START_ADDR         (0x1000)
  56
  57#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
  58
  59#define MAX_AGAW_WIDTH 64
  60
  61#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  62#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  63
  64/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  65   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  66#define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  67                                __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  68#define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  69
  70#define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  71#define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
  72#define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
  73
  74
  75/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
  76   are never going to work. */
  77static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
  78{
  79        return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
  80}
  81
  82static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
  83{
  84        return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
  85}
  86static inline unsigned long page_to_dma_pfn(struct page *pg)
  87{
  88        return mm_to_dma_pfn(page_to_pfn(pg));
  89}
  90static inline unsigned long virt_to_dma_pfn(void *p)
  91{
  92        return page_to_dma_pfn(virt_to_page(p));
  93}
  94
  95/* global iommu list, set NULL for ignored DMAR units */
  96static struct intel_iommu **g_iommus;
  97
  98static void __init check_tylersburg_isoch(void);
  99static int rwbf_quirk;
 100
 101/*
 102 * 0: Present
 103 * 1-11: Reserved
 104 * 12-63: Context Ptr (12 - (haw-1))
 105 * 64-127: Reserved
 106 */
 107struct root_entry {
 108        u64     val;
 109        u64     rsvd1;
 110};
 111#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 112static inline bool root_present(struct root_entry *root)
 113{
 114        return (root->val & 1);
 115}
 116static inline void set_root_present(struct root_entry *root)
 117{
 118        root->val |= 1;
 119}
 120static inline void set_root_value(struct root_entry *root, unsigned long value)
 121{
 122        root->val |= value & VTD_PAGE_MASK;
 123}
 124
 125static inline struct context_entry *
 126get_context_addr_from_root(struct root_entry *root)
 127{
 128        return (struct context_entry *)
 129                (root_present(root)?phys_to_virt(
 130                root->val & VTD_PAGE_MASK) :
 131                NULL);
 132}
 133
 134/*
 135 * low 64 bits:
 136 * 0: present
 137 * 1: fault processing disable
 138 * 2-3: translation type
 139 * 12-63: address space root
 140 * high 64 bits:
 141 * 0-2: address width
 142 * 3-6: aval
 143 * 8-23: domain id
 144 */
 145struct context_entry {
 146        u64 lo;
 147        u64 hi;
 148};
 149
 150static inline bool context_present(struct context_entry *context)
 151{
 152        return (context->lo & 1);
 153}
 154static inline void context_set_present(struct context_entry *context)
 155{
 156        context->lo |= 1;
 157}
 158
 159static inline void context_set_fault_enable(struct context_entry *context)
 160{
 161        context->lo &= (((u64)-1) << 2) | 1;
 162}
 163
 164static inline void context_set_translation_type(struct context_entry *context,
 165                                                unsigned long value)
 166{
 167        context->lo &= (((u64)-1) << 4) | 3;
 168        context->lo |= (value & 3) << 2;
 169}
 170
 171static inline void context_set_address_root(struct context_entry *context,
 172                                            unsigned long value)
 173{
 174        context->lo |= value & VTD_PAGE_MASK;
 175}
 176
 177static inline void context_set_address_width(struct context_entry *context,
 178                                             unsigned long value)
 179{
 180        context->hi |= value & 7;
 181}
 182
 183static inline void context_set_domain_id(struct context_entry *context,
 184                                         unsigned long value)
 185{
 186        context->hi |= (value & ((1 << 16) - 1)) << 8;
 187}
 188
 189static inline void context_clear_entry(struct context_entry *context)
 190{
 191        context->lo = 0;
 192        context->hi = 0;
 193}
 194
 195/*
 196 * 0: readable
 197 * 1: writable
 198 * 2-6: reserved
 199 * 7: super page
 200 * 8-10: available
 201 * 11: snoop behavior
 202 * 12-63: Host physcial address
 203 */
 204struct dma_pte {
 205        u64 val;
 206};
 207
 208static inline void dma_clear_pte(struct dma_pte *pte)
 209{
 210        pte->val = 0;
 211}
 212
 213static inline void dma_set_pte_readable(struct dma_pte *pte)
 214{
 215        pte->val |= DMA_PTE_READ;
 216}
 217
 218static inline void dma_set_pte_writable(struct dma_pte *pte)
 219{
 220        pte->val |= DMA_PTE_WRITE;
 221}
 222
 223static inline void dma_set_pte_snp(struct dma_pte *pte)
 224{
 225        pte->val |= DMA_PTE_SNP;
 226}
 227
 228static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
 229{
 230        pte->val = (pte->val & ~3) | (prot & 3);
 231}
 232
 233static inline u64 dma_pte_addr(struct dma_pte *pte)
 234{
 235#ifdef CONFIG_64BIT
 236        return pte->val & VTD_PAGE_MASK;
 237#else
 238        /* Must have a full atomic 64-bit read */
 239        return  __cmpxchg64(pte, 0ULL, 0ULL) & VTD_PAGE_MASK;
 240#endif
 241}
 242
 243static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
 244{
 245        pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
 246}
 247
 248static inline bool dma_pte_present(struct dma_pte *pte)
 249{
 250        return (pte->val & 3) != 0;
 251}
 252
 253static inline int first_pte_in_page(struct dma_pte *pte)
 254{
 255        return !((unsigned long)pte & ~VTD_PAGE_MASK);
 256}
 257
 258/*
 259 * This domain is a statically identity mapping domain.
 260 *      1. This domain creats a static 1:1 mapping to all usable memory.
 261 *      2. It maps to each iommu if successful.
 262 *      3. Each iommu mapps to this domain if successful.
 263 */
 264static struct dmar_domain *si_domain;
 265static int hw_pass_through = 1;
 266
 267/* devices under the same p2p bridge are owned in one domain */
 268#define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
 269
 270/* domain represents a virtual machine, more than one devices
 271 * across iommus may be owned in one domain, e.g. kvm guest.
 272 */
 273#define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
 274
 275/* si_domain contains mulitple devices */
 276#define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
 277
 278struct dmar_domain {
 279        int     id;                     /* domain id */
 280        unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
 281
 282        struct list_head devices;       /* all devices' list */
 283        struct iova_domain iovad;       /* iova's that belong to this domain */
 284
 285        struct dma_pte  *pgd;           /* virtual address */
 286        int             gaw;            /* max guest address width */
 287
 288        /* adjusted guest address width, 0 is level 2 30-bit */
 289        int             agaw;
 290
 291        int             flags;          /* flags to find out type of domain */
 292
 293        int             iommu_coherency;/* indicate coherency of iommu access */
 294        int             iommu_snooping; /* indicate snooping control feature*/
 295        int             iommu_count;    /* reference count of iommu */
 296        spinlock_t      iommu_lock;     /* protect iommu set in domain */
 297        u64             max_addr;       /* maximum mapped address */
 298};
 299
 300/* PCI domain-device relationship */
 301struct device_domain_info {
 302        struct list_head link;  /* link to domain siblings */
 303        struct list_head global; /* link to global list */
 304        int segment;            /* PCI domain */
 305        u8 bus;                 /* PCI bus number */
 306        u8 devfn;               /* PCI devfn number */
 307        struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
 308        struct intel_iommu *iommu; /* IOMMU used by this device */
 309        struct dmar_domain *domain; /* pointer to domain */
 310};
 311
 312static void flush_unmaps_timeout(unsigned long data);
 313
 314DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
 315
 316#define HIGH_WATER_MARK 250
 317struct deferred_flush_tables {
 318        int next;
 319        struct iova *iova[HIGH_WATER_MARK];
 320        struct dmar_domain *domain[HIGH_WATER_MARK];
 321};
 322
 323static struct deferred_flush_tables *deferred_flush;
 324
 325/* bitmap for indexing intel_iommus */
 326static int g_num_of_iommus;
 327
 328static DEFINE_SPINLOCK(async_umap_flush_lock);
 329static LIST_HEAD(unmaps_to_do);
 330
 331static int timer_on;
 332static long list_size;
 333
 334static void domain_remove_dev_info(struct dmar_domain *domain);
 335
 336#ifdef CONFIG_DMAR_DEFAULT_ON
 337int dmar_disabled = 0;
 338#else
 339int dmar_disabled = 1;
 340#endif /*CONFIG_DMAR_DEFAULT_ON*/
 341
 342static int __initdata dmar_map_gfx = 1;
 343static int dmar_forcedac;
 344static int intel_iommu_strict;
 345
 346#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 347static DEFINE_SPINLOCK(device_domain_lock);
 348static LIST_HEAD(device_domain_list);
 349
 350static struct iommu_ops intel_iommu_ops;
 351
 352static int __init intel_iommu_setup(char *str)
 353{
 354        if (!str)
 355                return -EINVAL;
 356        while (*str) {
 357                if (!strncmp(str, "on", 2)) {
 358                        dmar_disabled = 0;
 359                        printk(KERN_INFO "Intel-IOMMU: enabled\n");
 360                } else if (!strncmp(str, "off", 3)) {
 361                        dmar_disabled = 1;
 362                        printk(KERN_INFO "Intel-IOMMU: disabled\n");
 363                } else if (!strncmp(str, "igfx_off", 8)) {
 364                        dmar_map_gfx = 0;
 365                        printk(KERN_INFO
 366                                "Intel-IOMMU: disable GFX device mapping\n");
 367                } else if (!strncmp(str, "forcedac", 8)) {
 368                        printk(KERN_INFO
 369                                "Intel-IOMMU: Forcing DAC for PCI devices\n");
 370                        dmar_forcedac = 1;
 371                } else if (!strncmp(str, "strict", 6)) {
 372                        printk(KERN_INFO
 373                                "Intel-IOMMU: disable batched IOTLB flush\n");
 374                        intel_iommu_strict = 1;
 375                }
 376
 377                str += strcspn(str, ",");
 378                while (*str == ',')
 379                        str++;
 380        }
 381        return 0;
 382}
 383__setup("intel_iommu=", intel_iommu_setup);
 384
 385static struct kmem_cache *iommu_domain_cache;
 386static struct kmem_cache *iommu_devinfo_cache;
 387static struct kmem_cache *iommu_iova_cache;
 388
 389static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
 390{
 391        unsigned int flags;
 392        void *vaddr;
 393
 394        /* trying to avoid low memory issues */
 395        flags = current->flags & PF_MEMALLOC;
 396        current->flags |= PF_MEMALLOC;
 397        vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
 398        current->flags &= (~PF_MEMALLOC | flags);
 399        return vaddr;
 400}
 401
 402
 403static inline void *alloc_pgtable_page(void)
 404{
 405        unsigned int flags;
 406        void *vaddr;
 407
 408        /* trying to avoid low memory issues */
 409        flags = current->flags & PF_MEMALLOC;
 410        current->flags |= PF_MEMALLOC;
 411        vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
 412        current->flags &= (~PF_MEMALLOC | flags);
 413        return vaddr;
 414}
 415
 416static inline void free_pgtable_page(void *vaddr)
 417{
 418        free_page((unsigned long)vaddr);
 419}
 420
 421static inline void *alloc_domain_mem(void)
 422{
 423        return iommu_kmem_cache_alloc(iommu_domain_cache);
 424}
 425
 426static void free_domain_mem(void *vaddr)
 427{
 428        kmem_cache_free(iommu_domain_cache, vaddr);
 429}
 430
 431static inline void * alloc_devinfo_mem(void)
 432{
 433        return iommu_kmem_cache_alloc(iommu_devinfo_cache);
 434}
 435
 436static inline void free_devinfo_mem(void *vaddr)
 437{
 438        kmem_cache_free(iommu_devinfo_cache, vaddr);
 439}
 440
 441struct iova *alloc_iova_mem(void)
 442{
 443        return iommu_kmem_cache_alloc(iommu_iova_cache);
 444}
 445
 446void free_iova_mem(struct iova *iova)
 447{
 448        kmem_cache_free(iommu_iova_cache, iova);
 449}
 450
 451
 452static inline int width_to_agaw(int width);
 453
 454static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 455{
 456        unsigned long sagaw;
 457        int agaw = -1;
 458
 459        sagaw = cap_sagaw(iommu->cap);
 460        for (agaw = width_to_agaw(max_gaw);
 461             agaw >= 0; agaw--) {
 462                if (test_bit(agaw, &sagaw))
 463                        break;
 464        }
 465
 466        return agaw;
 467}
 468
 469/*
 470 * Calculate max SAGAW for each iommu.
 471 */
 472int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 473{
 474        return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 475}
 476
 477/*
 478 * calculate agaw for each iommu.
 479 * "SAGAW" may be different across iommus, use a default agaw, and
 480 * get a supported less agaw for iommus that don't support the default agaw.
 481 */
 482int iommu_calculate_agaw(struct intel_iommu *iommu)
 483{
 484        return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 485}
 486
 487/* This functionin only returns single iommu in a domain */
 488static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 489{
 490        int iommu_id;
 491
 492        /* si_domain and vm domain should not get here. */
 493        BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
 494        BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
 495
 496        iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
 497        if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 498                return NULL;
 499
 500        return g_iommus[iommu_id];
 501}
 502
 503static void domain_update_iommu_coherency(struct dmar_domain *domain)
 504{
 505        int i;
 506
 507        domain->iommu_coherency = 1;
 508
 509        i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
 510        for (; i < g_num_of_iommus; ) {
 511                if (!ecap_coherent(g_iommus[i]->ecap)) {
 512                        domain->iommu_coherency = 0;
 513                        break;
 514                }
 515                i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
 516        }
 517}
 518
 519static void domain_update_iommu_snooping(struct dmar_domain *domain)
 520{
 521        int i;
 522
 523        domain->iommu_snooping = 1;
 524
 525        i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
 526        for (; i < g_num_of_iommus; ) {
 527                if (!ecap_sc_support(g_iommus[i]->ecap)) {
 528                        domain->iommu_snooping = 0;
 529                        break;
 530                }
 531                i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
 532        }
 533}
 534
 535/* Some capabilities may be different across iommus */
 536static void domain_update_iommu_cap(struct dmar_domain *domain)
 537{
 538        domain_update_iommu_coherency(domain);
 539        domain_update_iommu_snooping(domain);
 540}
 541
 542static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
 543{
 544        struct dmar_drhd_unit *drhd = NULL;
 545        int i;
 546
 547        for_each_drhd_unit(drhd) {
 548                if (drhd->ignored)
 549                        continue;
 550                if (segment != drhd->segment)
 551                        continue;
 552
 553                for (i = 0; i < drhd->devices_cnt; i++) {
 554                        if (drhd->devices[i] &&
 555                            drhd->devices[i]->bus->number == bus &&
 556                            drhd->devices[i]->devfn == devfn)
 557                                return drhd->iommu;
 558                        if (drhd->devices[i] &&
 559                            drhd->devices[i]->subordinate &&
 560                            drhd->devices[i]->subordinate->number <= bus &&
 561                            drhd->devices[i]->subordinate->subordinate >= bus)
 562                                return drhd->iommu;
 563                }
 564
 565                if (drhd->include_all)
 566                        return drhd->iommu;
 567        }
 568
 569        return NULL;
 570}
 571
 572static void domain_flush_cache(struct dmar_domain *domain,
 573                               void *addr, int size)
 574{
 575        if (!domain->iommu_coherency)
 576                clflush_cache_range(addr, size);
 577}
 578
 579/* Gets context entry for a given bus and devfn */
 580static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
 581                u8 bus, u8 devfn)
 582{
 583        struct root_entry *root;
 584        struct context_entry *context;
 585        unsigned long phy_addr;
 586        unsigned long flags;
 587
 588        spin_lock_irqsave(&iommu->lock, flags);
 589        root = &iommu->root_entry[bus];
 590        context = get_context_addr_from_root(root);
 591        if (!context) {
 592                context = (struct context_entry *)alloc_pgtable_page();
 593                if (!context) {
 594                        spin_unlock_irqrestore(&iommu->lock, flags);
 595                        return NULL;
 596                }
 597                __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 598                phy_addr = virt_to_phys((void *)context);
 599                set_root_value(root, phy_addr);
 600                set_root_present(root);
 601                __iommu_flush_cache(iommu, root, sizeof(*root));
 602        }
 603        spin_unlock_irqrestore(&iommu->lock, flags);
 604        return &context[devfn];
 605}
 606
 607static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 608{
 609        struct root_entry *root;
 610        struct context_entry *context;
 611        int ret;
 612        unsigned long flags;
 613
 614        spin_lock_irqsave(&iommu->lock, flags);
 615        root = &iommu->root_entry[bus];
 616        context = get_context_addr_from_root(root);
 617        if (!context) {
 618                ret = 0;
 619                goto out;
 620        }
 621        ret = context_present(&context[devfn]);
 622out:
 623        spin_unlock_irqrestore(&iommu->lock, flags);
 624        return ret;
 625}
 626
 627static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
 628{
 629        struct root_entry *root;
 630        struct context_entry *context;
 631        unsigned long flags;
 632
 633        spin_lock_irqsave(&iommu->lock, flags);
 634        root = &iommu->root_entry[bus];
 635        context = get_context_addr_from_root(root);
 636        if (context) {
 637                context_clear_entry(&context[devfn]);
 638                __iommu_flush_cache(iommu, &context[devfn], \
 639                        sizeof(*context));
 640        }
 641        spin_unlock_irqrestore(&iommu->lock, flags);
 642}
 643
 644static void free_context_table(struct intel_iommu *iommu)
 645{
 646        struct root_entry *root;
 647        int i;
 648        unsigned long flags;
 649        struct context_entry *context;
 650
 651        spin_lock_irqsave(&iommu->lock, flags);
 652        if (!iommu->root_entry) {
 653                goto out;
 654        }
 655        for (i = 0; i < ROOT_ENTRY_NR; i++) {
 656                root = &iommu->root_entry[i];
 657                context = get_context_addr_from_root(root);
 658                if (context)
 659                        free_pgtable_page(context);
 660        }
 661        free_pgtable_page(iommu->root_entry);
 662        iommu->root_entry = NULL;
 663out:
 664        spin_unlock_irqrestore(&iommu->lock, flags);
 665}
 666
 667/* page table handling */
 668#define LEVEL_STRIDE            (9)
 669#define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
 670
 671static inline int agaw_to_level(int agaw)
 672{
 673        return agaw + 2;
 674}
 675
 676static inline int agaw_to_width(int agaw)
 677{
 678        return 30 + agaw * LEVEL_STRIDE;
 679
 680}
 681
 682static inline int width_to_agaw(int width)
 683{
 684        return (width - 30) / LEVEL_STRIDE;
 685}
 686
 687static inline unsigned int level_to_offset_bits(int level)
 688{
 689        return (level - 1) * LEVEL_STRIDE;
 690}
 691
 692static inline int pfn_level_offset(unsigned long pfn, int level)
 693{
 694        return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 695}
 696
 697static inline unsigned long level_mask(int level)
 698{
 699        return -1UL << level_to_offset_bits(level);
 700}
 701
 702static inline unsigned long level_size(int level)
 703{
 704        return 1UL << level_to_offset_bits(level);
 705}
 706
 707static inline unsigned long align_to_level(unsigned long pfn, int level)
 708{
 709        return (pfn + level_size(level) - 1) & level_mask(level);
 710}
 711
 712static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 713                                      unsigned long pfn)
 714{
 715        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 716        struct dma_pte *parent, *pte = NULL;
 717        int level = agaw_to_level(domain->agaw);
 718        int offset;
 719
 720        BUG_ON(!domain->pgd);
 721        BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
 722        parent = domain->pgd;
 723
 724        while (level > 0) {
 725                void *tmp_page;
 726
 727                offset = pfn_level_offset(pfn, level);
 728                pte = &parent[offset];
 729                if (level == 1)
 730                        break;
 731
 732                if (!dma_pte_present(pte)) {
 733                        uint64_t pteval;
 734
 735                        tmp_page = alloc_pgtable_page();
 736
 737                        if (!tmp_page)
 738                                return NULL;
 739
 740                        domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 741                        pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 742                        if (cmpxchg64(&pte->val, 0ULL, pteval)) {
 743                                /* Someone else set it while we were thinking; use theirs. */
 744                                free_pgtable_page(tmp_page);
 745                        } else {
 746                                dma_pte_addr(pte);
 747                                domain_flush_cache(domain, pte, sizeof(*pte));
 748                        }
 749                }
 750                parent = phys_to_virt(dma_pte_addr(pte));
 751                level--;
 752        }
 753
 754        return pte;
 755}
 756
 757/* return address's pte at specific level */
 758static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
 759                                         unsigned long pfn,
 760                                         int level)
 761{
 762        struct dma_pte *parent, *pte = NULL;
 763        int total = agaw_to_level(domain->agaw);
 764        int offset;
 765
 766        parent = domain->pgd;
 767        while (level <= total) {
 768                offset = pfn_level_offset(pfn, total);
 769                pte = &parent[offset];
 770                if (level == total)
 771                        return pte;
 772
 773                if (!dma_pte_present(pte))
 774                        break;
 775                parent = phys_to_virt(dma_pte_addr(pte));
 776                total--;
 777        }
 778        return NULL;
 779}
 780
 781/* clear last level pte, a tlb flush should be followed */
 782static void dma_pte_clear_range(struct dmar_domain *domain,
 783                                unsigned long start_pfn,
 784                                unsigned long last_pfn)
 785{
 786        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 787        struct dma_pte *first_pte, *pte;
 788
 789        BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
 790        BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
 791        BUG_ON(start_pfn > last_pfn);
 792
 793        /* we don't need lock here; nobody else touches the iova range */
 794        do {
 795                first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1);
 796                if (!pte) {
 797                        start_pfn = align_to_level(start_pfn + 1, 2);
 798                        continue;
 799                }
 800                do { 
 801                        dma_clear_pte(pte);
 802                        start_pfn++;
 803                        pte++;
 804                } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
 805
 806                domain_flush_cache(domain, first_pte,
 807                                   (void *)pte - (void *)first_pte);
 808
 809        } while (start_pfn && start_pfn <= last_pfn);
 810}
 811
 812/* free page table pages. last level pte should already be cleared */
 813static void dma_pte_free_pagetable(struct dmar_domain *domain,
 814                                   unsigned long start_pfn,
 815                                   unsigned long last_pfn)
 816{
 817        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 818        struct dma_pte *first_pte, *pte;
 819        int total = agaw_to_level(domain->agaw);
 820        int level;
 821        unsigned long tmp;
 822
 823        BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
 824        BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
 825        BUG_ON(start_pfn > last_pfn);
 826
 827        /* We don't need lock here; nobody else touches the iova range */
 828        level = 2;
 829        while (level <= total) {
 830                tmp = align_to_level(start_pfn, level);
 831
 832                /* If we can't even clear one PTE at this level, we're done */
 833                if (tmp + level_size(level) - 1 > last_pfn)
 834                        return;
 835
 836                do {
 837                        first_pte = pte = dma_pfn_level_pte(domain, tmp, level);
 838                        if (!pte) {
 839                                tmp = align_to_level(tmp + 1, level + 1);
 840                                continue;
 841                        }
 842                        do {
 843                                if (dma_pte_present(pte)) {
 844                                        free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
 845                                        dma_clear_pte(pte);
 846                                }
 847                                pte++;
 848                                tmp += level_size(level);
 849                        } while (!first_pte_in_page(pte) &&
 850                                 tmp + level_size(level) - 1 <= last_pfn);
 851
 852                        domain_flush_cache(domain, first_pte,
 853                                           (void *)pte - (void *)first_pte);
 854                        
 855                } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
 856                level++;
 857        }
 858        /* free pgd */
 859        if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
 860                free_pgtable_page(domain->pgd);
 861                domain->pgd = NULL;
 862        }
 863}
 864
 865/* iommu handling */
 866static int iommu_alloc_root_entry(struct intel_iommu *iommu)
 867{
 868        struct root_entry *root;
 869        unsigned long flags;
 870
 871        root = (struct root_entry *)alloc_pgtable_page();
 872        if (!root)
 873                return -ENOMEM;
 874
 875        __iommu_flush_cache(iommu, root, ROOT_SIZE);
 876
 877        spin_lock_irqsave(&iommu->lock, flags);
 878        iommu->root_entry = root;
 879        spin_unlock_irqrestore(&iommu->lock, flags);
 880
 881        return 0;
 882}
 883
 884static void iommu_set_root_entry(struct intel_iommu *iommu)
 885{
 886        void *addr;
 887        u32 sts;
 888        unsigned long flag;
 889
 890        addr = iommu->root_entry;
 891
 892        spin_lock_irqsave(&iommu->register_lock, flag);
 893        dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
 894
 895        writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
 896
 897        /* Make sure hardware complete it */
 898        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
 899                      readl, (sts & DMA_GSTS_RTPS), sts);
 900
 901        spin_unlock_irqrestore(&iommu->register_lock, flag);
 902}
 903
 904static void iommu_flush_write_buffer(struct intel_iommu *iommu)
 905{
 906        u32 val;
 907        unsigned long flag;
 908
 909        if (!rwbf_quirk && !cap_rwbf(iommu->cap))
 910                return;
 911
 912        spin_lock_irqsave(&iommu->register_lock, flag);
 913        writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
 914
 915        /* Make sure hardware complete it */
 916        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
 917                      readl, (!(val & DMA_GSTS_WBFS)), val);
 918
 919        spin_unlock_irqrestore(&iommu->register_lock, flag);
 920}
 921
 922/* return value determine if we need a write buffer flush */
 923static void __iommu_flush_context(struct intel_iommu *iommu,
 924                                  u16 did, u16 source_id, u8 function_mask,
 925                                  u64 type)
 926{
 927        u64 val = 0;
 928        unsigned long flag;
 929
 930        switch (type) {
 931        case DMA_CCMD_GLOBAL_INVL:
 932                val = DMA_CCMD_GLOBAL_INVL;
 933                break;
 934        case DMA_CCMD_DOMAIN_INVL:
 935                val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
 936                break;
 937        case DMA_CCMD_DEVICE_INVL:
 938                val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
 939                        | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
 940                break;
 941        default:
 942                BUG();
 943        }
 944        val |= DMA_CCMD_ICC;
 945
 946        spin_lock_irqsave(&iommu->register_lock, flag);
 947        dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
 948
 949        /* Make sure hardware complete it */
 950        IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
 951                dmar_readq, (!(val & DMA_CCMD_ICC)), val);
 952
 953        spin_unlock_irqrestore(&iommu->register_lock, flag);
 954}
 955
 956/* return value determine if we need a write buffer flush */
 957static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
 958                                u64 addr, unsigned int size_order, u64 type)
 959{
 960        int tlb_offset = ecap_iotlb_offset(iommu->ecap);
 961        u64 val = 0, val_iva = 0;
 962        unsigned long flag;
 963
 964        switch (type) {
 965        case DMA_TLB_GLOBAL_FLUSH:
 966                /* global flush doesn't need set IVA_REG */
 967                val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
 968                break;
 969        case DMA_TLB_DSI_FLUSH:
 970                val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
 971                break;
 972        case DMA_TLB_PSI_FLUSH:
 973                val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
 974                /* Note: always flush non-leaf currently */
 975                val_iva = size_order | addr;
 976                break;
 977        default:
 978                BUG();
 979        }
 980        /* Note: set drain read/write */
 981#if 0
 982        /*
 983         * This is probably to be super secure.. Looks like we can
 984         * ignore it without any impact.
 985         */
 986        if (cap_read_drain(iommu->cap))
 987                val |= DMA_TLB_READ_DRAIN;
 988#endif
 989        if (cap_write_drain(iommu->cap))
 990                val |= DMA_TLB_WRITE_DRAIN;
 991
 992        spin_lock_irqsave(&iommu->register_lock, flag);
 993        /* Note: Only uses first TLB reg currently */
 994        if (val_iva)
 995                dmar_writeq(iommu->reg + tlb_offset, val_iva);
 996        dmar_writeq(iommu->reg + tlb_offset + 8, val);
 997
 998        /* Make sure hardware complete it */
 999        IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1000                dmar_readq, (!(val & DMA_TLB_IVT)), val);
1001
1002        spin_unlock_irqrestore(&iommu->register_lock, flag);
1003
1004        /* check IOTLB invalidation granularity */
1005        if (DMA_TLB_IAIG(val) == 0)
1006                printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1007        if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1008                pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1009                        (unsigned long long)DMA_TLB_IIRG(type),
1010                        (unsigned long long)DMA_TLB_IAIG(val));
1011}
1012
1013static struct device_domain_info *iommu_support_dev_iotlb(
1014        struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1015{
1016        int found = 0;
1017        unsigned long flags;
1018        struct device_domain_info *info;
1019        struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1020
1021        if (!ecap_dev_iotlb_support(iommu->ecap))
1022                return NULL;
1023
1024        if (!iommu->qi)
1025                return NULL;
1026
1027        spin_lock_irqsave(&device_domain_lock, flags);
1028        list_for_each_entry(info, &domain->devices, link)
1029                if (info->bus == bus && info->devfn == devfn) {
1030                        found = 1;
1031                        break;
1032                }
1033        spin_unlock_irqrestore(&device_domain_lock, flags);
1034
1035        if (!found || !info->dev)
1036                return NULL;
1037
1038        if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1039                return NULL;
1040
1041        if (!dmar_find_matched_atsr_unit(info->dev))
1042                return NULL;
1043
1044        info->iommu = iommu;
1045
1046        return info;
1047}
1048
1049static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1050{
1051        if (!info)
1052                return;
1053
1054        pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1055}
1056
1057static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1058{
1059        if (!info->dev || !pci_ats_enabled(info->dev))
1060                return;
1061
1062        pci_disable_ats(info->dev);
1063}
1064
1065static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1066                                  u64 addr, unsigned mask)
1067{
1068        u16 sid, qdep;
1069        unsigned long flags;
1070        struct device_domain_info *info;
1071
1072        spin_lock_irqsave(&device_domain_lock, flags);
1073        list_for_each_entry(info, &domain->devices, link) {
1074                if (!info->dev || !pci_ats_enabled(info->dev))
1075                        continue;
1076
1077                sid = info->bus << 8 | info->devfn;
1078                qdep = pci_ats_queue_depth(info->dev);
1079                qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1080        }
1081        spin_unlock_irqrestore(&device_domain_lock, flags);
1082}
1083
1084static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1085                                  unsigned long pfn, unsigned int pages)
1086{
1087        unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1088        uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1089
1090        BUG_ON(pages == 0);
1091
1092        /*
1093         * Fallback to domain selective flush if no PSI support or the size is
1094         * too big.
1095         * PSI requires page size to be 2 ^ x, and the base address is naturally
1096         * aligned to the size
1097         */
1098        if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1099                iommu->flush.flush_iotlb(iommu, did, 0, 0,
1100                                                DMA_TLB_DSI_FLUSH);
1101        else
1102                iommu->flush.flush_iotlb(iommu, did, addr, mask,
1103                                                DMA_TLB_PSI_FLUSH);
1104
1105        /*
1106         * In caching mode, domain ID 0 is reserved for non-present to present
1107         * mapping flush. Device IOTLB doesn't need to be flushed in this case.
1108         */
1109        if (!cap_caching_mode(iommu->cap) || did)
1110                iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1111}
1112
1113static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1114{
1115        u32 pmen;
1116        unsigned long flags;
1117
1118        spin_lock_irqsave(&iommu->register_lock, flags);
1119        pmen = readl(iommu->reg + DMAR_PMEN_REG);
1120        pmen &= ~DMA_PMEN_EPM;
1121        writel(pmen, iommu->reg + DMAR_PMEN_REG);
1122
1123        /* wait for the protected region status bit to clear */
1124        IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1125                readl, !(pmen & DMA_PMEN_PRS), pmen);
1126
1127        spin_unlock_irqrestore(&iommu->register_lock, flags);
1128}
1129
1130static int iommu_enable_translation(struct intel_iommu *iommu)
1131{
1132        u32 sts;
1133        unsigned long flags;
1134
1135        spin_lock_irqsave(&iommu->register_lock, flags);
1136        iommu->gcmd |= DMA_GCMD_TE;
1137        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1138
1139        /* Make sure hardware complete it */
1140        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1141                      readl, (sts & DMA_GSTS_TES), sts);
1142
1143        spin_unlock_irqrestore(&iommu->register_lock, flags);
1144        return 0;
1145}
1146
1147static int iommu_disable_translation(struct intel_iommu *iommu)
1148{
1149        u32 sts;
1150        unsigned long flag;
1151
1152        spin_lock_irqsave(&iommu->register_lock, flag);
1153        iommu->gcmd &= ~DMA_GCMD_TE;
1154        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1155
1156        /* Make sure hardware complete it */
1157        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1158                      readl, (!(sts & DMA_GSTS_TES)), sts);
1159
1160        spin_unlock_irqrestore(&iommu->register_lock, flag);
1161        return 0;
1162}
1163
1164
1165static int iommu_init_domains(struct intel_iommu *iommu)
1166{
1167        unsigned long ndomains;
1168        unsigned long nlongs;
1169
1170        ndomains = cap_ndoms(iommu->cap);
1171        pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1172        nlongs = BITS_TO_LONGS(ndomains);
1173
1174        spin_lock_init(&iommu->lock);
1175
1176        /* TBD: there might be 64K domains,
1177         * consider other allocation for future chip
1178         */
1179        iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1180        if (!iommu->domain_ids) {
1181                printk(KERN_ERR "Allocating domain id array failed\n");
1182                return -ENOMEM;
1183        }
1184        iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1185                        GFP_KERNEL);
1186        if (!iommu->domains) {
1187                printk(KERN_ERR "Allocating domain array failed\n");
1188                return -ENOMEM;
1189        }
1190
1191        /*
1192         * if Caching mode is set, then invalid translations are tagged
1193         * with domainid 0. Hence we need to pre-allocate it.
1194         */
1195        if (cap_caching_mode(iommu->cap))
1196                set_bit(0, iommu->domain_ids);
1197        return 0;
1198}
1199
1200
1201static void domain_exit(struct dmar_domain *domain);
1202static void vm_domain_exit(struct dmar_domain *domain);
1203
1204void free_dmar_iommu(struct intel_iommu *iommu)
1205{
1206        struct dmar_domain *domain;
1207        int i;
1208        unsigned long flags;
1209
1210        if ((iommu->domains) && (iommu->domain_ids)) {
1211                i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1212                for (; i < cap_ndoms(iommu->cap); ) {
1213                        domain = iommu->domains[i];
1214                        clear_bit(i, iommu->domain_ids);
1215
1216                        spin_lock_irqsave(&domain->iommu_lock, flags);
1217                        if (--domain->iommu_count == 0) {
1218                                if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1219                                        vm_domain_exit(domain);
1220                                else
1221                                        domain_exit(domain);
1222                        }
1223                        spin_unlock_irqrestore(&domain->iommu_lock, flags);
1224
1225                        i = find_next_bit(iommu->domain_ids,
1226                                cap_ndoms(iommu->cap), i+1);
1227                }
1228        }
1229
1230        if (iommu->gcmd & DMA_GCMD_TE)
1231                iommu_disable_translation(iommu);
1232
1233        if (iommu->irq) {
1234                set_irq_data(iommu->irq, NULL);
1235                /* This will mask the irq */
1236                free_irq(iommu->irq, iommu);
1237                destroy_irq(iommu->irq);
1238        }
1239
1240        kfree(iommu->domains);
1241        kfree(iommu->domain_ids);
1242
1243        g_iommus[iommu->seq_id] = NULL;
1244
1245        /* if all iommus are freed, free g_iommus */
1246        for (i = 0; i < g_num_of_iommus; i++) {
1247                if (g_iommus[i])
1248                        break;
1249        }
1250
1251        if (i == g_num_of_iommus)
1252                kfree(g_iommus);
1253
1254        /* free context mapping */
1255        free_context_table(iommu);
1256}
1257
1258static struct dmar_domain *alloc_domain(void)
1259{
1260        struct dmar_domain *domain;
1261
1262        domain = alloc_domain_mem();
1263        if (!domain)
1264                return NULL;
1265
1266        memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1267        domain->flags = 0;
1268
1269        return domain;
1270}
1271
1272static int iommu_attach_domain(struct dmar_domain *domain,
1273                               struct intel_iommu *iommu)
1274{
1275        int num;
1276        unsigned long ndomains;
1277        unsigned long flags;
1278
1279        ndomains = cap_ndoms(iommu->cap);
1280
1281        spin_lock_irqsave(&iommu->lock, flags);
1282
1283        num = find_first_zero_bit(iommu->domain_ids, ndomains);
1284        if (num >= ndomains) {
1285                spin_unlock_irqrestore(&iommu->lock, flags);
1286                printk(KERN_ERR "IOMMU: no free domain ids\n");
1287                return -ENOMEM;
1288        }
1289
1290        domain->id = num;
1291        set_bit(num, iommu->domain_ids);
1292        set_bit(iommu->seq_id, &domain->iommu_bmp);
1293        iommu->domains[num] = domain;
1294        spin_unlock_irqrestore(&iommu->lock, flags);
1295
1296        return 0;
1297}
1298
1299static void iommu_detach_domain(struct dmar_domain *domain,
1300                                struct intel_iommu *iommu)
1301{
1302        unsigned long flags;
1303        int num, ndomains;
1304        int found = 0;
1305
1306        spin_lock_irqsave(&iommu->lock, flags);
1307        ndomains = cap_ndoms(iommu->cap);
1308        num = find_first_bit(iommu->domain_ids, ndomains);
1309        for (; num < ndomains; ) {
1310                if (iommu->domains[num] == domain) {
1311                        found = 1;
1312                        break;
1313                }
1314                num = find_next_bit(iommu->domain_ids,
1315                                    cap_ndoms(iommu->cap), num+1);
1316        }
1317
1318        if (found) {
1319                clear_bit(num, iommu->domain_ids);
1320                clear_bit(iommu->seq_id, &domain->iommu_bmp);
1321                iommu->domains[num] = NULL;
1322        }
1323        spin_unlock_irqrestore(&iommu->lock, flags);
1324}
1325
1326static struct iova_domain reserved_iova_list;
1327static struct lock_class_key reserved_rbtree_key;
1328
1329static void dmar_init_reserved_ranges(void)
1330{
1331        struct pci_dev *pdev = NULL;
1332        struct iova *iova;
1333        int i;
1334
1335        init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1336
1337        lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1338                &reserved_rbtree_key);
1339
1340        /* IOAPIC ranges shouldn't be accessed by DMA */
1341        iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1342                IOVA_PFN(IOAPIC_RANGE_END));
1343        if (!iova)
1344                printk(KERN_ERR "Reserve IOAPIC range failed\n");
1345
1346        /* Reserve all PCI MMIO to avoid peer-to-peer access */
1347        for_each_pci_dev(pdev) {
1348                struct resource *r;
1349
1350                for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1351                        r = &pdev->resource[i];
1352                        if (!r->flags || !(r->flags & IORESOURCE_MEM))
1353                                continue;
1354                        iova = reserve_iova(&reserved_iova_list,
1355                                            IOVA_PFN(r->start),
1356                                            IOVA_PFN(r->end));
1357                        if (!iova)
1358                                printk(KERN_ERR "Reserve iova failed\n");
1359                }
1360        }
1361
1362}
1363
1364static void domain_reserve_special_ranges(struct dmar_domain *domain)
1365{
1366        copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1367}
1368
1369static inline int guestwidth_to_adjustwidth(int gaw)
1370{
1371        int agaw;
1372        int r = (gaw - 12) % 9;
1373
1374        if (r == 0)
1375                agaw = gaw;
1376        else
1377                agaw = gaw + 9 - r;
1378        if (agaw > 64)
1379                agaw = 64;
1380        return agaw;
1381}
1382
1383static int domain_init(struct dmar_domain *domain, int guest_width)
1384{
1385        struct intel_iommu *iommu;
1386        int adjust_width, agaw;
1387        unsigned long sagaw;
1388
1389        init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1390        spin_lock_init(&domain->iommu_lock);
1391
1392        domain_reserve_special_ranges(domain);
1393
1394        /* calculate AGAW */
1395        iommu = domain_get_iommu(domain);
1396        if (guest_width > cap_mgaw(iommu->cap))
1397                guest_width = cap_mgaw(iommu->cap);
1398        domain->gaw = guest_width;
1399        adjust_width = guestwidth_to_adjustwidth(guest_width);
1400        agaw = width_to_agaw(adjust_width);
1401        sagaw = cap_sagaw(iommu->cap);
1402        if (!test_bit(agaw, &sagaw)) {
1403                /* hardware doesn't support it, choose a bigger one */
1404                pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1405                agaw = find_next_bit(&sagaw, 5, agaw);
1406                if (agaw >= 5)
1407                        return -ENODEV;
1408        }
1409        domain->agaw = agaw;
1410        INIT_LIST_HEAD(&domain->devices);
1411
1412        if (ecap_coherent(iommu->ecap))
1413                domain->iommu_coherency = 1;
1414        else
1415                domain->iommu_coherency = 0;
1416
1417        if (ecap_sc_support(iommu->ecap))
1418                domain->iommu_snooping = 1;
1419        else
1420                domain->iommu_snooping = 0;
1421
1422        domain->iommu_count = 1;
1423
1424        /* always allocate the top pgd */
1425        domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1426        if (!domain->pgd)
1427                return -ENOMEM;
1428        __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1429        return 0;
1430}
1431
1432static void domain_exit(struct dmar_domain *domain)
1433{
1434        struct dmar_drhd_unit *drhd;
1435        struct intel_iommu *iommu;
1436
1437        /* Domain 0 is reserved, so dont process it */
1438        if (!domain)
1439                return;
1440
1441        domain_remove_dev_info(domain);
1442        /* destroy iovas */
1443        put_iova_domain(&domain->iovad);
1444
1445        /* clear ptes */
1446        dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1447
1448        /* free page tables */
1449        dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1450
1451        for_each_active_iommu(iommu, drhd)
1452                if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1453                        iommu_detach_domain(domain, iommu);
1454
1455        free_domain_mem(domain);
1456}
1457
1458static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1459                                 u8 bus, u8 devfn, int translation)
1460{
1461        struct context_entry *context;
1462        unsigned long flags;
1463        struct intel_iommu *iommu;
1464        struct dma_pte *pgd;
1465        unsigned long num;
1466        unsigned long ndomains;
1467        int id;
1468        int agaw;
1469        struct device_domain_info *info = NULL;
1470
1471        pr_debug("Set context mapping for %02x:%02x.%d\n",
1472                bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1473
1474        BUG_ON(!domain->pgd);
1475        BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1476               translation != CONTEXT_TT_MULTI_LEVEL);
1477
1478        iommu = device_to_iommu(segment, bus, devfn);
1479        if (!iommu)
1480                return -ENODEV;
1481
1482        context = device_to_context_entry(iommu, bus, devfn);
1483        if (!context)
1484                return -ENOMEM;
1485        spin_lock_irqsave(&iommu->lock, flags);
1486        if (context_present(context)) {
1487                spin_unlock_irqrestore(&iommu->lock, flags);
1488                return 0;
1489        }
1490
1491        id = domain->id;
1492        pgd = domain->pgd;
1493
1494        if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1495            domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1496                int found = 0;
1497
1498                /* find an available domain id for this device in iommu */
1499                ndomains = cap_ndoms(iommu->cap);
1500                num = find_first_bit(iommu->domain_ids, ndomains);
1501                for (; num < ndomains; ) {
1502                        if (iommu->domains[num] == domain) {
1503                                id = num;
1504                                found = 1;
1505                                break;
1506                        }
1507                        num = find_next_bit(iommu->domain_ids,
1508                                            cap_ndoms(iommu->cap), num+1);
1509                }
1510
1511                if (found == 0) {
1512                        num = find_first_zero_bit(iommu->domain_ids, ndomains);
1513                        if (num >= ndomains) {
1514                                spin_unlock_irqrestore(&iommu->lock, flags);
1515                                printk(KERN_ERR "IOMMU: no free domain ids\n");
1516                                return -EFAULT;
1517                        }
1518
1519                        set_bit(num, iommu->domain_ids);
1520                        iommu->domains[num] = domain;
1521                        id = num;
1522                }
1523
1524                /* Skip top levels of page tables for
1525                 * iommu which has less agaw than default.
1526                 */
1527                for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1528                        pgd = phys_to_virt(dma_pte_addr(pgd));
1529                        if (!dma_pte_present(pgd)) {
1530                                spin_unlock_irqrestore(&iommu->lock, flags);
1531                                return -ENOMEM;
1532                        }
1533                }
1534        }
1535
1536        context_set_domain_id(context, id);
1537
1538        if (translation != CONTEXT_TT_PASS_THROUGH) {
1539                info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1540                translation = info ? CONTEXT_TT_DEV_IOTLB :
1541                                     CONTEXT_TT_MULTI_LEVEL;
1542        }
1543        /*
1544         * In pass through mode, AW must be programmed to indicate the largest
1545         * AGAW value supported by hardware. And ASR is ignored by hardware.
1546         */
1547        if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1548                context_set_address_width(context, iommu->msagaw);
1549        else {
1550                context_set_address_root(context, virt_to_phys(pgd));
1551                context_set_address_width(context, iommu->agaw);
1552        }
1553
1554        context_set_translation_type(context, translation);
1555        context_set_fault_enable(context);
1556        context_set_present(context);
1557        domain_flush_cache(domain, context, sizeof(*context));
1558
1559        /*
1560         * It's a non-present to present mapping. If hardware doesn't cache
1561         * non-present entry we only need to flush the write-buffer. If the
1562         * _does_ cache non-present entries, then it does so in the special
1563         * domain #0, which we have to flush:
1564         */
1565        if (cap_caching_mode(iommu->cap)) {
1566                iommu->flush.flush_context(iommu, 0,
1567                                           (((u16)bus) << 8) | devfn,
1568                                           DMA_CCMD_MASK_NOBIT,
1569                                           DMA_CCMD_DEVICE_INVL);
1570                iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH);
1571        } else {
1572                iommu_flush_write_buffer(iommu);
1573        }
1574        iommu_enable_dev_iotlb(info);
1575        spin_unlock_irqrestore(&iommu->lock, flags);
1576
1577        spin_lock_irqsave(&domain->iommu_lock, flags);
1578        if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1579                domain->iommu_count++;
1580                domain_update_iommu_cap(domain);
1581        }
1582        spin_unlock_irqrestore(&domain->iommu_lock, flags);
1583        return 0;
1584}
1585
1586static int
1587domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1588                        int translation)
1589{
1590        int ret;
1591        struct pci_dev *tmp, *parent;
1592
1593        ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1594                                         pdev->bus->number, pdev->devfn,
1595                                         translation);
1596        if (ret)
1597                return ret;
1598
1599        /* dependent device mapping */
1600        tmp = pci_find_upstream_pcie_bridge(pdev);
1601        if (!tmp)
1602                return 0;
1603        /* Secondary interface's bus number and devfn 0 */
1604        parent = pdev->bus->self;
1605        while (parent != tmp) {
1606                ret = domain_context_mapping_one(domain,
1607                                                 pci_domain_nr(parent->bus),
1608                                                 parent->bus->number,
1609                                                 parent->devfn, translation);
1610                if (ret)
1611                        return ret;
1612                parent = parent->bus->self;
1613        }
1614        if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1615                return domain_context_mapping_one(domain,
1616                                        pci_domain_nr(tmp->subordinate),
1617                                        tmp->subordinate->number, 0,
1618                                        translation);
1619        else /* this is a legacy PCI bridge */
1620                return domain_context_mapping_one(domain,
1621                                                  pci_domain_nr(tmp->bus),
1622                                                  tmp->bus->number,
1623                                                  tmp->devfn,
1624                                                  translation);
1625}
1626
1627static int domain_context_mapped(struct pci_dev *pdev)
1628{
1629        int ret;
1630        struct pci_dev *tmp, *parent;
1631        struct intel_iommu *iommu;
1632
1633        iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1634                                pdev->devfn);
1635        if (!iommu)
1636                return -ENODEV;
1637
1638        ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1639        if (!ret)
1640                return ret;
1641        /* dependent device mapping */
1642        tmp = pci_find_upstream_pcie_bridge(pdev);
1643        if (!tmp)
1644                return ret;
1645        /* Secondary interface's bus number and devfn 0 */
1646        parent = pdev->bus->self;
1647        while (parent != tmp) {
1648                ret = device_context_mapped(iommu, parent->bus->number,
1649                                            parent->devfn);
1650                if (!ret)
1651                        return ret;
1652                parent = parent->bus->self;
1653        }
1654        if (tmp->is_pcie)
1655                return device_context_mapped(iommu, tmp->subordinate->number,
1656                                             0);
1657        else
1658                return device_context_mapped(iommu, tmp->bus->number,
1659                                             tmp->devfn);
1660}
1661
1662/* Returns a number of VTD pages, but aligned to MM page size */
1663static inline unsigned long aligned_nrpages(unsigned long host_addr,
1664                                            size_t size)
1665{
1666        host_addr &= ~PAGE_MASK;
1667        return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1668}
1669
1670static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1671                            struct scatterlist *sg, unsigned long phys_pfn,
1672                            unsigned long nr_pages, int prot)
1673{
1674        struct dma_pte *first_pte = NULL, *pte = NULL;
1675        phys_addr_t uninitialized_var(pteval);
1676        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1677        unsigned long sg_res;
1678
1679        BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1680
1681        if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1682                return -EINVAL;
1683
1684        prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1685
1686        if (sg)
1687                sg_res = 0;
1688        else {
1689                sg_res = nr_pages + 1;
1690                pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1691        }
1692
1693        while (nr_pages--) {
1694                uint64_t tmp;
1695
1696                if (!sg_res) {
1697                        sg_res = aligned_nrpages(sg->offset, sg->length);
1698                        sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1699                        sg->dma_length = sg->length;
1700                        pteval = page_to_phys(sg_page(sg)) | prot;
1701                }
1702                if (!pte) {
1703                        first_pte = pte = pfn_to_dma_pte(domain, iov_pfn);
1704                        if (!pte)
1705                                return -ENOMEM;
1706                }
1707                /* We don't need lock here, nobody else
1708                 * touches the iova range
1709                 */
1710                tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1711                if (tmp) {
1712                        static int dumps = 5;
1713                        printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1714                               iov_pfn, tmp, (unsigned long long)pteval);
1715                        if (dumps) {
1716                                dumps--;
1717                                debug_dma_dump_mappings(NULL);
1718                        }
1719                        WARN_ON(1);
1720                }
1721                pte++;
1722                if (!nr_pages || first_pte_in_page(pte)) {
1723                        domain_flush_cache(domain, first_pte,
1724                                           (void *)pte - (void *)first_pte);
1725                        pte = NULL;
1726                }
1727                iov_pfn++;
1728                pteval += VTD_PAGE_SIZE;
1729                sg_res--;
1730                if (!sg_res)
1731                        sg = sg_next(sg);
1732        }
1733        return 0;
1734}
1735
1736static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1737                                    struct scatterlist *sg, unsigned long nr_pages,
1738                                    int prot)
1739{
1740        return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1741}
1742
1743static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1744                                     unsigned long phys_pfn, unsigned long nr_pages,
1745                                     int prot)
1746{
1747        return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1748}
1749
1750static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1751{
1752        if (!iommu)
1753                return;
1754
1755        clear_context_table(iommu, bus, devfn);
1756        iommu->flush.flush_context(iommu, 0, 0, 0,
1757                                           DMA_CCMD_GLOBAL_INVL);
1758        iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1759}
1760
1761static void domain_remove_dev_info(struct dmar_domain *domain)
1762{
1763        struct device_domain_info *info;
1764        unsigned long flags;
1765        struct intel_iommu *iommu;
1766
1767        spin_lock_irqsave(&device_domain_lock, flags);
1768        while (!list_empty(&domain->devices)) {
1769                info = list_entry(domain->devices.next,
1770                        struct device_domain_info, link);
1771                list_del(&info->link);
1772                list_del(&info->global);
1773                if (info->dev)
1774                        info->dev->dev.archdata.iommu = NULL;
1775                spin_unlock_irqrestore(&device_domain_lock, flags);
1776
1777                iommu_disable_dev_iotlb(info);
1778                iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1779                iommu_detach_dev(iommu, info->bus, info->devfn);
1780                free_devinfo_mem(info);
1781
1782                spin_lock_irqsave(&device_domain_lock, flags);
1783        }
1784        spin_unlock_irqrestore(&device_domain_lock, flags);
1785}
1786
1787/*
1788 * find_domain
1789 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1790 */
1791static struct dmar_domain *
1792find_domain(struct pci_dev *pdev)
1793{
1794        struct device_domain_info *info;
1795
1796        /* No lock here, assumes no domain exit in normal case */
1797        info = pdev->dev.archdata.iommu;
1798        if (info)
1799                return info->domain;
1800        return NULL;
1801}
1802
1803/* domain is initialized */
1804static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1805{
1806        struct dmar_domain *domain, *found = NULL;
1807        struct intel_iommu *iommu;
1808        struct dmar_drhd_unit *drhd;
1809        struct device_domain_info *info, *tmp;
1810        struct pci_dev *dev_tmp;
1811        unsigned long flags;
1812        int bus = 0, devfn = 0;
1813        int segment;
1814        int ret;
1815
1816        domain = find_domain(pdev);
1817        if (domain)
1818                return domain;
1819
1820        segment = pci_domain_nr(pdev->bus);
1821
1822        dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1823        if (dev_tmp) {
1824                if (dev_tmp->is_pcie) {
1825                        bus = dev_tmp->subordinate->number;
1826                        devfn = 0;
1827                } else {
1828                        bus = dev_tmp->bus->number;
1829                        devfn = dev_tmp->devfn;
1830                }
1831                spin_lock_irqsave(&device_domain_lock, flags);
1832                list_for_each_entry(info, &device_domain_list, global) {
1833                        if (info->segment == segment &&
1834                            info->bus == bus && info->devfn == devfn) {
1835                                found = info->domain;
1836                                break;
1837                        }
1838                }
1839                spin_unlock_irqrestore(&device_domain_lock, flags);
1840                /* pcie-pci bridge already has a domain, uses it */
1841                if (found) {
1842                        domain = found;
1843                        goto found_domain;
1844                }
1845        }
1846
1847        domain = alloc_domain();
1848        if (!domain)
1849                goto error;
1850
1851        /* Allocate new domain for the device */
1852        drhd = dmar_find_matched_drhd_unit(pdev);
1853        if (!drhd) {
1854                printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1855                        pci_name(pdev));
1856                return NULL;
1857        }
1858        iommu = drhd->iommu;
1859
1860        ret = iommu_attach_domain(domain, iommu);
1861        if (ret) {
1862                domain_exit(domain);
1863                goto error;
1864        }
1865
1866        if (domain_init(domain, gaw)) {
1867                domain_exit(domain);
1868                goto error;
1869        }
1870
1871        /* register pcie-to-pci device */
1872        if (dev_tmp) {
1873                info = alloc_devinfo_mem();
1874                if (!info) {
1875                        domain_exit(domain);
1876                        goto error;
1877                }
1878                info->segment = segment;
1879                info->bus = bus;
1880                info->devfn = devfn;
1881                info->dev = NULL;
1882                info->domain = domain;
1883                /* This domain is shared by devices under p2p bridge */
1884                domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1885
1886                /* pcie-to-pci bridge already has a domain, uses it */
1887                found = NULL;
1888                spin_lock_irqsave(&device_domain_lock, flags);
1889                list_for_each_entry(tmp, &device_domain_list, global) {
1890                        if (tmp->segment == segment &&
1891                            tmp->bus == bus && tmp->devfn == devfn) {
1892                                found = tmp->domain;
1893                                break;
1894                        }
1895                }
1896                if (found) {
1897                        free_devinfo_mem(info);
1898                        domain_exit(domain);
1899                        domain = found;
1900                } else {
1901                        list_add(&info->link, &domain->devices);
1902                        list_add(&info->global, &device_domain_list);
1903                }
1904                spin_unlock_irqrestore(&device_domain_lock, flags);
1905        }
1906
1907found_domain:
1908        info = alloc_devinfo_mem();
1909        if (!info)
1910                goto error;
1911        info->segment = segment;
1912        info->bus = pdev->bus->number;
1913        info->devfn = pdev->devfn;
1914        info->dev = pdev;
1915        info->domain = domain;
1916        spin_lock_irqsave(&device_domain_lock, flags);
1917        /* somebody is fast */
1918        found = find_domain(pdev);
1919        if (found != NULL) {
1920                spin_unlock_irqrestore(&device_domain_lock, flags);
1921                if (found != domain) {
1922                        domain_exit(domain);
1923                        domain = found;
1924                }
1925                free_devinfo_mem(info);
1926                return domain;
1927        }
1928        list_add(&info->link, &domain->devices);
1929        list_add(&info->global, &device_domain_list);
1930        pdev->dev.archdata.iommu = info;
1931        spin_unlock_irqrestore(&device_domain_lock, flags);
1932        return domain;
1933error:
1934        /* recheck it here, maybe others set it */
1935        return find_domain(pdev);
1936}
1937
1938static int iommu_identity_mapping;
1939#define IDENTMAP_ALL            1
1940#define IDENTMAP_GFX            2
1941#define IDENTMAP_AZALIA         4
1942
1943static int iommu_domain_identity_map(struct dmar_domain *domain,
1944                                     unsigned long long start,
1945                                     unsigned long long end)
1946{
1947        unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
1948        unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
1949
1950        if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
1951                          dma_to_mm_pfn(last_vpfn))) {
1952                printk(KERN_ERR "IOMMU: reserve iova failed\n");
1953                return -ENOMEM;
1954        }
1955
1956        pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
1957                 start, end, domain->id);
1958        /*
1959         * RMRR range might have overlap with physical memory range,
1960         * clear it first
1961         */
1962        dma_pte_clear_range(domain, first_vpfn, last_vpfn);
1963
1964        return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
1965                                  last_vpfn - first_vpfn + 1,
1966                                  DMA_PTE_READ|DMA_PTE_WRITE);
1967}
1968
1969static int iommu_prepare_identity_map(struct pci_dev *pdev,
1970                                      unsigned long long start,
1971                                      unsigned long long end)
1972{
1973        struct dmar_domain *domain;
1974        int ret;
1975
1976        domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1977        if (!domain)
1978                return -ENOMEM;
1979
1980        /* For _hardware_ passthrough, don't bother. But for software
1981           passthrough, we do it anyway -- it may indicate a memory
1982           range which is reserved in E820, so which didn't get set
1983           up to start with in si_domain */
1984        if (domain == si_domain && hw_pass_through) {
1985                printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
1986                       pci_name(pdev), start, end);
1987                return 0;
1988        }
1989
1990        printk(KERN_INFO
1991               "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1992               pci_name(pdev), start, end);
1993        
1994        if (end >> agaw_to_width(domain->agaw)) {
1995                WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
1996                     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1997                     agaw_to_width(domain->agaw),
1998                     dmi_get_system_info(DMI_BIOS_VENDOR),
1999                     dmi_get_system_info(DMI_BIOS_VERSION),
2000                     dmi_get_system_info(DMI_PRODUCT_VERSION));
2001                ret = -EIO;
2002                goto error;
2003        }
2004
2005        ret = iommu_domain_identity_map(domain, start, end);
2006        if (ret)
2007                goto error;
2008
2009        /* context entry init */
2010        ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2011        if (ret)
2012                goto error;
2013
2014        return 0;
2015
2016 error:
2017        domain_exit(domain);
2018        return ret;
2019}
2020
2021static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2022        struct pci_dev *pdev)
2023{
2024        if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2025                return 0;
2026        return iommu_prepare_identity_map(pdev, rmrr->base_address,
2027                rmrr->end_address + 1);
2028}
2029
2030#ifdef CONFIG_DMAR_FLOPPY_WA
2031static inline void iommu_prepare_isa(void)
2032{
2033        struct pci_dev *pdev;
2034        int ret;
2035
2036        pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2037        if (!pdev)
2038                return;
2039
2040        printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2041        ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
2042
2043        if (ret)
2044                printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2045                       "floppy might not work\n");
2046
2047}
2048#else
2049static inline void iommu_prepare_isa(void)
2050{
2051        return;
2052}
2053#endif /* !CONFIG_DMAR_FLPY_WA */
2054
2055static int md_domain_init(struct dmar_domain *domain, int guest_width);
2056
2057static int __init si_domain_work_fn(unsigned long start_pfn,
2058                                    unsigned long end_pfn, void *datax)
2059{
2060        int *ret = datax;
2061
2062        *ret = iommu_domain_identity_map(si_domain,
2063                                         (uint64_t)start_pfn << PAGE_SHIFT,
2064                                         (uint64_t)end_pfn << PAGE_SHIFT);
2065        return *ret;
2066
2067}
2068
2069static int __init si_domain_init(int hw)
2070{
2071        struct dmar_drhd_unit *drhd;
2072        struct intel_iommu *iommu;
2073        int nid, ret = 0;
2074
2075        si_domain = alloc_domain();
2076        if (!si_domain)
2077                return -EFAULT;
2078
2079        pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2080
2081        for_each_active_iommu(iommu, drhd) {
2082                ret = iommu_attach_domain(si_domain, iommu);
2083                if (ret) {
2084                        domain_exit(si_domain);
2085                        return -EFAULT;
2086                }
2087        }
2088
2089        if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2090                domain_exit(si_domain);
2091                return -EFAULT;
2092        }
2093
2094        si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2095
2096        if (hw)
2097                return 0;
2098
2099        for_each_online_node(nid) {
2100                work_with_active_regions(nid, si_domain_work_fn, &ret);
2101                if (ret)
2102                        return ret;
2103        }
2104
2105        return 0;
2106}
2107
2108static void domain_remove_one_dev_info(struct dmar_domain *domain,
2109                                          struct pci_dev *pdev);
2110static int identity_mapping(struct pci_dev *pdev)
2111{
2112        struct device_domain_info *info;
2113
2114        if (likely(!iommu_identity_mapping))
2115                return 0;
2116
2117
2118        list_for_each_entry(info, &si_domain->devices, link)
2119                if (info->dev == pdev)
2120                        return 1;
2121        return 0;
2122}
2123
2124static int domain_add_dev_info(struct dmar_domain *domain,
2125                               struct pci_dev *pdev,
2126                               int translation)
2127{
2128        struct device_domain_info *info;
2129        unsigned long flags;
2130        int ret;
2131
2132        info = alloc_devinfo_mem();
2133        if (!info)
2134                return -ENOMEM;
2135
2136        ret = domain_context_mapping(domain, pdev, translation);
2137        if (ret) {
2138                free_devinfo_mem(info);
2139                return ret;
2140        }
2141
2142        info->segment = pci_domain_nr(pdev->bus);
2143        info->bus = pdev->bus->number;
2144        info->devfn = pdev->devfn;
2145        info->dev = pdev;
2146        info->domain = domain;
2147
2148        spin_lock_irqsave(&device_domain_lock, flags);
2149        list_add(&info->link, &domain->devices);
2150        list_add(&info->global, &device_domain_list);
2151        pdev->dev.archdata.iommu = info;
2152        spin_unlock_irqrestore(&device_domain_lock, flags);
2153
2154        return 0;
2155}
2156
2157static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2158{
2159        if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2160                return 1;
2161
2162        if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2163                return 1;
2164
2165        if (!(iommu_identity_mapping & IDENTMAP_ALL))
2166                return 0;
2167
2168        /*
2169         * We want to start off with all devices in the 1:1 domain, and
2170         * take them out later if we find they can't access all of memory.
2171         *
2172         * However, we can't do this for PCI devices behind bridges,
2173         * because all PCI devices behind the same bridge will end up
2174         * with the same source-id on their transactions.
2175         *
2176         * Practically speaking, we can't change things around for these
2177         * devices at run-time, because we can't be sure there'll be no
2178         * DMA transactions in flight for any of their siblings.
2179         * 
2180         * So PCI devices (unless they're on the root bus) as well as
2181         * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2182         * the 1:1 domain, just in _case_ one of their siblings turns out
2183         * not to be able to map all of memory.
2184         */
2185        if (!pdev->is_pcie) {
2186                if (!pci_is_root_bus(pdev->bus))
2187                        return 0;
2188                if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2189                        return 0;
2190        } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2191                return 0;
2192
2193        /* 
2194         * At boot time, we don't yet know if devices will be 64-bit capable.
2195         * Assume that they will -- if they turn out not to be, then we can 
2196         * take them out of the 1:1 domain later.
2197         */
2198        if (!startup)
2199                return pdev->dma_mask > DMA_BIT_MASK(32);
2200
2201        return 1;
2202}
2203
2204static int __init iommu_prepare_static_identity_mapping(int hw)
2205{
2206        struct pci_dev *pdev = NULL;
2207        int ret;
2208
2209        ret = si_domain_init(hw);
2210        if (ret)
2211                return -EFAULT;
2212
2213        for_each_pci_dev(pdev) {
2214                if (iommu_should_identity_map(pdev, 1)) {
2215                        printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2216                               hw ? "hardware" : "software", pci_name(pdev));
2217
2218                        ret = domain_add_dev_info(si_domain, pdev,
2219                                                     hw ? CONTEXT_TT_PASS_THROUGH :
2220                                                     CONTEXT_TT_MULTI_LEVEL);
2221                        if (ret)
2222                                return ret;
2223                }
2224        }
2225
2226        return 0;
2227}
2228
2229int __init init_dmars(void)
2230{
2231        struct dmar_drhd_unit *drhd;
2232        struct dmar_rmrr_unit *rmrr;
2233        struct pci_dev *pdev;
2234        struct intel_iommu *iommu;
2235        int i, ret;
2236
2237        /*
2238         * for each drhd
2239         *    allocate root
2240         *    initialize and program root entry to not present
2241         * endfor
2242         */
2243        for_each_drhd_unit(drhd) {
2244                g_num_of_iommus++;
2245                /*
2246                 * lock not needed as this is only incremented in the single
2247                 * threaded kernel __init code path all other access are read
2248                 * only
2249                 */
2250        }
2251
2252        g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2253                        GFP_KERNEL);
2254        if (!g_iommus) {
2255                printk(KERN_ERR "Allocating global iommu array failed\n");
2256                ret = -ENOMEM;
2257                goto error;
2258        }
2259
2260        deferred_flush = kzalloc(g_num_of_iommus *
2261                sizeof(struct deferred_flush_tables), GFP_KERNEL);
2262        if (!deferred_flush) {
2263                ret = -ENOMEM;
2264                goto error;
2265        }
2266
2267        for_each_drhd_unit(drhd) {
2268                if (drhd->ignored)
2269                        continue;
2270
2271                iommu = drhd->iommu;
2272                g_iommus[iommu->seq_id] = iommu;
2273
2274                ret = iommu_init_domains(iommu);
2275                if (ret)
2276                        goto error;
2277
2278                /*
2279                 * TBD:
2280                 * we could share the same root & context tables
2281                 * amoung all IOMMU's. Need to Split it later.
2282                 */
2283                ret = iommu_alloc_root_entry(iommu);
2284                if (ret) {
2285                        printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2286                        goto error;
2287                }
2288                if (!ecap_pass_through(iommu->ecap))
2289                        hw_pass_through = 0;
2290        }
2291
2292        /*
2293         * Start from the sane iommu hardware state.
2294         */
2295        for_each_drhd_unit(drhd) {
2296                if (drhd->ignored)
2297                        continue;
2298
2299                iommu = drhd->iommu;
2300
2301                /*
2302                 * If the queued invalidation is already initialized by us
2303                 * (for example, while enabling interrupt-remapping) then
2304                 * we got the things already rolling from a sane state.
2305                 */
2306                if (iommu->qi)
2307                        continue;
2308
2309                /*
2310                 * Clear any previous faults.
2311                 */
2312                dmar_fault(-1, iommu);
2313                /*
2314                 * Disable queued invalidation if supported and already enabled
2315                 * before OS handover.
2316                 */
2317                dmar_disable_qi(iommu);
2318        }
2319
2320        for_each_drhd_unit(drhd) {
2321                if (drhd->ignored)
2322                        continue;
2323
2324                iommu = drhd->iommu;
2325
2326                if (dmar_enable_qi(iommu)) {
2327                        /*
2328                         * Queued Invalidate not enabled, use Register Based
2329                         * Invalidate
2330                         */
2331                        iommu->flush.flush_context = __iommu_flush_context;
2332                        iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2333                        printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
2334                               "invalidation\n",
2335                               (unsigned long long)drhd->reg_base_addr);
2336                } else {
2337                        iommu->flush.flush_context = qi_flush_context;
2338                        iommu->flush.flush_iotlb = qi_flush_iotlb;
2339                        printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2340                               "invalidation\n",
2341                               (unsigned long long)drhd->reg_base_addr);
2342                }
2343        }
2344
2345        if (iommu_pass_through)
2346                iommu_identity_mapping |= IDENTMAP_ALL;
2347
2348#ifdef CONFIG_DMAR_BROKEN_GFX_WA
2349        iommu_identity_mapping |= IDENTMAP_GFX;
2350#endif
2351
2352        check_tylersburg_isoch();
2353
2354        /*
2355         * If pass through is not set or not enabled, setup context entries for
2356         * identity mappings for rmrr, gfx, and isa and may fall back to static
2357         * identity mapping if iommu_identity_mapping is set.
2358         */
2359        if (iommu_identity_mapping) {
2360                ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2361                if (ret) {
2362                        printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2363                        goto error;
2364                }
2365        }
2366        /*
2367         * For each rmrr
2368         *   for each dev attached to rmrr
2369         *   do
2370         *     locate drhd for dev, alloc domain for dev
2371         *     allocate free domain
2372         *     allocate page table entries for rmrr
2373         *     if context not allocated for bus
2374         *           allocate and init context
2375         *           set present in root table for this bus
2376         *     init context with domain, translation etc
2377         *    endfor
2378         * endfor
2379         */
2380        printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2381        for_each_rmrr_units(rmrr) {
2382                for (i = 0; i < rmrr->devices_cnt; i++) {
2383                        pdev = rmrr->devices[i];
2384                        /*
2385                         * some BIOS lists non-exist devices in DMAR
2386                         * table.
2387                         */
2388                        if (!pdev)
2389                                continue;
2390                        ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2391                        if (ret)
2392                                printk(KERN_ERR
2393                                       "IOMMU: mapping reserved region failed\n");
2394                }
2395        }
2396
2397        iommu_prepare_isa();
2398
2399        /*
2400         * for each drhd
2401         *   enable fault log
2402         *   global invalidate context cache
2403         *   global invalidate iotlb
2404         *   enable translation
2405         */
2406        for_each_drhd_unit(drhd) {
2407                if (drhd->ignored)
2408                        continue;
2409                iommu = drhd->iommu;
2410
2411                iommu_flush_write_buffer(iommu);
2412
2413                ret = dmar_set_interrupt(iommu);
2414                if (ret)
2415                        goto error;
2416
2417                iommu_set_root_entry(iommu);
2418
2419                iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2420                iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2421
2422                ret = iommu_enable_translation(iommu);
2423                if (ret)
2424                        goto error;
2425
2426                iommu_disable_protect_mem_regions(iommu);
2427        }
2428
2429        return 0;
2430error:
2431        for_each_drhd_unit(drhd) {
2432                if (drhd->ignored)
2433                        continue;
2434                iommu = drhd->iommu;
2435                free_iommu(iommu);
2436        }
2437        kfree(g_iommus);
2438        return ret;
2439}
2440
2441/* This takes a number of _MM_ pages, not VTD pages */
2442static struct iova *intel_alloc_iova(struct device *dev,
2443                                     struct dmar_domain *domain,
2444                                     unsigned long nrpages, uint64_t dma_mask)
2445{
2446        struct pci_dev *pdev = to_pci_dev(dev);
2447        struct iova *iova = NULL;
2448
2449        /* Restrict dma_mask to the width that the iommu can handle */
2450        dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2451
2452        if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2453                /*
2454                 * First try to allocate an io virtual address in
2455                 * DMA_BIT_MASK(32) and if that fails then try allocating
2456                 * from higher range
2457                 */
2458                iova = alloc_iova(&domain->iovad, nrpages,
2459                                  IOVA_PFN(DMA_BIT_MASK(32)), 1);
2460                if (iova)
2461                        return iova;
2462        }
2463        iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2464        if (unlikely(!iova)) {
2465                printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2466                       nrpages, pci_name(pdev));
2467                return NULL;
2468        }
2469
2470        return iova;
2471}
2472
2473static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2474{
2475        struct dmar_domain *domain;
2476        int ret;
2477
2478        domain = get_domain_for_dev(pdev,
2479                        DEFAULT_DOMAIN_ADDRESS_WIDTH);
2480        if (!domain) {
2481                printk(KERN_ERR
2482                        "Allocating domain for %s failed", pci_name(pdev));
2483                return NULL;
2484        }
2485
2486        /* make sure context mapping is ok */
2487        if (unlikely(!domain_context_mapped(pdev))) {
2488                ret = domain_context_mapping(domain, pdev,
2489                                             CONTEXT_TT_MULTI_LEVEL);
2490                if (ret) {
2491                        printk(KERN_ERR
2492                                "Domain context map for %s failed",
2493                                pci_name(pdev));
2494                        return NULL;
2495                }
2496        }
2497
2498        return domain;
2499}
2500
2501static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2502{
2503        struct device_domain_info *info;
2504
2505        /* No lock here, assumes no domain exit in normal case */
2506        info = dev->dev.archdata.iommu;
2507        if (likely(info))
2508                return info->domain;
2509
2510        return __get_valid_domain_for_dev(dev);
2511}
2512
2513static int iommu_dummy(struct pci_dev *pdev)
2514{
2515        return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2516}
2517
2518/* Check if the pdev needs to go through non-identity map and unmap process.*/
2519static int iommu_no_mapping(struct device *dev)
2520{
2521        struct pci_dev *pdev;
2522        int found;
2523
2524        if (unlikely(dev->bus != &pci_bus_type))
2525                return 1;
2526
2527        pdev = to_pci_dev(dev);
2528        if (iommu_dummy(pdev))
2529                return 1;
2530
2531        if (!iommu_identity_mapping)
2532                return 0;
2533
2534        found = identity_mapping(pdev);
2535        if (found) {
2536                if (iommu_should_identity_map(pdev, 0))
2537                        return 1;
2538                else {
2539                        /*
2540                         * 32 bit DMA is removed from si_domain and fall back
2541                         * to non-identity mapping.
2542                         */
2543                        domain_remove_one_dev_info(si_domain, pdev);
2544                        printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2545                               pci_name(pdev));
2546                        return 0;
2547                }
2548        } else {
2549                /*
2550                 * In case of a detached 64 bit DMA device from vm, the device
2551                 * is put into si_domain for identity mapping.
2552                 */
2553                if (iommu_should_identity_map(pdev, 0)) {
2554                        int ret;
2555                        ret = domain_add_dev_info(si_domain, pdev,
2556                                                  hw_pass_through ?
2557                                                  CONTEXT_TT_PASS_THROUGH :
2558                                                  CONTEXT_TT_MULTI_LEVEL);
2559                        if (!ret) {
2560                                printk(KERN_INFO "64bit %s uses identity mapping\n",
2561                                       pci_name(pdev));
2562                                return 1;
2563                        }
2564                }
2565        }
2566
2567        return 0;
2568}
2569
2570static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2571                                     size_t size, int dir, u64 dma_mask)
2572{
2573        struct pci_dev *pdev = to_pci_dev(hwdev);
2574        struct dmar_domain *domain;
2575        phys_addr_t start_paddr;
2576        struct iova *iova;
2577        int prot = 0;
2578        int ret;
2579        struct intel_iommu *iommu;
2580        unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2581
2582        BUG_ON(dir == DMA_NONE);
2583
2584        if (iommu_no_mapping(hwdev))
2585                return paddr;
2586
2587        domain = get_valid_domain_for_dev(pdev);
2588        if (!domain)
2589                return 0;
2590
2591        iommu = domain_get_iommu(domain);
2592        size = aligned_nrpages(paddr, size);
2593
2594        iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2595                                pdev->dma_mask);
2596        if (!iova)
2597                goto error;
2598
2599        /*
2600         * Check if DMAR supports zero-length reads on write only
2601         * mappings..
2602         */
2603        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2604                        !cap_zlr(iommu->cap))
2605                prot |= DMA_PTE_READ;
2606        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2607                prot |= DMA_PTE_WRITE;
2608        /*
2609         * paddr - (paddr + size) might be partial page, we should map the whole
2610         * page.  Note: if two part of one page are separately mapped, we
2611         * might have two guest_addr mapping to the same host paddr, but this
2612         * is not a big problem
2613         */
2614        ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2615                                 mm_to_dma_pfn(paddr_pfn), size, prot);
2616        if (ret)
2617                goto error;
2618
2619        /* it's a non-present to present mapping. Only flush if caching mode */
2620        if (cap_caching_mode(iommu->cap))
2621                iommu_flush_iotlb_psi(iommu, 0, mm_to_dma_pfn(iova->pfn_lo), size);
2622        else
2623                iommu_flush_write_buffer(iommu);
2624
2625        start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2626        start_paddr += paddr & ~PAGE_MASK;
2627        return start_paddr;
2628
2629error:
2630        if (iova)
2631                __free_iova(&domain->iovad, iova);
2632        printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2633                pci_name(pdev), size, (unsigned long long)paddr, dir);
2634        return 0;
2635}
2636
2637static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2638                                 unsigned long offset, size_t size,
2639                                 enum dma_data_direction dir,
2640                                 struct dma_attrs *attrs)
2641{
2642        return __intel_map_single(dev, page_to_phys(page) + offset, size,
2643                                  dir, to_pci_dev(dev)->dma_mask);
2644}
2645
2646static void flush_unmaps(void)
2647{
2648        int i, j;
2649
2650        timer_on = 0;
2651
2652        /* just flush them all */
2653        for (i = 0; i < g_num_of_iommus; i++) {
2654                struct intel_iommu *iommu = g_iommus[i];
2655                if (!iommu)
2656                        continue;
2657
2658                if (!deferred_flush[i].next)
2659                        continue;
2660
2661                iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2662                                         DMA_TLB_GLOBAL_FLUSH);
2663                for (j = 0; j < deferred_flush[i].next; j++) {
2664                        unsigned long mask;
2665                        struct iova *iova = deferred_flush[i].iova[j];
2666
2667                        mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2668                        iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2669                                        (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2670                        __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2671                }
2672                deferred_flush[i].next = 0;
2673        }
2674
2675        list_size = 0;
2676}
2677
2678static void flush_unmaps_timeout(unsigned long data)
2679{
2680        unsigned long flags;
2681
2682        spin_lock_irqsave(&async_umap_flush_lock, flags);
2683        flush_unmaps();
2684        spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2685}
2686
2687static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2688{
2689        unsigned long flags;
2690        int next, iommu_id;
2691        struct intel_iommu *iommu;
2692
2693        spin_lock_irqsave(&async_umap_flush_lock, flags);
2694        if (list_size == HIGH_WATER_MARK)
2695                flush_unmaps();
2696
2697        iommu = domain_get_iommu(dom);
2698        iommu_id = iommu->seq_id;
2699
2700        next = deferred_flush[iommu_id].next;
2701        deferred_flush[iommu_id].domain[next] = dom;
2702        deferred_flush[iommu_id].iova[next] = iova;
2703        deferred_flush[iommu_id].next++;
2704
2705        if (!timer_on) {
2706                mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2707                timer_on = 1;
2708        }
2709        list_size++;
2710        spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2711}
2712
2713static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2714                             size_t size, enum dma_data_direction dir,
2715                             struct dma_attrs *attrs)
2716{
2717        struct pci_dev *pdev = to_pci_dev(dev);
2718        struct dmar_domain *domain;
2719        unsigned long start_pfn, last_pfn;
2720        struct iova *iova;
2721        struct intel_iommu *iommu;
2722
2723        if (iommu_no_mapping(dev))
2724                return;
2725
2726        domain = find_domain(pdev);
2727        BUG_ON(!domain);
2728
2729        iommu = domain_get_iommu(domain);
2730
2731        iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2732        if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2733                      (unsigned long long)dev_addr))
2734                return;
2735
2736        start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2737        last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2738
2739        pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2740                 pci_name(pdev), start_pfn, last_pfn);
2741
2742        /*  clear the whole page */
2743        dma_pte_clear_range(domain, start_pfn, last_pfn);
2744
2745        /* free page tables */
2746        dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2747
2748        if (intel_iommu_strict) {
2749                iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2750                                      last_pfn - start_pfn + 1);
2751                /* free iova */
2752                __free_iova(&domain->iovad, iova);
2753        } else {
2754                add_unmap(domain, iova);
2755                /*
2756                 * queue up the release of the unmap to save the 1/6th of the
2757                 * cpu used up by the iotlb flush operation...
2758                 */
2759        }
2760}
2761
2762static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2763                                  dma_addr_t *dma_handle, gfp_t flags)
2764{
2765        void *vaddr;
2766        int order;
2767
2768        size = PAGE_ALIGN(size);
2769        order = get_order(size);
2770
2771        if (!iommu_no_mapping(hwdev))
2772                flags &= ~(GFP_DMA | GFP_DMA32);
2773        else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2774                if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2775                        flags |= GFP_DMA;
2776                else
2777                        flags |= GFP_DMA32;
2778        }
2779
2780        vaddr = (void *)__get_free_pages(flags, order);
2781        if (!vaddr)
2782                return NULL;
2783        memset(vaddr, 0, size);
2784
2785        *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2786                                         DMA_BIDIRECTIONAL,
2787                                         hwdev->coherent_dma_mask);
2788        if (*dma_handle)
2789                return vaddr;
2790        free_pages((unsigned long)vaddr, order);
2791        return NULL;
2792}
2793
2794static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2795                                dma_addr_t dma_handle)
2796{
2797        int order;
2798
2799        size = PAGE_ALIGN(size);
2800        order = get_order(size);
2801
2802        intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2803        free_pages((unsigned long)vaddr, order);
2804}
2805
2806static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2807                           int nelems, enum dma_data_direction dir,
2808                           struct dma_attrs *attrs)
2809{
2810        struct pci_dev *pdev = to_pci_dev(hwdev);
2811        struct dmar_domain *domain;
2812        unsigned long start_pfn, last_pfn;
2813        struct iova *iova;
2814        struct intel_iommu *iommu;
2815
2816        if (iommu_no_mapping(hwdev))
2817                return;
2818
2819        domain = find_domain(pdev);
2820        BUG_ON(!domain);
2821
2822        iommu = domain_get_iommu(domain);
2823
2824        iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2825        if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
2826                      (unsigned long long)sglist[0].dma_address))
2827                return;
2828
2829        start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2830        last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2831
2832        /*  clear the whole page */
2833        dma_pte_clear_range(domain, start_pfn, last_pfn);
2834
2835        /* free page tables */
2836        dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2837
2838        if (intel_iommu_strict) {
2839                iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2840                                      last_pfn - start_pfn + 1);
2841                /* free iova */
2842                __free_iova(&domain->iovad, iova);
2843        } else {
2844                add_unmap(domain, iova);
2845                /*
2846                 * queue up the release of the unmap to save the 1/6th of the
2847                 * cpu used up by the iotlb flush operation...
2848                 */
2849        }
2850}
2851
2852static int intel_nontranslate_map_sg(struct device *hddev,
2853        struct scatterlist *sglist, int nelems, int dir)
2854{
2855        int i;
2856        struct scatterlist *sg;
2857
2858        for_each_sg(sglist, sg, nelems, i) {
2859                BUG_ON(!sg_page(sg));
2860                sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
2861                sg->dma_length = sg->length;
2862        }
2863        return nelems;
2864}
2865
2866static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2867                        enum dma_data_direction dir, struct dma_attrs *attrs)
2868{
2869        int i;
2870        struct pci_dev *pdev = to_pci_dev(hwdev);
2871        struct dmar_domain *domain;
2872        size_t size = 0;
2873        int prot = 0;
2874        size_t offset_pfn = 0;
2875        struct iova *iova = NULL;
2876        int ret;
2877        struct scatterlist *sg;
2878        unsigned long start_vpfn;
2879        struct intel_iommu *iommu;
2880
2881        BUG_ON(dir == DMA_NONE);
2882        if (iommu_no_mapping(hwdev))
2883                return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2884
2885        domain = get_valid_domain_for_dev(pdev);
2886        if (!domain)
2887                return 0;
2888
2889        iommu = domain_get_iommu(domain);
2890
2891        for_each_sg(sglist, sg, nelems, i)
2892                size += aligned_nrpages(sg->offset, sg->length);
2893
2894        iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2895                                pdev->dma_mask);
2896        if (!iova) {
2897                sglist->dma_length = 0;
2898                return 0;
2899        }
2900
2901        /*
2902         * Check if DMAR supports zero-length reads on write only
2903         * mappings..
2904         */
2905        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2906                        !cap_zlr(iommu->cap))
2907                prot |= DMA_PTE_READ;
2908        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2909                prot |= DMA_PTE_WRITE;
2910
2911        start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
2912
2913        ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
2914        if (unlikely(ret)) {
2915                /*  clear the page */
2916                dma_pte_clear_range(domain, start_vpfn,
2917                                    start_vpfn + size - 1);
2918                /* free page tables */
2919                dma_pte_free_pagetable(domain, start_vpfn,
2920                                       start_vpfn + size - 1);
2921                /* free iova */
2922                __free_iova(&domain->iovad, iova);
2923                return 0;
2924        }
2925
2926        /* it's a non-present to present mapping. Only flush if caching mode */
2927        if (cap_caching_mode(iommu->cap))
2928                iommu_flush_iotlb_psi(iommu, 0, start_vpfn, offset_pfn);
2929        else
2930                iommu_flush_write_buffer(iommu);
2931
2932        return nelems;
2933}
2934
2935static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
2936{
2937        return !dma_addr;
2938}
2939
2940struct dma_map_ops intel_dma_ops = {
2941        .alloc_coherent = intel_alloc_coherent,
2942        .free_coherent = intel_free_coherent,
2943        .map_sg = intel_map_sg,
2944        .unmap_sg = intel_unmap_sg,
2945        .map_page = intel_map_page,
2946        .unmap_page = intel_unmap_page,
2947        .mapping_error = intel_mapping_error,
2948};
2949
2950static inline int iommu_domain_cache_init(void)
2951{
2952        int ret = 0;
2953
2954        iommu_domain_cache = kmem_cache_create("iommu_domain",
2955                                         sizeof(struct dmar_domain),
2956                                         0,
2957                                         SLAB_HWCACHE_ALIGN,
2958
2959                                         NULL);
2960        if (!iommu_domain_cache) {
2961                printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2962                ret = -ENOMEM;
2963        }
2964
2965        return ret;
2966}
2967
2968static inline int iommu_devinfo_cache_init(void)
2969{
2970        int ret = 0;
2971
2972        iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2973                                         sizeof(struct device_domain_info),
2974                                         0,
2975                                         SLAB_HWCACHE_ALIGN,
2976                                         NULL);
2977        if (!iommu_devinfo_cache) {
2978                printk(KERN_ERR "Couldn't create devinfo cache\n");
2979                ret = -ENOMEM;
2980        }
2981
2982        return ret;
2983}
2984
2985static inline int iommu_iova_cache_init(void)
2986{
2987        int ret = 0;
2988
2989        iommu_iova_cache = kmem_cache_create("iommu_iova",
2990                                         sizeof(struct iova),
2991                                         0,
2992                                         SLAB_HWCACHE_ALIGN,
2993                                         NULL);
2994        if (!iommu_iova_cache) {
2995                printk(KERN_ERR "Couldn't create iova cache\n");
2996                ret = -ENOMEM;
2997        }
2998
2999        return ret;
3000}
3001
3002static int __init iommu_init_mempool(void)
3003{
3004        int ret;
3005        ret = iommu_iova_cache_init();
3006        if (ret)
3007                return ret;
3008
3009        ret = iommu_domain_cache_init();
3010        if (ret)
3011                goto domain_error;
3012
3013        ret = iommu_devinfo_cache_init();
3014        if (!ret)
3015                return ret;
3016
3017        kmem_cache_destroy(iommu_domain_cache);
3018domain_error:
3019        kmem_cache_destroy(iommu_iova_cache);
3020
3021        return -ENOMEM;
3022}
3023
3024static void __init iommu_exit_mempool(void)
3025{
3026        kmem_cache_destroy(iommu_devinfo_cache);
3027        kmem_cache_destroy(iommu_domain_cache);
3028        kmem_cache_destroy(iommu_iova_cache);
3029
3030}
3031
3032static void __init init_no_remapping_devices(void)
3033{
3034        struct dmar_drhd_unit *drhd;
3035
3036        for_each_drhd_unit(drhd) {
3037                if (!drhd->include_all) {
3038                        int i;
3039                        for (i = 0; i < drhd->devices_cnt; i++)
3040                                if (drhd->devices[i] != NULL)
3041                                        break;
3042                        /* ignore DMAR unit if no pci devices exist */
3043                        if (i == drhd->devices_cnt)
3044                                drhd->ignored = 1;
3045                }
3046        }
3047
3048        if (dmar_map_gfx)
3049                return;
3050
3051        for_each_drhd_unit(drhd) {
3052                int i;
3053                if (drhd->ignored || drhd->include_all)
3054                        continue;
3055
3056                for (i = 0; i < drhd->devices_cnt; i++)
3057                        if (drhd->devices[i] &&
3058                                !IS_GFX_DEVICE(drhd->devices[i]))
3059                                break;
3060
3061                if (i < drhd->devices_cnt)
3062                        continue;
3063
3064                /* bypass IOMMU if it is just for gfx devices */
3065                drhd->ignored = 1;
3066                for (i = 0; i < drhd->devices_cnt; i++) {
3067                        if (!drhd->devices[i])
3068                                continue;
3069                        drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3070                }
3071        }
3072}
3073
3074#ifdef CONFIG_SUSPEND
3075static int init_iommu_hw(void)
3076{
3077        struct dmar_drhd_unit *drhd;
3078        struct intel_iommu *iommu = NULL;
3079
3080        for_each_active_iommu(iommu, drhd)
3081                if (iommu->qi)
3082                        dmar_reenable_qi(iommu);
3083
3084        for_each_active_iommu(iommu, drhd) {
3085                iommu_flush_write_buffer(iommu);
3086
3087                iommu_set_root_entry(iommu);
3088
3089                iommu->flush.flush_context(iommu, 0, 0, 0,
3090                                           DMA_CCMD_GLOBAL_INVL);
3091                iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3092                                         DMA_TLB_GLOBAL_FLUSH);
3093                iommu_enable_translation(iommu);
3094                iommu_disable_protect_mem_regions(iommu);
3095        }
3096
3097        return 0;
3098}
3099
3100static void iommu_flush_all(void)
3101{
3102        struct dmar_drhd_unit *drhd;
3103        struct intel_iommu *iommu;
3104
3105        for_each_active_iommu(iommu, drhd) {
3106                iommu->flush.flush_context(iommu, 0, 0, 0,
3107                                           DMA_CCMD_GLOBAL_INVL);
3108                iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3109                                         DMA_TLB_GLOBAL_FLUSH);
3110        }
3111}
3112
3113static int iommu_suspend(struct sys_device *dev, pm_message_t state)
3114{
3115        struct dmar_drhd_unit *drhd;
3116        struct intel_iommu *iommu = NULL;
3117        unsigned long flag;
3118
3119        for_each_active_iommu(iommu, drhd) {
3120                iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3121                                                 GFP_ATOMIC);
3122                if (!iommu->iommu_state)
3123                        goto nomem;
3124        }
3125
3126        iommu_flush_all();
3127
3128        for_each_active_iommu(iommu, drhd) {
3129                iommu_disable_translation(iommu);
3130
3131                spin_lock_irqsave(&iommu->register_lock, flag);
3132
3133                iommu->iommu_state[SR_DMAR_FECTL_REG] =
3134                        readl(iommu->reg + DMAR_FECTL_REG);
3135                iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3136                        readl(iommu->reg + DMAR_FEDATA_REG);
3137                iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3138                        readl(iommu->reg + DMAR_FEADDR_REG);
3139                iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3140                        readl(iommu->reg + DMAR_FEUADDR_REG);
3141
3142                spin_unlock_irqrestore(&iommu->register_lock, flag);
3143        }
3144        return 0;
3145
3146nomem:
3147        for_each_active_iommu(iommu, drhd)
3148                kfree(iommu->iommu_state);
3149
3150        return -ENOMEM;
3151}
3152
3153static int iommu_resume(struct sys_device *dev)
3154{
3155        struct dmar_drhd_unit *drhd;
3156        struct intel_iommu *iommu = NULL;
3157        unsigned long flag;
3158
3159        if (init_iommu_hw()) {
3160                WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3161                return -EIO;
3162        }
3163
3164        for_each_active_iommu(iommu, drhd) {
3165
3166                spin_lock_irqsave(&iommu->register_lock, flag);
3167
3168                writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3169                        iommu->reg + DMAR_FECTL_REG);
3170                writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3171                        iommu->reg + DMAR_FEDATA_REG);
3172                writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3173                        iommu->reg + DMAR_FEADDR_REG);
3174                writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3175                        iommu->reg + DMAR_FEUADDR_REG);
3176
3177                spin_unlock_irqrestore(&iommu->register_lock, flag);
3178        }
3179
3180        for_each_active_iommu(iommu, drhd)
3181                kfree(iommu->iommu_state);
3182
3183        return 0;
3184}
3185
3186static struct sysdev_class iommu_sysclass = {
3187        .name           = "iommu",
3188        .resume         = iommu_resume,
3189        .suspend        = iommu_suspend,
3190};
3191
3192static struct sys_device device_iommu = {
3193        .cls    = &iommu_sysclass,
3194};
3195
3196static int __init init_iommu_sysfs(void)
3197{
3198        int error;
3199
3200        error = sysdev_class_register(&iommu_sysclass);
3201        if (error)
3202                return error;
3203
3204        error = sysdev_register(&device_iommu);
3205        if (error)
3206                sysdev_class_unregister(&iommu_sysclass);
3207
3208        return error;
3209}
3210
3211#else
3212static int __init init_iommu_sysfs(void)
3213{
3214        return 0;
3215}
3216#endif  /* CONFIG_PM */
3217
3218/*
3219 * Here we only respond to action of unbound device from driver.
3220 *
3221 * Added device is not attached to its DMAR domain here yet. That will happen
3222 * when mapping the device to iova.
3223 */
3224static int device_notifier(struct notifier_block *nb,
3225                                  unsigned long action, void *data)
3226{
3227        struct device *dev = data;
3228        struct pci_dev *pdev = to_pci_dev(dev);
3229        struct dmar_domain *domain;
3230
3231        domain = find_domain(pdev);
3232        if (!domain)
3233                return 0;
3234
3235        if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through)
3236                domain_remove_one_dev_info(domain, pdev);
3237
3238        return 0;
3239}
3240
3241static struct notifier_block device_nb = {
3242        .notifier_call = device_notifier,
3243};
3244
3245int __init intel_iommu_init(void)
3246{
3247        int ret = 0;
3248        int force_on = 0;
3249
3250        /* VT-d is required for a TXT/tboot launch, so enforce that */
3251        force_on = tboot_force_iommu();
3252
3253        if (dmar_table_init()) {
3254                if (force_on)
3255                        panic("tboot: Failed to initialize DMAR table\n");
3256                return  -ENODEV;
3257        }
3258
3259        if (dmar_dev_scope_init()) {
3260                if (force_on)
3261                        panic("tboot: Failed to initialize DMAR device scope\n");
3262                return  -ENODEV;
3263        }
3264
3265        /*
3266         * Check the need for DMA-remapping initialization now.
3267         * Above initialization will also be used by Interrupt-remapping.
3268         */
3269        if (no_iommu || swiotlb || dmar_disabled)
3270                return -ENODEV;
3271
3272        iommu_init_mempool();
3273        dmar_init_reserved_ranges();
3274
3275        init_no_remapping_devices();
3276
3277        ret = init_dmars();
3278        if (ret) {
3279                if (force_on)
3280                        panic("tboot: Failed to initialize DMARs\n");
3281                printk(KERN_ERR "IOMMU: dmar init failed\n");
3282                put_iova_domain(&reserved_iova_list);
3283                iommu_exit_mempool();
3284                return ret;
3285        }
3286        printk(KERN_INFO
3287        "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3288
3289        init_timer(&unmap_timer);
3290        force_iommu = 1;
3291        dma_ops = &intel_dma_ops;
3292
3293        init_iommu_sysfs();
3294
3295        register_iommu(&intel_iommu_ops);
3296
3297        bus_register_notifier(&pci_bus_type, &device_nb);
3298
3299        return 0;
3300}
3301
3302static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3303                                           struct pci_dev *pdev)
3304{
3305        struct pci_dev *tmp, *parent;
3306
3307        if (!iommu || !pdev)
3308                return;
3309
3310        /* dependent device detach */
3311        tmp = pci_find_upstream_pcie_bridge(pdev);
3312        /* Secondary interface's bus number and devfn 0 */
3313        if (tmp) {
3314                parent = pdev->bus->self;
3315                while (parent != tmp) {
3316                        iommu_detach_dev(iommu, parent->bus->number,
3317                                         parent->devfn);
3318                        parent = parent->bus->self;
3319                }
3320                if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
3321                        iommu_detach_dev(iommu,
3322                                tmp->subordinate->number, 0);
3323                else /* this is a legacy PCI bridge */
3324                        iommu_detach_dev(iommu, tmp->bus->number,
3325                                         tmp->devfn);
3326        }
3327}
3328
3329static void domain_remove_one_dev_info(struct dmar_domain *domain,
3330                                          struct pci_dev *pdev)
3331{
3332        struct device_domain_info *info;
3333        struct intel_iommu *iommu;
3334        unsigned long flags;
3335        int found = 0;
3336        struct list_head *entry, *tmp;
3337
3338        iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3339                                pdev->devfn);
3340        if (!iommu)
3341                return;
3342
3343        spin_lock_irqsave(&device_domain_lock, flags);
3344        list_for_each_safe(entry, tmp, &domain->devices) {
3345                info = list_entry(entry, struct device_domain_info, link);
3346                /* No need to compare PCI domain; it has to be the same */
3347                if (info->bus == pdev->bus->number &&
3348                    info->devfn == pdev->devfn) {
3349                        list_del(&info->link);
3350                        list_del(&info->global);
3351                        if (info->dev)
3352                                info->dev->dev.archdata.iommu = NULL;
3353                        spin_unlock_irqrestore(&device_domain_lock, flags);
3354
3355                        iommu_disable_dev_iotlb(info);
3356                        iommu_detach_dev(iommu, info->bus, info->devfn);
3357                        iommu_detach_dependent_devices(iommu, pdev);
3358                        free_devinfo_mem(info);
3359
3360                        spin_lock_irqsave(&device_domain_lock, flags);
3361
3362                        if (found)
3363                                break;
3364                        else
3365                                continue;
3366                }
3367
3368                /* if there is no other devices under the same iommu
3369                 * owned by this domain, clear this iommu in iommu_bmp
3370                 * update iommu count and coherency
3371                 */
3372                if (iommu == device_to_iommu(info->segment, info->bus,
3373                                            info->devfn))
3374                        found = 1;
3375        }
3376
3377        if (found == 0) {
3378                unsigned long tmp_flags;
3379                spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3380                clear_bit(iommu->seq_id, &domain->iommu_bmp);
3381                domain->iommu_count--;
3382                domain_update_iommu_cap(domain);
3383                spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3384        }
3385
3386        spin_unlock_irqrestore(&device_domain_lock, flags);
3387}
3388
3389static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3390{
3391        struct device_domain_info *info;
3392        struct intel_iommu *iommu;
3393        unsigned long flags1, flags2;
3394
3395        spin_lock_irqsave(&device_domain_lock, flags1);
3396        while (!list_empty(&domain->devices)) {
3397                info = list_entry(domain->devices.next,
3398                        struct device_domain_info, link);
3399                list_del(&info->link);
3400                list_del(&info->global);
3401                if (info->dev)
3402                        info->dev->dev.archdata.iommu = NULL;
3403
3404                spin_unlock_irqrestore(&device_domain_lock, flags1);
3405
3406                iommu_disable_dev_iotlb(info);
3407                iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3408                iommu_detach_dev(iommu, info->bus, info->devfn);
3409                iommu_detach_dependent_devices(iommu, info->dev);
3410
3411                /* clear this iommu in iommu_bmp, update iommu count
3412                 * and capabilities
3413                 */
3414                spin_lock_irqsave(&domain->iommu_lock, flags2);
3415                if (test_and_clear_bit(iommu->seq_id,
3416                                       &domain->iommu_bmp)) {
3417                        domain->iommu_count--;
3418                        domain_update_iommu_cap(domain);
3419                }
3420                spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3421
3422                free_devinfo_mem(info);
3423                spin_lock_irqsave(&device_domain_lock, flags1);
3424        }
3425        spin_unlock_irqrestore(&device_domain_lock, flags1);
3426}
3427
3428/* domain id for virtual machine, it won't be set in context */
3429static unsigned long vm_domid;
3430
3431static int vm_domain_min_agaw(struct dmar_domain *domain)
3432{
3433        int i;
3434        int min_agaw = domain->agaw;
3435
3436        i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
3437        for (; i < g_num_of_iommus; ) {
3438                if (min_agaw > g_iommus[i]->agaw)
3439                        min_agaw = g_iommus[i]->agaw;
3440
3441                i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
3442        }
3443
3444        return min_agaw;
3445}
3446
3447static struct dmar_domain *iommu_alloc_vm_domain(void)
3448{
3449        struct dmar_domain *domain;
3450
3451        domain = alloc_domain_mem();
3452        if (!domain)
3453                return NULL;
3454
3455        domain->id = vm_domid++;
3456        memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3457        domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3458
3459        return domain;
3460}
3461
3462static int md_domain_init(struct dmar_domain *domain, int guest_width)
3463{
3464        int adjust_width;
3465
3466        init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3467        spin_lock_init(&domain->iommu_lock);
3468
3469        domain_reserve_special_ranges(domain);
3470
3471        /* calculate AGAW */
3472        domain->gaw = guest_width;
3473        adjust_width = guestwidth_to_adjustwidth(guest_width);
3474        domain->agaw = width_to_agaw(adjust_width);
3475
3476        INIT_LIST_HEAD(&domain->devices);
3477
3478        domain->iommu_count = 0;
3479        domain->iommu_coherency = 0;
3480        domain->iommu_snooping = 0;
3481        domain->max_addr = 0;
3482
3483        /* always allocate the top pgd */
3484        domain->pgd = (struct dma_pte *)alloc_pgtable_page();
3485        if (!domain->pgd)
3486                return -ENOMEM;
3487        domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3488        return 0;
3489}
3490
3491static void iommu_free_vm_domain(struct dmar_domain *domain)
3492{
3493        unsigned long flags;
3494        struct dmar_drhd_unit *drhd;
3495        struct intel_iommu *iommu;
3496        unsigned long i;
3497        unsigned long ndomains;
3498
3499        for_each_drhd_unit(drhd) {
3500                if (drhd->ignored)
3501                        continue;
3502                iommu = drhd->iommu;
3503
3504                ndomains = cap_ndoms(iommu->cap);
3505                i = find_first_bit(iommu->domain_ids, ndomains);
3506                for (; i < ndomains; ) {
3507                        if (iommu->domains[i] == domain) {
3508                                spin_lock_irqsave(&iommu->lock, flags);
3509                                clear_bit(i, iommu->domain_ids);
3510                                iommu->domains[i] = NULL;
3511                                spin_unlock_irqrestore(&iommu->lock, flags);
3512                                break;
3513                        }
3514                        i = find_next_bit(iommu->domain_ids, ndomains, i+1);
3515                }
3516        }
3517}
3518
3519static void vm_domain_exit(struct dmar_domain *domain)
3520{
3521        /* Domain 0 is reserved, so dont process it */
3522        if (!domain)
3523                return;
3524
3525        vm_domain_remove_all_dev_info(domain);
3526        /* destroy iovas */
3527        put_iova_domain(&domain->iovad);
3528
3529        /* clear ptes */
3530        dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3531
3532        /* free page tables */
3533        dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3534
3535        iommu_free_vm_domain(domain);
3536        free_domain_mem(domain);
3537}
3538
3539static int intel_iommu_domain_init(struct iommu_domain *domain)
3540{
3541        struct dmar_domain *dmar_domain;
3542
3543        dmar_domain = iommu_alloc_vm_domain();
3544        if (!dmar_domain) {
3545                printk(KERN_ERR
3546                        "intel_iommu_domain_init: dmar_domain == NULL\n");
3547                return -ENOMEM;
3548        }
3549        if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3550                printk(KERN_ERR
3551                        "intel_iommu_domain_init() failed\n");
3552                vm_domain_exit(dmar_domain);
3553                return -ENOMEM;
3554        }
3555        domain->priv = dmar_domain;
3556
3557        return 0;
3558}
3559
3560static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3561{
3562        struct dmar_domain *dmar_domain = domain->priv;
3563
3564        domain->priv = NULL;
3565        vm_domain_exit(dmar_domain);
3566}
3567
3568static int intel_iommu_attach_device(struct iommu_domain *domain,
3569                                     struct device *dev)
3570{
3571        struct dmar_domain *dmar_domain = domain->priv;
3572        struct pci_dev *pdev = to_pci_dev(dev);
3573        struct intel_iommu *iommu;
3574        int addr_width;
3575        u64 end;
3576
3577        /* normally pdev is not mapped */
3578        if (unlikely(domain_context_mapped(pdev))) {
3579                struct dmar_domain *old_domain;
3580
3581                old_domain = find_domain(pdev);
3582                if (old_domain) {
3583                        if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3584                            dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3585                                domain_remove_one_dev_info(old_domain, pdev);
3586                        else
3587                                domain_remove_dev_info(old_domain);
3588                }
3589        }
3590
3591        iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3592                                pdev->devfn);
3593        if (!iommu)
3594                return -ENODEV;
3595
3596        /* check if this iommu agaw is sufficient for max mapped address */
3597        addr_width = agaw_to_width(iommu->agaw);
3598        end = DOMAIN_MAX_ADDR(addr_width);
3599        end = end & VTD_PAGE_MASK;
3600        if (end < dmar_domain->max_addr) {
3601                printk(KERN_ERR "%s: iommu agaw (%d) is not "
3602                       "sufficient for the mapped address (%llx)\n",
3603                       __func__, iommu->agaw, dmar_domain->max_addr);
3604                return -EFAULT;
3605        }
3606
3607        return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3608}
3609
3610static void intel_iommu_detach_device(struct iommu_domain *domain,
3611                                      struct device *dev)
3612{
3613        struct dmar_domain *dmar_domain = domain->priv;
3614        struct pci_dev *pdev = to_pci_dev(dev);
3615
3616        domain_remove_one_dev_info(dmar_domain, pdev);
3617}
3618
3619static int intel_iommu_map_range(struct iommu_domain *domain,
3620                                 unsigned long iova, phys_addr_t hpa,
3621                                 size_t size, int iommu_prot)
3622{
3623        struct dmar_domain *dmar_domain = domain->priv;
3624        u64 max_addr;
3625        int addr_width;
3626        int prot = 0;
3627        int ret;
3628
3629        if (iommu_prot & IOMMU_READ)
3630                prot |= DMA_PTE_READ;
3631        if (iommu_prot & IOMMU_WRITE)
3632                prot |= DMA_PTE_WRITE;
3633        if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3634                prot |= DMA_PTE_SNP;
3635
3636        max_addr = iova + size;
3637        if (dmar_domain->max_addr < max_addr) {
3638                int min_agaw;
3639                u64 end;
3640
3641                /* check if minimum agaw is sufficient for mapped address */
3642                min_agaw = vm_domain_min_agaw(dmar_domain);
3643                addr_width = agaw_to_width(min_agaw);
3644                end = DOMAIN_MAX_ADDR(addr_width);
3645                end = end & VTD_PAGE_MASK;
3646                if (end < max_addr) {
3647                        printk(KERN_ERR "%s: iommu agaw (%d) is not "
3648                               "sufficient for the mapped address (%llx)\n",
3649                               __func__, min_agaw, max_addr);
3650                        return -EFAULT;
3651                }
3652                dmar_domain->max_addr = max_addr;
3653        }
3654        /* Round up size to next multiple of PAGE_SIZE, if it and
3655           the low bits of hpa would take us onto the next page */
3656        size = aligned_nrpages(hpa, size);
3657        ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3658                                 hpa >> VTD_PAGE_SHIFT, size, prot);
3659        return ret;
3660}
3661
3662static void intel_iommu_unmap_range(struct iommu_domain *domain,
3663                                    unsigned long iova, size_t size)
3664{
3665        struct dmar_domain *dmar_domain = domain->priv;
3666
3667        if (!size)
3668                return;
3669
3670        dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
3671                            (iova + size - 1) >> VTD_PAGE_SHIFT);
3672
3673        if (dmar_domain->max_addr == iova + size)
3674                dmar_domain->max_addr = iova;
3675}
3676
3677static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3678                                            unsigned long iova)
3679{
3680        struct dmar_domain *dmar_domain = domain->priv;
3681        struct dma_pte *pte;
3682        u64 phys = 0;
3683
3684        pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT);
3685        if (pte)
3686                phys = dma_pte_addr(pte);
3687
3688        return phys;
3689}
3690
3691static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3692                                      unsigned long cap)
3693{
3694        struct dmar_domain *dmar_domain = domain->priv;
3695
3696        if (cap == IOMMU_CAP_CACHE_COHERENCY)
3697                return dmar_domain->iommu_snooping;
3698
3699        return 0;
3700}
3701
3702static struct iommu_ops intel_iommu_ops = {
3703        .domain_init    = intel_iommu_domain_init,
3704        .domain_destroy = intel_iommu_domain_destroy,
3705        .attach_dev     = intel_iommu_attach_device,
3706        .detach_dev     = intel_iommu_detach_device,
3707        .map            = intel_iommu_map_range,
3708        .unmap          = intel_iommu_unmap_range,
3709        .iova_to_phys   = intel_iommu_iova_to_phys,
3710        .domain_has_cap = intel_iommu_domain_has_cap,
3711};
3712
3713static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3714{
3715        /*
3716         * Mobile 4 Series Chipset neglects to set RWBF capability,
3717         * but needs it:
3718         */
3719        printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3720        rwbf_quirk = 1;
3721}
3722
3723DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
3724
3725/* On Tylersburg chipsets, some BIOSes have been known to enable the
3726   ISOCH DMAR unit for the Azalia sound device, but not give it any
3727   TLB entries, which causes it to deadlock. Check for that.  We do
3728   this in a function called from init_dmars(), instead of in a PCI
3729   quirk, because we don't want to print the obnoxious "BIOS broken"
3730   message if VT-d is actually disabled.
3731*/
3732static void __init check_tylersburg_isoch(void)
3733{
3734        struct pci_dev *pdev;
3735        uint32_t vtisochctrl;
3736
3737        /* If there's no Azalia in the system anyway, forget it. */
3738        pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
3739        if (!pdev)
3740                return;
3741        pci_dev_put(pdev);
3742
3743        /* System Management Registers. Might be hidden, in which case
3744           we can't do the sanity check. But that's OK, because the
3745           known-broken BIOSes _don't_ actually hide it, so far. */
3746        pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
3747        if (!pdev)
3748                return;
3749
3750        if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
3751                pci_dev_put(pdev);
3752                return;
3753        }
3754
3755        pci_dev_put(pdev);
3756
3757        /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
3758        if (vtisochctrl & 1)
3759                return;
3760
3761        /* Drop all bits other than the number of TLB entries */
3762        vtisochctrl &= 0x1c;
3763
3764        /* If we have the recommended number of TLB entries (16), fine. */
3765        if (vtisochctrl == 0x10)
3766                return;
3767
3768        /* Zero TLB entries? You get to ride the short bus to school. */
3769        if (!vtisochctrl) {
3770                WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
3771                     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3772                     dmi_get_system_info(DMI_BIOS_VENDOR),
3773                     dmi_get_system_info(DMI_BIOS_VERSION),
3774                     dmi_get_system_info(DMI_PRODUCT_VERSION));
3775                iommu_identity_mapping |= IDENTMAP_AZALIA;
3776                return;
3777        }
3778        
3779        printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
3780               vtisochctrl);
3781}
3782