linux/drivers/iommu/intel-iommu.c
<<
>>
Prefs
   1/*
   2 * Copyright © 2006-2014 Intel Corporation.
   3 *
   4 * This program is free software; you can redistribute it and/or modify it
   5 * under the terms and conditions of the GNU General Public License,
   6 * version 2, as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope it will be useful, but WITHOUT
   9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  11 * more details.
  12 *
  13 * Authors: David Woodhouse <dwmw2@infradead.org>,
  14 *          Ashok Raj <ashok.raj@intel.com>,
  15 *          Shaohua Li <shaohua.li@intel.com>,
  16 *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
  17 *          Fenghua Yu <fenghua.yu@intel.com>
  18 */
  19
  20#include <linux/init.h>
  21#include <linux/bitmap.h>
  22#include <linux/debugfs.h>
  23#include <linux/export.h>
  24#include <linux/slab.h>
  25#include <linux/irq.h>
  26#include <linux/interrupt.h>
  27#include <linux/spinlock.h>
  28#include <linux/pci.h>
  29#include <linux/dmar.h>
  30#include <linux/dma-mapping.h>
  31#include <linux/mempool.h>
  32#include <linux/memory.h>
  33#include <linux/timer.h>
  34#include <linux/iova.h>
  35#include <linux/iommu.h>
  36#include <linux/intel-iommu.h>
  37#include <linux/syscore_ops.h>
  38#include <linux/tboot.h>
  39#include <linux/dmi.h>
  40#include <linux/pci-ats.h>
  41#include <linux/memblock.h>
  42#include <linux/dma-contiguous.h>
  43#include <asm/irq_remapping.h>
  44#include <asm/cacheflush.h>
  45#include <asm/iommu.h>
  46
  47#include "irq_remapping.h"
  48#include "pci.h"
  49
  50#define ROOT_SIZE               VTD_PAGE_SIZE
  51#define CONTEXT_SIZE            VTD_PAGE_SIZE
  52
  53#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  54#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  55#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  56
  57#define IOAPIC_RANGE_START      (0xfee00000)
  58#define IOAPIC_RANGE_END        (0xfeefffff)
  59#define IOVA_START_ADDR         (0x1000)
  60
  61#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
  62
  63#define MAX_AGAW_WIDTH 64
  64#define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
  65
  66#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  67#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  68
  69/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  70   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  71#define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  72                                __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  73#define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  74
  75#define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  76#define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
  77#define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
  78
  79/* page table handling */
  80#define LEVEL_STRIDE            (9)
  81#define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  82
  83/*
  84 * This bitmap is used to advertise the page sizes our hardware support
  85 * to the IOMMU core, which will then use this information to split
  86 * physically contiguous memory regions it is mapping into page sizes
  87 * that we support.
  88 *
  89 * Traditionally the IOMMU core just handed us the mappings directly,
  90 * after making sure the size is an order of a 4KiB page and that the
  91 * mapping has natural alignment.
  92 *
  93 * To retain this behavior, we currently advertise that we support
  94 * all page sizes that are an order of 4KiB.
  95 *
  96 * If at some point we'd like to utilize the IOMMU core's new behavior,
  97 * we could change this to advertise the real page sizes we support.
  98 */
  99#define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
 100
 101static inline int agaw_to_level(int agaw)
 102{
 103        return agaw + 2;
 104}
 105
 106static inline int agaw_to_width(int agaw)
 107{
 108        return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
 109}
 110
 111static inline int width_to_agaw(int width)
 112{
 113        return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
 114}
 115
 116static inline unsigned int level_to_offset_bits(int level)
 117{
 118        return (level - 1) * LEVEL_STRIDE;
 119}
 120
 121static inline int pfn_level_offset(unsigned long pfn, int level)
 122{
 123        return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 124}
 125
 126static inline unsigned long level_mask(int level)
 127{
 128        return -1UL << level_to_offset_bits(level);
 129}
 130
 131static inline unsigned long level_size(int level)
 132{
 133        return 1UL << level_to_offset_bits(level);
 134}
 135
 136static inline unsigned long align_to_level(unsigned long pfn, int level)
 137{
 138        return (pfn + level_size(level) - 1) & level_mask(level);
 139}
 140
 141static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 142{
 143        return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
 144}
 145
 146/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 147   are never going to work. */
 148static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 149{
 150        return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 151}
 152
 153static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 154{
 155        return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 156}
 157static inline unsigned long page_to_dma_pfn(struct page *pg)
 158{
 159        return mm_to_dma_pfn(page_to_pfn(pg));
 160}
 161static inline unsigned long virt_to_dma_pfn(void *p)
 162{
 163        return page_to_dma_pfn(virt_to_page(p));
 164}
 165
 166/* global iommu list, set NULL for ignored DMAR units */
 167static struct intel_iommu **g_iommus;
 168
 169static void __init check_tylersburg_isoch(void);
 170static int rwbf_quirk;
 171
 172/*
 173 * set to 1 to panic kernel if can't successfully enable VT-d
 174 * (used when kernel is launched w/ TXT)
 175 */
 176static int force_on = 0;
 177
 178/*
 179 * 0: Present
 180 * 1-11: Reserved
 181 * 12-63: Context Ptr (12 - (haw-1))
 182 * 64-127: Reserved
 183 */
 184struct root_entry {
 185        u64     val;
 186        u64     rsvd1;
 187};
 188#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 189static inline bool root_present(struct root_entry *root)
 190{
 191        return (root->val & 1);
 192}
 193static inline void set_root_present(struct root_entry *root)
 194{
 195        root->val |= 1;
 196}
 197static inline void set_root_value(struct root_entry *root, unsigned long value)
 198{
 199        root->val |= value & VTD_PAGE_MASK;
 200}
 201
 202static inline struct context_entry *
 203get_context_addr_from_root(struct root_entry *root)
 204{
 205        return (struct context_entry *)
 206                (root_present(root)?phys_to_virt(
 207                root->val & VTD_PAGE_MASK) :
 208                NULL);
 209}
 210
 211/*
 212 * low 64 bits:
 213 * 0: present
 214 * 1: fault processing disable
 215 * 2-3: translation type
 216 * 12-63: address space root
 217 * high 64 bits:
 218 * 0-2: address width
 219 * 3-6: aval
 220 * 8-23: domain id
 221 */
 222struct context_entry {
 223        u64 lo;
 224        u64 hi;
 225};
 226
 227static inline bool context_present(struct context_entry *context)
 228{
 229        return (context->lo & 1);
 230}
 231static inline void context_set_present(struct context_entry *context)
 232{
 233        context->lo |= 1;
 234}
 235
 236static inline void context_set_fault_enable(struct context_entry *context)
 237{
 238        context->lo &= (((u64)-1) << 2) | 1;
 239}
 240
 241static inline void context_set_translation_type(struct context_entry *context,
 242                                                unsigned long value)
 243{
 244        context->lo &= (((u64)-1) << 4) | 3;
 245        context->lo |= (value & 3) << 2;
 246}
 247
 248static inline void context_set_address_root(struct context_entry *context,
 249                                            unsigned long value)
 250{
 251        context->lo |= value & VTD_PAGE_MASK;
 252}
 253
 254static inline void context_set_address_width(struct context_entry *context,
 255                                             unsigned long value)
 256{
 257        context->hi |= value & 7;
 258}
 259
 260static inline void context_set_domain_id(struct context_entry *context,
 261                                         unsigned long value)
 262{
 263        context->hi |= (value & ((1 << 16) - 1)) << 8;
 264}
 265
 266static inline void context_clear_entry(struct context_entry *context)
 267{
 268        context->lo = 0;
 269        context->hi = 0;
 270}
 271
 272/*
 273 * 0: readable
 274 * 1: writable
 275 * 2-6: reserved
 276 * 7: super page
 277 * 8-10: available
 278 * 11: snoop behavior
 279 * 12-63: Host physcial address
 280 */
 281struct dma_pte {
 282        u64 val;
 283};
 284
 285static inline void dma_clear_pte(struct dma_pte *pte)
 286{
 287        pte->val = 0;
 288}
 289
 290static inline u64 dma_pte_addr(struct dma_pte *pte)
 291{
 292#ifdef CONFIG_64BIT
 293        return pte->val & VTD_PAGE_MASK;
 294#else
 295        /* Must have a full atomic 64-bit read */
 296        return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
 297#endif
 298}
 299
 300static inline bool dma_pte_present(struct dma_pte *pte)
 301{
 302        return (pte->val & 3) != 0;
 303}
 304
 305static inline bool dma_pte_superpage(struct dma_pte *pte)
 306{
 307        return (pte->val & (1 << 7));
 308}
 309
 310static inline int first_pte_in_page(struct dma_pte *pte)
 311{
 312        return !((unsigned long)pte & ~VTD_PAGE_MASK);
 313}
 314
 315/*
 316 * This domain is a statically identity mapping domain.
 317 *      1. This domain creats a static 1:1 mapping to all usable memory.
 318 *      2. It maps to each iommu if successful.
 319 *      3. Each iommu mapps to this domain if successful.
 320 */
 321static struct dmar_domain *si_domain;
 322static int hw_pass_through = 1;
 323
 324/* devices under the same p2p bridge are owned in one domain */
 325#define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
 326
 327/* domain represents a virtual machine, more than one devices
 328 * across iommus may be owned in one domain, e.g. kvm guest.
 329 */
 330#define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
 331
 332/* si_domain contains mulitple devices */
 333#define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
 334
 335/* define the limit of IOMMUs supported in each domain */
 336#ifdef  CONFIG_X86
 337# define        IOMMU_UNITS_SUPPORTED   MAX_IO_APICS
 338#else
 339# define        IOMMU_UNITS_SUPPORTED   64
 340#endif
 341
 342struct dmar_domain {
 343        int     id;                     /* domain id */
 344        int     nid;                    /* node id */
 345        DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
 346                                        /* bitmap of iommus this domain uses*/
 347
 348        struct list_head devices;       /* all devices' list */
 349        struct iova_domain iovad;       /* iova's that belong to this domain */
 350
 351        struct dma_pte  *pgd;           /* virtual address */
 352        int             gaw;            /* max guest address width */
 353
 354        /* adjusted guest address width, 0 is level 2 30-bit */
 355        int             agaw;
 356
 357        int             flags;          /* flags to find out type of domain */
 358
 359        int             iommu_coherency;/* indicate coherency of iommu access */
 360        int             iommu_snooping; /* indicate snooping control feature*/
 361        int             iommu_count;    /* reference count of iommu */
 362        int             iommu_superpage;/* Level of superpages supported:
 363                                           0 == 4KiB (no superpages), 1 == 2MiB,
 364                                           2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
 365        spinlock_t      iommu_lock;     /* protect iommu set in domain */
 366        u64             max_addr;       /* maximum mapped address */
 367};
 368
 369/* PCI domain-device relationship */
 370struct device_domain_info {
 371        struct list_head link;  /* link to domain siblings */
 372        struct list_head global; /* link to global list */
 373        u8 bus;                 /* PCI bus number */
 374        u8 devfn;               /* PCI devfn number */
 375        struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
 376        struct intel_iommu *iommu; /* IOMMU used by this device */
 377        struct dmar_domain *domain; /* pointer to domain */
 378};
 379
 380struct dmar_rmrr_unit {
 381        struct list_head list;          /* list of rmrr units   */
 382        struct acpi_dmar_header *hdr;   /* ACPI header          */
 383        u64     base_address;           /* reserved base address*/
 384        u64     end_address;            /* reserved end address */
 385        struct dmar_dev_scope *devices; /* target devices */
 386        int     devices_cnt;            /* target device count */
 387};
 388
 389struct dmar_atsr_unit {
 390        struct list_head list;          /* list of ATSR units */
 391        struct acpi_dmar_header *hdr;   /* ACPI header */
 392        struct dmar_dev_scope *devices; /* target devices */
 393        int devices_cnt;                /* target device count */
 394        u8 include_all:1;               /* include all ports */
 395};
 396
 397static LIST_HEAD(dmar_atsr_units);
 398static LIST_HEAD(dmar_rmrr_units);
 399
 400#define for_each_rmrr_units(rmrr) \
 401        list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 402
 403static void flush_unmaps_timeout(unsigned long data);
 404
 405static DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
 406
 407#define HIGH_WATER_MARK 250
 408struct deferred_flush_tables {
 409        int next;
 410        struct iova *iova[HIGH_WATER_MARK];
 411        struct dmar_domain *domain[HIGH_WATER_MARK];
 412        struct page *freelist[HIGH_WATER_MARK];
 413};
 414
 415static struct deferred_flush_tables *deferred_flush;
 416
 417/* bitmap for indexing intel_iommus */
 418static int g_num_of_iommus;
 419
 420static DEFINE_SPINLOCK(async_umap_flush_lock);
 421static LIST_HEAD(unmaps_to_do);
 422
 423static int timer_on;
 424static long list_size;
 425
 426static void domain_exit(struct dmar_domain *domain);
 427static void domain_remove_dev_info(struct dmar_domain *domain);
 428static void domain_remove_one_dev_info(struct dmar_domain *domain,
 429                                       struct device *dev);
 430static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
 431                                           struct device *dev);
 432
 433#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
 434int dmar_disabled = 0;
 435#else
 436int dmar_disabled = 1;
 437#endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
 438
 439int intel_iommu_enabled = 0;
 440EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 441
 442static int dmar_map_gfx = 1;
 443static int dmar_forcedac;
 444static int intel_iommu_strict;
 445static int intel_iommu_superpage = 1;
 446
 447int intel_iommu_gfx_mapped;
 448EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 449
 450#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 451static DEFINE_SPINLOCK(device_domain_lock);
 452static LIST_HEAD(device_domain_list);
 453
 454static struct iommu_ops intel_iommu_ops;
 455
 456static int __init intel_iommu_setup(char *str)
 457{
 458        if (!str)
 459                return -EINVAL;
 460        while (*str) {
 461                if (!strncmp(str, "on", 2)) {
 462                        dmar_disabled = 0;
 463                        printk(KERN_INFO "Intel-IOMMU: enabled\n");
 464                } else if (!strncmp(str, "off", 3)) {
 465                        dmar_disabled = 1;
 466                        printk(KERN_INFO "Intel-IOMMU: disabled\n");
 467                } else if (!strncmp(str, "igfx_off", 8)) {
 468                        dmar_map_gfx = 0;
 469                        printk(KERN_INFO
 470                                "Intel-IOMMU: disable GFX device mapping\n");
 471                } else if (!strncmp(str, "forcedac", 8)) {
 472                        printk(KERN_INFO
 473                                "Intel-IOMMU: Forcing DAC for PCI devices\n");
 474                        dmar_forcedac = 1;
 475                } else if (!strncmp(str, "strict", 6)) {
 476                        printk(KERN_INFO
 477                                "Intel-IOMMU: disable batched IOTLB flush\n");
 478                        intel_iommu_strict = 1;
 479                } else if (!strncmp(str, "sp_off", 6)) {
 480                        printk(KERN_INFO
 481                                "Intel-IOMMU: disable supported super page\n");
 482                        intel_iommu_superpage = 0;
 483                }
 484
 485                str += strcspn(str, ",");
 486                while (*str == ',')
 487                        str++;
 488        }
 489        return 0;
 490}
 491__setup("intel_iommu=", intel_iommu_setup);
 492
 493static struct kmem_cache *iommu_domain_cache;
 494static struct kmem_cache *iommu_devinfo_cache;
 495static struct kmem_cache *iommu_iova_cache;
 496
 497static inline void *alloc_pgtable_page(int node)
 498{
 499        struct page *page;
 500        void *vaddr = NULL;
 501
 502        page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 503        if (page)
 504                vaddr = page_address(page);
 505        return vaddr;
 506}
 507
 508static inline void free_pgtable_page(void *vaddr)
 509{
 510        free_page((unsigned long)vaddr);
 511}
 512
 513static inline void *alloc_domain_mem(void)
 514{
 515        return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 516}
 517
 518static void free_domain_mem(void *vaddr)
 519{
 520        kmem_cache_free(iommu_domain_cache, vaddr);
 521}
 522
 523static inline void * alloc_devinfo_mem(void)
 524{
 525        return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 526}
 527
 528static inline void free_devinfo_mem(void *vaddr)
 529{
 530        kmem_cache_free(iommu_devinfo_cache, vaddr);
 531}
 532
 533struct iova *alloc_iova_mem(void)
 534{
 535        return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
 536}
 537
 538void free_iova_mem(struct iova *iova)
 539{
 540        kmem_cache_free(iommu_iova_cache, iova);
 541}
 542
 543
 544static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 545{
 546        unsigned long sagaw;
 547        int agaw = -1;
 548
 549        sagaw = cap_sagaw(iommu->cap);
 550        for (agaw = width_to_agaw(max_gaw);
 551             agaw >= 0; agaw--) {
 552                if (test_bit(agaw, &sagaw))
 553                        break;
 554        }
 555
 556        return agaw;
 557}
 558
 559/*
 560 * Calculate max SAGAW for each iommu.
 561 */
 562int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 563{
 564        return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 565}
 566
 567/*
 568 * calculate agaw for each iommu.
 569 * "SAGAW" may be different across iommus, use a default agaw, and
 570 * get a supported less agaw for iommus that don't support the default agaw.
 571 */
 572int iommu_calculate_agaw(struct intel_iommu *iommu)
 573{
 574        return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 575}
 576
 577/* This functionin only returns single iommu in a domain */
 578static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 579{
 580        int iommu_id;
 581
 582        /* si_domain and vm domain should not get here. */
 583        BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
 584        BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
 585
 586        iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
 587        if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 588                return NULL;
 589
 590        return g_iommus[iommu_id];
 591}
 592
 593static void domain_update_iommu_coherency(struct dmar_domain *domain)
 594{
 595        struct dmar_drhd_unit *drhd;
 596        struct intel_iommu *iommu;
 597        int i, found = 0;
 598
 599        domain->iommu_coherency = 1;
 600
 601        for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
 602                found = 1;
 603                if (!ecap_coherent(g_iommus[i]->ecap)) {
 604                        domain->iommu_coherency = 0;
 605                        break;
 606                }
 607        }
 608        if (found)
 609                return;
 610
 611        /* No hardware attached; use lowest common denominator */
 612        rcu_read_lock();
 613        for_each_active_iommu(iommu, drhd) {
 614                if (!ecap_coherent(iommu->ecap)) {
 615                        domain->iommu_coherency = 0;
 616                        break;
 617                }
 618        }
 619        rcu_read_unlock();
 620}
 621
 622static void domain_update_iommu_snooping(struct dmar_domain *domain)
 623{
 624        int i;
 625
 626        domain->iommu_snooping = 1;
 627
 628        for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
 629                if (!ecap_sc_support(g_iommus[i]->ecap)) {
 630                        domain->iommu_snooping = 0;
 631                        break;
 632                }
 633        }
 634}
 635
 636static void domain_update_iommu_superpage(struct dmar_domain *domain)
 637{
 638        struct dmar_drhd_unit *drhd;
 639        struct intel_iommu *iommu = NULL;
 640        int mask = 0xf;
 641
 642        if (!intel_iommu_superpage) {
 643                domain->iommu_superpage = 0;
 644                return;
 645        }
 646
 647        /* set iommu_superpage to the smallest common denominator */
 648        rcu_read_lock();
 649        for_each_active_iommu(iommu, drhd) {
 650                mask &= cap_super_page_val(iommu->cap);
 651                if (!mask) {
 652                        break;
 653                }
 654        }
 655        rcu_read_unlock();
 656
 657        domain->iommu_superpage = fls(mask);
 658}
 659
 660/* Some capabilities may be different across iommus */
 661static void domain_update_iommu_cap(struct dmar_domain *domain)
 662{
 663        domain_update_iommu_coherency(domain);
 664        domain_update_iommu_snooping(domain);
 665        domain_update_iommu_superpage(domain);
 666}
 667
 668static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 669{
 670        struct dmar_drhd_unit *drhd = NULL;
 671        struct intel_iommu *iommu;
 672        struct device *tmp;
 673        struct pci_dev *ptmp, *pdev = NULL;
 674        u16 segment;
 675        int i;
 676
 677        if (dev_is_pci(dev)) {
 678                pdev = to_pci_dev(dev);
 679                segment = pci_domain_nr(pdev->bus);
 680        } else if (ACPI_COMPANION(dev))
 681                dev = &ACPI_COMPANION(dev)->dev;
 682
 683        rcu_read_lock();
 684        for_each_active_iommu(iommu, drhd) {
 685                if (pdev && segment != drhd->segment)
 686                        continue;
 687
 688                for_each_active_dev_scope(drhd->devices,
 689                                          drhd->devices_cnt, i, tmp) {
 690                        if (tmp == dev) {
 691                                *bus = drhd->devices[i].bus;
 692                                *devfn = drhd->devices[i].devfn;
 693                                goto out;
 694                        }
 695
 696                        if (!pdev || !dev_is_pci(tmp))
 697                                continue;
 698
 699                        ptmp = to_pci_dev(tmp);
 700                        if (ptmp->subordinate &&
 701                            ptmp->subordinate->number <= pdev->bus->number &&
 702                            ptmp->subordinate->busn_res.end >= pdev->bus->number)
 703                                goto got_pdev;
 704                }
 705
 706                if (pdev && drhd->include_all) {
 707                got_pdev:
 708                        *bus = pdev->bus->number;
 709                        *devfn = pdev->devfn;
 710                        goto out;
 711                }
 712        }
 713        iommu = NULL;
 714 out:
 715        rcu_read_unlock();
 716
 717        return iommu;
 718}
 719
 720static void domain_flush_cache(struct dmar_domain *domain,
 721                               void *addr, int size)
 722{
 723        if (!domain->iommu_coherency)
 724                clflush_cache_range(addr, size);
 725}
 726
 727/* Gets context entry for a given bus and devfn */
 728static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
 729                u8 bus, u8 devfn)
 730{
 731        struct root_entry *root;
 732        struct context_entry *context;
 733        unsigned long phy_addr;
 734        unsigned long flags;
 735
 736        spin_lock_irqsave(&iommu->lock, flags);
 737        root = &iommu->root_entry[bus];
 738        context = get_context_addr_from_root(root);
 739        if (!context) {
 740                context = (struct context_entry *)
 741                                alloc_pgtable_page(iommu->node);
 742                if (!context) {
 743                        spin_unlock_irqrestore(&iommu->lock, flags);
 744                        return NULL;
 745                }
 746                __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 747                phy_addr = virt_to_phys((void *)context);
 748                set_root_value(root, phy_addr);
 749                set_root_present(root);
 750                __iommu_flush_cache(iommu, root, sizeof(*root));
 751        }
 752        spin_unlock_irqrestore(&iommu->lock, flags);
 753        return &context[devfn];
 754}
 755
 756static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 757{
 758        struct root_entry *root;
 759        struct context_entry *context;
 760        int ret;
 761        unsigned long flags;
 762
 763        spin_lock_irqsave(&iommu->lock, flags);
 764        root = &iommu->root_entry[bus];
 765        context = get_context_addr_from_root(root);
 766        if (!context) {
 767                ret = 0;
 768                goto out;
 769        }
 770        ret = context_present(&context[devfn]);
 771out:
 772        spin_unlock_irqrestore(&iommu->lock, flags);
 773        return ret;
 774}
 775
 776static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
 777{
 778        struct root_entry *root;
 779        struct context_entry *context;
 780        unsigned long flags;
 781
 782        spin_lock_irqsave(&iommu->lock, flags);
 783        root = &iommu->root_entry[bus];
 784        context = get_context_addr_from_root(root);
 785        if (context) {
 786                context_clear_entry(&context[devfn]);
 787                __iommu_flush_cache(iommu, &context[devfn], \
 788                        sizeof(*context));
 789        }
 790        spin_unlock_irqrestore(&iommu->lock, flags);
 791}
 792
 793static void free_context_table(struct intel_iommu *iommu)
 794{
 795        struct root_entry *root;
 796        int i;
 797        unsigned long flags;
 798        struct context_entry *context;
 799
 800        spin_lock_irqsave(&iommu->lock, flags);
 801        if (!iommu->root_entry) {
 802                goto out;
 803        }
 804        for (i = 0; i < ROOT_ENTRY_NR; i++) {
 805                root = &iommu->root_entry[i];
 806                context = get_context_addr_from_root(root);
 807                if (context)
 808                        free_pgtable_page(context);
 809        }
 810        free_pgtable_page(iommu->root_entry);
 811        iommu->root_entry = NULL;
 812out:
 813        spin_unlock_irqrestore(&iommu->lock, flags);
 814}
 815
 816static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 817                                      unsigned long pfn, int *target_level)
 818{
 819        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 820        struct dma_pte *parent, *pte = NULL;
 821        int level = agaw_to_level(domain->agaw);
 822        int offset;
 823
 824        BUG_ON(!domain->pgd);
 825
 826        if (addr_width < BITS_PER_LONG && pfn >> addr_width)
 827                /* Address beyond IOMMU's addressing capabilities. */
 828                return NULL;
 829
 830        parent = domain->pgd;
 831
 832        while (1) {
 833                void *tmp_page;
 834
 835                offset = pfn_level_offset(pfn, level);
 836                pte = &parent[offset];
 837                if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 838                        break;
 839                if (level == *target_level)
 840                        break;
 841
 842                if (!dma_pte_present(pte)) {
 843                        uint64_t pteval;
 844
 845                        tmp_page = alloc_pgtable_page(domain->nid);
 846
 847                        if (!tmp_page)
 848                                return NULL;
 849
 850                        domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 851                        pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 852                        if (cmpxchg64(&pte->val, 0ULL, pteval)) {
 853                                /* Someone else set it while we were thinking; use theirs. */
 854                                free_pgtable_page(tmp_page);
 855                        } else {
 856                                dma_pte_addr(pte);
 857                                domain_flush_cache(domain, pte, sizeof(*pte));
 858                        }
 859                }
 860                if (level == 1)
 861                        break;
 862
 863                parent = phys_to_virt(dma_pte_addr(pte));
 864                level--;
 865        }
 866
 867        if (!*target_level)
 868                *target_level = level;
 869
 870        return pte;
 871}
 872
 873
 874/* return address's pte at specific level */
 875static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
 876                                         unsigned long pfn,
 877                                         int level, int *large_page)
 878{
 879        struct dma_pte *parent, *pte = NULL;
 880        int total = agaw_to_level(domain->agaw);
 881        int offset;
 882
 883        parent = domain->pgd;
 884        while (level <= total) {
 885                offset = pfn_level_offset(pfn, total);
 886                pte = &parent[offset];
 887                if (level == total)
 888                        return pte;
 889
 890                if (!dma_pte_present(pte)) {
 891                        *large_page = total;
 892                        break;
 893                }
 894
 895                if (pte->val & DMA_PTE_LARGE_PAGE) {
 896                        *large_page = total;
 897                        return pte;
 898                }
 899
 900                parent = phys_to_virt(dma_pte_addr(pte));
 901                total--;
 902        }
 903        return NULL;
 904}
 905
 906/* clear last level pte, a tlb flush should be followed */
 907static void dma_pte_clear_range(struct dmar_domain *domain,
 908                                unsigned long start_pfn,
 909                                unsigned long last_pfn)
 910{
 911        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 912        unsigned int large_page = 1;
 913        struct dma_pte *first_pte, *pte;
 914
 915        BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
 916        BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
 917        BUG_ON(start_pfn > last_pfn);
 918
 919        /* we don't need lock here; nobody else touches the iova range */
 920        do {
 921                large_page = 1;
 922                first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
 923                if (!pte) {
 924                        start_pfn = align_to_level(start_pfn + 1, large_page + 1);
 925                        continue;
 926                }
 927                do {
 928                        dma_clear_pte(pte);
 929                        start_pfn += lvl_to_nr_pages(large_page);
 930                        pte++;
 931                } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
 932
 933                domain_flush_cache(domain, first_pte,
 934                                   (void *)pte - (void *)first_pte);
 935
 936        } while (start_pfn && start_pfn <= last_pfn);
 937}
 938
 939static void dma_pte_free_level(struct dmar_domain *domain, int level,
 940                               struct dma_pte *pte, unsigned long pfn,
 941                               unsigned long start_pfn, unsigned long last_pfn)
 942{
 943        pfn = max(start_pfn, pfn);
 944        pte = &pte[pfn_level_offset(pfn, level)];
 945
 946        do {
 947                unsigned long level_pfn;
 948                struct dma_pte *level_pte;
 949
 950                if (!dma_pte_present(pte) || dma_pte_superpage(pte))
 951                        goto next;
 952
 953                level_pfn = pfn & level_mask(level - 1);
 954                level_pte = phys_to_virt(dma_pte_addr(pte));
 955
 956                if (level > 2)
 957                        dma_pte_free_level(domain, level - 1, level_pte,
 958                                           level_pfn, start_pfn, last_pfn);
 959
 960                /* If range covers entire pagetable, free it */
 961                if (!(start_pfn > level_pfn ||
 962                      last_pfn < level_pfn + level_size(level) - 1)) {
 963                        dma_clear_pte(pte);
 964                        domain_flush_cache(domain, pte, sizeof(*pte));
 965                        free_pgtable_page(level_pte);
 966                }
 967next:
 968                pfn += level_size(level);
 969        } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
 970}
 971
 972/* free page table pages. last level pte should already be cleared */
 973static void dma_pte_free_pagetable(struct dmar_domain *domain,
 974                                   unsigned long start_pfn,
 975                                   unsigned long last_pfn)
 976{
 977        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 978
 979        BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
 980        BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
 981        BUG_ON(start_pfn > last_pfn);
 982
 983        /* We don't need lock here; nobody else touches the iova range */
 984        dma_pte_free_level(domain, agaw_to_level(domain->agaw),
 985                           domain->pgd, 0, start_pfn, last_pfn);
 986
 987        /* free pgd */
 988        if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
 989                free_pgtable_page(domain->pgd);
 990                domain->pgd = NULL;
 991        }
 992}
 993
 994/* When a page at a given level is being unlinked from its parent, we don't
 995   need to *modify* it at all. All we need to do is make a list of all the
 996   pages which can be freed just as soon as we've flushed the IOTLB and we
 997   know the hardware page-walk will no longer touch them.
 998   The 'pte' argument is the *parent* PTE, pointing to the page that is to
 999   be freed. */
1000static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1001                                            int level, struct dma_pte *pte,
1002                                            struct page *freelist)
1003{
1004        struct page *pg;
1005
1006        pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1007        pg->freelist = freelist;
1008        freelist = pg;
1009
1010        if (level == 1)
1011                return freelist;
1012
1013        pte = page_address(pg);
1014        do {
1015                if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1016                        freelist = dma_pte_list_pagetables(domain, level - 1,
1017                                                           pte, freelist);
1018                pte++;
1019        } while (!first_pte_in_page(pte));
1020
1021        return freelist;
1022}
1023
1024static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1025                                        struct dma_pte *pte, unsigned long pfn,
1026                                        unsigned long start_pfn,
1027                                        unsigned long last_pfn,
1028                                        struct page *freelist)
1029{
1030        struct dma_pte *first_pte = NULL, *last_pte = NULL;
1031
1032        pfn = max(start_pfn, pfn);
1033        pte = &pte[pfn_level_offset(pfn, level)];
1034
1035        do {
1036                unsigned long level_pfn;
1037
1038                if (!dma_pte_present(pte))
1039                        goto next;
1040
1041                level_pfn = pfn & level_mask(level);
1042
1043                /* If range covers entire pagetable, free it */
1044                if (start_pfn <= level_pfn &&
1045                    last_pfn >= level_pfn + level_size(level) - 1) {
1046                        /* These suborbinate page tables are going away entirely. Don't
1047                           bother to clear them; we're just going to *free* them. */
1048                        if (level > 1 && !dma_pte_superpage(pte))
1049                                freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1050
1051                        dma_clear_pte(pte);
1052                        if (!first_pte)
1053                                first_pte = pte;
1054                        last_pte = pte;
1055                } else if (level > 1) {
1056                        /* Recurse down into a level that isn't *entirely* obsolete */
1057                        freelist = dma_pte_clear_level(domain, level - 1,
1058                                                       phys_to_virt(dma_pte_addr(pte)),
1059                                                       level_pfn, start_pfn, last_pfn,
1060                                                       freelist);
1061                }
1062next:
1063                pfn += level_size(level);
1064        } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1065
1066        if (first_pte)
1067                domain_flush_cache(domain, first_pte,
1068                                   (void *)++last_pte - (void *)first_pte);
1069
1070        return freelist;
1071}
1072
1073/* We can't just free the pages because the IOMMU may still be walking
1074   the page tables, and may have cached the intermediate levels. The
1075   pages can only be freed after the IOTLB flush has been done. */
1076struct page *domain_unmap(struct dmar_domain *domain,
1077                          unsigned long start_pfn,
1078                          unsigned long last_pfn)
1079{
1080        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1081        struct page *freelist = NULL;
1082
1083        BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
1084        BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
1085        BUG_ON(start_pfn > last_pfn);
1086
1087        /* we don't need lock here; nobody else touches the iova range */
1088        freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1089                                       domain->pgd, 0, start_pfn, last_pfn, NULL);
1090
1091        /* free pgd */
1092        if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1093                struct page *pgd_page = virt_to_page(domain->pgd);
1094                pgd_page->freelist = freelist;
1095                freelist = pgd_page;
1096
1097                domain->pgd = NULL;
1098        }
1099
1100        return freelist;
1101}
1102
1103void dma_free_pagelist(struct page *freelist)
1104{
1105        struct page *pg;
1106
1107        while ((pg = freelist)) {
1108                freelist = pg->freelist;
1109                free_pgtable_page(page_address(pg));
1110        }
1111}
1112
1113/* iommu handling */
1114static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1115{
1116        struct root_entry *root;
1117        unsigned long flags;
1118
1119        root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1120        if (!root)
1121                return -ENOMEM;
1122
1123        __iommu_flush_cache(iommu, root, ROOT_SIZE);
1124
1125        spin_lock_irqsave(&iommu->lock, flags);
1126        iommu->root_entry = root;
1127        spin_unlock_irqrestore(&iommu->lock, flags);
1128
1129        return 0;
1130}
1131
1132static void iommu_set_root_entry(struct intel_iommu *iommu)
1133{
1134        void *addr;
1135        u32 sts;
1136        unsigned long flag;
1137
1138        addr = iommu->root_entry;
1139
1140        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1141        dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
1142
1143        writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1144
1145        /* Make sure hardware complete it */
1146        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1147                      readl, (sts & DMA_GSTS_RTPS), sts);
1148
1149        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1150}
1151
1152static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1153{
1154        u32 val;
1155        unsigned long flag;
1156
1157        if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1158                return;
1159
1160        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1161        writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1162
1163        /* Make sure hardware complete it */
1164        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1165                      readl, (!(val & DMA_GSTS_WBFS)), val);
1166
1167        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1168}
1169
1170/* return value determine if we need a write buffer flush */
1171static void __iommu_flush_context(struct intel_iommu *iommu,
1172                                  u16 did, u16 source_id, u8 function_mask,
1173                                  u64 type)
1174{
1175        u64 val = 0;
1176        unsigned long flag;
1177
1178        switch (type) {
1179        case DMA_CCMD_GLOBAL_INVL:
1180                val = DMA_CCMD_GLOBAL_INVL;
1181                break;
1182        case DMA_CCMD_DOMAIN_INVL:
1183                val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1184                break;
1185        case DMA_CCMD_DEVICE_INVL:
1186                val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1187                        | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1188                break;
1189        default:
1190                BUG();
1191        }
1192        val |= DMA_CCMD_ICC;
1193
1194        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1195        dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1196
1197        /* Make sure hardware complete it */
1198        IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1199                dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1200
1201        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1202}
1203
1204/* return value determine if we need a write buffer flush */
1205static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1206                                u64 addr, unsigned int size_order, u64 type)
1207{
1208        int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1209        u64 val = 0, val_iva = 0;
1210        unsigned long flag;
1211
1212        switch (type) {
1213        case DMA_TLB_GLOBAL_FLUSH:
1214                /* global flush doesn't need set IVA_REG */
1215                val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1216                break;
1217        case DMA_TLB_DSI_FLUSH:
1218                val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1219                break;
1220        case DMA_TLB_PSI_FLUSH:
1221                val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1222                /* IH bit is passed in as part of address */
1223                val_iva = size_order | addr;
1224                break;
1225        default:
1226                BUG();
1227        }
1228        /* Note: set drain read/write */
1229#if 0
1230        /*
1231         * This is probably to be super secure.. Looks like we can
1232         * ignore it without any impact.
1233         */
1234        if (cap_read_drain(iommu->cap))
1235                val |= DMA_TLB_READ_DRAIN;
1236#endif
1237        if (cap_write_drain(iommu->cap))
1238                val |= DMA_TLB_WRITE_DRAIN;
1239
1240        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1241        /* Note: Only uses first TLB reg currently */
1242        if (val_iva)
1243                dmar_writeq(iommu->reg + tlb_offset, val_iva);
1244        dmar_writeq(iommu->reg + tlb_offset + 8, val);
1245
1246        /* Make sure hardware complete it */
1247        IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1248                dmar_readq, (!(val & DMA_TLB_IVT)), val);
1249
1250        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1251
1252        /* check IOTLB invalidation granularity */
1253        if (DMA_TLB_IAIG(val) == 0)
1254                printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1255        if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1256                pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1257                        (unsigned long long)DMA_TLB_IIRG(type),
1258                        (unsigned long long)DMA_TLB_IAIG(val));
1259}
1260
1261static struct device_domain_info *
1262iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1263                         u8 bus, u8 devfn)
1264{
1265        int found = 0;
1266        unsigned long flags;
1267        struct device_domain_info *info;
1268        struct pci_dev *pdev;
1269
1270        if (!ecap_dev_iotlb_support(iommu->ecap))
1271                return NULL;
1272
1273        if (!iommu->qi)
1274                return NULL;
1275
1276        spin_lock_irqsave(&device_domain_lock, flags);
1277        list_for_each_entry(info, &domain->devices, link)
1278                if (info->bus == bus && info->devfn == devfn) {
1279                        found = 1;
1280                        break;
1281                }
1282        spin_unlock_irqrestore(&device_domain_lock, flags);
1283
1284        if (!found || !info->dev || !dev_is_pci(info->dev))
1285                return NULL;
1286
1287        pdev = to_pci_dev(info->dev);
1288
1289        if (!pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS))
1290                return NULL;
1291
1292        if (!dmar_find_matched_atsr_unit(pdev))
1293                return NULL;
1294
1295        return info;
1296}
1297
1298static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1299{
1300        if (!info || !dev_is_pci(info->dev))
1301                return;
1302
1303        pci_enable_ats(to_pci_dev(info->dev), VTD_PAGE_SHIFT);
1304}
1305
1306static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1307{
1308        if (!info->dev || !dev_is_pci(info->dev) ||
1309            !pci_ats_enabled(to_pci_dev(info->dev)))
1310                return;
1311
1312        pci_disable_ats(to_pci_dev(info->dev));
1313}
1314
1315static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1316                                  u64 addr, unsigned mask)
1317{
1318        u16 sid, qdep;
1319        unsigned long flags;
1320        struct device_domain_info *info;
1321
1322        spin_lock_irqsave(&device_domain_lock, flags);
1323        list_for_each_entry(info, &domain->devices, link) {
1324                struct pci_dev *pdev;
1325                if (!info->dev || !dev_is_pci(info->dev))
1326                        continue;
1327
1328                pdev = to_pci_dev(info->dev);
1329                if (!pci_ats_enabled(pdev))
1330                        continue;
1331
1332                sid = info->bus << 8 | info->devfn;
1333                qdep = pci_ats_queue_depth(pdev);
1334                qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1335        }
1336        spin_unlock_irqrestore(&device_domain_lock, flags);
1337}
1338
1339static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1340                                  unsigned long pfn, unsigned int pages, int ih, int map)
1341{
1342        unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1343        uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1344
1345        BUG_ON(pages == 0);
1346
1347        if (ih)
1348                ih = 1 << 6;
1349        /*
1350         * Fallback to domain selective flush if no PSI support or the size is
1351         * too big.
1352         * PSI requires page size to be 2 ^ x, and the base address is naturally
1353         * aligned to the size
1354         */
1355        if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1356                iommu->flush.flush_iotlb(iommu, did, 0, 0,
1357                                                DMA_TLB_DSI_FLUSH);
1358        else
1359                iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1360                                                DMA_TLB_PSI_FLUSH);
1361
1362        /*
1363         * In caching mode, changes of pages from non-present to present require
1364         * flush. However, device IOTLB doesn't need to be flushed in this case.
1365         */
1366        if (!cap_caching_mode(iommu->cap) || !map)
1367                iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1368}
1369
1370static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1371{
1372        u32 pmen;
1373        unsigned long flags;
1374
1375        raw_spin_lock_irqsave(&iommu->register_lock, flags);
1376        pmen = readl(iommu->reg + DMAR_PMEN_REG);
1377        pmen &= ~DMA_PMEN_EPM;
1378        writel(pmen, iommu->reg + DMAR_PMEN_REG);
1379
1380        /* wait for the protected region status bit to clear */
1381        IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1382                readl, !(pmen & DMA_PMEN_PRS), pmen);
1383
1384        raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1385}
1386
1387static int iommu_enable_translation(struct intel_iommu *iommu)
1388{
1389        u32 sts;
1390        unsigned long flags;
1391
1392        raw_spin_lock_irqsave(&iommu->register_lock, flags);
1393        iommu->gcmd |= DMA_GCMD_TE;
1394        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1395
1396        /* Make sure hardware complete it */
1397        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1398                      readl, (sts & DMA_GSTS_TES), sts);
1399
1400        raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1401        return 0;
1402}
1403
1404static int iommu_disable_translation(struct intel_iommu *iommu)
1405{
1406        u32 sts;
1407        unsigned long flag;
1408
1409        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1410        iommu->gcmd &= ~DMA_GCMD_TE;
1411        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1412
1413        /* Make sure hardware complete it */
1414        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1415                      readl, (!(sts & DMA_GSTS_TES)), sts);
1416
1417        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1418        return 0;
1419}
1420
1421
1422static int iommu_init_domains(struct intel_iommu *iommu)
1423{
1424        unsigned long ndomains;
1425        unsigned long nlongs;
1426
1427        ndomains = cap_ndoms(iommu->cap);
1428        pr_debug("IOMMU%d: Number of Domains supported <%ld>\n",
1429                 iommu->seq_id, ndomains);
1430        nlongs = BITS_TO_LONGS(ndomains);
1431
1432        spin_lock_init(&iommu->lock);
1433
1434        /* TBD: there might be 64K domains,
1435         * consider other allocation for future chip
1436         */
1437        iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1438        if (!iommu->domain_ids) {
1439                pr_err("IOMMU%d: allocating domain id array failed\n",
1440                       iommu->seq_id);
1441                return -ENOMEM;
1442        }
1443        iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1444                        GFP_KERNEL);
1445        if (!iommu->domains) {
1446                pr_err("IOMMU%d: allocating domain array failed\n",
1447                       iommu->seq_id);
1448                kfree(iommu->domain_ids);
1449                iommu->domain_ids = NULL;
1450                return -ENOMEM;
1451        }
1452
1453        /*
1454         * if Caching mode is set, then invalid translations are tagged
1455         * with domainid 0. Hence we need to pre-allocate it.
1456         */
1457        if (cap_caching_mode(iommu->cap))
1458                set_bit(0, iommu->domain_ids);
1459        return 0;
1460}
1461
1462static void free_dmar_iommu(struct intel_iommu *iommu)
1463{
1464        struct dmar_domain *domain;
1465        int i, count;
1466        unsigned long flags;
1467
1468        if ((iommu->domains) && (iommu->domain_ids)) {
1469                for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1470                        /*
1471                         * Domain id 0 is reserved for invalid translation
1472                         * if hardware supports caching mode.
1473                         */
1474                        if (cap_caching_mode(iommu->cap) && i == 0)
1475                                continue;
1476
1477                        domain = iommu->domains[i];
1478                        clear_bit(i, iommu->domain_ids);
1479
1480                        spin_lock_irqsave(&domain->iommu_lock, flags);
1481                        count = --domain->iommu_count;
1482                        spin_unlock_irqrestore(&domain->iommu_lock, flags);
1483                        if (count == 0)
1484                                domain_exit(domain);
1485                }
1486        }
1487
1488        if (iommu->gcmd & DMA_GCMD_TE)
1489                iommu_disable_translation(iommu);
1490
1491        kfree(iommu->domains);
1492        kfree(iommu->domain_ids);
1493        iommu->domains = NULL;
1494        iommu->domain_ids = NULL;
1495
1496        g_iommus[iommu->seq_id] = NULL;
1497
1498        /* free context mapping */
1499        free_context_table(iommu);
1500}
1501
1502static struct dmar_domain *alloc_domain(bool vm)
1503{
1504        /* domain id for virtual machine, it won't be set in context */
1505        static atomic_t vm_domid = ATOMIC_INIT(0);
1506        struct dmar_domain *domain;
1507
1508        domain = alloc_domain_mem();
1509        if (!domain)
1510                return NULL;
1511
1512        domain->nid = -1;
1513        domain->iommu_count = 0;
1514        memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1515        domain->flags = 0;
1516        spin_lock_init(&domain->iommu_lock);
1517        INIT_LIST_HEAD(&domain->devices);
1518        if (vm) {
1519                domain->id = atomic_inc_return(&vm_domid);
1520                domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
1521        }
1522
1523        return domain;
1524}
1525
1526static int iommu_attach_domain(struct dmar_domain *domain,
1527                               struct intel_iommu *iommu)
1528{
1529        int num;
1530        unsigned long ndomains;
1531        unsigned long flags;
1532
1533        ndomains = cap_ndoms(iommu->cap);
1534
1535        spin_lock_irqsave(&iommu->lock, flags);
1536
1537        num = find_first_zero_bit(iommu->domain_ids, ndomains);
1538        if (num >= ndomains) {
1539                spin_unlock_irqrestore(&iommu->lock, flags);
1540                printk(KERN_ERR "IOMMU: no free domain ids\n");
1541                return -ENOMEM;
1542        }
1543
1544        domain->id = num;
1545        domain->iommu_count++;
1546        set_bit(num, iommu->domain_ids);
1547        set_bit(iommu->seq_id, domain->iommu_bmp);
1548        iommu->domains[num] = domain;
1549        spin_unlock_irqrestore(&iommu->lock, flags);
1550
1551        return 0;
1552}
1553
1554static void iommu_detach_domain(struct dmar_domain *domain,
1555                                struct intel_iommu *iommu)
1556{
1557        unsigned long flags;
1558        int num, ndomains;
1559
1560        spin_lock_irqsave(&iommu->lock, flags);
1561        ndomains = cap_ndoms(iommu->cap);
1562        for_each_set_bit(num, iommu->domain_ids, ndomains) {
1563                if (iommu->domains[num] == domain) {
1564                        clear_bit(num, iommu->domain_ids);
1565                        iommu->domains[num] = NULL;
1566                        break;
1567                }
1568        }
1569        spin_unlock_irqrestore(&iommu->lock, flags);
1570}
1571
1572static struct iova_domain reserved_iova_list;
1573static struct lock_class_key reserved_rbtree_key;
1574
1575static int dmar_init_reserved_ranges(void)
1576{
1577        struct pci_dev *pdev = NULL;
1578        struct iova *iova;
1579        int i;
1580
1581        init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1582
1583        lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1584                &reserved_rbtree_key);
1585
1586        /* IOAPIC ranges shouldn't be accessed by DMA */
1587        iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1588                IOVA_PFN(IOAPIC_RANGE_END));
1589        if (!iova) {
1590                printk(KERN_ERR "Reserve IOAPIC range failed\n");
1591                return -ENODEV;
1592        }
1593
1594        /* Reserve all PCI MMIO to avoid peer-to-peer access */
1595        for_each_pci_dev(pdev) {
1596                struct resource *r;
1597
1598                for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1599                        r = &pdev->resource[i];
1600                        if (!r->flags || !(r->flags & IORESOURCE_MEM))
1601                                continue;
1602                        iova = reserve_iova(&reserved_iova_list,
1603                                            IOVA_PFN(r->start),
1604                                            IOVA_PFN(r->end));
1605                        if (!iova) {
1606                                printk(KERN_ERR "Reserve iova failed\n");
1607                                return -ENODEV;
1608                        }
1609                }
1610        }
1611        return 0;
1612}
1613
1614static void domain_reserve_special_ranges(struct dmar_domain *domain)
1615{
1616        copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1617}
1618
1619static inline int guestwidth_to_adjustwidth(int gaw)
1620{
1621        int agaw;
1622        int r = (gaw - 12) % 9;
1623
1624        if (r == 0)
1625                agaw = gaw;
1626        else
1627                agaw = gaw + 9 - r;
1628        if (agaw > 64)
1629                agaw = 64;
1630        return agaw;
1631}
1632
1633static int domain_init(struct dmar_domain *domain, int guest_width)
1634{
1635        struct intel_iommu *iommu;
1636        int adjust_width, agaw;
1637        unsigned long sagaw;
1638
1639        init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1640        domain_reserve_special_ranges(domain);
1641
1642        /* calculate AGAW */
1643        iommu = domain_get_iommu(domain);
1644        if (guest_width > cap_mgaw(iommu->cap))
1645                guest_width = cap_mgaw(iommu->cap);
1646        domain->gaw = guest_width;
1647        adjust_width = guestwidth_to_adjustwidth(guest_width);
1648        agaw = width_to_agaw(adjust_width);
1649        sagaw = cap_sagaw(iommu->cap);
1650        if (!test_bit(agaw, &sagaw)) {
1651                /* hardware doesn't support it, choose a bigger one */
1652                pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1653                agaw = find_next_bit(&sagaw, 5, agaw);
1654                if (agaw >= 5)
1655                        return -ENODEV;
1656        }
1657        domain->agaw = agaw;
1658
1659        if (ecap_coherent(iommu->ecap))
1660                domain->iommu_coherency = 1;
1661        else
1662                domain->iommu_coherency = 0;
1663
1664        if (ecap_sc_support(iommu->ecap))
1665                domain->iommu_snooping = 1;
1666        else
1667                domain->iommu_snooping = 0;
1668
1669        if (intel_iommu_superpage)
1670                domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1671        else
1672                domain->iommu_superpage = 0;
1673
1674        domain->nid = iommu->node;
1675
1676        /* always allocate the top pgd */
1677        domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1678        if (!domain->pgd)
1679                return -ENOMEM;
1680        __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1681        return 0;
1682}
1683
1684static void domain_exit(struct dmar_domain *domain)
1685{
1686        struct dmar_drhd_unit *drhd;
1687        struct intel_iommu *iommu;
1688        struct page *freelist = NULL;
1689
1690        /* Domain 0 is reserved, so dont process it */
1691        if (!domain)
1692                return;
1693
1694        /* Flush any lazy unmaps that may reference this domain */
1695        if (!intel_iommu_strict)
1696                flush_unmaps_timeout(0);
1697
1698        /* remove associated devices */
1699        domain_remove_dev_info(domain);
1700
1701        /* destroy iovas */
1702        put_iova_domain(&domain->iovad);
1703
1704        freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1705
1706        /* clear attached or cached domains */
1707        rcu_read_lock();
1708        for_each_active_iommu(iommu, drhd)
1709                if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1710                    test_bit(iommu->seq_id, domain->iommu_bmp))
1711                        iommu_detach_domain(domain, iommu);
1712        rcu_read_unlock();
1713
1714        dma_free_pagelist(freelist);
1715
1716        free_domain_mem(domain);
1717}
1718
1719static int domain_context_mapping_one(struct dmar_domain *domain,
1720                                      struct intel_iommu *iommu,
1721                                      u8 bus, u8 devfn, int translation)
1722{
1723        struct context_entry *context;
1724        unsigned long flags;
1725        struct dma_pte *pgd;
1726        unsigned long num;
1727        unsigned long ndomains;
1728        int id;
1729        int agaw;
1730        struct device_domain_info *info = NULL;
1731
1732        pr_debug("Set context mapping for %02x:%02x.%d\n",
1733                bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1734
1735        BUG_ON(!domain->pgd);
1736        BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1737               translation != CONTEXT_TT_MULTI_LEVEL);
1738
1739        context = device_to_context_entry(iommu, bus, devfn);
1740        if (!context)
1741                return -ENOMEM;
1742        spin_lock_irqsave(&iommu->lock, flags);
1743        if (context_present(context)) {
1744                spin_unlock_irqrestore(&iommu->lock, flags);
1745                return 0;
1746        }
1747
1748        id = domain->id;
1749        pgd = domain->pgd;
1750
1751        if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1752            domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1753                int found = 0;
1754
1755                /* find an available domain id for this device in iommu */
1756                ndomains = cap_ndoms(iommu->cap);
1757                for_each_set_bit(num, iommu->domain_ids, ndomains) {
1758                        if (iommu->domains[num] == domain) {
1759                                id = num;
1760                                found = 1;
1761                                break;
1762                        }
1763                }
1764
1765                if (found == 0) {
1766                        num = find_first_zero_bit(iommu->domain_ids, ndomains);
1767                        if (num >= ndomains) {
1768                                spin_unlock_irqrestore(&iommu->lock, flags);
1769                                printk(KERN_ERR "IOMMU: no free domain ids\n");
1770                                return -EFAULT;
1771                        }
1772
1773                        set_bit(num, iommu->domain_ids);
1774                        iommu->domains[num] = domain;
1775                        id = num;
1776                }
1777
1778                /* Skip top levels of page tables for
1779                 * iommu which has less agaw than default.
1780                 * Unnecessary for PT mode.
1781                 */
1782                if (translation != CONTEXT_TT_PASS_THROUGH) {
1783                        for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1784                                pgd = phys_to_virt(dma_pte_addr(pgd));
1785                                if (!dma_pte_present(pgd)) {
1786                                        spin_unlock_irqrestore(&iommu->lock, flags);
1787                                        return -ENOMEM;
1788                                }
1789                        }
1790                }
1791        }
1792
1793        context_set_domain_id(context, id);
1794
1795        if (translation != CONTEXT_TT_PASS_THROUGH) {
1796                info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
1797                translation = info ? CONTEXT_TT_DEV_IOTLB :
1798                                     CONTEXT_TT_MULTI_LEVEL;
1799        }
1800        /*
1801         * In pass through mode, AW must be programmed to indicate the largest
1802         * AGAW value supported by hardware. And ASR is ignored by hardware.
1803         */
1804        if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1805                context_set_address_width(context, iommu->msagaw);
1806        else {
1807                context_set_address_root(context, virt_to_phys(pgd));
1808                context_set_address_width(context, iommu->agaw);
1809        }
1810
1811        context_set_translation_type(context, translation);
1812        context_set_fault_enable(context);
1813        context_set_present(context);
1814        domain_flush_cache(domain, context, sizeof(*context));
1815
1816        /*
1817         * It's a non-present to present mapping. If hardware doesn't cache
1818         * non-present entry we only need to flush the write-buffer. If the
1819         * _does_ cache non-present entries, then it does so in the special
1820         * domain #0, which we have to flush:
1821         */
1822        if (cap_caching_mode(iommu->cap)) {
1823                iommu->flush.flush_context(iommu, 0,
1824                                           (((u16)bus) << 8) | devfn,
1825                                           DMA_CCMD_MASK_NOBIT,
1826                                           DMA_CCMD_DEVICE_INVL);
1827                iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1828        } else {
1829                iommu_flush_write_buffer(iommu);
1830        }
1831        iommu_enable_dev_iotlb(info);
1832        spin_unlock_irqrestore(&iommu->lock, flags);
1833
1834        spin_lock_irqsave(&domain->iommu_lock, flags);
1835        if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1836                domain->iommu_count++;
1837                if (domain->iommu_count == 1)
1838                        domain->nid = iommu->node;
1839                domain_update_iommu_cap(domain);
1840        }
1841        spin_unlock_irqrestore(&domain->iommu_lock, flags);
1842        return 0;
1843}
1844
1845static int
1846domain_context_mapping(struct dmar_domain *domain, struct device *dev,
1847                       int translation)
1848{
1849        int ret;
1850        struct pci_dev *pdev, *tmp, *parent;
1851        struct intel_iommu *iommu;
1852        u8 bus, devfn;
1853
1854        iommu = device_to_iommu(dev, &bus, &devfn);
1855        if (!iommu)
1856                return -ENODEV;
1857
1858        ret = domain_context_mapping_one(domain, iommu, bus, devfn,
1859                                         translation);
1860        if (ret || !dev_is_pci(dev))
1861                return ret;
1862
1863        /* dependent device mapping */
1864        pdev = to_pci_dev(dev);
1865        tmp = pci_find_upstream_pcie_bridge(pdev);
1866        if (!tmp)
1867                return 0;
1868        /* Secondary interface's bus number and devfn 0 */
1869        parent = pdev->bus->self;
1870        while (parent != tmp) {
1871                ret = domain_context_mapping_one(domain, iommu,
1872                                                 parent->bus->number,
1873                                                 parent->devfn, translation);
1874                if (ret)
1875                        return ret;
1876                parent = parent->bus->self;
1877        }
1878        if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1879                return domain_context_mapping_one(domain, iommu,
1880                                        tmp->subordinate->number, 0,
1881                                        translation);
1882        else /* this is a legacy PCI bridge */
1883                return domain_context_mapping_one(domain, iommu,
1884                                                  tmp->bus->number,
1885                                                  tmp->devfn,
1886                                                  translation);
1887}
1888
1889static int domain_context_mapped(struct device *dev)
1890{
1891        int ret;
1892        struct pci_dev *pdev, *tmp, *parent;
1893        struct intel_iommu *iommu;
1894        u8 bus, devfn;
1895
1896        iommu = device_to_iommu(dev, &bus, &devfn);
1897        if (!iommu)
1898                return -ENODEV;
1899
1900        ret = device_context_mapped(iommu, bus, devfn);
1901        if (!ret || !dev_is_pci(dev))
1902                return ret;
1903
1904        /* dependent device mapping */
1905        pdev = to_pci_dev(dev);
1906        tmp = pci_find_upstream_pcie_bridge(pdev);
1907        if (!tmp)
1908                return ret;
1909        /* Secondary interface's bus number and devfn 0 */
1910        parent = pdev->bus->self;
1911        while (parent != tmp) {
1912                ret = device_context_mapped(iommu, parent->bus->number,
1913                                            parent->devfn);
1914                if (!ret)
1915                        return ret;
1916                parent = parent->bus->self;
1917        }
1918        if (pci_is_pcie(tmp))
1919                return device_context_mapped(iommu, tmp->subordinate->number,
1920                                             0);
1921        else
1922                return device_context_mapped(iommu, tmp->bus->number,
1923                                             tmp->devfn);
1924}
1925
1926/* Returns a number of VTD pages, but aligned to MM page size */
1927static inline unsigned long aligned_nrpages(unsigned long host_addr,
1928                                            size_t size)
1929{
1930        host_addr &= ~PAGE_MASK;
1931        return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1932}
1933
1934/* Return largest possible superpage level for a given mapping */
1935static inline int hardware_largepage_caps(struct dmar_domain *domain,
1936                                          unsigned long iov_pfn,
1937                                          unsigned long phy_pfn,
1938                                          unsigned long pages)
1939{
1940        int support, level = 1;
1941        unsigned long pfnmerge;
1942
1943        support = domain->iommu_superpage;
1944
1945        /* To use a large page, the virtual *and* physical addresses
1946           must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1947           of them will mean we have to use smaller pages. So just
1948           merge them and check both at once. */
1949        pfnmerge = iov_pfn | phy_pfn;
1950
1951        while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1952                pages >>= VTD_STRIDE_SHIFT;
1953                if (!pages)
1954                        break;
1955                pfnmerge >>= VTD_STRIDE_SHIFT;
1956                level++;
1957                support--;
1958        }
1959        return level;
1960}
1961
1962static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1963                            struct scatterlist *sg, unsigned long phys_pfn,
1964                            unsigned long nr_pages, int prot)
1965{
1966        struct dma_pte *first_pte = NULL, *pte = NULL;
1967        phys_addr_t uninitialized_var(pteval);
1968        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1969        unsigned long sg_res;
1970        unsigned int largepage_lvl = 0;
1971        unsigned long lvl_pages = 0;
1972
1973        BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1974
1975        if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1976                return -EINVAL;
1977
1978        prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1979
1980        if (sg)
1981                sg_res = 0;
1982        else {
1983                sg_res = nr_pages + 1;
1984                pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1985        }
1986
1987        while (nr_pages > 0) {
1988                uint64_t tmp;
1989
1990                if (!sg_res) {
1991                        sg_res = aligned_nrpages(sg->offset, sg->length);
1992                        sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1993                        sg->dma_length = sg->length;
1994                        pteval = page_to_phys(sg_page(sg)) | prot;
1995                        phys_pfn = pteval >> VTD_PAGE_SHIFT;
1996                }
1997
1998                if (!pte) {
1999                        largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2000
2001                        first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2002                        if (!pte)
2003                                return -ENOMEM;
2004                        /* It is large page*/
2005                        if (largepage_lvl > 1) {
2006                                pteval |= DMA_PTE_LARGE_PAGE;
2007                                /* Ensure that old small page tables are removed to make room
2008                                   for superpage, if they exist. */
2009                                dma_pte_clear_range(domain, iov_pfn,
2010                                                    iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
2011                                dma_pte_free_pagetable(domain, iov_pfn,
2012                                                       iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
2013                        } else {
2014                                pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2015                        }
2016
2017                }
2018                /* We don't need lock here, nobody else
2019                 * touches the iova range
2020                 */
2021                tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2022                if (tmp) {
2023                        static int dumps = 5;
2024                        printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2025                               iov_pfn, tmp, (unsigned long long)pteval);
2026                        if (dumps) {
2027                                dumps--;
2028                                debug_dma_dump_mappings(NULL);
2029                        }
2030                        WARN_ON(1);
2031                }
2032
2033                lvl_pages = lvl_to_nr_pages(largepage_lvl);
2034
2035                BUG_ON(nr_pages < lvl_pages);
2036                BUG_ON(sg_res < lvl_pages);
2037
2038                nr_pages -= lvl_pages;
2039                iov_pfn += lvl_pages;
2040                phys_pfn += lvl_pages;
2041                pteval += lvl_pages * VTD_PAGE_SIZE;
2042                sg_res -= lvl_pages;
2043
2044                /* If the next PTE would be the first in a new page, then we
2045                   need to flush the cache on the entries we've just written.
2046                   And then we'll need to recalculate 'pte', so clear it and
2047                   let it get set again in the if (!pte) block above.
2048
2049                   If we're done (!nr_pages) we need to flush the cache too.
2050
2051                   Also if we've been setting superpages, we may need to
2052                   recalculate 'pte' and switch back to smaller pages for the
2053                   end of the mapping, if the trailing size is not enough to
2054                   use another superpage (i.e. sg_res < lvl_pages). */
2055                pte++;
2056                if (!nr_pages || first_pte_in_page(pte) ||
2057                    (largepage_lvl > 1 && sg_res < lvl_pages)) {
2058                        domain_flush_cache(domain, first_pte,
2059                                           (void *)pte - (void *)first_pte);
2060                        pte = NULL;
2061                }
2062
2063                if (!sg_res && nr_pages)
2064                        sg = sg_next(sg);
2065        }
2066        return 0;
2067}
2068
2069static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2070                                    struct scatterlist *sg, unsigned long nr_pages,
2071                                    int prot)
2072{
2073        return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2074}
2075
2076static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2077                                     unsigned long phys_pfn, unsigned long nr_pages,
2078                                     int prot)
2079{
2080        return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2081}
2082
2083static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
2084{
2085        if (!iommu)
2086                return;
2087
2088        clear_context_table(iommu, bus, devfn);
2089        iommu->flush.flush_context(iommu, 0, 0, 0,
2090                                           DMA_CCMD_GLOBAL_INVL);
2091        iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2092}
2093
2094static inline void unlink_domain_info(struct device_domain_info *info)
2095{
2096        assert_spin_locked(&device_domain_lock);
2097        list_del(&info->link);
2098        list_del(&info->global);
2099        if (info->dev)
2100                info->dev->archdata.iommu = NULL;
2101}
2102
2103static void domain_remove_dev_info(struct dmar_domain *domain)
2104{
2105        struct device_domain_info *info;
2106        unsigned long flags, flags2;
2107
2108        spin_lock_irqsave(&device_domain_lock, flags);
2109        while (!list_empty(&domain->devices)) {
2110                info = list_entry(domain->devices.next,
2111                        struct device_domain_info, link);
2112                unlink_domain_info(info);
2113                spin_unlock_irqrestore(&device_domain_lock, flags);
2114
2115                iommu_disable_dev_iotlb(info);
2116                iommu_detach_dev(info->iommu, info->bus, info->devfn);
2117
2118                if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
2119                        iommu_detach_dependent_devices(info->iommu, info->dev);
2120                        /* clear this iommu in iommu_bmp, update iommu count
2121                         * and capabilities
2122                         */
2123                        spin_lock_irqsave(&domain->iommu_lock, flags2);
2124                        if (test_and_clear_bit(info->iommu->seq_id,
2125                                               domain->iommu_bmp)) {
2126                                domain->iommu_count--;
2127                                domain_update_iommu_cap(domain);
2128                        }
2129                        spin_unlock_irqrestore(&domain->iommu_lock, flags2);
2130                }
2131
2132                free_devinfo_mem(info);
2133                spin_lock_irqsave(&device_domain_lock, flags);
2134        }
2135        spin_unlock_irqrestore(&device_domain_lock, flags);
2136}
2137
2138/*
2139 * find_domain
2140 * Note: we use struct device->archdata.iommu stores the info
2141 */
2142static struct dmar_domain *find_domain(struct device *dev)
2143{
2144        struct device_domain_info *info;
2145
2146        /* No lock here, assumes no domain exit in normal case */
2147        info = dev->archdata.iommu;
2148        if (info)
2149                return info->domain;
2150        return NULL;
2151}
2152
2153static inline struct device_domain_info *
2154dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2155{
2156        struct device_domain_info *info;
2157
2158        list_for_each_entry(info, &device_domain_list, global)
2159                if (info->iommu->segment == segment && info->bus == bus &&
2160                    info->devfn == devfn)
2161                        return info;
2162
2163        return NULL;
2164}
2165
2166static struct dmar_domain *dmar_insert_dev_info(struct intel_iommu *iommu,
2167                                                int bus, int devfn,
2168                                                struct device *dev,
2169                                                struct dmar_domain *domain)
2170{
2171        struct dmar_domain *found = NULL;
2172        struct device_domain_info *info;
2173        unsigned long flags;
2174
2175        info = alloc_devinfo_mem();
2176        if (!info)
2177                return NULL;
2178
2179        info->bus = bus;
2180        info->devfn = devfn;
2181        info->dev = dev;
2182        info->domain = domain;
2183        info->iommu = iommu;
2184        if (!dev)
2185                domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2186
2187        spin_lock_irqsave(&device_domain_lock, flags);
2188        if (dev)
2189                found = find_domain(dev);
2190        else {
2191                struct device_domain_info *info2;
2192                info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2193                if (info2)
2194                        found = info2->domain;
2195        }
2196        if (found) {
2197                spin_unlock_irqrestore(&device_domain_lock, flags);
2198                free_devinfo_mem(info);
2199                /* Caller must free the original domain */
2200                return found;
2201        }
2202
2203        list_add(&info->link, &domain->devices);
2204        list_add(&info->global, &device_domain_list);
2205        if (dev)
2206                dev->archdata.iommu = info;
2207        spin_unlock_irqrestore(&device_domain_lock, flags);
2208
2209        return domain;
2210}
2211
2212/* domain is initialized */
2213static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2214{
2215        struct dmar_domain *domain, *free = NULL;
2216        struct intel_iommu *iommu = NULL;
2217        struct device_domain_info *info;
2218        struct pci_dev *dev_tmp = NULL;
2219        unsigned long flags;
2220        u8 bus, devfn, bridge_bus, bridge_devfn;
2221
2222        domain = find_domain(dev);
2223        if (domain)
2224                return domain;
2225
2226        if (dev_is_pci(dev)) {
2227                struct pci_dev *pdev = to_pci_dev(dev);
2228                u16 segment;
2229
2230                segment = pci_domain_nr(pdev->bus);
2231                dev_tmp = pci_find_upstream_pcie_bridge(pdev);
2232                if (dev_tmp) {
2233                        if (pci_is_pcie(dev_tmp)) {
2234                                bridge_bus = dev_tmp->subordinate->number;
2235                                bridge_devfn = 0;
2236                        } else {
2237                                bridge_bus = dev_tmp->bus->number;
2238                                bridge_devfn = dev_tmp->devfn;
2239                        }
2240                        spin_lock_irqsave(&device_domain_lock, flags);
2241                        info = dmar_search_domain_by_dev_info(segment,
2242                                                              bridge_bus,
2243                                                              bridge_devfn);
2244                        if (info) {
2245                                iommu = info->iommu;
2246                                domain = info->domain;
2247                        }
2248                        spin_unlock_irqrestore(&device_domain_lock, flags);
2249                        /* pcie-pci bridge already has a domain, uses it */
2250                        if (info)
2251                                goto found_domain;
2252                }
2253        }
2254
2255        iommu = device_to_iommu(dev, &bus, &devfn);
2256        if (!iommu)
2257                goto error;
2258
2259        /* Allocate and initialize new domain for the device */
2260        domain = alloc_domain(false);
2261        if (!domain)
2262                goto error;
2263        if (iommu_attach_domain(domain, iommu)) {
2264                free_domain_mem(domain);
2265                domain = NULL;
2266                goto error;
2267        }
2268        free = domain;
2269        if (domain_init(domain, gaw))
2270                goto error;
2271
2272        /* register pcie-to-pci device */
2273        if (dev_tmp) {
2274                domain = dmar_insert_dev_info(iommu, bridge_bus, bridge_devfn,
2275                                              NULL, domain);
2276                if (!domain)
2277                        goto error;
2278        }
2279
2280found_domain:
2281        domain = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
2282error:
2283        if (free != domain)
2284                domain_exit(free);
2285
2286        return domain;
2287}
2288
2289static int iommu_identity_mapping;
2290#define IDENTMAP_ALL            1
2291#define IDENTMAP_GFX            2
2292#define IDENTMAP_AZALIA         4
2293
2294static int iommu_domain_identity_map(struct dmar_domain *domain,
2295                                     unsigned long long start,
2296                                     unsigned long long end)
2297{
2298        unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2299        unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2300
2301        if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2302                          dma_to_mm_pfn(last_vpfn))) {
2303                printk(KERN_ERR "IOMMU: reserve iova failed\n");
2304                return -ENOMEM;
2305        }
2306
2307        pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2308                 start, end, domain->id);
2309        /*
2310         * RMRR range might have overlap with physical memory range,
2311         * clear it first
2312         */
2313        dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2314
2315        return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2316                                  last_vpfn - first_vpfn + 1,
2317                                  DMA_PTE_READ|DMA_PTE_WRITE);
2318}
2319
2320static int iommu_prepare_identity_map(struct device *dev,
2321                                      unsigned long long start,
2322                                      unsigned long long end)
2323{
2324        struct dmar_domain *domain;
2325        int ret;
2326
2327        domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2328        if (!domain)
2329                return -ENOMEM;
2330
2331        /* For _hardware_ passthrough, don't bother. But for software
2332           passthrough, we do it anyway -- it may indicate a memory
2333           range which is reserved in E820, so which didn't get set
2334           up to start with in si_domain */
2335        if (domain == si_domain && hw_pass_through) {
2336                printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2337                       dev_name(dev), start, end);
2338                return 0;
2339        }
2340
2341        printk(KERN_INFO
2342               "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2343               dev_name(dev), start, end);
2344        
2345        if (end < start) {
2346                WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2347                        "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2348                        dmi_get_system_info(DMI_BIOS_VENDOR),
2349                        dmi_get_system_info(DMI_BIOS_VERSION),
2350                     dmi_get_system_info(DMI_PRODUCT_VERSION));
2351                ret = -EIO;
2352                goto error;
2353        }
2354
2355        if (end >> agaw_to_width(domain->agaw)) {
2356                WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2357                     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2358                     agaw_to_width(domain->agaw),
2359                     dmi_get_system_info(DMI_BIOS_VENDOR),
2360                     dmi_get_system_info(DMI_BIOS_VERSION),
2361                     dmi_get_system_info(DMI_PRODUCT_VERSION));
2362                ret = -EIO;
2363                goto error;
2364        }
2365
2366        ret = iommu_domain_identity_map(domain, start, end);
2367        if (ret)
2368                goto error;
2369
2370        /* context entry init */
2371        ret = domain_context_mapping(domain, dev, CONTEXT_TT_MULTI_LEVEL);
2372        if (ret)
2373                goto error;
2374
2375        return 0;
2376
2377 error:
2378        domain_exit(domain);
2379        return ret;
2380}
2381
2382static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2383                                         struct device *dev)
2384{
2385        if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2386                return 0;
2387        return iommu_prepare_identity_map(dev, rmrr->base_address,
2388                                          rmrr->end_address);
2389}
2390
2391#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2392static inline void iommu_prepare_isa(void)
2393{
2394        struct pci_dev *pdev;
2395        int ret;
2396
2397        pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2398        if (!pdev)
2399                return;
2400
2401        printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2402        ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2403
2404        if (ret)
2405                printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2406                       "floppy might not work\n");
2407
2408}
2409#else
2410static inline void iommu_prepare_isa(void)
2411{
2412        return;
2413}
2414#endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2415
2416static int md_domain_init(struct dmar_domain *domain, int guest_width);
2417
2418static int __init si_domain_init(int hw)
2419{
2420        struct dmar_drhd_unit *drhd;
2421        struct intel_iommu *iommu;
2422        int nid, ret = 0;
2423
2424        si_domain = alloc_domain(false);
2425        if (!si_domain)
2426                return -EFAULT;
2427
2428        si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2429
2430        for_each_active_iommu(iommu, drhd) {
2431                ret = iommu_attach_domain(si_domain, iommu);
2432                if (ret) {
2433                        domain_exit(si_domain);
2434                        return -EFAULT;
2435                }
2436        }
2437
2438        if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2439                domain_exit(si_domain);
2440                return -EFAULT;
2441        }
2442
2443        pr_debug("IOMMU: identity mapping domain is domain %d\n",
2444                 si_domain->id);
2445
2446        if (hw)
2447                return 0;
2448
2449        for_each_online_node(nid) {
2450                unsigned long start_pfn, end_pfn;
2451                int i;
2452
2453                for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2454                        ret = iommu_domain_identity_map(si_domain,
2455                                        PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2456                        if (ret)
2457                                return ret;
2458                }
2459        }
2460
2461        return 0;
2462}
2463
2464static int identity_mapping(struct device *dev)
2465{
2466        struct device_domain_info *info;
2467
2468        if (likely(!iommu_identity_mapping))
2469                return 0;
2470
2471        info = dev->archdata.iommu;
2472        if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2473                return (info->domain == si_domain);
2474
2475        return 0;
2476}
2477
2478static int domain_add_dev_info(struct dmar_domain *domain,
2479                               struct device *dev, int translation)
2480{
2481        struct dmar_domain *ndomain;
2482        struct intel_iommu *iommu;
2483        u8 bus, devfn;
2484        int ret;
2485
2486        iommu = device_to_iommu(dev, &bus, &devfn);
2487        if (!iommu)
2488                return -ENODEV;
2489
2490        ndomain = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
2491        if (ndomain != domain)
2492                return -EBUSY;
2493
2494        ret = domain_context_mapping(domain, dev, translation);
2495        if (ret) {
2496                domain_remove_one_dev_info(domain, dev);
2497                return ret;
2498        }
2499
2500        return 0;
2501}
2502
2503static bool device_has_rmrr(struct device *dev)
2504{
2505        struct dmar_rmrr_unit *rmrr;
2506        struct device *tmp;
2507        int i;
2508
2509        rcu_read_lock();
2510        for_each_rmrr_units(rmrr) {
2511                /*
2512                 * Return TRUE if this RMRR contains the device that
2513                 * is passed in.
2514                 */
2515                for_each_active_dev_scope(rmrr->devices,
2516                                          rmrr->devices_cnt, i, tmp)
2517                        if (tmp == dev) {
2518                                rcu_read_unlock();
2519                                return true;
2520                        }
2521        }
2522        rcu_read_unlock();
2523        return false;
2524}
2525
2526static int iommu_should_identity_map(struct device *dev, int startup)
2527{
2528
2529        if (dev_is_pci(dev)) {
2530                struct pci_dev *pdev = to_pci_dev(dev);
2531
2532                /*
2533                 * We want to prevent any device associated with an RMRR from
2534                 * getting placed into the SI Domain. This is done because
2535                 * problems exist when devices are moved in and out of domains
2536                 * and their respective RMRR info is lost. We exempt USB devices
2537                 * from this process due to their usage of RMRRs that are known
2538                 * to not be needed after BIOS hand-off to OS.
2539                 */
2540                if (device_has_rmrr(dev) &&
2541                    (pdev->class >> 8) != PCI_CLASS_SERIAL_USB)
2542                        return 0;
2543
2544                if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2545                        return 1;
2546
2547                if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2548                        return 1;
2549
2550                if (!(iommu_identity_mapping & IDENTMAP_ALL))
2551                        return 0;
2552
2553                /*
2554                 * We want to start off with all devices in the 1:1 domain, and
2555                 * take them out later if we find they can't access all of memory.
2556                 *
2557                 * However, we can't do this for PCI devices behind bridges,
2558                 * because all PCI devices behind the same bridge will end up
2559                 * with the same source-id on their transactions.
2560                 *
2561                 * Practically speaking, we can't change things around for these
2562                 * devices at run-time, because we can't be sure there'll be no
2563                 * DMA transactions in flight for any of their siblings.
2564                 *
2565                 * So PCI devices (unless they're on the root bus) as well as
2566                 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2567                 * the 1:1 domain, just in _case_ one of their siblings turns out
2568                 * not to be able to map all of memory.
2569                 */
2570                if (!pci_is_pcie(pdev)) {
2571                        if (!pci_is_root_bus(pdev->bus))
2572                                return 0;
2573                        if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2574                                return 0;
2575                } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2576                        return 0;
2577        } else {
2578                if (device_has_rmrr(dev))
2579                        return 0;
2580        }
2581
2582        /*
2583         * At boot time, we don't yet know if devices will be 64-bit capable.
2584         * Assume that they will — if they turn out not to be, then we can
2585         * take them out of the 1:1 domain later.
2586         */
2587        if (!startup) {
2588                /*
2589                 * If the device's dma_mask is less than the system's memory
2590                 * size then this is not a candidate for identity mapping.
2591                 */
2592                u64 dma_mask = *dev->dma_mask;
2593
2594                if (dev->coherent_dma_mask &&
2595                    dev->coherent_dma_mask < dma_mask)
2596                        dma_mask = dev->coherent_dma_mask;
2597
2598                return dma_mask >= dma_get_required_mask(dev);
2599        }
2600
2601        return 1;
2602}
2603
2604static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2605{
2606        int ret;
2607
2608        if (!iommu_should_identity_map(dev, 1))
2609                return 0;
2610
2611        ret = domain_add_dev_info(si_domain, dev,
2612                                  hw ? CONTEXT_TT_PASS_THROUGH :
2613                                       CONTEXT_TT_MULTI_LEVEL);
2614        if (!ret)
2615                pr_info("IOMMU: %s identity mapping for device %s\n",
2616                        hw ? "hardware" : "software", dev_name(dev));
2617        else if (ret == -ENODEV)
2618                /* device not associated with an iommu */
2619                ret = 0;
2620
2621        return ret;
2622}
2623
2624
2625static int __init iommu_prepare_static_identity_mapping(int hw)
2626{
2627        struct pci_dev *pdev = NULL;
2628        struct dmar_drhd_unit *drhd;
2629        struct intel_iommu *iommu;
2630        struct device *dev;
2631        int i;
2632        int ret = 0;
2633
2634        ret = si_domain_init(hw);
2635        if (ret)
2636                return -EFAULT;
2637
2638        for_each_pci_dev(pdev) {
2639                ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2640                if (ret)
2641                        return ret;
2642        }
2643
2644        for_each_active_iommu(iommu, drhd)
2645                for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2646                        struct acpi_device_physical_node *pn;
2647                        struct acpi_device *adev;
2648
2649                        if (dev->bus != &acpi_bus_type)
2650                                continue;
2651                                
2652                        adev= to_acpi_device(dev);
2653                        mutex_lock(&adev->physical_node_lock);
2654                        list_for_each_entry(pn, &adev->physical_node_list, node) {
2655                                ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2656                                if (ret)
2657                                        break;
2658                        }
2659                        mutex_unlock(&adev->physical_node_lock);
2660                        if (ret)
2661                                return ret;
2662                }
2663
2664        return 0;
2665}
2666
2667static int __init init_dmars(void)
2668{
2669        struct dmar_drhd_unit *drhd;
2670        struct dmar_rmrr_unit *rmrr;
2671        struct device *dev;
2672        struct intel_iommu *iommu;
2673        int i, ret;
2674
2675        /*
2676         * for each drhd
2677         *    allocate root
2678         *    initialize and program root entry to not present
2679         * endfor
2680         */
2681        for_each_drhd_unit(drhd) {
2682                /*
2683                 * lock not needed as this is only incremented in the single
2684                 * threaded kernel __init code path all other access are read
2685                 * only
2686                 */
2687                if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2688                        g_num_of_iommus++;
2689                        continue;
2690                }
2691                printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2692                          IOMMU_UNITS_SUPPORTED);
2693        }
2694
2695        g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2696                        GFP_KERNEL);
2697        if (!g_iommus) {
2698                printk(KERN_ERR "Allocating global iommu array failed\n");
2699                ret = -ENOMEM;
2700                goto error;
2701        }
2702
2703        deferred_flush = kzalloc(g_num_of_iommus *
2704                sizeof(struct deferred_flush_tables), GFP_KERNEL);
2705        if (!deferred_flush) {
2706                ret = -ENOMEM;
2707                goto free_g_iommus;
2708        }
2709
2710        for_each_active_iommu(iommu, drhd) {
2711                g_iommus[iommu->seq_id] = iommu;
2712
2713                ret = iommu_init_domains(iommu);
2714                if (ret)
2715                        goto free_iommu;
2716
2717                /*
2718                 * TBD:
2719                 * we could share the same root & context tables
2720                 * among all IOMMU's. Need to Split it later.
2721                 */
2722                ret = iommu_alloc_root_entry(iommu);
2723                if (ret) {
2724                        printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2725                        goto free_iommu;
2726                }
2727                if (!ecap_pass_through(iommu->ecap))
2728                        hw_pass_through = 0;
2729        }
2730
2731        /*
2732         * Start from the sane iommu hardware state.
2733         */
2734        for_each_active_iommu(iommu, drhd) {
2735                /*
2736                 * If the queued invalidation is already initialized by us
2737                 * (for example, while enabling interrupt-remapping) then
2738                 * we got the things already rolling from a sane state.
2739                 */
2740                if (iommu->qi)
2741                        continue;
2742
2743                /*
2744                 * Clear any previous faults.
2745                 */
2746                dmar_fault(-1, iommu);
2747                /*
2748                 * Disable queued invalidation if supported and already enabled
2749                 * before OS handover.
2750                 */
2751                dmar_disable_qi(iommu);
2752        }
2753
2754        for_each_active_iommu(iommu, drhd) {
2755                if (dmar_enable_qi(iommu)) {
2756                        /*
2757                         * Queued Invalidate not enabled, use Register Based
2758                         * Invalidate
2759                         */
2760                        iommu->flush.flush_context = __iommu_flush_context;
2761                        iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2762                        printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2763                               "invalidation\n",
2764                                iommu->seq_id,
2765                               (unsigned long long)drhd->reg_base_addr);
2766                } else {
2767                        iommu->flush.flush_context = qi_flush_context;
2768                        iommu->flush.flush_iotlb = qi_flush_iotlb;
2769                        printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2770                               "invalidation\n",
2771                                iommu->seq_id,
2772                               (unsigned long long)drhd->reg_base_addr);
2773                }
2774        }
2775
2776        if (iommu_pass_through)
2777                iommu_identity_mapping |= IDENTMAP_ALL;
2778
2779#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2780        iommu_identity_mapping |= IDENTMAP_GFX;
2781#endif
2782
2783        check_tylersburg_isoch();
2784
2785        /*
2786         * If pass through is not set or not enabled, setup context entries for
2787         * identity mappings for rmrr, gfx, and isa and may fall back to static
2788         * identity mapping if iommu_identity_mapping is set.
2789         */
2790        if (iommu_identity_mapping) {
2791                ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2792                if (ret) {
2793                        printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2794                        goto free_iommu;
2795                }
2796        }
2797        /*
2798         * For each rmrr
2799         *   for each dev attached to rmrr
2800         *   do
2801         *     locate drhd for dev, alloc domain for dev
2802         *     allocate free domain
2803         *     allocate page table entries for rmrr
2804         *     if context not allocated for bus
2805         *           allocate and init context
2806         *           set present in root table for this bus
2807         *     init context with domain, translation etc
2808         *    endfor
2809         * endfor
2810         */
2811        printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2812        for_each_rmrr_units(rmrr) {
2813                /* some BIOS lists non-exist devices in DMAR table. */
2814                for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2815                                          i, dev) {
2816                        ret = iommu_prepare_rmrr_dev(rmrr, dev);
2817                        if (ret)
2818                                printk(KERN_ERR
2819                                       "IOMMU: mapping reserved region failed\n");
2820                }
2821        }
2822
2823        iommu_prepare_isa();
2824
2825        /*
2826         * for each drhd
2827         *   enable fault log
2828         *   global invalidate context cache
2829         *   global invalidate iotlb
2830         *   enable translation
2831         */
2832        for_each_iommu(iommu, drhd) {
2833                if (drhd->ignored) {
2834                        /*
2835                         * we always have to disable PMRs or DMA may fail on
2836                         * this device
2837                         */
2838                        if (force_on)
2839                                iommu_disable_protect_mem_regions(iommu);
2840                        continue;
2841                }
2842
2843                iommu_flush_write_buffer(iommu);
2844
2845                ret = dmar_set_interrupt(iommu);
2846                if (ret)
2847                        goto free_iommu;
2848
2849                iommu_set_root_entry(iommu);
2850
2851                iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2852                iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2853
2854                ret = iommu_enable_translation(iommu);
2855                if (ret)
2856                        goto free_iommu;
2857
2858                iommu_disable_protect_mem_regions(iommu);
2859        }
2860
2861        return 0;
2862
2863free_iommu:
2864        for_each_active_iommu(iommu, drhd)
2865                free_dmar_iommu(iommu);
2866        kfree(deferred_flush);
2867free_g_iommus:
2868        kfree(g_iommus);
2869error:
2870        return ret;
2871}
2872
2873/* This takes a number of _MM_ pages, not VTD pages */
2874static struct iova *intel_alloc_iova(struct device *dev,
2875                                     struct dmar_domain *domain,
2876                                     unsigned long nrpages, uint64_t dma_mask)
2877{
2878        struct iova *iova = NULL;
2879
2880        /* Restrict dma_mask to the width that the iommu can handle */
2881        dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2882
2883        if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2884                /*
2885                 * First try to allocate an io virtual address in
2886                 * DMA_BIT_MASK(32) and if that fails then try allocating
2887                 * from higher range
2888                 */
2889                iova = alloc_iova(&domain->iovad, nrpages,
2890                                  IOVA_PFN(DMA_BIT_MASK(32)), 1);
2891                if (iova)
2892                        return iova;
2893        }
2894        iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2895        if (unlikely(!iova)) {
2896                printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2897                       nrpages, dev_name(dev));
2898                return NULL;
2899        }
2900
2901        return iova;
2902}
2903
2904static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
2905{
2906        struct dmar_domain *domain;
2907        int ret;
2908
2909        domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2910        if (!domain) {
2911                printk(KERN_ERR "Allocating domain for %s failed",
2912                       dev_name(dev));
2913                return NULL;
2914        }
2915
2916        /* make sure context mapping is ok */
2917        if (unlikely(!domain_context_mapped(dev))) {
2918                ret = domain_context_mapping(domain, dev, CONTEXT_TT_MULTI_LEVEL);
2919                if (ret) {
2920                        printk(KERN_ERR "Domain context map for %s failed",
2921                               dev_name(dev));
2922                        return NULL;
2923                }
2924        }
2925
2926        return domain;
2927}
2928
2929static inline struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
2930{
2931        struct device_domain_info *info;
2932
2933        /* No lock here, assumes no domain exit in normal case */
2934        info = dev->archdata.iommu;
2935        if (likely(info))
2936                return info->domain;
2937
2938        return __get_valid_domain_for_dev(dev);
2939}
2940
2941static int iommu_dummy(struct device *dev)
2942{
2943        return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2944}
2945
2946/* Check if the dev needs to go through non-identity map and unmap process.*/
2947static int iommu_no_mapping(struct device *dev)
2948{
2949        int found;
2950
2951        if (iommu_dummy(dev))
2952                return 1;
2953
2954        if (!iommu_identity_mapping)
2955                return 0;
2956
2957        found = identity_mapping(dev);
2958        if (found) {
2959                if (iommu_should_identity_map(dev, 0))
2960                        return 1;
2961                else {
2962                        /*
2963                         * 32 bit DMA is removed from si_domain and fall back
2964                         * to non-identity mapping.
2965                         */
2966                        domain_remove_one_dev_info(si_domain, dev);
2967                        printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2968                               dev_name(dev));
2969                        return 0;
2970                }
2971        } else {
2972                /*
2973                 * In case of a detached 64 bit DMA device from vm, the device
2974                 * is put into si_domain for identity mapping.
2975                 */
2976                if (iommu_should_identity_map(dev, 0)) {
2977                        int ret;
2978                        ret = domain_add_dev_info(si_domain, dev,
2979                                                  hw_pass_through ?
2980                                                  CONTEXT_TT_PASS_THROUGH :
2981                                                  CONTEXT_TT_MULTI_LEVEL);
2982                        if (!ret) {
2983                                printk(KERN_INFO "64bit %s uses identity mapping\n",
2984                                       dev_name(dev));
2985                                return 1;
2986                        }
2987                }
2988        }
2989
2990        return 0;
2991}
2992
2993static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
2994                                     size_t size, int dir, u64 dma_mask)
2995{
2996        struct dmar_domain *domain;
2997        phys_addr_t start_paddr;
2998        struct iova *iova;
2999        int prot = 0;
3000        int ret;
3001        struct intel_iommu *iommu;
3002        unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3003
3004        BUG_ON(dir == DMA_NONE);
3005
3006        if (iommu_no_mapping(dev))
3007                return paddr;
3008
3009        domain = get_valid_domain_for_dev(dev);
3010        if (!domain)
3011                return 0;
3012
3013        iommu = domain_get_iommu(domain);
3014        size = aligned_nrpages(paddr, size);
3015
3016        iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3017        if (!iova)
3018                goto error;
3019
3020        /*
3021         * Check if DMAR supports zero-length reads on write only
3022         * mappings..
3023         */
3024        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3025                        !cap_zlr(iommu->cap))
3026                prot |= DMA_PTE_READ;
3027        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3028                prot |= DMA_PTE_WRITE;
3029        /*
3030         * paddr - (paddr + size) might be partial page, we should map the whole
3031         * page.  Note: if two part of one page are separately mapped, we
3032         * might have two guest_addr mapping to the same host paddr, but this
3033         * is not a big problem
3034         */
3035        ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
3036                                 mm_to_dma_pfn(paddr_pfn), size, prot);
3037        if (ret)
3038                goto error;
3039
3040        /* it's a non-present to present mapping. Only flush if caching mode */
3041        if (cap_caching_mode(iommu->cap))
3042                iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 0, 1);
3043        else
3044                iommu_flush_write_buffer(iommu);
3045
3046        start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
3047        start_paddr += paddr & ~PAGE_MASK;
3048        return start_paddr;
3049
3050error:
3051        if (iova)
3052                __free_iova(&domain->iovad, iova);
3053        printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
3054                dev_name(dev), size, (unsigned long long)paddr, dir);
3055        return 0;
3056}
3057
3058static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3059                                 unsigned long offset, size_t size,
3060                                 enum dma_data_direction dir,
3061                                 struct dma_attrs *attrs)
3062{
3063        return __intel_map_single(dev, page_to_phys(page) + offset, size,
3064                                  dir, *dev->dma_mask);
3065}
3066
3067static void flush_unmaps(void)
3068{
3069        int i, j;
3070
3071        timer_on = 0;
3072
3073        /* just flush them all */
3074        for (i = 0; i < g_num_of_iommus; i++) {
3075                struct intel_iommu *iommu = g_iommus[i];
3076                if (!iommu)
3077                        continue;
3078
3079                if (!deferred_flush[i].next)
3080                        continue;
3081
3082                /* In caching mode, global flushes turn emulation expensive */
3083                if (!cap_caching_mode(iommu->cap))
3084                        iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3085                                         DMA_TLB_GLOBAL_FLUSH);
3086                for (j = 0; j < deferred_flush[i].next; j++) {
3087                        unsigned long mask;
3088                        struct iova *iova = deferred_flush[i].iova[j];
3089                        struct dmar_domain *domain = deferred_flush[i].domain[j];
3090
3091                        /* On real hardware multiple invalidations are expensive */
3092                        if (cap_caching_mode(iommu->cap))
3093                                iommu_flush_iotlb_psi(iommu, domain->id,
3094                                        iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1,
3095                                        !deferred_flush[i].freelist[j], 0);
3096                        else {
3097                                mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
3098                                iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
3099                                                (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
3100                        }
3101                        __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
3102                        if (deferred_flush[i].freelist[j])
3103                                dma_free_pagelist(deferred_flush[i].freelist[j]);
3104                }
3105                deferred_flush[i].next = 0;
3106        }
3107
3108        list_size = 0;
3109}
3110
3111static void flush_unmaps_timeout(unsigned long data)
3112{
3113        unsigned long flags;
3114
3115        spin_lock_irqsave(&async_umap_flush_lock, flags);
3116        flush_unmaps();
3117        spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3118}
3119
3120static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
3121{
3122        unsigned long flags;
3123        int next, iommu_id;
3124        struct intel_iommu *iommu;
3125
3126        spin_lock_irqsave(&async_umap_flush_lock, flags);
3127        if (list_size == HIGH_WATER_MARK)
3128                flush_unmaps();
3129
3130        iommu = domain_get_iommu(dom);
3131        iommu_id = iommu->seq_id;
3132
3133        next = deferred_flush[iommu_id].next;
3134        deferred_flush[iommu_id].domain[next] = dom;
3135        deferred_flush[iommu_id].iova[next] = iova;
3136        deferred_flush[iommu_id].freelist[next] = freelist;
3137        deferred_flush[iommu_id].next++;
3138
3139        if (!timer_on) {
3140                mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
3141                timer_on = 1;
3142        }
3143        list_size++;
3144        spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3145}
3146
3147static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3148                             size_t size, enum dma_data_direction dir,
3149                             struct dma_attrs *attrs)
3150{
3151        struct dmar_domain *domain;
3152        unsigned long start_pfn, last_pfn;
3153        struct iova *iova;
3154        struct intel_iommu *iommu;
3155        struct page *freelist;
3156
3157        if (iommu_no_mapping(dev))
3158                return;
3159
3160        domain = find_domain(dev);
3161        BUG_ON(!domain);
3162
3163        iommu = domain_get_iommu(domain);
3164
3165        iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
3166        if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
3167                      (unsigned long long)dev_addr))
3168                return;
3169
3170        start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3171        last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3172
3173        pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3174                 dev_name(dev), start_pfn, last_pfn);
3175
3176        freelist = domain_unmap(domain, start_pfn, last_pfn);
3177
3178        if (intel_iommu_strict) {
3179                iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3180                                      last_pfn - start_pfn + 1, !freelist, 0);
3181                /* free iova */
3182                __free_iova(&domain->iovad, iova);
3183                dma_free_pagelist(freelist);
3184        } else {
3185                add_unmap(domain, iova, freelist);
3186                /*
3187                 * queue up the release of the unmap to save the 1/6th of the
3188                 * cpu used up by the iotlb flush operation...
3189                 */
3190        }
3191}
3192
3193static void *intel_alloc_coherent(struct device *dev, size_t size,
3194                                  dma_addr_t *dma_handle, gfp_t flags,
3195                                  struct dma_attrs *attrs)
3196{
3197        struct page *page = NULL;
3198        int order;
3199
3200        size = PAGE_ALIGN(size);
3201        order = get_order(size);
3202
3203        if (!iommu_no_mapping(dev))
3204                flags &= ~(GFP_DMA | GFP_DMA32);
3205        else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3206                if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3207                        flags |= GFP_DMA;
3208                else
3209                        flags |= GFP_DMA32;
3210        }
3211
3212        if (flags & __GFP_WAIT) {
3213                unsigned int count = size >> PAGE_SHIFT;
3214
3215                page = dma_alloc_from_contiguous(dev, count, order);
3216                if (page && iommu_no_mapping(dev) &&
3217                    page_to_phys(page) + size > dev->coherent_dma_mask) {
3218                        dma_release_from_contiguous(dev, page, count);
3219                        page = NULL;
3220                }
3221        }
3222
3223        if (!page)
3224                page = alloc_pages(flags, order);
3225        if (!page)
3226                return NULL;
3227        memset(page_address(page), 0, size);
3228
3229        *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3230                                         DMA_BIDIRECTIONAL,
3231                                         dev->coherent_dma_mask);
3232        if (*dma_handle)
3233                return page_address(page);
3234        if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3235                __free_pages(page, order);
3236
3237        return NULL;
3238}
3239
3240static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3241                                dma_addr_t dma_handle, struct dma_attrs *attrs)
3242{
3243        int order;
3244        struct page *page = virt_to_page(vaddr);
3245
3246        size = PAGE_ALIGN(size);
3247        order = get_order(size);
3248
3249        intel_unmap_page(dev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3250        if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3251                __free_pages(page, order);
3252}
3253
3254static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3255                           int nelems, enum dma_data_direction dir,
3256                           struct dma_attrs *attrs)
3257{
3258        struct dmar_domain *domain;
3259        unsigned long start_pfn, last_pfn;
3260        struct iova *iova;
3261        struct intel_iommu *iommu;
3262        struct page *freelist;
3263
3264        if (iommu_no_mapping(dev))
3265                return;
3266
3267        domain = find_domain(dev);
3268        BUG_ON(!domain);
3269
3270        iommu = domain_get_iommu(domain);
3271
3272        iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3273        if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3274                      (unsigned long long)sglist[0].dma_address))
3275                return;
3276
3277        start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3278        last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3279
3280        freelist = domain_unmap(domain, start_pfn, last_pfn);
3281
3282        if (intel_iommu_strict) {
3283                iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3284                                      last_pfn - start_pfn + 1, !freelist, 0);
3285                /* free iova */
3286                __free_iova(&domain->iovad, iova);
3287                dma_free_pagelist(freelist);
3288        } else {
3289                add_unmap(domain, iova, freelist);
3290                /*
3291                 * queue up the release of the unmap to save the 1/6th of the
3292                 * cpu used up by the iotlb flush operation...
3293                 */
3294        }
3295}
3296
3297static int intel_nontranslate_map_sg(struct device *hddev,
3298        struct scatterlist *sglist, int nelems, int dir)
3299{
3300        int i;
3301        struct scatterlist *sg;
3302
3303        for_each_sg(sglist, sg, nelems, i) {
3304                BUG_ON(!sg_page(sg));
3305                sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3306                sg->dma_length = sg->length;
3307        }
3308        return nelems;
3309}
3310
3311static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3312                        enum dma_data_direction dir, struct dma_attrs *attrs)
3313{
3314        int i;
3315        struct dmar_domain *domain;
3316        size_t size = 0;
3317        int prot = 0;
3318        struct iova *iova = NULL;
3319        int ret;
3320        struct scatterlist *sg;
3321        unsigned long start_vpfn;
3322        struct intel_iommu *iommu;
3323
3324        BUG_ON(dir == DMA_NONE);
3325        if (iommu_no_mapping(dev))
3326                return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3327
3328        domain = get_valid_domain_for_dev(dev);
3329        if (!domain)
3330                return 0;
3331
3332        iommu = domain_get_iommu(domain);
3333
3334        for_each_sg(sglist, sg, nelems, i)
3335                size += aligned_nrpages(sg->offset, sg->length);
3336
3337        iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3338                                *dev->dma_mask);
3339        if (!iova) {
3340                sglist->dma_length = 0;
3341                return 0;
3342        }
3343
3344        /*
3345         * Check if DMAR supports zero-length reads on write only
3346         * mappings..
3347         */
3348        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3349                        !cap_zlr(iommu->cap))
3350                prot |= DMA_PTE_READ;
3351        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3352                prot |= DMA_PTE_WRITE;
3353
3354        start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3355
3356        ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3357        if (unlikely(ret)) {
3358                /*  clear the page */
3359                dma_pte_clear_range(domain, start_vpfn,
3360                                    start_vpfn + size - 1);
3361                /* free page tables */
3362                dma_pte_free_pagetable(domain, start_vpfn,
3363                                       start_vpfn + size - 1);
3364                /* free iova */
3365                __free_iova(&domain->iovad, iova);
3366                return 0;
3367        }
3368
3369        /* it's a non-present to present mapping. Only flush if caching mode */
3370        if (cap_caching_mode(iommu->cap))
3371                iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 0, 1);
3372        else
3373                iommu_flush_write_buffer(iommu);
3374
3375        return nelems;
3376}
3377
3378static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3379{
3380        return !dma_addr;
3381}
3382
3383struct dma_map_ops intel_dma_ops = {
3384        .alloc = intel_alloc_coherent,
3385        .free = intel_free_coherent,
3386        .map_sg = intel_map_sg,
3387        .unmap_sg = intel_unmap_sg,
3388        .map_page = intel_map_page,
3389        .unmap_page = intel_unmap_page,
3390        .mapping_error = intel_mapping_error,
3391};
3392
3393static inline int iommu_domain_cache_init(void)
3394{
3395        int ret = 0;
3396
3397        iommu_domain_cache = kmem_cache_create("iommu_domain",
3398                                         sizeof(struct dmar_domain),
3399                                         0,
3400                                         SLAB_HWCACHE_ALIGN,
3401
3402                                         NULL);
3403        if (!iommu_domain_cache) {
3404                printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3405                ret = -ENOMEM;
3406        }
3407
3408        return ret;
3409}
3410
3411static inline int iommu_devinfo_cache_init(void)
3412{
3413        int ret = 0;
3414
3415        iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3416                                         sizeof(struct device_domain_info),
3417                                         0,
3418                                         SLAB_HWCACHE_ALIGN,
3419                                         NULL);
3420        if (!iommu_devinfo_cache) {
3421                printk(KERN_ERR "Couldn't create devinfo cache\n");
3422                ret = -ENOMEM;
3423        }
3424
3425        return ret;
3426}
3427
3428static inline int iommu_iova_cache_init(void)
3429{
3430        int ret = 0;
3431
3432        iommu_iova_cache = kmem_cache_create("iommu_iova",
3433                                         sizeof(struct iova),
3434                                         0,
3435                                         SLAB_HWCACHE_ALIGN,
3436                                         NULL);
3437        if (!iommu_iova_cache) {
3438                printk(KERN_ERR "Couldn't create iova cache\n");
3439                ret = -ENOMEM;
3440        }
3441
3442        return ret;
3443}
3444
3445static int __init iommu_init_mempool(void)
3446{
3447        int ret;
3448        ret = iommu_iova_cache_init();
3449        if (ret)
3450                return ret;
3451
3452        ret = iommu_domain_cache_init();
3453        if (ret)
3454                goto domain_error;
3455
3456        ret = iommu_devinfo_cache_init();
3457        if (!ret)
3458                return ret;
3459
3460        kmem_cache_destroy(iommu_domain_cache);
3461domain_error:
3462        kmem_cache_destroy(iommu_iova_cache);
3463
3464        return -ENOMEM;
3465}
3466
3467static void __init iommu_exit_mempool(void)
3468{
3469        kmem_cache_destroy(iommu_devinfo_cache);
3470        kmem_cache_destroy(iommu_domain_cache);
3471        kmem_cache_destroy(iommu_iova_cache);
3472
3473}
3474
3475static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3476{
3477        struct dmar_drhd_unit *drhd;
3478        u32 vtbar;
3479        int rc;
3480
3481        /* We know that this device on this chipset has its own IOMMU.
3482         * If we find it under a different IOMMU, then the BIOS is lying
3483         * to us. Hope that the IOMMU for this device is actually
3484         * disabled, and it needs no translation...
3485         */
3486        rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3487        if (rc) {
3488                /* "can't" happen */
3489                dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3490                return;
3491        }
3492        vtbar &= 0xffff0000;
3493
3494        /* we know that the this iommu should be at offset 0xa000 from vtbar */
3495        drhd = dmar_find_matched_drhd_unit(pdev);
3496        if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3497                            TAINT_FIRMWARE_WORKAROUND,
3498                            "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3499                pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3500}
3501DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3502
3503static void __init init_no_remapping_devices(void)
3504{
3505        struct dmar_drhd_unit *drhd;
3506        struct device *dev;
3507        int i;
3508
3509        for_each_drhd_unit(drhd) {
3510                if (!drhd->include_all) {
3511                        for_each_active_dev_scope(drhd->devices,
3512                                                  drhd->devices_cnt, i, dev)
3513                                break;
3514                        /* ignore DMAR unit if no devices exist */
3515                        if (i == drhd->devices_cnt)
3516                                drhd->ignored = 1;
3517                }
3518        }
3519
3520        for_each_active_drhd_unit(drhd) {
3521                if (drhd->include_all)
3522                        continue;
3523
3524                for_each_active_dev_scope(drhd->devices,
3525                                          drhd->devices_cnt, i, dev)
3526                        if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3527                                break;
3528                if (i < drhd->devices_cnt)
3529                        continue;
3530
3531                /* This IOMMU has *only* gfx devices. Either bypass it or
3532                   set the gfx_mapped flag, as appropriate */
3533                if (dmar_map_gfx) {
3534                        intel_iommu_gfx_mapped = 1;
3535                } else {
3536                        drhd->ignored = 1;
3537                        for_each_active_dev_scope(drhd->devices,
3538                                                  drhd->devices_cnt, i, dev)
3539                                dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3540                }
3541        }
3542}
3543
3544#ifdef CONFIG_SUSPEND
3545static int init_iommu_hw(void)
3546{
3547        struct dmar_drhd_unit *drhd;
3548        struct intel_iommu *iommu = NULL;
3549
3550        for_each_active_iommu(iommu, drhd)
3551                if (iommu->qi)
3552                        dmar_reenable_qi(iommu);
3553
3554        for_each_iommu(iommu, drhd) {
3555                if (drhd->ignored) {
3556                        /*
3557                         * we always have to disable PMRs or DMA may fail on
3558                         * this device
3559                         */
3560                        if (force_on)
3561                                iommu_disable_protect_mem_regions(iommu);
3562                        continue;
3563                }
3564        
3565                iommu_flush_write_buffer(iommu);
3566
3567                iommu_set_root_entry(iommu);
3568
3569                iommu->flush.flush_context(iommu, 0, 0, 0,
3570                                           DMA_CCMD_GLOBAL_INVL);
3571                iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3572                                         DMA_TLB_GLOBAL_FLUSH);
3573                if (iommu_enable_translation(iommu))
3574                        return 1;
3575                iommu_disable_protect_mem_regions(iommu);
3576        }
3577
3578        return 0;
3579}
3580
3581static void iommu_flush_all(void)
3582{
3583        struct dmar_drhd_unit *drhd;
3584        struct intel_iommu *iommu;
3585
3586        for_each_active_iommu(iommu, drhd) {
3587                iommu->flush.flush_context(iommu, 0, 0, 0,
3588                                           DMA_CCMD_GLOBAL_INVL);
3589                iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3590                                         DMA_TLB_GLOBAL_FLUSH);
3591        }
3592}
3593
3594static int iommu_suspend(void)
3595{
3596        struct dmar_drhd_unit *drhd;
3597        struct intel_iommu *iommu = NULL;
3598        unsigned long flag;
3599
3600        for_each_active_iommu(iommu, drhd) {
3601                iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3602                                                 GFP_ATOMIC);
3603                if (!iommu->iommu_state)
3604                        goto nomem;
3605        }
3606
3607        iommu_flush_all();
3608
3609        for_each_active_iommu(iommu, drhd) {
3610                iommu_disable_translation(iommu);
3611
3612                raw_spin_lock_irqsave(&iommu->register_lock, flag);
3613
3614                iommu->iommu_state[SR_DMAR_FECTL_REG] =
3615                        readl(iommu->reg + DMAR_FECTL_REG);
3616                iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3617                        readl(iommu->reg + DMAR_FEDATA_REG);
3618                iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3619                        readl(iommu->reg + DMAR_FEADDR_REG);
3620                iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3621                        readl(iommu->reg + DMAR_FEUADDR_REG);
3622
3623                raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3624        }
3625        return 0;
3626
3627nomem:
3628        for_each_active_iommu(iommu, drhd)
3629                kfree(iommu->iommu_state);
3630
3631        return -ENOMEM;
3632}
3633
3634static void iommu_resume(void)
3635{
3636        struct dmar_drhd_unit *drhd;
3637        struct intel_iommu *iommu = NULL;
3638        unsigned long flag;
3639
3640        if (init_iommu_hw()) {
3641                if (force_on)
3642                        panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3643                else
3644                        WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3645                return;
3646        }
3647
3648        for_each_active_iommu(iommu, drhd) {
3649
3650                raw_spin_lock_irqsave(&iommu->register_lock, flag);
3651
3652                writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3653                        iommu->reg + DMAR_FECTL_REG);
3654                writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3655                        iommu->reg + DMAR_FEDATA_REG);
3656                writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3657                        iommu->reg + DMAR_FEADDR_REG);
3658                writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3659                        iommu->reg + DMAR_FEUADDR_REG);
3660
3661                raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3662        }
3663
3664        for_each_active_iommu(iommu, drhd)
3665                kfree(iommu->iommu_state);
3666}
3667
3668static struct syscore_ops iommu_syscore_ops = {
3669        .resume         = iommu_resume,
3670        .suspend        = iommu_suspend,
3671};
3672
3673static void __init init_iommu_pm_ops(void)
3674{
3675        register_syscore_ops(&iommu_syscore_ops);
3676}
3677
3678#else
3679static inline void init_iommu_pm_ops(void) {}
3680#endif  /* CONFIG_PM */
3681
3682
3683int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3684{
3685        struct acpi_dmar_reserved_memory *rmrr;
3686        struct dmar_rmrr_unit *rmrru;
3687
3688        rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3689        if (!rmrru)
3690                return -ENOMEM;
3691
3692        rmrru->hdr = header;
3693        rmrr = (struct acpi_dmar_reserved_memory *)header;
3694        rmrru->base_address = rmrr->base_address;
3695        rmrru->end_address = rmrr->end_address;
3696        rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3697                                ((void *)rmrr) + rmrr->header.length,
3698                                &rmrru->devices_cnt);
3699        if (rmrru->devices_cnt && rmrru->devices == NULL) {
3700                kfree(rmrru);
3701                return -ENOMEM;
3702        }
3703
3704        list_add(&rmrru->list, &dmar_rmrr_units);
3705
3706        return 0;
3707}
3708
3709int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3710{
3711        struct acpi_dmar_atsr *atsr;
3712        struct dmar_atsr_unit *atsru;
3713
3714        atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3715        atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3716        if (!atsru)
3717                return -ENOMEM;
3718
3719        atsru->hdr = hdr;
3720        atsru->include_all = atsr->flags & 0x1;
3721        if (!atsru->include_all) {
3722                atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3723                                (void *)atsr + atsr->header.length,
3724                                &atsru->devices_cnt);
3725                if (atsru->devices_cnt && atsru->devices == NULL) {
3726                        kfree(atsru);
3727                        return -ENOMEM;
3728                }
3729        }
3730
3731        list_add_rcu(&atsru->list, &dmar_atsr_units);
3732
3733        return 0;
3734}
3735
3736static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3737{
3738        dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3739        kfree(atsru);
3740}
3741
3742static void intel_iommu_free_dmars(void)
3743{
3744        struct dmar_rmrr_unit *rmrru, *rmrr_n;
3745        struct dmar_atsr_unit *atsru, *atsr_n;
3746
3747        list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3748                list_del(&rmrru->list);
3749                dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3750                kfree(rmrru);
3751        }
3752
3753        list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3754                list_del(&atsru->list);
3755                intel_iommu_free_atsr(atsru);
3756        }
3757}
3758
3759int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3760{
3761        int i, ret = 1;
3762        struct pci_bus *bus;
3763        struct pci_dev *bridge = NULL;
3764        struct device *tmp;
3765        struct acpi_dmar_atsr *atsr;
3766        struct dmar_atsr_unit *atsru;
3767
3768        dev = pci_physfn(dev);
3769        for (bus = dev->bus; bus; bus = bus->parent) {
3770                bridge = bus->self;
3771                if (!bridge || !pci_is_pcie(bridge) ||
3772                    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3773                        return 0;
3774                if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3775                        break;
3776        }
3777        if (!bridge)
3778                return 0;
3779
3780        rcu_read_lock();
3781        list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3782                atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3783                if (atsr->segment != pci_domain_nr(dev->bus))
3784                        continue;
3785
3786                for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3787                        if (tmp == &bridge->dev)
3788                                goto out;
3789
3790                if (atsru->include_all)
3791                        goto out;
3792        }
3793        ret = 0;
3794out:
3795        rcu_read_unlock();
3796
3797        return ret;
3798}
3799
3800int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3801{
3802        int ret = 0;
3803        struct dmar_rmrr_unit *rmrru;
3804        struct dmar_atsr_unit *atsru;
3805        struct acpi_dmar_atsr *atsr;
3806        struct acpi_dmar_reserved_memory *rmrr;
3807
3808        if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
3809                return 0;
3810
3811        list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3812                rmrr = container_of(rmrru->hdr,
3813                                    struct acpi_dmar_reserved_memory, header);
3814                if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3815                        ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3816                                ((void *)rmrr) + rmrr->header.length,
3817                                rmrr->segment, rmrru->devices,
3818                                rmrru->devices_cnt);
3819                        if(ret < 0)
3820                                return ret;
3821                } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
3822                        dmar_remove_dev_scope(info, rmrr->segment,
3823                                rmrru->devices, rmrru->devices_cnt);
3824                }
3825        }
3826
3827        list_for_each_entry(atsru, &dmar_atsr_units, list) {
3828                if (atsru->include_all)
3829                        continue;
3830
3831                atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3832                if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3833                        ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3834                                        (void *)atsr + atsr->header.length,
3835                                        atsr->segment, atsru->devices,
3836                                        atsru->devices_cnt);
3837                        if (ret > 0)
3838                                break;
3839                        else if(ret < 0)
3840                                return ret;
3841                } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
3842                        if (dmar_remove_dev_scope(info, atsr->segment,
3843                                        atsru->devices, atsru->devices_cnt))
3844                                break;
3845                }
3846        }
3847
3848        return 0;
3849}
3850
3851/*
3852 * Here we only respond to action of unbound device from driver.
3853 *
3854 * Added device is not attached to its DMAR domain here yet. That will happen
3855 * when mapping the device to iova.
3856 */
3857static int device_notifier(struct notifier_block *nb,
3858                                  unsigned long action, void *data)
3859{
3860        struct device *dev = data;
3861        struct dmar_domain *domain;
3862
3863        if (iommu_dummy(dev))
3864                return 0;
3865
3866        if (action != BUS_NOTIFY_UNBOUND_DRIVER &&
3867            action != BUS_NOTIFY_DEL_DEVICE)
3868                return 0;
3869
3870        domain = find_domain(dev);
3871        if (!domain)
3872                return 0;
3873
3874        down_read(&dmar_global_lock);
3875        domain_remove_one_dev_info(domain, dev);
3876        if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3877            !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3878            list_empty(&domain->devices))
3879                domain_exit(domain);
3880        up_read(&dmar_global_lock);
3881
3882        return 0;
3883}
3884
3885static struct notifier_block device_nb = {
3886        .notifier_call = device_notifier,
3887};
3888
3889static int intel_iommu_memory_notifier(struct notifier_block *nb,
3890                                       unsigned long val, void *v)
3891{
3892        struct memory_notify *mhp = v;
3893        unsigned long long start, end;
3894        unsigned long start_vpfn, last_vpfn;
3895
3896        switch (val) {
3897        case MEM_GOING_ONLINE:
3898                start = mhp->start_pfn << PAGE_SHIFT;
3899                end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
3900                if (iommu_domain_identity_map(si_domain, start, end)) {
3901                        pr_warn("dmar: failed to build identity map for [%llx-%llx]\n",
3902                                start, end);
3903                        return NOTIFY_BAD;
3904                }
3905                break;
3906
3907        case MEM_OFFLINE:
3908        case MEM_CANCEL_ONLINE:
3909                start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3910                last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
3911                while (start_vpfn <= last_vpfn) {
3912                        struct iova *iova;
3913                        struct dmar_drhd_unit *drhd;
3914                        struct intel_iommu *iommu;
3915                        struct page *freelist;
3916
3917                        iova = find_iova(&si_domain->iovad, start_vpfn);
3918                        if (iova == NULL) {
3919                                pr_debug("dmar: failed get IOVA for PFN %lx\n",
3920                                         start_vpfn);
3921                                break;
3922                        }
3923
3924                        iova = split_and_remove_iova(&si_domain->iovad, iova,
3925                                                     start_vpfn, last_vpfn);
3926                        if (iova == NULL) {
3927                                pr_warn("dmar: failed to split IOVA PFN [%lx-%lx]\n",
3928                                        start_vpfn, last_vpfn);
3929                                return NOTIFY_BAD;
3930                        }
3931
3932                        freelist = domain_unmap(si_domain, iova->pfn_lo,
3933                                               iova->pfn_hi);
3934
3935                        rcu_read_lock();
3936                        for_each_active_iommu(iommu, drhd)
3937                                iommu_flush_iotlb_psi(iommu, si_domain->id,
3938                                        iova->pfn_lo,
3939                                        iova->pfn_hi - iova->pfn_lo + 1,
3940                                        !freelist, 0);
3941                        rcu_read_unlock();
3942                        dma_free_pagelist(freelist);
3943
3944                        start_vpfn = iova->pfn_hi + 1;
3945                        free_iova_mem(iova);
3946                }
3947                break;
3948        }
3949
3950        return NOTIFY_OK;
3951}
3952
3953static struct notifier_block intel_iommu_memory_nb = {
3954        .notifier_call = intel_iommu_memory_notifier,
3955        .priority = 0
3956};
3957
3958int __init intel_iommu_init(void)
3959{
3960        int ret = -ENODEV;
3961        struct dmar_drhd_unit *drhd;
3962        struct intel_iommu *iommu;
3963
3964        /* VT-d is required for a TXT/tboot launch, so enforce that */
3965        force_on = tboot_force_iommu();
3966
3967        if (iommu_init_mempool()) {
3968                if (force_on)
3969                        panic("tboot: Failed to initialize iommu memory\n");
3970                return -ENOMEM;
3971        }
3972
3973        down_write(&dmar_global_lock);
3974        if (dmar_table_init()) {
3975                if (force_on)
3976                        panic("tboot: Failed to initialize DMAR table\n");
3977                goto out_free_dmar;
3978        }
3979
3980        /*
3981         * Disable translation if already enabled prior to OS handover.
3982         */
3983        for_each_active_iommu(iommu, drhd)
3984                if (iommu->gcmd & DMA_GCMD_TE)
3985                        iommu_disable_translation(iommu);
3986
3987        if (dmar_dev_scope_init() < 0) {
3988                if (force_on)
3989                        panic("tboot: Failed to initialize DMAR device scope\n");
3990                goto out_free_dmar;
3991        }
3992
3993        if (no_iommu || dmar_disabled)
3994                goto out_free_dmar;
3995
3996        if (list_empty(&dmar_rmrr_units))
3997                printk(KERN_INFO "DMAR: No RMRR found\n");
3998
3999        if (list_empty(&dmar_atsr_units))
4000                printk(KERN_INFO "DMAR: No ATSR found\n");
4001
4002        if (dmar_init_reserved_ranges()) {
4003                if (force_on)
4004                        panic("tboot: Failed to reserve iommu ranges\n");
4005                goto out_free_reserved_range;
4006        }
4007
4008        init_no_remapping_devices();
4009
4010        ret = init_dmars();
4011        if (ret) {
4012                if (force_on)
4013                        panic("tboot: Failed to initialize DMARs\n");
4014                printk(KERN_ERR "IOMMU: dmar init failed\n");
4015                goto out_free_reserved_range;
4016        }
4017        up_write(&dmar_global_lock);
4018        printk(KERN_INFO
4019        "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
4020
4021        init_timer(&unmap_timer);
4022#ifdef CONFIG_SWIOTLB
4023        swiotlb = 0;
4024#endif
4025        dma_ops = &intel_dma_ops;
4026
4027        init_iommu_pm_ops();
4028
4029        bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4030        bus_register_notifier(&pci_bus_type, &device_nb);
4031        if (si_domain && !hw_pass_through)
4032                register_memory_notifier(&intel_iommu_memory_nb);
4033
4034        intel_iommu_enabled = 1;
4035
4036        return 0;
4037
4038out_free_reserved_range:
4039        put_iova_domain(&reserved_iova_list);
4040out_free_dmar:
4041        intel_iommu_free_dmars();
4042        up_write(&dmar_global_lock);
4043        iommu_exit_mempool();
4044        return ret;
4045}
4046
4047static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
4048                                           struct device *dev)
4049{
4050        struct pci_dev *tmp, *parent, *pdev;
4051
4052        if (!iommu || !dev || !dev_is_pci(dev))
4053                return;
4054
4055        pdev = to_pci_dev(dev);
4056
4057        /* dependent device detach */
4058        tmp = pci_find_upstream_pcie_bridge(pdev);
4059        /* Secondary interface's bus number and devfn 0 */
4060        if (tmp) {
4061                parent = pdev->bus->self;
4062                while (parent != tmp) {
4063                        iommu_detach_dev(iommu, parent->bus->number,
4064                                         parent->devfn);
4065                        parent = parent->bus->self;
4066                }
4067                if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
4068                        iommu_detach_dev(iommu,
4069                                tmp->subordinate->number, 0);
4070                else /* this is a legacy PCI bridge */
4071                        iommu_detach_dev(iommu, tmp->bus->number,
4072                                         tmp->devfn);
4073        }
4074}
4075
4076static void domain_remove_one_dev_info(struct dmar_domain *domain,
4077                                       struct device *dev)
4078{
4079        struct device_domain_info *info, *tmp;
4080        struct intel_iommu *iommu;
4081        unsigned long flags;
4082        int found = 0;
4083        u8 bus, devfn;
4084
4085        iommu = device_to_iommu(dev, &bus, &devfn);
4086        if (!iommu)
4087                return;
4088
4089        spin_lock_irqsave(&device_domain_lock, flags);
4090        list_for_each_entry_safe(info, tmp, &domain->devices, link) {
4091                if (info->iommu == iommu && info->bus == bus &&
4092                    info->devfn == devfn) {
4093                        unlink_domain_info(info);
4094                        spin_unlock_irqrestore(&device_domain_lock, flags);
4095
4096                        iommu_disable_dev_iotlb(info);
4097                        iommu_detach_dev(iommu, info->bus, info->devfn);
4098                        iommu_detach_dependent_devices(iommu, dev);
4099                        free_devinfo_mem(info);
4100
4101                        spin_lock_irqsave(&device_domain_lock, flags);
4102
4103                        if (found)
4104                                break;
4105                        else
4106                                continue;
4107                }
4108
4109                /* if there is no other devices under the same iommu
4110                 * owned by this domain, clear this iommu in iommu_bmp
4111                 * update iommu count and coherency
4112                 */
4113                if (info->iommu == iommu)
4114                        found = 1;
4115        }
4116
4117        spin_unlock_irqrestore(&device_domain_lock, flags);
4118
4119        if (found == 0) {
4120                unsigned long tmp_flags;
4121                spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
4122                clear_bit(iommu->seq_id, domain->iommu_bmp);
4123                domain->iommu_count--;
4124                domain_update_iommu_cap(domain);
4125                spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
4126
4127                if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
4128                    !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
4129                        spin_lock_irqsave(&iommu->lock, tmp_flags);
4130                        clear_bit(domain->id, iommu->domain_ids);
4131                        iommu->domains[domain->id] = NULL;
4132                        spin_unlock_irqrestore(&iommu->lock, tmp_flags);
4133                }
4134        }
4135}
4136
4137static int md_domain_init(struct dmar_domain *domain, int guest_width)
4138{
4139        int adjust_width;
4140
4141        init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
4142        domain_reserve_special_ranges(domain);
4143
4144        /* calculate AGAW */
4145        domain->gaw = guest_width;
4146        adjust_width = guestwidth_to_adjustwidth(guest_width);
4147        domain->agaw = width_to_agaw(adjust_width);
4148
4149        domain->iommu_coherency = 0;
4150        domain->iommu_snooping = 0;
4151        domain->iommu_superpage = 0;
4152        domain->max_addr = 0;
4153        domain->nid = -1;
4154
4155        /* always allocate the top pgd */
4156        domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4157        if (!domain->pgd)
4158                return -ENOMEM;
4159        domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4160        return 0;
4161}
4162
4163static int intel_iommu_domain_init(struct iommu_domain *domain)
4164{
4165        struct dmar_domain *dmar_domain;
4166
4167        dmar_domain = alloc_domain(true);
4168        if (!dmar_domain) {
4169                printk(KERN_ERR
4170                        "intel_iommu_domain_init: dmar_domain == NULL\n");
4171                return -ENOMEM;
4172        }
4173        if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4174                printk(KERN_ERR
4175                        "intel_iommu_domain_init() failed\n");
4176                domain_exit(dmar_domain);
4177                return -ENOMEM;
4178        }
4179        domain_update_iommu_cap(dmar_domain);
4180        domain->priv = dmar_domain;
4181
4182        domain->geometry.aperture_start = 0;
4183        domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4184        domain->geometry.force_aperture = true;
4185
4186        return 0;
4187}
4188
4189static void intel_iommu_domain_destroy(struct iommu_domain *domain)
4190{
4191        struct dmar_domain *dmar_domain = domain->priv;
4192
4193        domain->priv = NULL;
4194        domain_exit(dmar_domain);
4195}
4196
4197static int intel_iommu_attach_device(struct iommu_domain *domain,
4198                                     struct device *dev)
4199{
4200        struct dmar_domain *dmar_domain = domain->priv;
4201        struct intel_iommu *iommu;
4202        int addr_width;
4203        u8 bus, devfn;
4204
4205        /* normally dev is not mapped */
4206        if (unlikely(domain_context_mapped(dev))) {
4207                struct dmar_domain *old_domain;
4208
4209                old_domain = find_domain(dev);
4210                if (old_domain) {
4211                        if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
4212                            dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
4213                                domain_remove_one_dev_info(old_domain, dev);
4214                        else
4215                                domain_remove_dev_info(old_domain);
4216                }
4217        }
4218
4219        iommu = device_to_iommu(dev, &bus, &devfn);
4220        if (!iommu)
4221                return -ENODEV;
4222
4223        /* check if this iommu agaw is sufficient for max mapped address */
4224        addr_width = agaw_to_width(iommu->agaw);
4225        if (addr_width > cap_mgaw(iommu->cap))
4226                addr_width = cap_mgaw(iommu->cap);
4227
4228        if (dmar_domain->max_addr > (1LL << addr_width)) {
4229                printk(KERN_ERR "%s: iommu width (%d) is not "
4230                       "sufficient for the mapped address (%llx)\n",
4231                       __func__, addr_width, dmar_domain->max_addr);
4232                return -EFAULT;
4233        }
4234        dmar_domain->gaw = addr_width;
4235
4236        /*
4237         * Knock out extra levels of page tables if necessary
4238         */
4239        while (iommu->agaw < dmar_domain->agaw) {
4240                struct dma_pte *pte;
4241
4242                pte = dmar_domain->pgd;
4243                if (dma_pte_present(pte)) {
4244                        dmar_domain->pgd = (struct dma_pte *)
4245                                phys_to_virt(dma_pte_addr(pte));
4246                        free_pgtable_page(pte);
4247                }
4248                dmar_domain->agaw--;
4249        }
4250
4251        return domain_add_dev_info(dmar_domain, dev, CONTEXT_TT_MULTI_LEVEL);
4252}
4253
4254static void intel_iommu_detach_device(struct iommu_domain *domain,
4255                                      struct device *dev)
4256{
4257        struct dmar_domain *dmar_domain = domain->priv;
4258
4259        domain_remove_one_dev_info(dmar_domain, dev);
4260}
4261
4262static int intel_iommu_map(struct iommu_domain *domain,
4263                           unsigned long iova, phys_addr_t hpa,
4264                           size_t size, int iommu_prot)
4265{
4266        struct dmar_domain *dmar_domain = domain->priv;
4267        u64 max_addr;
4268        int prot = 0;
4269        int ret;
4270
4271        if (iommu_prot & IOMMU_READ)
4272                prot |= DMA_PTE_READ;
4273        if (iommu_prot & IOMMU_WRITE)
4274                prot |= DMA_PTE_WRITE;
4275        if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4276                prot |= DMA_PTE_SNP;
4277
4278        max_addr = iova + size;
4279        if (dmar_domain->max_addr < max_addr) {
4280                u64 end;
4281
4282                /* check if minimum agaw is sufficient for mapped address */
4283                end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4284                if (end < max_addr) {
4285                        printk(KERN_ERR "%s: iommu width (%d) is not "
4286                               "sufficient for the mapped address (%llx)\n",
4287                               __func__, dmar_domain->gaw, max_addr);
4288                        return -EFAULT;
4289                }
4290                dmar_domain->max_addr = max_addr;
4291        }
4292        /* Round up size to next multiple of PAGE_SIZE, if it and
4293           the low bits of hpa would take us onto the next page */
4294        size = aligned_nrpages(hpa, size);
4295        ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4296                                 hpa >> VTD_PAGE_SHIFT, size, prot);
4297        return ret;
4298}
4299
4300static size_t intel_iommu_unmap(struct iommu_domain *domain,
4301                                unsigned long iova, size_t size)
4302{
4303        struct dmar_domain *dmar_domain = domain->priv;
4304        struct page *freelist = NULL;
4305        struct intel_iommu *iommu;
4306        unsigned long start_pfn, last_pfn;
4307        unsigned int npages;
4308        int iommu_id, num, ndomains, level = 0;
4309
4310        /* Cope with horrid API which requires us to unmap more than the
4311           size argument if it happens to be a large-page mapping. */
4312        if (!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level))
4313                BUG();
4314
4315        if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4316                size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4317
4318        start_pfn = iova >> VTD_PAGE_SHIFT;
4319        last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4320
4321        freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
4322
4323        npages = last_pfn - start_pfn + 1;
4324
4325        for_each_set_bit(iommu_id, dmar_domain->iommu_bmp, g_num_of_iommus) {
4326               iommu = g_iommus[iommu_id];
4327
4328               /*
4329                * find bit position of dmar_domain
4330                */
4331               ndomains = cap_ndoms(iommu->cap);
4332               for_each_set_bit(num, iommu->domain_ids, ndomains) {
4333                       if (iommu->domains[num] == dmar_domain)
4334                               iommu_flush_iotlb_psi(iommu, num, start_pfn,
4335                                                     npages, !freelist, 0);
4336               }
4337
4338        }
4339
4340        dma_free_pagelist(freelist);
4341
4342        if (dmar_domain->max_addr == iova + size)
4343                dmar_domain->max_addr = iova;
4344
4345        return size;
4346}
4347
4348static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4349                                            dma_addr_t iova)
4350{
4351        struct dmar_domain *dmar_domain = domain->priv;
4352        struct dma_pte *pte;
4353        int level = 0;
4354        u64 phys = 0;
4355
4356        pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4357        if (pte)
4358                phys = dma_pte_addr(pte);
4359
4360        return phys;
4361}
4362
4363static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4364                                      unsigned long cap)
4365{
4366        struct dmar_domain *dmar_domain = domain->priv;
4367
4368        if (cap == IOMMU_CAP_CACHE_COHERENCY)
4369                return dmar_domain->iommu_snooping;
4370        if (cap == IOMMU_CAP_INTR_REMAP)
4371                return irq_remapping_enabled;
4372
4373        return 0;
4374}
4375
4376#define REQ_ACS_FLAGS   (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
4377
4378static int intel_iommu_add_device(struct device *dev)
4379{
4380        struct pci_dev *pdev = to_pci_dev(dev);
4381        struct pci_dev *bridge, *dma_pdev = NULL;
4382        struct iommu_group *group;
4383        int ret;
4384        u8 bus, devfn;
4385
4386        if (!device_to_iommu(dev, &bus, &devfn))
4387                return -ENODEV;
4388
4389        bridge = pci_find_upstream_pcie_bridge(pdev);
4390        if (bridge) {
4391                if (pci_is_pcie(bridge))
4392                        dma_pdev = pci_get_domain_bus_and_slot(
4393                                                pci_domain_nr(pdev->bus),
4394                                                bridge->subordinate->number, 0);
4395                if (!dma_pdev)
4396                        dma_pdev = pci_dev_get(bridge);
4397        } else
4398                dma_pdev = pci_dev_get(pdev);
4399
4400        /* Account for quirked devices */
4401        swap_pci_ref(&dma_pdev, pci_get_dma_source(dma_pdev));
4402
4403        /*
4404         * If it's a multifunction device that does not support our
4405         * required ACS flags, add to the same group as lowest numbered
4406         * function that also does not suport the required ACS flags.
4407         */
4408        if (dma_pdev->multifunction &&
4409            !pci_acs_enabled(dma_pdev, REQ_ACS_FLAGS)) {
4410                u8 i, slot = PCI_SLOT(dma_pdev->devfn);
4411
4412                for (i = 0; i < 8; i++) {
4413                        struct pci_dev *tmp;
4414
4415                        tmp = pci_get_slot(dma_pdev->bus, PCI_DEVFN(slot, i));
4416                        if (!tmp)
4417                                continue;
4418
4419                        if (!pci_acs_enabled(tmp, REQ_ACS_FLAGS)) {
4420                                swap_pci_ref(&dma_pdev, tmp);
4421                                break;
4422                        }
4423                        pci_dev_put(tmp);
4424                }
4425        }
4426
4427        /*
4428         * Devices on the root bus go through the iommu.  If that's not us,
4429         * find the next upstream device and test ACS up to the root bus.
4430         * Finding the next device may require skipping virtual buses.
4431         */
4432        while (!pci_is_root_bus(dma_pdev->bus)) {
4433                struct pci_bus *bus = dma_pdev->bus;
4434
4435                while (!bus->self) {
4436                        if (!pci_is_root_bus(bus))
4437                                bus = bus->parent;
4438                        else
4439                                goto root_bus;
4440                }
4441
4442                if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS))
4443                        break;
4444
4445                swap_pci_ref(&dma_pdev, pci_dev_get(bus->self));
4446        }
4447
4448root_bus:
4449        group = iommu_group_get(&dma_pdev->dev);
4450        pci_dev_put(dma_pdev);
4451        if (!group) {
4452                group = iommu_group_alloc();
4453                if (IS_ERR(group))
4454                        return PTR_ERR(group);
4455        }
4456
4457        ret = iommu_group_add_device(group, dev);
4458
4459        iommu_group_put(group);
4460        return ret;
4461}
4462
4463static void intel_iommu_remove_device(struct device *dev)
4464{
4465        iommu_group_remove_device(dev);
4466}
4467
4468static struct iommu_ops intel_iommu_ops = {
4469        .domain_init    = intel_iommu_domain_init,
4470        .domain_destroy = intel_iommu_domain_destroy,
4471        .attach_dev     = intel_iommu_attach_device,
4472        .detach_dev     = intel_iommu_detach_device,
4473        .map            = intel_iommu_map,
4474        .unmap          = intel_iommu_unmap,
4475        .iova_to_phys   = intel_iommu_iova_to_phys,
4476        .domain_has_cap = intel_iommu_domain_has_cap,
4477        .add_device     = intel_iommu_add_device,
4478        .remove_device  = intel_iommu_remove_device,
4479        .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
4480};
4481
4482static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4483{
4484        /* G4x/GM45 integrated gfx dmar support is totally busted. */
4485        printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4486        dmar_map_gfx = 0;
4487}
4488
4489DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4490DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4491DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4492DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4493DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4494DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4495DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4496
4497static void quirk_iommu_rwbf(struct pci_dev *dev)
4498{
4499        /*
4500         * Mobile 4 Series Chipset neglects to set RWBF capability,
4501         * but needs it. Same seems to hold for the desktop versions.
4502         */
4503        printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4504        rwbf_quirk = 1;
4505}
4506
4507DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4508DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4509DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4510DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4511DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4512DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4513DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4514
4515#define GGC 0x52
4516#define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4517#define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4518#define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4519#define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4520#define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4521#define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4522#define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4523#define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4524
4525static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4526{
4527        unsigned short ggc;
4528
4529        if (pci_read_config_word(dev, GGC, &ggc))
4530                return;
4531
4532        if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4533                printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4534                dmar_map_gfx = 0;
4535        } else if (dmar_map_gfx) {
4536                /* we have to ensure the gfx device is idle before we flush */
4537                printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4538                intel_iommu_strict = 1;
4539       }
4540}
4541DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4542DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4543DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4544DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4545
4546/* On Tylersburg chipsets, some BIOSes have been known to enable the
4547   ISOCH DMAR unit for the Azalia sound device, but not give it any
4548   TLB entries, which causes it to deadlock. Check for that.  We do
4549   this in a function called from init_dmars(), instead of in a PCI
4550   quirk, because we don't want to print the obnoxious "BIOS broken"
4551   message if VT-d is actually disabled.
4552*/
4553static void __init check_tylersburg_isoch(void)
4554{
4555        struct pci_dev *pdev;
4556        uint32_t vtisochctrl;
4557
4558        /* If there's no Azalia in the system anyway, forget it. */
4559        pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4560        if (!pdev)
4561                return;
4562        pci_dev_put(pdev);
4563
4564        /* System Management Registers. Might be hidden, in which case
4565           we can't do the sanity check. But that's OK, because the
4566           known-broken BIOSes _don't_ actually hide it, so far. */
4567        pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4568        if (!pdev)
4569                return;
4570
4571        if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4572                pci_dev_put(pdev);
4573                return;
4574        }
4575
4576        pci_dev_put(pdev);
4577
4578        /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4579        if (vtisochctrl & 1)
4580                return;
4581
4582        /* Drop all bits other than the number of TLB entries */
4583        vtisochctrl &= 0x1c;
4584
4585        /* If we have the recommended number of TLB entries (16), fine. */
4586        if (vtisochctrl == 0x10)
4587                return;
4588
4589        /* Zero TLB entries? You get to ride the short bus to school. */
4590        if (!vtisochctrl) {
4591                WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4592                     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4593                     dmi_get_system_info(DMI_BIOS_VENDOR),
4594                     dmi_get_system_info(DMI_BIOS_VERSION),
4595                     dmi_get_system_info(DMI_PRODUCT_VERSION));
4596                iommu_identity_mapping |= IDENTMAP_AZALIA;
4597                return;
4598        }
4599        
4600        printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4601               vtisochctrl);
4602}
4603