qemu/hw/i386/intel_iommu.c
<<
>>
Prefs
   1/*
   2 * QEMU emulation of an Intel IOMMU (VT-d)
   3 *   (DMA Remapping device)
   4 *
   5 * Copyright (C) 2013 Knut Omang, Oracle <knut.omang@oracle.com>
   6 * Copyright (C) 2014 Le Tan, <tamlokveer@gmail.com>
   7 *
   8 * This program is free software; you can redistribute it and/or modify
   9 * it under the terms of the GNU General Public License as published by
  10 * the Free Software Foundation; either version 2 of the License, or
  11 * (at your option) any later version.
  12
  13 * This program is distributed in the hope that it will be useful,
  14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 * GNU General Public License for more details.
  17
  18 * You should have received a copy of the GNU General Public License along
  19 * with this program; if not, see <http://www.gnu.org/licenses/>.
  20 */
  21
  22#include "qemu/osdep.h"
  23#include "qemu/error-report.h"
  24#include "qapi/error.h"
  25#include "hw/sysbus.h"
  26#include "exec/address-spaces.h"
  27#include "intel_iommu_internal.h"
  28#include "hw/pci/pci.h"
  29#include "hw/pci/pci_bus.h"
  30#include "hw/i386/pc.h"
  31#include "hw/i386/apic-msidef.h"
  32#include "hw/boards.h"
  33#include "hw/i386/x86-iommu.h"
  34#include "hw/pci-host/q35.h"
  35#include "sysemu/kvm.h"
  36#include "hw/i386/apic_internal.h"
  37#include "kvm_i386.h"
  38#include "trace.h"
  39
  40static void vtd_address_space_refresh_all(IntelIOMMUState *s);
  41
  42static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
  43                            uint64_t wmask, uint64_t w1cmask)
  44{
  45    stq_le_p(&s->csr[addr], val);
  46    stq_le_p(&s->wmask[addr], wmask);
  47    stq_le_p(&s->w1cmask[addr], w1cmask);
  48}
  49
  50static void vtd_define_quad_wo(IntelIOMMUState *s, hwaddr addr, uint64_t mask)
  51{
  52    stq_le_p(&s->womask[addr], mask);
  53}
  54
  55static void vtd_define_long(IntelIOMMUState *s, hwaddr addr, uint32_t val,
  56                            uint32_t wmask, uint32_t w1cmask)
  57{
  58    stl_le_p(&s->csr[addr], val);
  59    stl_le_p(&s->wmask[addr], wmask);
  60    stl_le_p(&s->w1cmask[addr], w1cmask);
  61}
  62
  63static void vtd_define_long_wo(IntelIOMMUState *s, hwaddr addr, uint32_t mask)
  64{
  65    stl_le_p(&s->womask[addr], mask);
  66}
  67
  68/* "External" get/set operations */
  69static void vtd_set_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val)
  70{
  71    uint64_t oldval = ldq_le_p(&s->csr[addr]);
  72    uint64_t wmask = ldq_le_p(&s->wmask[addr]);
  73    uint64_t w1cmask = ldq_le_p(&s->w1cmask[addr]);
  74    stq_le_p(&s->csr[addr],
  75             ((oldval & ~wmask) | (val & wmask)) & ~(w1cmask & val));
  76}
  77
  78static void vtd_set_long(IntelIOMMUState *s, hwaddr addr, uint32_t val)
  79{
  80    uint32_t oldval = ldl_le_p(&s->csr[addr]);
  81    uint32_t wmask = ldl_le_p(&s->wmask[addr]);
  82    uint32_t w1cmask = ldl_le_p(&s->w1cmask[addr]);
  83    stl_le_p(&s->csr[addr],
  84             ((oldval & ~wmask) | (val & wmask)) & ~(w1cmask & val));
  85}
  86
  87static uint64_t vtd_get_quad(IntelIOMMUState *s, hwaddr addr)
  88{
  89    uint64_t val = ldq_le_p(&s->csr[addr]);
  90    uint64_t womask = ldq_le_p(&s->womask[addr]);
  91    return val & ~womask;
  92}
  93
  94static uint32_t vtd_get_long(IntelIOMMUState *s, hwaddr addr)
  95{
  96    uint32_t val = ldl_le_p(&s->csr[addr]);
  97    uint32_t womask = ldl_le_p(&s->womask[addr]);
  98    return val & ~womask;
  99}
 100
 101/* "Internal" get/set operations */
 102static uint64_t vtd_get_quad_raw(IntelIOMMUState *s, hwaddr addr)
 103{
 104    return ldq_le_p(&s->csr[addr]);
 105}
 106
 107static uint32_t vtd_get_long_raw(IntelIOMMUState *s, hwaddr addr)
 108{
 109    return ldl_le_p(&s->csr[addr]);
 110}
 111
 112static void vtd_set_quad_raw(IntelIOMMUState *s, hwaddr addr, uint64_t val)
 113{
 114    stq_le_p(&s->csr[addr], val);
 115}
 116
 117static uint32_t vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr,
 118                                        uint32_t clear, uint32_t mask)
 119{
 120    uint32_t new_val = (ldl_le_p(&s->csr[addr]) & ~clear) | mask;
 121    stl_le_p(&s->csr[addr], new_val);
 122    return new_val;
 123}
 124
 125static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
 126                                        uint64_t clear, uint64_t mask)
 127{
 128    uint64_t new_val = (ldq_le_p(&s->csr[addr]) & ~clear) | mask;
 129    stq_le_p(&s->csr[addr], new_val);
 130    return new_val;
 131}
 132
 133static inline void vtd_iommu_lock(IntelIOMMUState *s)
 134{
 135    qemu_mutex_lock(&s->iommu_lock);
 136}
 137
 138static inline void vtd_iommu_unlock(IntelIOMMUState *s)
 139{
 140    qemu_mutex_unlock(&s->iommu_lock);
 141}
 142
 143/* Whether the address space needs to notify new mappings */
 144static inline gboolean vtd_as_has_map_notifier(VTDAddressSpace *as)
 145{
 146    return as->notifier_flags & IOMMU_NOTIFIER_MAP;
 147}
 148
 149/* GHashTable functions */
 150static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2)
 151{
 152    return *((const uint64_t *)v1) == *((const uint64_t *)v2);
 153}
 154
 155static guint vtd_uint64_hash(gconstpointer v)
 156{
 157    return (guint)*(const uint64_t *)v;
 158}
 159
 160static gboolean vtd_hash_remove_by_domain(gpointer key, gpointer value,
 161                                          gpointer user_data)
 162{
 163    VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value;
 164    uint16_t domain_id = *(uint16_t *)user_data;
 165    return entry->domain_id == domain_id;
 166}
 167
 168/* The shift of an addr for a certain level of paging structure */
 169static inline uint32_t vtd_slpt_level_shift(uint32_t level)
 170{
 171    assert(level != 0);
 172    return VTD_PAGE_SHIFT_4K + (level - 1) * VTD_SL_LEVEL_BITS;
 173}
 174
 175static inline uint64_t vtd_slpt_level_page_mask(uint32_t level)
 176{
 177    return ~((1ULL << vtd_slpt_level_shift(level)) - 1);
 178}
 179
 180static gboolean vtd_hash_remove_by_page(gpointer key, gpointer value,
 181                                        gpointer user_data)
 182{
 183    VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value;
 184    VTDIOTLBPageInvInfo *info = (VTDIOTLBPageInvInfo *)user_data;
 185    uint64_t gfn = (info->addr >> VTD_PAGE_SHIFT_4K) & info->mask;
 186    uint64_t gfn_tlb = (info->addr & entry->mask) >> VTD_PAGE_SHIFT_4K;
 187    return (entry->domain_id == info->domain_id) &&
 188            (((entry->gfn & info->mask) == gfn) ||
 189             (entry->gfn == gfn_tlb));
 190}
 191
 192/* Reset all the gen of VTDAddressSpace to zero and set the gen of
 193 * IntelIOMMUState to 1.  Must be called with IOMMU lock held.
 194 */
 195static void vtd_reset_context_cache_locked(IntelIOMMUState *s)
 196{
 197    VTDAddressSpace *vtd_as;
 198    VTDBus *vtd_bus;
 199    GHashTableIter bus_it;
 200    uint32_t devfn_it;
 201
 202    trace_vtd_context_cache_reset();
 203
 204    g_hash_table_iter_init(&bus_it, s->vtd_as_by_busptr);
 205
 206    while (g_hash_table_iter_next (&bus_it, NULL, (void**)&vtd_bus)) {
 207        for (devfn_it = 0; devfn_it < PCI_DEVFN_MAX; ++devfn_it) {
 208            vtd_as = vtd_bus->dev_as[devfn_it];
 209            if (!vtd_as) {
 210                continue;
 211            }
 212            vtd_as->context_cache_entry.context_cache_gen = 0;
 213        }
 214    }
 215    s->context_cache_gen = 1;
 216}
 217
 218/* Must be called with IOMMU lock held. */
 219static void vtd_reset_iotlb_locked(IntelIOMMUState *s)
 220{
 221    assert(s->iotlb);
 222    g_hash_table_remove_all(s->iotlb);
 223}
 224
 225static void vtd_reset_iotlb(IntelIOMMUState *s)
 226{
 227    vtd_iommu_lock(s);
 228    vtd_reset_iotlb_locked(s);
 229    vtd_iommu_unlock(s);
 230}
 231
 232static void vtd_reset_caches(IntelIOMMUState *s)
 233{
 234    vtd_iommu_lock(s);
 235    vtd_reset_iotlb_locked(s);
 236    vtd_reset_context_cache_locked(s);
 237    vtd_iommu_unlock(s);
 238}
 239
 240static uint64_t vtd_get_iotlb_key(uint64_t gfn, uint16_t source_id,
 241                                  uint32_t level)
 242{
 243    return gfn | ((uint64_t)(source_id) << VTD_IOTLB_SID_SHIFT) |
 244           ((uint64_t)(level) << VTD_IOTLB_LVL_SHIFT);
 245}
 246
 247static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t level)
 248{
 249    return (addr & vtd_slpt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K;
 250}
 251
 252/* Must be called with IOMMU lock held */
 253static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, uint16_t source_id,
 254                                       hwaddr addr)
 255{
 256    VTDIOTLBEntry *entry;
 257    uint64_t key;
 258    int level;
 259
 260    for (level = VTD_SL_PT_LEVEL; level < VTD_SL_PML4_LEVEL; level++) {
 261        key = vtd_get_iotlb_key(vtd_get_iotlb_gfn(addr, level),
 262                                source_id, level);
 263        entry = g_hash_table_lookup(s->iotlb, &key);
 264        if (entry) {
 265            goto out;
 266        }
 267    }
 268
 269out:
 270    return entry;
 271}
 272
 273/* Must be with IOMMU lock held */
 274static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id,
 275                             uint16_t domain_id, hwaddr addr, uint64_t slpte,
 276                             uint8_t access_flags, uint32_t level)
 277{
 278    VTDIOTLBEntry *entry = g_malloc(sizeof(*entry));
 279    uint64_t *key = g_malloc(sizeof(*key));
 280    uint64_t gfn = vtd_get_iotlb_gfn(addr, level);
 281
 282    trace_vtd_iotlb_page_update(source_id, addr, slpte, domain_id);
 283    if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) {
 284        trace_vtd_iotlb_reset("iotlb exceeds size limit");
 285        vtd_reset_iotlb_locked(s);
 286    }
 287
 288    entry->gfn = gfn;
 289    entry->domain_id = domain_id;
 290    entry->slpte = slpte;
 291    entry->access_flags = access_flags;
 292    entry->mask = vtd_slpt_level_page_mask(level);
 293    *key = vtd_get_iotlb_key(gfn, source_id, level);
 294    g_hash_table_replace(s->iotlb, key, entry);
 295}
 296
 297/* Given the reg addr of both the message data and address, generate an
 298 * interrupt via MSI.
 299 */
 300static void vtd_generate_interrupt(IntelIOMMUState *s, hwaddr mesg_addr_reg,
 301                                   hwaddr mesg_data_reg)
 302{
 303    MSIMessage msi;
 304
 305    assert(mesg_data_reg < DMAR_REG_SIZE);
 306    assert(mesg_addr_reg < DMAR_REG_SIZE);
 307
 308    msi.address = vtd_get_long_raw(s, mesg_addr_reg);
 309    msi.data = vtd_get_long_raw(s, mesg_data_reg);
 310
 311    trace_vtd_irq_generate(msi.address, msi.data);
 312
 313    apic_get_class()->send_msi(&msi);
 314}
 315
 316/* Generate a fault event to software via MSI if conditions are met.
 317 * Notice that the value of FSTS_REG being passed to it should be the one
 318 * before any update.
 319 */
 320static void vtd_generate_fault_event(IntelIOMMUState *s, uint32_t pre_fsts)
 321{
 322    if (pre_fsts & VTD_FSTS_PPF || pre_fsts & VTD_FSTS_PFO ||
 323        pre_fsts & VTD_FSTS_IQE) {
 324        trace_vtd_err("There are previous interrupt conditions "
 325                      "to be serviced by software, fault event "
 326                      "is not generated.");
 327        return;
 328    }
 329    vtd_set_clear_mask_long(s, DMAR_FECTL_REG, 0, VTD_FECTL_IP);
 330    if (vtd_get_long_raw(s, DMAR_FECTL_REG) & VTD_FECTL_IM) {
 331        trace_vtd_err("Interrupt Mask set, irq is not generated.");
 332    } else {
 333        vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG);
 334        vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0);
 335    }
 336}
 337
 338/* Check if the Fault (F) field of the Fault Recording Register referenced by
 339 * @index is Set.
 340 */
 341static bool vtd_is_frcd_set(IntelIOMMUState *s, uint16_t index)
 342{
 343    /* Each reg is 128-bit */
 344    hwaddr addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4);
 345    addr += 8; /* Access the high 64-bit half */
 346
 347    assert(index < DMAR_FRCD_REG_NR);
 348
 349    return vtd_get_quad_raw(s, addr) & VTD_FRCD_F;
 350}
 351
 352/* Update the PPF field of Fault Status Register.
 353 * Should be called whenever change the F field of any fault recording
 354 * registers.
 355 */
 356static void vtd_update_fsts_ppf(IntelIOMMUState *s)
 357{
 358    uint32_t i;
 359    uint32_t ppf_mask = 0;
 360
 361    for (i = 0; i < DMAR_FRCD_REG_NR; i++) {
 362        if (vtd_is_frcd_set(s, i)) {
 363            ppf_mask = VTD_FSTS_PPF;
 364            break;
 365        }
 366    }
 367    vtd_set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_PPF, ppf_mask);
 368    trace_vtd_fsts_ppf(!!ppf_mask);
 369}
 370
 371static void vtd_set_frcd_and_update_ppf(IntelIOMMUState *s, uint16_t index)
 372{
 373    /* Each reg is 128-bit */
 374    hwaddr addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4);
 375    addr += 8; /* Access the high 64-bit half */
 376
 377    assert(index < DMAR_FRCD_REG_NR);
 378
 379    vtd_set_clear_mask_quad(s, addr, 0, VTD_FRCD_F);
 380    vtd_update_fsts_ppf(s);
 381}
 382
 383/* Must not update F field now, should be done later */
 384static void vtd_record_frcd(IntelIOMMUState *s, uint16_t index,
 385                            uint16_t source_id, hwaddr addr,
 386                            VTDFaultReason fault, bool is_write)
 387{
 388    uint64_t hi = 0, lo;
 389    hwaddr frcd_reg_addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4);
 390
 391    assert(index < DMAR_FRCD_REG_NR);
 392
 393    lo = VTD_FRCD_FI(addr);
 394    hi = VTD_FRCD_SID(source_id) | VTD_FRCD_FR(fault);
 395    if (!is_write) {
 396        hi |= VTD_FRCD_T;
 397    }
 398    vtd_set_quad_raw(s, frcd_reg_addr, lo);
 399    vtd_set_quad_raw(s, frcd_reg_addr + 8, hi);
 400
 401    trace_vtd_frr_new(index, hi, lo);
 402}
 403
 404/* Try to collapse multiple pending faults from the same requester */
 405static bool vtd_try_collapse_fault(IntelIOMMUState *s, uint16_t source_id)
 406{
 407    uint32_t i;
 408    uint64_t frcd_reg;
 409    hwaddr addr = DMAR_FRCD_REG_OFFSET + 8; /* The high 64-bit half */
 410
 411    for (i = 0; i < DMAR_FRCD_REG_NR; i++) {
 412        frcd_reg = vtd_get_quad_raw(s, addr);
 413        if ((frcd_reg & VTD_FRCD_F) &&
 414            ((frcd_reg & VTD_FRCD_SID_MASK) == source_id)) {
 415            return true;
 416        }
 417        addr += 16; /* 128-bit for each */
 418    }
 419    return false;
 420}
 421
 422/* Log and report an DMAR (address translation) fault to software */
 423static void vtd_report_dmar_fault(IntelIOMMUState *s, uint16_t source_id,
 424                                  hwaddr addr, VTDFaultReason fault,
 425                                  bool is_write)
 426{
 427    uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG);
 428
 429    assert(fault < VTD_FR_MAX);
 430
 431    if (fault == VTD_FR_RESERVED_ERR) {
 432        /* This is not a normal fault reason case. Drop it. */
 433        return;
 434    }
 435
 436    trace_vtd_dmar_fault(source_id, fault, addr, is_write);
 437
 438    if (fsts_reg & VTD_FSTS_PFO) {
 439        trace_vtd_err("New fault is not recorded due to "
 440                      "Primary Fault Overflow.");
 441        return;
 442    }
 443
 444    if (vtd_try_collapse_fault(s, source_id)) {
 445        trace_vtd_err("New fault is not recorded due to "
 446                      "compression of faults.");
 447        return;
 448    }
 449
 450    if (vtd_is_frcd_set(s, s->next_frcd_reg)) {
 451        trace_vtd_err("Next Fault Recording Reg is used, "
 452                      "new fault is not recorded, set PFO field.");
 453        vtd_set_clear_mask_long(s, DMAR_FSTS_REG, 0, VTD_FSTS_PFO);
 454        return;
 455    }
 456
 457    vtd_record_frcd(s, s->next_frcd_reg, source_id, addr, fault, is_write);
 458
 459    if (fsts_reg & VTD_FSTS_PPF) {
 460        trace_vtd_err("There are pending faults already, "
 461                      "fault event is not generated.");
 462        vtd_set_frcd_and_update_ppf(s, s->next_frcd_reg);
 463        s->next_frcd_reg++;
 464        if (s->next_frcd_reg == DMAR_FRCD_REG_NR) {
 465            s->next_frcd_reg = 0;
 466        }
 467    } else {
 468        vtd_set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_FRI_MASK,
 469                                VTD_FSTS_FRI(s->next_frcd_reg));
 470        vtd_set_frcd_and_update_ppf(s, s->next_frcd_reg); /* Will set PPF */
 471        s->next_frcd_reg++;
 472        if (s->next_frcd_reg == DMAR_FRCD_REG_NR) {
 473            s->next_frcd_reg = 0;
 474        }
 475        /* This case actually cause the PPF to be Set.
 476         * So generate fault event (interrupt).
 477         */
 478         vtd_generate_fault_event(s, fsts_reg);
 479    }
 480}
 481
 482/* Handle Invalidation Queue Errors of queued invalidation interface error
 483 * conditions.
 484 */
 485static void vtd_handle_inv_queue_error(IntelIOMMUState *s)
 486{
 487    uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG);
 488
 489    vtd_set_clear_mask_long(s, DMAR_FSTS_REG, 0, VTD_FSTS_IQE);
 490    vtd_generate_fault_event(s, fsts_reg);
 491}
 492
 493/* Set the IWC field and try to generate an invalidation completion interrupt */
 494static void vtd_generate_completion_event(IntelIOMMUState *s)
 495{
 496    if (vtd_get_long_raw(s, DMAR_ICS_REG) & VTD_ICS_IWC) {
 497        trace_vtd_inv_desc_wait_irq("One pending, skip current");
 498        return;
 499    }
 500    vtd_set_clear_mask_long(s, DMAR_ICS_REG, 0, VTD_ICS_IWC);
 501    vtd_set_clear_mask_long(s, DMAR_IECTL_REG, 0, VTD_IECTL_IP);
 502    if (vtd_get_long_raw(s, DMAR_IECTL_REG) & VTD_IECTL_IM) {
 503        trace_vtd_inv_desc_wait_irq("IM in IECTL_REG is set, "
 504                                    "new event not generated");
 505        return;
 506    } else {
 507        /* Generate the interrupt event */
 508        trace_vtd_inv_desc_wait_irq("Generating complete event");
 509        vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG);
 510        vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0);
 511    }
 512}
 513
 514static inline bool vtd_root_entry_present(VTDRootEntry *root)
 515{
 516    return root->val & VTD_ROOT_ENTRY_P;
 517}
 518
 519static int vtd_get_root_entry(IntelIOMMUState *s, uint8_t index,
 520                              VTDRootEntry *re)
 521{
 522    dma_addr_t addr;
 523
 524    addr = s->root + index * sizeof(*re);
 525    if (dma_memory_read(&address_space_memory, addr, re, sizeof(*re))) {
 526        trace_vtd_re_invalid(re->rsvd, re->val);
 527        re->val = 0;
 528        return -VTD_FR_ROOT_TABLE_INV;
 529    }
 530    re->val = le64_to_cpu(re->val);
 531    return 0;
 532}
 533
 534static inline bool vtd_ce_present(VTDContextEntry *context)
 535{
 536    return context->lo & VTD_CONTEXT_ENTRY_P;
 537}
 538
 539static int vtd_get_context_entry_from_root(VTDRootEntry *root, uint8_t index,
 540                                           VTDContextEntry *ce)
 541{
 542    dma_addr_t addr;
 543
 544    /* we have checked that root entry is present */
 545    addr = (root->val & VTD_ROOT_ENTRY_CTP) + index * sizeof(*ce);
 546    if (dma_memory_read(&address_space_memory, addr, ce, sizeof(*ce))) {
 547        trace_vtd_re_invalid(root->rsvd, root->val);
 548        return -VTD_FR_CONTEXT_TABLE_INV;
 549    }
 550    ce->lo = le64_to_cpu(ce->lo);
 551    ce->hi = le64_to_cpu(ce->hi);
 552    return 0;
 553}
 554
 555static inline dma_addr_t vtd_ce_get_slpt_base(VTDContextEntry *ce)
 556{
 557    return ce->lo & VTD_CONTEXT_ENTRY_SLPTPTR;
 558}
 559
 560static inline uint64_t vtd_get_slpte_addr(uint64_t slpte, uint8_t aw)
 561{
 562    return slpte & VTD_SL_PT_BASE_ADDR_MASK(aw);
 563}
 564
 565/* Whether the pte indicates the address of the page frame */
 566static inline bool vtd_is_last_slpte(uint64_t slpte, uint32_t level)
 567{
 568    return level == VTD_SL_PT_LEVEL || (slpte & VTD_SL_PT_PAGE_SIZE_MASK);
 569}
 570
 571/* Get the content of a spte located in @base_addr[@index] */
 572static uint64_t vtd_get_slpte(dma_addr_t base_addr, uint32_t index)
 573{
 574    uint64_t slpte;
 575
 576    assert(index < VTD_SL_PT_ENTRY_NR);
 577
 578    if (dma_memory_read(&address_space_memory,
 579                        base_addr + index * sizeof(slpte), &slpte,
 580                        sizeof(slpte))) {
 581        slpte = (uint64_t)-1;
 582        return slpte;
 583    }
 584    slpte = le64_to_cpu(slpte);
 585    return slpte;
 586}
 587
 588/* Given an iova and the level of paging structure, return the offset
 589 * of current level.
 590 */
 591static inline uint32_t vtd_iova_level_offset(uint64_t iova, uint32_t level)
 592{
 593    return (iova >> vtd_slpt_level_shift(level)) &
 594            ((1ULL << VTD_SL_LEVEL_BITS) - 1);
 595}
 596
 597/* Check Capability Register to see if the @level of page-table is supported */
 598static inline bool vtd_is_level_supported(IntelIOMMUState *s, uint32_t level)
 599{
 600    return VTD_CAP_SAGAW_MASK & s->cap &
 601           (1ULL << (level - 2 + VTD_CAP_SAGAW_SHIFT));
 602}
 603
 604/* Get the page-table level that hardware should use for the second-level
 605 * page-table walk from the Address Width field of context-entry.
 606 */
 607static inline uint32_t vtd_ce_get_level(VTDContextEntry *ce)
 608{
 609    return 2 + (ce->hi & VTD_CONTEXT_ENTRY_AW);
 610}
 611
 612static inline uint32_t vtd_ce_get_agaw(VTDContextEntry *ce)
 613{
 614    return 30 + (ce->hi & VTD_CONTEXT_ENTRY_AW) * 9;
 615}
 616
 617static inline uint32_t vtd_ce_get_type(VTDContextEntry *ce)
 618{
 619    return ce->lo & VTD_CONTEXT_ENTRY_TT;
 620}
 621
 622/* Return true if check passed, otherwise false */
 623static inline bool vtd_ce_type_check(X86IOMMUState *x86_iommu,
 624                                     VTDContextEntry *ce)
 625{
 626    switch (vtd_ce_get_type(ce)) {
 627    case VTD_CONTEXT_TT_MULTI_LEVEL:
 628        /* Always supported */
 629        break;
 630    case VTD_CONTEXT_TT_DEV_IOTLB:
 631        if (!x86_iommu->dt_supported) {
 632            return false;
 633        }
 634        break;
 635    case VTD_CONTEXT_TT_PASS_THROUGH:
 636        if (!x86_iommu->pt_supported) {
 637            return false;
 638        }
 639        break;
 640    default:
 641        /* Unknwon type */
 642        return false;
 643    }
 644    return true;
 645}
 646
 647static inline uint64_t vtd_iova_limit(VTDContextEntry *ce, uint8_t aw)
 648{
 649    uint32_t ce_agaw = vtd_ce_get_agaw(ce);
 650    return 1ULL << MIN(ce_agaw, aw);
 651}
 652
 653/* Return true if IOVA passes range check, otherwise false. */
 654static inline bool vtd_iova_range_check(uint64_t iova, VTDContextEntry *ce,
 655                                        uint8_t aw)
 656{
 657    /*
 658     * Check if @iova is above 2^X-1, where X is the minimum of MGAW
 659     * in CAP_REG and AW in context-entry.
 660     */
 661    return !(iova & ~(vtd_iova_limit(ce, aw) - 1));
 662}
 663
 664/*
 665 * Rsvd field masks for spte:
 666 *     Index [1] to [4] 4k pages
 667 *     Index [5] to [8] large pages
 668 */
 669static uint64_t vtd_paging_entry_rsvd_field[9];
 670
 671static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level)
 672{
 673    if (slpte & VTD_SL_PT_PAGE_SIZE_MASK) {
 674        /* Maybe large page */
 675        return slpte & vtd_paging_entry_rsvd_field[level + 4];
 676    } else {
 677        return slpte & vtd_paging_entry_rsvd_field[level];
 678    }
 679}
 680
 681/* Find the VTD address space associated with a given bus number */
 682static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num)
 683{
 684    VTDBus *vtd_bus = s->vtd_as_by_bus_num[bus_num];
 685    if (!vtd_bus) {
 686        /*
 687         * Iterate over the registered buses to find the one which
 688         * currently hold this bus number, and update the bus_num
 689         * lookup table:
 690         */
 691        GHashTableIter iter;
 692
 693        g_hash_table_iter_init(&iter, s->vtd_as_by_busptr);
 694        while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) {
 695            if (pci_bus_num(vtd_bus->bus) == bus_num) {
 696                s->vtd_as_by_bus_num[bus_num] = vtd_bus;
 697                return vtd_bus;
 698            }
 699        }
 700    }
 701    return vtd_bus;
 702}
 703
 704/* Given the @iova, get relevant @slptep. @slpte_level will be the last level
 705 * of the translation, can be used for deciding the size of large page.
 706 */
 707static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write,
 708                             uint64_t *slptep, uint32_t *slpte_level,
 709                             bool *reads, bool *writes, uint8_t aw_bits)
 710{
 711    dma_addr_t addr = vtd_ce_get_slpt_base(ce);
 712    uint32_t level = vtd_ce_get_level(ce);
 713    uint32_t offset;
 714    uint64_t slpte;
 715    uint64_t access_right_check;
 716
 717    if (!vtd_iova_range_check(iova, ce, aw_bits)) {
 718        trace_vtd_err_dmar_iova_overflow(iova);
 719        return -VTD_FR_ADDR_BEYOND_MGAW;
 720    }
 721
 722    /* FIXME: what is the Atomics request here? */
 723    access_right_check = is_write ? VTD_SL_W : VTD_SL_R;
 724
 725    while (true) {
 726        offset = vtd_iova_level_offset(iova, level);
 727        slpte = vtd_get_slpte(addr, offset);
 728
 729        if (slpte == (uint64_t)-1) {
 730            trace_vtd_err_dmar_slpte_read_error(iova, level);
 731            if (level == vtd_ce_get_level(ce)) {
 732                /* Invalid programming of context-entry */
 733                return -VTD_FR_CONTEXT_ENTRY_INV;
 734            } else {
 735                return -VTD_FR_PAGING_ENTRY_INV;
 736            }
 737        }
 738        *reads = (*reads) && (slpte & VTD_SL_R);
 739        *writes = (*writes) && (slpte & VTD_SL_W);
 740        if (!(slpte & access_right_check)) {
 741            trace_vtd_err_dmar_slpte_perm_error(iova, level, slpte, is_write);
 742            return is_write ? -VTD_FR_WRITE : -VTD_FR_READ;
 743        }
 744        if (vtd_slpte_nonzero_rsvd(slpte, level)) {
 745            trace_vtd_err_dmar_slpte_resv_error(iova, level, slpte);
 746            return -VTD_FR_PAGING_ENTRY_RSVD;
 747        }
 748
 749        if (vtd_is_last_slpte(slpte, level)) {
 750            *slptep = slpte;
 751            *slpte_level = level;
 752            return 0;
 753        }
 754        addr = vtd_get_slpte_addr(slpte, aw_bits);
 755        level--;
 756    }
 757}
 758
 759typedef int (*vtd_page_walk_hook)(IOMMUTLBEntry *entry, void *private);
 760
 761/**
 762 * Constant information used during page walking
 763 *
 764 * @hook_fn: hook func to be called when detected page
 765 * @private: private data to be passed into hook func
 766 * @notify_unmap: whether we should notify invalid entries
 767 * @as: VT-d address space of the device
 768 * @aw: maximum address width
 769 * @domain: domain ID of the page walk
 770 */
 771typedef struct {
 772    VTDAddressSpace *as;
 773    vtd_page_walk_hook hook_fn;
 774    void *private;
 775    bool notify_unmap;
 776    uint8_t aw;
 777    uint16_t domain_id;
 778} vtd_page_walk_info;
 779
 780static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info)
 781{
 782    VTDAddressSpace *as = info->as;
 783    vtd_page_walk_hook hook_fn = info->hook_fn;
 784    void *private = info->private;
 785    DMAMap target = {
 786        .iova = entry->iova,
 787        .size = entry->addr_mask,
 788        .translated_addr = entry->translated_addr,
 789        .perm = entry->perm,
 790    };
 791    DMAMap *mapped = iova_tree_find(as->iova_tree, &target);
 792
 793    if (entry->perm == IOMMU_NONE && !info->notify_unmap) {
 794        trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
 795        return 0;
 796    }
 797
 798    assert(hook_fn);
 799
 800    /* Update local IOVA mapped ranges */
 801    if (entry->perm) {
 802        if (mapped) {
 803            /* If it's exactly the same translation, skip */
 804            if (!memcmp(mapped, &target, sizeof(target))) {
 805                trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask,
 806                                                 entry->translated_addr);
 807                return 0;
 808            } else {
 809                /*
 810                 * Translation changed.  Normally this should not
 811                 * happen, but it can happen when with buggy guest
 812                 * OSes.  Note that there will be a small window that
 813                 * we don't have map at all.  But that's the best
 814                 * effort we can do.  The ideal way to emulate this is
 815                 * atomically modify the PTE to follow what has
 816                 * changed, but we can't.  One example is that vfio
 817                 * driver only has VFIO_IOMMU_[UN]MAP_DMA but no
 818                 * interface to modify a mapping (meanwhile it seems
 819                 * meaningless to even provide one).  Anyway, let's
 820                 * mark this as a TODO in case one day we'll have
 821                 * a better solution.
 822                 */
 823                IOMMUAccessFlags cache_perm = entry->perm;
 824                int ret;
 825
 826                /* Emulate an UNMAP */
 827                entry->perm = IOMMU_NONE;
 828                trace_vtd_page_walk_one(info->domain_id,
 829                                        entry->iova,
 830                                        entry->translated_addr,
 831                                        entry->addr_mask,
 832                                        entry->perm);
 833                ret = hook_fn(entry, private);
 834                if (ret) {
 835                    return ret;
 836                }
 837                /* Drop any existing mapping */
 838                iova_tree_remove(as->iova_tree, &target);
 839                /* Recover the correct permission */
 840                entry->perm = cache_perm;
 841            }
 842        }
 843        iova_tree_insert(as->iova_tree, &target);
 844    } else {
 845        if (!mapped) {
 846            /* Skip since we didn't map this range at all */
 847            trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
 848            return 0;
 849        }
 850        iova_tree_remove(as->iova_tree, &target);
 851    }
 852
 853    trace_vtd_page_walk_one(info->domain_id, entry->iova,
 854                            entry->translated_addr, entry->addr_mask,
 855                            entry->perm);
 856    return hook_fn(entry, private);
 857}
 858
 859/**
 860 * vtd_page_walk_level - walk over specific level for IOVA range
 861 *
 862 * @addr: base GPA addr to start the walk
 863 * @start: IOVA range start address
 864 * @end: IOVA range end address (start <= addr < end)
 865 * @read: whether parent level has read permission
 866 * @write: whether parent level has write permission
 867 * @info: constant information for the page walk
 868 */
 869static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
 870                               uint64_t end, uint32_t level, bool read,
 871                               bool write, vtd_page_walk_info *info)
 872{
 873    bool read_cur, write_cur, entry_valid;
 874    uint32_t offset;
 875    uint64_t slpte;
 876    uint64_t subpage_size, subpage_mask;
 877    IOMMUTLBEntry entry;
 878    uint64_t iova = start;
 879    uint64_t iova_next;
 880    int ret = 0;
 881
 882    trace_vtd_page_walk_level(addr, level, start, end);
 883
 884    subpage_size = 1ULL << vtd_slpt_level_shift(level);
 885    subpage_mask = vtd_slpt_level_page_mask(level);
 886
 887    while (iova < end) {
 888        iova_next = (iova & subpage_mask) + subpage_size;
 889
 890        offset = vtd_iova_level_offset(iova, level);
 891        slpte = vtd_get_slpte(addr, offset);
 892
 893        if (slpte == (uint64_t)-1) {
 894            trace_vtd_page_walk_skip_read(iova, iova_next);
 895            goto next;
 896        }
 897
 898        if (vtd_slpte_nonzero_rsvd(slpte, level)) {
 899            trace_vtd_page_walk_skip_reserve(iova, iova_next);
 900            goto next;
 901        }
 902
 903        /* Permissions are stacked with parents' */
 904        read_cur = read && (slpte & VTD_SL_R);
 905        write_cur = write && (slpte & VTD_SL_W);
 906
 907        /*
 908         * As long as we have either read/write permission, this is a
 909         * valid entry. The rule works for both page entries and page
 910         * table entries.
 911         */
 912        entry_valid = read_cur | write_cur;
 913
 914        if (!vtd_is_last_slpte(slpte, level) && entry_valid) {
 915            /*
 916             * This is a valid PDE (or even bigger than PDE).  We need
 917             * to walk one further level.
 918             */
 919            ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw),
 920                                      iova, MIN(iova_next, end), level - 1,
 921                                      read_cur, write_cur, info);
 922        } else {
 923            /*
 924             * This means we are either:
 925             *
 926             * (1) the real page entry (either 4K page, or huge page)
 927             * (2) the whole range is invalid
 928             *
 929             * In either case, we send an IOTLB notification down.
 930             */
 931            entry.target_as = &address_space_memory;
 932            entry.iova = iova & subpage_mask;
 933            entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
 934            entry.addr_mask = ~subpage_mask;
 935            /* NOTE: this is only meaningful if entry_valid == true */
 936            entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw);
 937            ret = vtd_page_walk_one(&entry, info);
 938        }
 939
 940        if (ret < 0) {
 941            return ret;
 942        }
 943
 944next:
 945        iova = iova_next;
 946    }
 947
 948    return 0;
 949}
 950
 951/**
 952 * vtd_page_walk - walk specific IOVA range, and call the hook
 953 *
 954 * @ce: context entry to walk upon
 955 * @start: IOVA address to start the walk
 956 * @end: IOVA range end address (start <= addr < end)
 957 * @info: page walking information struct
 958 */
 959static int vtd_page_walk(VTDContextEntry *ce, uint64_t start, uint64_t end,
 960                         vtd_page_walk_info *info)
 961{
 962    dma_addr_t addr = vtd_ce_get_slpt_base(ce);
 963    uint32_t level = vtd_ce_get_level(ce);
 964
 965    if (!vtd_iova_range_check(start, ce, info->aw)) {
 966        return -VTD_FR_ADDR_BEYOND_MGAW;
 967    }
 968
 969    if (!vtd_iova_range_check(end, ce, info->aw)) {
 970        /* Fix end so that it reaches the maximum */
 971        end = vtd_iova_limit(ce, info->aw);
 972    }
 973
 974    return vtd_page_walk_level(addr, start, end, level, true, true, info);
 975}
 976
 977/* Map a device to its corresponding domain (context-entry) */
 978static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
 979                                    uint8_t devfn, VTDContextEntry *ce)
 980{
 981    VTDRootEntry re;
 982    int ret_fr;
 983    X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
 984
 985    ret_fr = vtd_get_root_entry(s, bus_num, &re);
 986    if (ret_fr) {
 987        return ret_fr;
 988    }
 989
 990    if (!vtd_root_entry_present(&re)) {
 991        /* Not error - it's okay we don't have root entry. */
 992        trace_vtd_re_not_present(bus_num);
 993        return -VTD_FR_ROOT_ENTRY_P;
 994    }
 995
 996    if (re.rsvd || (re.val & VTD_ROOT_ENTRY_RSVD(s->aw_bits))) {
 997        trace_vtd_re_invalid(re.rsvd, re.val);
 998        return -VTD_FR_ROOT_ENTRY_RSVD;
 999    }
1000
1001    ret_fr = vtd_get_context_entry_from_root(&re, devfn, ce);
1002    if (ret_fr) {
1003        return ret_fr;
1004    }
1005
1006    if (!vtd_ce_present(ce)) {
1007        /* Not error - it's okay we don't have context entry. */
1008        trace_vtd_ce_not_present(bus_num, devfn);
1009        return -VTD_FR_CONTEXT_ENTRY_P;
1010    }
1011
1012    if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
1013               (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO(s->aw_bits))) {
1014        trace_vtd_ce_invalid(ce->hi, ce->lo);
1015        return -VTD_FR_CONTEXT_ENTRY_RSVD;
1016    }
1017
1018    /* Check if the programming of context-entry is valid */
1019    if (!vtd_is_level_supported(s, vtd_ce_get_level(ce))) {
1020        trace_vtd_ce_invalid(ce->hi, ce->lo);
1021        return -VTD_FR_CONTEXT_ENTRY_INV;
1022    }
1023
1024    /* Do translation type check */
1025    if (!vtd_ce_type_check(x86_iommu, ce)) {
1026        trace_vtd_ce_invalid(ce->hi, ce->lo);
1027        return -VTD_FR_CONTEXT_ENTRY_INV;
1028    }
1029
1030    return 0;
1031}
1032
1033static int vtd_sync_shadow_page_hook(IOMMUTLBEntry *entry,
1034                                     void *private)
1035{
1036    memory_region_notify_iommu((IOMMUMemoryRegion *)private, 0, *entry);
1037    return 0;
1038}
1039
1040/* If context entry is NULL, we'll try to fetch it on our own. */
1041static int vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as,
1042                                            VTDContextEntry *ce,
1043                                            hwaddr addr, hwaddr size)
1044{
1045    IntelIOMMUState *s = vtd_as->iommu_state;
1046    vtd_page_walk_info info = {
1047        .hook_fn = vtd_sync_shadow_page_hook,
1048        .private = (void *)&vtd_as->iommu,
1049        .notify_unmap = true,
1050        .aw = s->aw_bits,
1051        .as = vtd_as,
1052    };
1053    VTDContextEntry ce_cache;
1054    int ret;
1055
1056    if (ce) {
1057        /* If the caller provided context entry, use it */
1058        ce_cache = *ce;
1059    } else {
1060        /* If the caller didn't provide ce, try to fetch */
1061        ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
1062                                       vtd_as->devfn, &ce_cache);
1063        if (ret) {
1064            /*
1065             * This should not really happen, but in case it happens,
1066             * we just skip the sync for this time.  After all we even
1067             * don't have the root table pointer!
1068             */
1069            trace_vtd_err("Detected invalid context entry when "
1070                          "trying to sync shadow page table");
1071            return 0;
1072        }
1073    }
1074
1075    info.domain_id = VTD_CONTEXT_ENTRY_DID(ce_cache.hi);
1076
1077    return vtd_page_walk(&ce_cache, addr, addr + size, &info);
1078}
1079
1080static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as)
1081{
1082    return vtd_sync_shadow_page_table_range(vtd_as, NULL, 0, UINT64_MAX);
1083}
1084
1085/*
1086 * Fetch translation type for specific device. Returns <0 if error
1087 * happens, otherwise return the shifted type to check against
1088 * VTD_CONTEXT_TT_*.
1089 */
1090static int vtd_dev_get_trans_type(VTDAddressSpace *as)
1091{
1092    IntelIOMMUState *s;
1093    VTDContextEntry ce;
1094    int ret;
1095
1096    s = as->iommu_state;
1097
1098    ret = vtd_dev_to_context_entry(s, pci_bus_num(as->bus),
1099                                   as->devfn, &ce);
1100    if (ret) {
1101        return ret;
1102    }
1103
1104    return vtd_ce_get_type(&ce);
1105}
1106
1107static bool vtd_dev_pt_enabled(VTDAddressSpace *as)
1108{
1109    int ret;
1110
1111    assert(as);
1112
1113    ret = vtd_dev_get_trans_type(as);
1114    if (ret < 0) {
1115        /*
1116         * Possibly failed to parse the context entry for some reason
1117         * (e.g., during init, or any guest configuration errors on
1118         * context entries). We should assume PT not enabled for
1119         * safety.
1120         */
1121        return false;
1122    }
1123
1124    return ret == VTD_CONTEXT_TT_PASS_THROUGH;
1125}
1126
1127/* Return whether the device is using IOMMU translation. */
1128static bool vtd_switch_address_space(VTDAddressSpace *as)
1129{
1130    bool use_iommu;
1131    /* Whether we need to take the BQL on our own */
1132    bool take_bql = !qemu_mutex_iothread_locked();
1133
1134    assert(as);
1135
1136    use_iommu = as->iommu_state->dmar_enabled & !vtd_dev_pt_enabled(as);
1137
1138    trace_vtd_switch_address_space(pci_bus_num(as->bus),
1139                                   VTD_PCI_SLOT(as->devfn),
1140                                   VTD_PCI_FUNC(as->devfn),
1141                                   use_iommu);
1142
1143    /*
1144     * It's possible that we reach here without BQL, e.g., when called
1145     * from vtd_pt_enable_fast_path(). However the memory APIs need
1146     * it. We'd better make sure we have had it already, or, take it.
1147     */
1148    if (take_bql) {
1149        qemu_mutex_lock_iothread();
1150    }
1151
1152    /* Turn off first then on the other */
1153    if (use_iommu) {
1154        memory_region_set_enabled(&as->sys_alias, false);
1155        memory_region_set_enabled(MEMORY_REGION(&as->iommu), true);
1156    } else {
1157        memory_region_set_enabled(MEMORY_REGION(&as->iommu), false);
1158        memory_region_set_enabled(&as->sys_alias, true);
1159    }
1160
1161    if (take_bql) {
1162        qemu_mutex_unlock_iothread();
1163    }
1164
1165    return use_iommu;
1166}
1167
1168static void vtd_switch_address_space_all(IntelIOMMUState *s)
1169{
1170    GHashTableIter iter;
1171    VTDBus *vtd_bus;
1172    int i;
1173
1174    g_hash_table_iter_init(&iter, s->vtd_as_by_busptr);
1175    while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) {
1176        for (i = 0; i < PCI_DEVFN_MAX; i++) {
1177            if (!vtd_bus->dev_as[i]) {
1178                continue;
1179            }
1180            vtd_switch_address_space(vtd_bus->dev_as[i]);
1181        }
1182    }
1183}
1184
1185static inline uint16_t vtd_make_source_id(uint8_t bus_num, uint8_t devfn)
1186{
1187    return ((bus_num & 0xffUL) << 8) | (devfn & 0xffUL);
1188}
1189
1190static const bool vtd_qualified_faults[] = {
1191    [VTD_FR_RESERVED] = false,
1192    [VTD_FR_ROOT_ENTRY_P] = false,
1193    [VTD_FR_CONTEXT_ENTRY_P] = true,
1194    [VTD_FR_CONTEXT_ENTRY_INV] = true,
1195    [VTD_FR_ADDR_BEYOND_MGAW] = true,
1196    [VTD_FR_WRITE] = true,
1197    [VTD_FR_READ] = true,
1198    [VTD_FR_PAGING_ENTRY_INV] = true,
1199    [VTD_FR_ROOT_TABLE_INV] = false,
1200    [VTD_FR_CONTEXT_TABLE_INV] = false,
1201    [VTD_FR_ROOT_ENTRY_RSVD] = false,
1202    [VTD_FR_PAGING_ENTRY_RSVD] = true,
1203    [VTD_FR_CONTEXT_ENTRY_TT] = true,
1204    [VTD_FR_RESERVED_ERR] = false,
1205    [VTD_FR_MAX] = false,
1206};
1207
1208/* To see if a fault condition is "qualified", which is reported to software
1209 * only if the FPD field in the context-entry used to process the faulting
1210 * request is 0.
1211 */
1212static inline bool vtd_is_qualified_fault(VTDFaultReason fault)
1213{
1214    return vtd_qualified_faults[fault];
1215}
1216
1217static inline bool vtd_is_interrupt_addr(hwaddr addr)
1218{
1219    return VTD_INTERRUPT_ADDR_FIRST <= addr && addr <= VTD_INTERRUPT_ADDR_LAST;
1220}
1221
1222static void vtd_pt_enable_fast_path(IntelIOMMUState *s, uint16_t source_id)
1223{
1224    VTDBus *vtd_bus;
1225    VTDAddressSpace *vtd_as;
1226    bool success = false;
1227
1228    vtd_bus = vtd_find_as_from_bus_num(s, VTD_SID_TO_BUS(source_id));
1229    if (!vtd_bus) {
1230        goto out;
1231    }
1232
1233    vtd_as = vtd_bus->dev_as[VTD_SID_TO_DEVFN(source_id)];
1234    if (!vtd_as) {
1235        goto out;
1236    }
1237
1238    if (vtd_switch_address_space(vtd_as) == false) {
1239        /* We switched off IOMMU region successfully. */
1240        success = true;
1241    }
1242
1243out:
1244    trace_vtd_pt_enable_fast_path(source_id, success);
1245}
1246
1247/* Map dev to context-entry then do a paging-structures walk to do a iommu
1248 * translation.
1249 *
1250 * Called from RCU critical section.
1251 *
1252 * @bus_num: The bus number
1253 * @devfn: The devfn, which is the  combined of device and function number
1254 * @is_write: The access is a write operation
1255 * @entry: IOMMUTLBEntry that contain the addr to be translated and result
1256 *
1257 * Returns true if translation is successful, otherwise false.
1258 */
1259static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
1260                                   uint8_t devfn, hwaddr addr, bool is_write,
1261                                   IOMMUTLBEntry *entry)
1262{
1263    IntelIOMMUState *s = vtd_as->iommu_state;
1264    VTDContextEntry ce;
1265    uint8_t bus_num = pci_bus_num(bus);
1266    VTDContextCacheEntry *cc_entry;
1267    uint64_t slpte, page_mask;
1268    uint32_t level;
1269    uint16_t source_id = vtd_make_source_id(bus_num, devfn);
1270    int ret_fr;
1271    bool is_fpd_set = false;
1272    bool reads = true;
1273    bool writes = true;
1274    uint8_t access_flags;
1275    VTDIOTLBEntry *iotlb_entry;
1276
1277    /*
1278     * We have standalone memory region for interrupt addresses, we
1279     * should never receive translation requests in this region.
1280     */
1281    assert(!vtd_is_interrupt_addr(addr));
1282
1283    vtd_iommu_lock(s);
1284
1285    cc_entry = &vtd_as->context_cache_entry;
1286
1287    /* Try to fetch slpte form IOTLB */
1288    iotlb_entry = vtd_lookup_iotlb(s, source_id, addr);
1289    if (iotlb_entry) {
1290        trace_vtd_iotlb_page_hit(source_id, addr, iotlb_entry->slpte,
1291                                 iotlb_entry->domain_id);
1292        slpte = iotlb_entry->slpte;
1293        access_flags = iotlb_entry->access_flags;
1294        page_mask = iotlb_entry->mask;
1295        goto out;
1296    }
1297
1298    /* Try to fetch context-entry from cache first */
1299    if (cc_entry->context_cache_gen == s->context_cache_gen) {
1300        trace_vtd_iotlb_cc_hit(bus_num, devfn, cc_entry->context_entry.hi,
1301                               cc_entry->context_entry.lo,
1302                               cc_entry->context_cache_gen);
1303        ce = cc_entry->context_entry;
1304        is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD;
1305    } else {
1306        ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
1307        is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD;
1308        if (ret_fr) {
1309            ret_fr = -ret_fr;
1310            if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
1311                trace_vtd_fault_disabled();
1312            } else {
1313                vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);
1314            }
1315            goto error;
1316        }
1317        /* Update context-cache */
1318        trace_vtd_iotlb_cc_update(bus_num, devfn, ce.hi, ce.lo,
1319                                  cc_entry->context_cache_gen,
1320                                  s->context_cache_gen);
1321        cc_entry->context_entry = ce;
1322        cc_entry->context_cache_gen = s->context_cache_gen;
1323    }
1324
1325    /*
1326     * We don't need to translate for pass-through context entries.
1327     * Also, let's ignore IOTLB caching as well for PT devices.
1328     */
1329    if (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH) {
1330        entry->iova = addr & VTD_PAGE_MASK_4K;
1331        entry->translated_addr = entry->iova;
1332        entry->addr_mask = ~VTD_PAGE_MASK_4K;
1333        entry->perm = IOMMU_RW;
1334        trace_vtd_translate_pt(source_id, entry->iova);
1335
1336        /*
1337         * When this happens, it means firstly caching-mode is not
1338         * enabled, and this is the first passthrough translation for
1339         * the device. Let's enable the fast path for passthrough.
1340         *
1341         * When passthrough is disabled again for the device, we can
1342         * capture it via the context entry invalidation, then the
1343         * IOMMU region can be swapped back.
1344         */
1345        vtd_pt_enable_fast_path(s, source_id);
1346        vtd_iommu_unlock(s);
1347        return true;
1348    }
1349
1350    ret_fr = vtd_iova_to_slpte(&ce, addr, is_write, &slpte, &level,
1351                               &reads, &writes, s->aw_bits);
1352    if (ret_fr) {
1353        ret_fr = -ret_fr;
1354        if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
1355            trace_vtd_fault_disabled();
1356        } else {
1357            vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);
1358        }
1359        goto error;
1360    }
1361
1362    page_mask = vtd_slpt_level_page_mask(level);
1363    access_flags = IOMMU_ACCESS_FLAG(reads, writes);
1364    vtd_update_iotlb(s, source_id, VTD_CONTEXT_ENTRY_DID(ce.hi), addr, slpte,
1365                     access_flags, level);
1366out:
1367    vtd_iommu_unlock(s);
1368    entry->iova = addr & page_mask;
1369    entry->translated_addr = vtd_get_slpte_addr(slpte, s->aw_bits) & page_mask;
1370    entry->addr_mask = ~page_mask;
1371    entry->perm = access_flags;
1372    return true;
1373
1374error:
1375    vtd_iommu_unlock(s);
1376    entry->iova = 0;
1377    entry->translated_addr = 0;
1378    entry->addr_mask = 0;
1379    entry->perm = IOMMU_NONE;
1380    return false;
1381}
1382
1383static void vtd_root_table_setup(IntelIOMMUState *s)
1384{
1385    s->root = vtd_get_quad_raw(s, DMAR_RTADDR_REG);
1386    s->root_extended = s->root & VTD_RTADDR_RTT;
1387    s->root &= VTD_RTADDR_ADDR_MASK(s->aw_bits);
1388
1389    trace_vtd_reg_dmar_root(s->root, s->root_extended);
1390}
1391
1392static void vtd_iec_notify_all(IntelIOMMUState *s, bool global,
1393                               uint32_t index, uint32_t mask)
1394{
1395    x86_iommu_iec_notify_all(X86_IOMMU_DEVICE(s), global, index, mask);
1396}
1397
1398static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s)
1399{
1400    uint64_t value = 0;
1401    value = vtd_get_quad_raw(s, DMAR_IRTA_REG);
1402    s->intr_size = 1UL << ((value & VTD_IRTA_SIZE_MASK) + 1);
1403    s->intr_root = value & VTD_IRTA_ADDR_MASK(s->aw_bits);
1404    s->intr_eime = value & VTD_IRTA_EIME;
1405
1406    /* Notify global invalidation */
1407    vtd_iec_notify_all(s, true, 0, 0);
1408
1409    trace_vtd_reg_ir_root(s->intr_root, s->intr_size);
1410}
1411
1412static void vtd_iommu_replay_all(IntelIOMMUState *s)
1413{
1414    VTDAddressSpace *vtd_as;
1415
1416    QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
1417        vtd_sync_shadow_page_table(vtd_as);
1418    }
1419}
1420
1421static void vtd_context_global_invalidate(IntelIOMMUState *s)
1422{
1423    trace_vtd_inv_desc_cc_global();
1424    /* Protects context cache */
1425    vtd_iommu_lock(s);
1426    s->context_cache_gen++;
1427    if (s->context_cache_gen == VTD_CONTEXT_CACHE_GEN_MAX) {
1428        vtd_reset_context_cache_locked(s);
1429    }
1430    vtd_iommu_unlock(s);
1431    vtd_address_space_refresh_all(s);
1432    /*
1433     * From VT-d spec 6.5.2.1, a global context entry invalidation
1434     * should be followed by a IOTLB global invalidation, so we should
1435     * be safe even without this. Hoewever, let's replay the region as
1436     * well to be safer, and go back here when we need finer tunes for
1437     * VT-d emulation codes.
1438     */
1439    vtd_iommu_replay_all(s);
1440}
1441
1442/* Do a context-cache device-selective invalidation.
1443 * @func_mask: FM field after shifting
1444 */
1445static void vtd_context_device_invalidate(IntelIOMMUState *s,
1446                                          uint16_t source_id,
1447                                          uint16_t func_mask)
1448{
1449    uint16_t mask;
1450    VTDBus *vtd_bus;
1451    VTDAddressSpace *vtd_as;
1452    uint8_t bus_n, devfn;
1453    uint16_t devfn_it;
1454
1455    trace_vtd_inv_desc_cc_devices(source_id, func_mask);
1456
1457    switch (func_mask & 3) {
1458    case 0:
1459        mask = 0;   /* No bits in the SID field masked */
1460        break;
1461    case 1:
1462        mask = 4;   /* Mask bit 2 in the SID field */
1463        break;
1464    case 2:
1465        mask = 6;   /* Mask bit 2:1 in the SID field */
1466        break;
1467    case 3:
1468        mask = 7;   /* Mask bit 2:0 in the SID field */
1469        break;
1470    }
1471    mask = ~mask;
1472
1473    bus_n = VTD_SID_TO_BUS(source_id);
1474    vtd_bus = vtd_find_as_from_bus_num(s, bus_n);
1475    if (vtd_bus) {
1476        devfn = VTD_SID_TO_DEVFN(source_id);
1477        for (devfn_it = 0; devfn_it < PCI_DEVFN_MAX; ++devfn_it) {
1478            vtd_as = vtd_bus->dev_as[devfn_it];
1479            if (vtd_as && ((devfn_it & mask) == (devfn & mask))) {
1480                trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(devfn_it),
1481                                             VTD_PCI_FUNC(devfn_it));
1482                vtd_iommu_lock(s);
1483                vtd_as->context_cache_entry.context_cache_gen = 0;
1484                vtd_iommu_unlock(s);
1485                /*
1486                 * Do switch address space when needed, in case if the
1487                 * device passthrough bit is switched.
1488                 */
1489                vtd_switch_address_space(vtd_as);
1490                /*
1491                 * So a device is moving out of (or moving into) a
1492                 * domain, resync the shadow page table.
1493                 * This won't bring bad even if we have no such
1494                 * notifier registered - the IOMMU notification
1495                 * framework will skip MAP notifications if that
1496                 * happened.
1497                 */
1498                vtd_sync_shadow_page_table(vtd_as);
1499            }
1500        }
1501    }
1502}
1503
1504/* Context-cache invalidation
1505 * Returns the Context Actual Invalidation Granularity.
1506 * @val: the content of the CCMD_REG
1507 */
1508static uint64_t vtd_context_cache_invalidate(IntelIOMMUState *s, uint64_t val)
1509{
1510    uint64_t caig;
1511    uint64_t type = val & VTD_CCMD_CIRG_MASK;
1512
1513    switch (type) {
1514    case VTD_CCMD_DOMAIN_INVL:
1515        /* Fall through */
1516    case VTD_CCMD_GLOBAL_INVL:
1517        caig = VTD_CCMD_GLOBAL_INVL_A;
1518        vtd_context_global_invalidate(s);
1519        break;
1520
1521    case VTD_CCMD_DEVICE_INVL:
1522        caig = VTD_CCMD_DEVICE_INVL_A;
1523        vtd_context_device_invalidate(s, VTD_CCMD_SID(val), VTD_CCMD_FM(val));
1524        break;
1525
1526    default:
1527        trace_vtd_err("Context cache invalidate type error.");
1528        caig = 0;
1529    }
1530    return caig;
1531}
1532
1533static void vtd_iotlb_global_invalidate(IntelIOMMUState *s)
1534{
1535    trace_vtd_inv_desc_iotlb_global();
1536    vtd_reset_iotlb(s);
1537    vtd_iommu_replay_all(s);
1538}
1539
1540static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
1541{
1542    VTDContextEntry ce;
1543    VTDAddressSpace *vtd_as;
1544
1545    trace_vtd_inv_desc_iotlb_domain(domain_id);
1546
1547    vtd_iommu_lock(s);
1548    g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_domain,
1549                                &domain_id);
1550    vtd_iommu_unlock(s);
1551
1552    QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
1553        if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
1554                                      vtd_as->devfn, &ce) &&
1555            domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
1556            vtd_sync_shadow_page_table(vtd_as);
1557        }
1558    }
1559}
1560
1561static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
1562                                           uint16_t domain_id, hwaddr addr,
1563                                           uint8_t am)
1564{
1565    VTDAddressSpace *vtd_as;
1566    VTDContextEntry ce;
1567    int ret;
1568    hwaddr size = (1 << am) * VTD_PAGE_SIZE;
1569
1570    QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) {
1571        ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
1572                                       vtd_as->devfn, &ce);
1573        if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
1574            if (vtd_as_has_map_notifier(vtd_as)) {
1575                /*
1576                 * As long as we have MAP notifications registered in
1577                 * any of our IOMMU notifiers, we need to sync the
1578                 * shadow page table.
1579                 */
1580                vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size);
1581            } else {
1582                /*
1583                 * For UNMAP-only notifiers, we don't need to walk the
1584                 * page tables.  We just deliver the PSI down to
1585                 * invalidate caches.
1586                 */
1587                IOMMUTLBEntry entry = {
1588                    .target_as = &address_space_memory,
1589                    .iova = addr,
1590                    .translated_addr = 0,
1591                    .addr_mask = size - 1,
1592                    .perm = IOMMU_NONE,
1593                };
1594                memory_region_notify_iommu(&vtd_as->iommu, 0, entry);
1595            }
1596        }
1597    }
1598}
1599
1600static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
1601                                      hwaddr addr, uint8_t am)
1602{
1603    VTDIOTLBPageInvInfo info;
1604
1605    trace_vtd_inv_desc_iotlb_pages(domain_id, addr, am);
1606
1607    assert(am <= VTD_MAMV);
1608    info.domain_id = domain_id;
1609    info.addr = addr;
1610    info.mask = ~((1 << am) - 1);
1611    vtd_iommu_lock(s);
1612    g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
1613    vtd_iommu_unlock(s);
1614    vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am);
1615}
1616
1617/* Flush IOTLB
1618 * Returns the IOTLB Actual Invalidation Granularity.
1619 * @val: the content of the IOTLB_REG
1620 */
1621static uint64_t vtd_iotlb_flush(IntelIOMMUState *s, uint64_t val)
1622{
1623    uint64_t iaig;
1624    uint64_t type = val & VTD_TLB_FLUSH_GRANU_MASK;
1625    uint16_t domain_id;
1626    hwaddr addr;
1627    uint8_t am;
1628
1629    switch (type) {
1630    case VTD_TLB_GLOBAL_FLUSH:
1631        iaig = VTD_TLB_GLOBAL_FLUSH_A;
1632        vtd_iotlb_global_invalidate(s);
1633        break;
1634
1635    case VTD_TLB_DSI_FLUSH:
1636        domain_id = VTD_TLB_DID(val);
1637        iaig = VTD_TLB_DSI_FLUSH_A;
1638        vtd_iotlb_domain_invalidate(s, domain_id);
1639        break;
1640
1641    case VTD_TLB_PSI_FLUSH:
1642        domain_id = VTD_TLB_DID(val);
1643        addr = vtd_get_quad_raw(s, DMAR_IVA_REG);
1644        am = VTD_IVA_AM(addr);
1645        addr = VTD_IVA_ADDR(addr);
1646        if (am > VTD_MAMV) {
1647            trace_vtd_err("IOTLB PSI flush: address mask overflow.");
1648            iaig = 0;
1649            break;
1650        }
1651        iaig = VTD_TLB_PSI_FLUSH_A;
1652        vtd_iotlb_page_invalidate(s, domain_id, addr, am);
1653        break;
1654
1655    default:
1656        trace_vtd_err("IOTLB flush: invalid granularity.");
1657        iaig = 0;
1658    }
1659    return iaig;
1660}
1661
1662static void vtd_fetch_inv_desc(IntelIOMMUState *s);
1663
1664static inline bool vtd_queued_inv_disable_check(IntelIOMMUState *s)
1665{
1666    return s->qi_enabled && (s->iq_tail == s->iq_head) &&
1667           (s->iq_last_desc_type == VTD_INV_DESC_WAIT);
1668}
1669
1670static void vtd_handle_gcmd_qie(IntelIOMMUState *s, bool en)
1671{
1672    uint64_t iqa_val = vtd_get_quad_raw(s, DMAR_IQA_REG);
1673
1674    trace_vtd_inv_qi_enable(en);
1675
1676    if (en) {
1677        s->iq = iqa_val & VTD_IQA_IQA_MASK(s->aw_bits);
1678        /* 2^(x+8) entries */
1679        s->iq_size = 1UL << ((iqa_val & VTD_IQA_QS) + 8);
1680        s->qi_enabled = true;
1681        trace_vtd_inv_qi_setup(s->iq, s->iq_size);
1682        /* Ok - report back to driver */
1683        vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_QIES);
1684
1685        if (s->iq_tail != 0) {
1686            /*
1687             * This is a spec violation but Windows guests are known to set up
1688             * Queued Invalidation this way so we allow the write and process
1689             * Invalidation Descriptors right away.
1690             */
1691            trace_vtd_warn_invalid_qi_tail(s->iq_tail);
1692            if (!(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) {
1693                vtd_fetch_inv_desc(s);
1694            }
1695        }
1696    } else {
1697        if (vtd_queued_inv_disable_check(s)) {
1698            /* disable Queued Invalidation */
1699            vtd_set_quad_raw(s, DMAR_IQH_REG, 0);
1700            s->iq_head = 0;
1701            s->qi_enabled = false;
1702            /* Ok - report back to driver */
1703            vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_QIES, 0);
1704        } else {
1705            trace_vtd_err_qi_disable(s->iq_head, s->iq_tail, s->iq_last_desc_type);
1706        }
1707    }
1708}
1709
1710/* Set Root Table Pointer */
1711static void vtd_handle_gcmd_srtp(IntelIOMMUState *s)
1712{
1713    vtd_root_table_setup(s);
1714    /* Ok - report back to driver */
1715    vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_RTPS);
1716    vtd_reset_caches(s);
1717    vtd_address_space_refresh_all(s);
1718}
1719
1720/* Set Interrupt Remap Table Pointer */
1721static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s)
1722{
1723    vtd_interrupt_remap_table_setup(s);
1724    /* Ok - report back to driver */
1725    vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS);
1726}
1727
1728/* Handle Translation Enable/Disable */
1729static void vtd_handle_gcmd_te(IntelIOMMUState *s, bool en)
1730{
1731    if (s->dmar_enabled == en) {
1732        return;
1733    }
1734
1735    trace_vtd_dmar_enable(en);
1736
1737    if (en) {
1738        s->dmar_enabled = true;
1739        /* Ok - report back to driver */
1740        vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_TES);
1741    } else {
1742        s->dmar_enabled = false;
1743
1744        /* Clear the index of Fault Recording Register */
1745        s->next_frcd_reg = 0;
1746        /* Ok - report back to driver */
1747        vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_TES, 0);
1748    }
1749
1750    vtd_reset_caches(s);
1751    vtd_address_space_refresh_all(s);
1752}
1753
1754/* Handle Interrupt Remap Enable/Disable */
1755static void vtd_handle_gcmd_ire(IntelIOMMUState *s, bool en)
1756{
1757    trace_vtd_ir_enable(en);
1758
1759    if (en) {
1760        s->intr_enabled = true;
1761        /* Ok - report back to driver */
1762        vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRES);
1763    } else {
1764        s->intr_enabled = false;
1765        /* Ok - report back to driver */
1766        vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_IRES, 0);
1767    }
1768}
1769
1770/* Handle write to Global Command Register */
1771static void vtd_handle_gcmd_write(IntelIOMMUState *s)
1772{
1773    uint32_t status = vtd_get_long_raw(s, DMAR_GSTS_REG);
1774    uint32_t val = vtd_get_long_raw(s, DMAR_GCMD_REG);
1775    uint32_t changed = status ^ val;
1776
1777    trace_vtd_reg_write_gcmd(status, val);
1778    if (changed & VTD_GCMD_TE) {
1779        /* Translation enable/disable */
1780        vtd_handle_gcmd_te(s, val & VTD_GCMD_TE);
1781    }
1782    if (val & VTD_GCMD_SRTP) {
1783        /* Set/update the root-table pointer */
1784        vtd_handle_gcmd_srtp(s);
1785    }
1786    if (changed & VTD_GCMD_QIE) {
1787        /* Queued Invalidation Enable */
1788        vtd_handle_gcmd_qie(s, val & VTD_GCMD_QIE);
1789    }
1790    if (val & VTD_GCMD_SIRTP) {
1791        /* Set/update the interrupt remapping root-table pointer */
1792        vtd_handle_gcmd_sirtp(s);
1793    }
1794    if (changed & VTD_GCMD_IRE) {
1795        /* Interrupt remap enable/disable */
1796        vtd_handle_gcmd_ire(s, val & VTD_GCMD_IRE);
1797    }
1798}
1799
1800/* Handle write to Context Command Register */
1801static void vtd_handle_ccmd_write(IntelIOMMUState *s)
1802{
1803    uint64_t ret;
1804    uint64_t val = vtd_get_quad_raw(s, DMAR_CCMD_REG);
1805
1806    /* Context-cache invalidation request */
1807    if (val & VTD_CCMD_ICC) {
1808        if (s->qi_enabled) {
1809            trace_vtd_err("Queued Invalidation enabled, "
1810                          "should not use register-based invalidation");
1811            return;
1812        }
1813        ret = vtd_context_cache_invalidate(s, val);
1814        /* Invalidation completed. Change something to show */
1815        vtd_set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_ICC, 0ULL);
1816        ret = vtd_set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_CAIG_MASK,
1817                                      ret);
1818    }
1819}
1820
1821/* Handle write to IOTLB Invalidation Register */
1822static void vtd_handle_iotlb_write(IntelIOMMUState *s)
1823{
1824    uint64_t ret;
1825    uint64_t val = vtd_get_quad_raw(s, DMAR_IOTLB_REG);
1826
1827    /* IOTLB invalidation request */
1828    if (val & VTD_TLB_IVT) {
1829        if (s->qi_enabled) {
1830            trace_vtd_err("Queued Invalidation enabled, "
1831                          "should not use register-based invalidation.");
1832            return;
1833        }
1834        ret = vtd_iotlb_flush(s, val);
1835        /* Invalidation completed. Change something to show */
1836        vtd_set_clear_mask_quad(s, DMAR_IOTLB_REG, VTD_TLB_IVT, 0ULL);
1837        ret = vtd_set_clear_mask_quad(s, DMAR_IOTLB_REG,
1838                                      VTD_TLB_FLUSH_GRANU_MASK_A, ret);
1839    }
1840}
1841
1842/* Fetch an Invalidation Descriptor from the Invalidation Queue */
1843static bool vtd_get_inv_desc(dma_addr_t base_addr, uint32_t offset,
1844                             VTDInvDesc *inv_desc)
1845{
1846    dma_addr_t addr = base_addr + offset * sizeof(*inv_desc);
1847    if (dma_memory_read(&address_space_memory, addr, inv_desc,
1848        sizeof(*inv_desc))) {
1849        trace_vtd_err("Read INV DESC failed.");
1850        inv_desc->lo = 0;
1851        inv_desc->hi = 0;
1852        return false;
1853    }
1854    inv_desc->lo = le64_to_cpu(inv_desc->lo);
1855    inv_desc->hi = le64_to_cpu(inv_desc->hi);
1856    return true;
1857}
1858
1859static bool vtd_process_wait_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc)
1860{
1861    if ((inv_desc->hi & VTD_INV_DESC_WAIT_RSVD_HI) ||
1862        (inv_desc->lo & VTD_INV_DESC_WAIT_RSVD_LO)) {
1863        trace_vtd_inv_desc_wait_invalid(inv_desc->hi, inv_desc->lo);
1864        return false;
1865    }
1866    if (inv_desc->lo & VTD_INV_DESC_WAIT_SW) {
1867        /* Status Write */
1868        uint32_t status_data = (uint32_t)(inv_desc->lo >>
1869                               VTD_INV_DESC_WAIT_DATA_SHIFT);
1870
1871        assert(!(inv_desc->lo & VTD_INV_DESC_WAIT_IF));
1872
1873        /* FIXME: need to be masked with HAW? */
1874        dma_addr_t status_addr = inv_desc->hi;
1875        trace_vtd_inv_desc_wait_sw(status_addr, status_data);
1876        status_data = cpu_to_le32(status_data);
1877        if (dma_memory_write(&address_space_memory, status_addr, &status_data,
1878                             sizeof(status_data))) {
1879            trace_vtd_inv_desc_wait_write_fail(inv_desc->hi, inv_desc->lo);
1880            return false;
1881        }
1882    } else if (inv_desc->lo & VTD_INV_DESC_WAIT_IF) {
1883        /* Interrupt flag */
1884        vtd_generate_completion_event(s);
1885    } else {
1886        trace_vtd_inv_desc_wait_invalid(inv_desc->hi, inv_desc->lo);
1887        return false;
1888    }
1889    return true;
1890}
1891
1892static bool vtd_process_context_cache_desc(IntelIOMMUState *s,
1893                                           VTDInvDesc *inv_desc)
1894{
1895    uint16_t sid, fmask;
1896
1897    if ((inv_desc->lo & VTD_INV_DESC_CC_RSVD) || inv_desc->hi) {
1898        trace_vtd_inv_desc_cc_invalid(inv_desc->hi, inv_desc->lo);
1899        return false;
1900    }
1901    switch (inv_desc->lo & VTD_INV_DESC_CC_G) {
1902    case VTD_INV_DESC_CC_DOMAIN:
1903        trace_vtd_inv_desc_cc_domain(
1904            (uint16_t)VTD_INV_DESC_CC_DID(inv_desc->lo));
1905        /* Fall through */
1906    case VTD_INV_DESC_CC_GLOBAL:
1907        vtd_context_global_invalidate(s);
1908        break;
1909
1910    case VTD_INV_DESC_CC_DEVICE:
1911        sid = VTD_INV_DESC_CC_SID(inv_desc->lo);
1912        fmask = VTD_INV_DESC_CC_FM(inv_desc->lo);
1913        vtd_context_device_invalidate(s, sid, fmask);
1914        break;
1915
1916    default:
1917        trace_vtd_inv_desc_cc_invalid(inv_desc->hi, inv_desc->lo);
1918        return false;
1919    }
1920    return true;
1921}
1922
1923static bool vtd_process_iotlb_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc)
1924{
1925    uint16_t domain_id;
1926    uint8_t am;
1927    hwaddr addr;
1928
1929    if ((inv_desc->lo & VTD_INV_DESC_IOTLB_RSVD_LO) ||
1930        (inv_desc->hi & VTD_INV_DESC_IOTLB_RSVD_HI)) {
1931        trace_vtd_inv_desc_iotlb_invalid(inv_desc->hi, inv_desc->lo);
1932        return false;
1933    }
1934
1935    switch (inv_desc->lo & VTD_INV_DESC_IOTLB_G) {
1936    case VTD_INV_DESC_IOTLB_GLOBAL:
1937        vtd_iotlb_global_invalidate(s);
1938        break;
1939
1940    case VTD_INV_DESC_IOTLB_DOMAIN:
1941        domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo);
1942        vtd_iotlb_domain_invalidate(s, domain_id);
1943        break;
1944
1945    case VTD_INV_DESC_IOTLB_PAGE:
1946        domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo);
1947        addr = VTD_INV_DESC_IOTLB_ADDR(inv_desc->hi);
1948        am = VTD_INV_DESC_IOTLB_AM(inv_desc->hi);
1949        if (am > VTD_MAMV) {
1950            trace_vtd_inv_desc_iotlb_invalid(inv_desc->hi, inv_desc->lo);
1951            return false;
1952        }
1953        vtd_iotlb_page_invalidate(s, domain_id, addr, am);
1954        break;
1955
1956    default:
1957        trace_vtd_inv_desc_iotlb_invalid(inv_desc->hi, inv_desc->lo);
1958        return false;
1959    }
1960    return true;
1961}
1962
1963static bool vtd_process_inv_iec_desc(IntelIOMMUState *s,
1964                                     VTDInvDesc *inv_desc)
1965{
1966    trace_vtd_inv_desc_iec(inv_desc->iec.granularity,
1967                           inv_desc->iec.index,
1968                           inv_desc->iec.index_mask);
1969
1970    vtd_iec_notify_all(s, !inv_desc->iec.granularity,
1971                       inv_desc->iec.index,
1972                       inv_desc->iec.index_mask);
1973    return true;
1974}
1975
1976static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s,
1977                                          VTDInvDesc *inv_desc)
1978{
1979    VTDAddressSpace *vtd_dev_as;
1980    IOMMUTLBEntry entry;
1981    struct VTDBus *vtd_bus;
1982    hwaddr addr;
1983    uint64_t sz;
1984    uint16_t sid;
1985    uint8_t devfn;
1986    bool size;
1987    uint8_t bus_num;
1988
1989    addr = VTD_INV_DESC_DEVICE_IOTLB_ADDR(inv_desc->hi);
1990    sid = VTD_INV_DESC_DEVICE_IOTLB_SID(inv_desc->lo);
1991    devfn = sid & 0xff;
1992    bus_num = sid >> 8;
1993    size = VTD_INV_DESC_DEVICE_IOTLB_SIZE(inv_desc->hi);
1994
1995    if ((inv_desc->lo & VTD_INV_DESC_DEVICE_IOTLB_RSVD_LO) ||
1996        (inv_desc->hi & VTD_INV_DESC_DEVICE_IOTLB_RSVD_HI)) {
1997        trace_vtd_inv_desc_iotlb_invalid(inv_desc->hi, inv_desc->lo);
1998        return false;
1999    }
2000
2001    vtd_bus = vtd_find_as_from_bus_num(s, bus_num);
2002    if (!vtd_bus) {
2003        goto done;
2004    }
2005
2006    vtd_dev_as = vtd_bus->dev_as[devfn];
2007    if (!vtd_dev_as) {
2008        goto done;
2009    }
2010
2011    /* According to ATS spec table 2.4:
2012     * S = 0, bits 15:12 = xxxx     range size: 4K
2013     * S = 1, bits 15:12 = xxx0     range size: 8K
2014     * S = 1, bits 15:12 = xx01     range size: 16K
2015     * S = 1, bits 15:12 = x011     range size: 32K
2016     * S = 1, bits 15:12 = 0111     range size: 64K
2017     * ...
2018     */
2019    if (size) {
2020        sz = (VTD_PAGE_SIZE * 2) << cto64(addr >> VTD_PAGE_SHIFT);
2021        addr &= ~(sz - 1);
2022    } else {
2023        sz = VTD_PAGE_SIZE;
2024    }
2025
2026    entry.target_as = &vtd_dev_as->as;
2027    entry.addr_mask = sz - 1;
2028    entry.iova = addr;
2029    entry.perm = IOMMU_NONE;
2030    entry.translated_addr = 0;
2031    memory_region_notify_iommu(&vtd_dev_as->iommu, 0, entry);
2032
2033done:
2034    return true;
2035}
2036
2037static bool vtd_process_inv_desc(IntelIOMMUState *s)
2038{
2039    VTDInvDesc inv_desc;
2040    uint8_t desc_type;
2041
2042    trace_vtd_inv_qi_head(s->iq_head);
2043    if (!vtd_get_inv_desc(s->iq, s->iq_head, &inv_desc)) {
2044        s->iq_last_desc_type = VTD_INV_DESC_NONE;
2045        return false;
2046    }
2047    desc_type = inv_desc.lo & VTD_INV_DESC_TYPE;
2048    /* FIXME: should update at first or at last? */
2049    s->iq_last_desc_type = desc_type;
2050
2051    switch (desc_type) {
2052    case VTD_INV_DESC_CC:
2053        trace_vtd_inv_desc("context-cache", inv_desc.hi, inv_desc.lo);
2054        if (!vtd_process_context_cache_desc(s, &inv_desc)) {
2055            return false;
2056        }
2057        break;
2058
2059    case VTD_INV_DESC_IOTLB:
2060        trace_vtd_inv_desc("iotlb", inv_desc.hi, inv_desc.lo);
2061        if (!vtd_process_iotlb_desc(s, &inv_desc)) {
2062            return false;
2063        }
2064        break;
2065
2066    case VTD_INV_DESC_WAIT:
2067        trace_vtd_inv_desc("wait", inv_desc.hi, inv_desc.lo);
2068        if (!vtd_process_wait_desc(s, &inv_desc)) {
2069            return false;
2070        }
2071        break;
2072
2073    case VTD_INV_DESC_IEC:
2074        trace_vtd_inv_desc("iec", inv_desc.hi, inv_desc.lo);
2075        if (!vtd_process_inv_iec_desc(s, &inv_desc)) {
2076            return false;
2077        }
2078        break;
2079
2080    case VTD_INV_DESC_DEVICE:
2081        trace_vtd_inv_desc("device", inv_desc.hi, inv_desc.lo);
2082        if (!vtd_process_device_iotlb_desc(s, &inv_desc)) {
2083            return false;
2084        }
2085        break;
2086
2087    default:
2088        trace_vtd_inv_desc_invalid(inv_desc.hi, inv_desc.lo);
2089        return false;
2090    }
2091    s->iq_head++;
2092    if (s->iq_head == s->iq_size) {
2093        s->iq_head = 0;
2094    }
2095    return true;
2096}
2097
2098/* Try to fetch and process more Invalidation Descriptors */
2099static void vtd_fetch_inv_desc(IntelIOMMUState *s)
2100{
2101    trace_vtd_inv_qi_fetch();
2102
2103    if (s->iq_tail >= s->iq_size) {
2104        /* Detects an invalid Tail pointer */
2105        trace_vtd_err_qi_tail(s->iq_tail, s->iq_size);
2106        vtd_handle_inv_queue_error(s);
2107        return;
2108    }
2109    while (s->iq_head != s->iq_tail) {
2110        if (!vtd_process_inv_desc(s)) {
2111            /* Invalidation Queue Errors */
2112            vtd_handle_inv_queue_error(s);
2113            break;
2114        }
2115        /* Must update the IQH_REG in time */
2116        vtd_set_quad_raw(s, DMAR_IQH_REG,
2117                         (((uint64_t)(s->iq_head)) << VTD_IQH_QH_SHIFT) &
2118                         VTD_IQH_QH_MASK);
2119    }
2120}
2121
2122/* Handle write to Invalidation Queue Tail Register */
2123static void vtd_handle_iqt_write(IntelIOMMUState *s)
2124{
2125    uint64_t val = vtd_get_quad_raw(s, DMAR_IQT_REG);
2126
2127    s->iq_tail = VTD_IQT_QT(val);
2128    trace_vtd_inv_qi_tail(s->iq_tail);
2129
2130    if (s->qi_enabled && !(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) {
2131        /* Process Invalidation Queue here */
2132        vtd_fetch_inv_desc(s);
2133    }
2134}
2135
2136static void vtd_handle_fsts_write(IntelIOMMUState *s)
2137{
2138    uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG);
2139    uint32_t fectl_reg = vtd_get_long_raw(s, DMAR_FECTL_REG);
2140    uint32_t status_fields = VTD_FSTS_PFO | VTD_FSTS_PPF | VTD_FSTS_IQE;
2141
2142    if ((fectl_reg & VTD_FECTL_IP) && !(fsts_reg & status_fields)) {
2143        vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0);
2144        trace_vtd_fsts_clear_ip();
2145    }
2146    /* FIXME: when IQE is Clear, should we try to fetch some Invalidation
2147     * Descriptors if there are any when Queued Invalidation is enabled?
2148     */
2149}
2150
2151static void vtd_handle_fectl_write(IntelIOMMUState *s)
2152{
2153    uint32_t fectl_reg;
2154    /* FIXME: when software clears the IM field, check the IP field. But do we
2155     * need to compare the old value and the new value to conclude that
2156     * software clears the IM field? Or just check if the IM field is zero?
2157     */
2158    fectl_reg = vtd_get_long_raw(s, DMAR_FECTL_REG);
2159
2160    trace_vtd_reg_write_fectl(fectl_reg);
2161
2162    if ((fectl_reg & VTD_FECTL_IP) && !(fectl_reg & VTD_FECTL_IM)) {
2163        vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG);
2164        vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0);
2165    }
2166}
2167
2168static void vtd_handle_ics_write(IntelIOMMUState *s)
2169{
2170    uint32_t ics_reg = vtd_get_long_raw(s, DMAR_ICS_REG);
2171    uint32_t iectl_reg = vtd_get_long_raw(s, DMAR_IECTL_REG);
2172
2173    if ((iectl_reg & VTD_IECTL_IP) && !(ics_reg & VTD_ICS_IWC)) {
2174        trace_vtd_reg_ics_clear_ip();
2175        vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0);
2176    }
2177}
2178
2179static void vtd_handle_iectl_write(IntelIOMMUState *s)
2180{
2181    uint32_t iectl_reg;
2182    /* FIXME: when software clears the IM field, check the IP field. But do we
2183     * need to compare the old value and the new value to conclude that
2184     * software clears the IM field? Or just check if the IM field is zero?
2185     */
2186    iectl_reg = vtd_get_long_raw(s, DMAR_IECTL_REG);
2187
2188    trace_vtd_reg_write_iectl(iectl_reg);
2189
2190    if ((iectl_reg & VTD_IECTL_IP) && !(iectl_reg & VTD_IECTL_IM)) {
2191        vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG);
2192        vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0);
2193    }
2194}
2195
2196static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size)
2197{
2198    IntelIOMMUState *s = opaque;
2199    uint64_t val;
2200
2201    trace_vtd_reg_read(addr, size);
2202
2203    if (addr + size > DMAR_REG_SIZE) {
2204        trace_vtd_err("Read MMIO over range.");
2205        return (uint64_t)-1;
2206    }
2207
2208    switch (addr) {
2209    /* Root Table Address Register, 64-bit */
2210    case DMAR_RTADDR_REG:
2211        if (size == 4) {
2212            val = s->root & ((1ULL << 32) - 1);
2213        } else {
2214            val = s->root;
2215        }
2216        break;
2217
2218    case DMAR_RTADDR_REG_HI:
2219        assert(size == 4);
2220        val = s->root >> 32;
2221        break;
2222
2223    /* Invalidation Queue Address Register, 64-bit */
2224    case DMAR_IQA_REG:
2225        val = s->iq | (vtd_get_quad(s, DMAR_IQA_REG) & VTD_IQA_QS);
2226        if (size == 4) {
2227            val = val & ((1ULL << 32) - 1);
2228        }
2229        break;
2230
2231    case DMAR_IQA_REG_HI:
2232        assert(size == 4);
2233        val = s->iq >> 32;
2234        break;
2235
2236    default:
2237        if (size == 4) {
2238            val = vtd_get_long(s, addr);
2239        } else {
2240            val = vtd_get_quad(s, addr);
2241        }
2242    }
2243
2244    return val;
2245}
2246
2247static void vtd_mem_write(void *opaque, hwaddr addr,
2248                          uint64_t val, unsigned size)
2249{
2250    IntelIOMMUState *s = opaque;
2251
2252    trace_vtd_reg_write(addr, size, val);
2253
2254    if (addr + size > DMAR_REG_SIZE) {
2255        trace_vtd_err("Write MMIO over range.");
2256        return;
2257    }
2258
2259    switch (addr) {
2260    /* Global Command Register, 32-bit */
2261    case DMAR_GCMD_REG:
2262        vtd_set_long(s, addr, val);
2263        vtd_handle_gcmd_write(s);
2264        break;
2265
2266    /* Context Command Register, 64-bit */
2267    case DMAR_CCMD_REG:
2268        if (size == 4) {
2269            vtd_set_long(s, addr, val);
2270        } else {
2271            vtd_set_quad(s, addr, val);
2272            vtd_handle_ccmd_write(s);
2273        }
2274        break;
2275
2276    case DMAR_CCMD_REG_HI:
2277        assert(size == 4);
2278        vtd_set_long(s, addr, val);
2279        vtd_handle_ccmd_write(s);
2280        break;
2281
2282    /* IOTLB Invalidation Register, 64-bit */
2283    case DMAR_IOTLB_REG:
2284        if (size == 4) {
2285            vtd_set_long(s, addr, val);
2286        } else {
2287            vtd_set_quad(s, addr, val);
2288            vtd_handle_iotlb_write(s);
2289        }
2290        break;
2291
2292    case DMAR_IOTLB_REG_HI:
2293        assert(size == 4);
2294        vtd_set_long(s, addr, val);
2295        vtd_handle_iotlb_write(s);
2296        break;
2297
2298    /* Invalidate Address Register, 64-bit */
2299    case DMAR_IVA_REG:
2300        if (size == 4) {
2301            vtd_set_long(s, addr, val);
2302        } else {
2303            vtd_set_quad(s, addr, val);
2304        }
2305        break;
2306
2307    case DMAR_IVA_REG_HI:
2308        assert(size == 4);
2309        vtd_set_long(s, addr, val);
2310        break;
2311
2312    /* Fault Status Register, 32-bit */
2313    case DMAR_FSTS_REG:
2314        assert(size == 4);
2315        vtd_set_long(s, addr, val);
2316        vtd_handle_fsts_write(s);
2317        break;
2318
2319    /* Fault Event Control Register, 32-bit */
2320    case DMAR_FECTL_REG:
2321        assert(size == 4);
2322        vtd_set_long(s, addr, val);
2323        vtd_handle_fectl_write(s);
2324        break;
2325
2326    /* Fault Event Data Register, 32-bit */
2327    case DMAR_FEDATA_REG:
2328        assert(size == 4);
2329        vtd_set_long(s, addr, val);
2330        break;
2331
2332    /* Fault Event Address Register, 32-bit */
2333    case DMAR_FEADDR_REG:
2334        if (size == 4) {
2335            vtd_set_long(s, addr, val);
2336        } else {
2337            /*
2338             * While the register is 32-bit only, some guests (Xen...) write to
2339             * it with 64-bit.
2340             */
2341            vtd_set_quad(s, addr, val);
2342        }
2343        break;
2344
2345    /* Fault Event Upper Address Register, 32-bit */
2346    case DMAR_FEUADDR_REG:
2347        assert(size == 4);
2348        vtd_set_long(s, addr, val);
2349        break;
2350
2351    /* Protected Memory Enable Register, 32-bit */
2352    case DMAR_PMEN_REG:
2353        assert(size == 4);
2354        vtd_set_long(s, addr, val);
2355        break;
2356
2357    /* Root Table Address Register, 64-bit */
2358    case DMAR_RTADDR_REG:
2359        if (size == 4) {
2360            vtd_set_long(s, addr, val);
2361        } else {
2362            vtd_set_quad(s, addr, val);
2363        }
2364        break;
2365
2366    case DMAR_RTADDR_REG_HI:
2367        assert(size == 4);
2368        vtd_set_long(s, addr, val);
2369        break;
2370
2371    /* Invalidation Queue Tail Register, 64-bit */
2372    case DMAR_IQT_REG:
2373        if (size == 4) {
2374            vtd_set_long(s, addr, val);
2375        } else {
2376            vtd_set_quad(s, addr, val);
2377        }
2378        vtd_handle_iqt_write(s);
2379        break;
2380
2381    case DMAR_IQT_REG_HI:
2382        assert(size == 4);
2383        vtd_set_long(s, addr, val);
2384        /* 19:63 of IQT_REG is RsvdZ, do nothing here */
2385        break;
2386
2387    /* Invalidation Queue Address Register, 64-bit */
2388    case DMAR_IQA_REG:
2389        if (size == 4) {
2390            vtd_set_long(s, addr, val);
2391        } else {
2392            vtd_set_quad(s, addr, val);
2393        }
2394        break;
2395
2396    case DMAR_IQA_REG_HI:
2397        assert(size == 4);
2398        vtd_set_long(s, addr, val);
2399        break;
2400
2401    /* Invalidation Completion Status Register, 32-bit */
2402    case DMAR_ICS_REG:
2403        assert(size == 4);
2404        vtd_set_long(s, addr, val);
2405        vtd_handle_ics_write(s);
2406        break;
2407
2408    /* Invalidation Event Control Register, 32-bit */
2409    case DMAR_IECTL_REG:
2410        assert(size == 4);
2411        vtd_set_long(s, addr, val);
2412        vtd_handle_iectl_write(s);
2413        break;
2414
2415    /* Invalidation Event Data Register, 32-bit */
2416    case DMAR_IEDATA_REG:
2417        assert(size == 4);
2418        vtd_set_long(s, addr, val);
2419        break;
2420
2421    /* Invalidation Event Address Register, 32-bit */
2422    case DMAR_IEADDR_REG:
2423        assert(size == 4);
2424        vtd_set_long(s, addr, val);
2425        break;
2426
2427    /* Invalidation Event Upper Address Register, 32-bit */
2428    case DMAR_IEUADDR_REG:
2429        assert(size == 4);
2430        vtd_set_long(s, addr, val);
2431        break;
2432
2433    /* Fault Recording Registers, 128-bit */
2434    case DMAR_FRCD_REG_0_0:
2435        if (size == 4) {
2436            vtd_set_long(s, addr, val);
2437        } else {
2438            vtd_set_quad(s, addr, val);
2439        }
2440        break;
2441
2442    case DMAR_FRCD_REG_0_1:
2443        assert(size == 4);
2444        vtd_set_long(s, addr, val);
2445        break;
2446
2447    case DMAR_FRCD_REG_0_2:
2448        if (size == 4) {
2449            vtd_set_long(s, addr, val);
2450        } else {
2451            vtd_set_quad(s, addr, val);
2452            /* May clear bit 127 (Fault), update PPF */
2453            vtd_update_fsts_ppf(s);
2454        }
2455        break;
2456
2457    case DMAR_FRCD_REG_0_3:
2458        assert(size == 4);
2459        vtd_set_long(s, addr, val);
2460        /* May clear bit 127 (Fault), update PPF */
2461        vtd_update_fsts_ppf(s);
2462        break;
2463
2464    case DMAR_IRTA_REG:
2465        if (size == 4) {
2466            vtd_set_long(s, addr, val);
2467        } else {
2468            vtd_set_quad(s, addr, val);
2469        }
2470        break;
2471
2472    case DMAR_IRTA_REG_HI:
2473        assert(size == 4);
2474        vtd_set_long(s, addr, val);
2475        break;
2476
2477    default:
2478        if (size == 4) {
2479            vtd_set_long(s, addr, val);
2480        } else {
2481            vtd_set_quad(s, addr, val);
2482        }
2483    }
2484}
2485
2486static IOMMUTLBEntry vtd_iommu_translate(IOMMUMemoryRegion *iommu, hwaddr addr,
2487                                         IOMMUAccessFlags flag, int iommu_idx)
2488{
2489    VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
2490    IntelIOMMUState *s = vtd_as->iommu_state;
2491    IOMMUTLBEntry iotlb = {
2492        /* We'll fill in the rest later. */
2493        .target_as = &address_space_memory,
2494    };
2495    bool success;
2496
2497    if (likely(s->dmar_enabled)) {
2498        success = vtd_do_iommu_translate(vtd_as, vtd_as->bus, vtd_as->devfn,
2499                                         addr, flag & IOMMU_WO, &iotlb);
2500    } else {
2501        /* DMAR disabled, passthrough, use 4k-page*/
2502        iotlb.iova = addr & VTD_PAGE_MASK_4K;
2503        iotlb.translated_addr = addr & VTD_PAGE_MASK_4K;
2504        iotlb.addr_mask = ~VTD_PAGE_MASK_4K;
2505        iotlb.perm = IOMMU_RW;
2506        success = true;
2507    }
2508
2509    if (likely(success)) {
2510        trace_vtd_dmar_translate(pci_bus_num(vtd_as->bus),
2511                                 VTD_PCI_SLOT(vtd_as->devfn),
2512                                 VTD_PCI_FUNC(vtd_as->devfn),
2513                                 iotlb.iova, iotlb.translated_addr,
2514                                 iotlb.addr_mask);
2515    } else {
2516        trace_vtd_err_dmar_translate(pci_bus_num(vtd_as->bus),
2517                                     VTD_PCI_SLOT(vtd_as->devfn),
2518                                     VTD_PCI_FUNC(vtd_as->devfn),
2519                                     iotlb.iova);
2520    }
2521
2522    return iotlb;
2523}
2524
2525static void vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu,
2526                                          IOMMUNotifierFlag old,
2527                                          IOMMUNotifierFlag new)
2528{
2529    VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
2530    IntelIOMMUState *s = vtd_as->iommu_state;
2531
2532    if (!s->caching_mode && new & IOMMU_NOTIFIER_MAP) {
2533        error_report("We need to set caching-mode=1 for intel-iommu to enable "
2534                     "device assignment with IOMMU protection.");
2535        exit(1);
2536    }
2537
2538    /* Update per-address-space notifier flags */
2539    vtd_as->notifier_flags = new;
2540
2541    if (old == IOMMU_NOTIFIER_NONE) {
2542        QLIST_INSERT_HEAD(&s->vtd_as_with_notifiers, vtd_as, next);
2543    } else if (new == IOMMU_NOTIFIER_NONE) {
2544        QLIST_REMOVE(vtd_as, next);
2545    }
2546}
2547
2548static int vtd_post_load(void *opaque, int version_id)
2549{
2550    IntelIOMMUState *iommu = opaque;
2551
2552    /*
2553     * Memory regions are dynamically turned on/off depending on
2554     * context entry configurations from the guest. After migration,
2555     * we need to make sure the memory regions are still correct.
2556     */
2557    vtd_switch_address_space_all(iommu);
2558
2559    return 0;
2560}
2561
2562static const VMStateDescription vtd_vmstate = {
2563    .name = "iommu-intel",
2564    .version_id = 1,
2565    .minimum_version_id = 1,
2566    .priority = MIG_PRI_IOMMU,
2567    .post_load = vtd_post_load,
2568    .fields = (VMStateField[]) {
2569        VMSTATE_UINT64(root, IntelIOMMUState),
2570        VMSTATE_UINT64(intr_root, IntelIOMMUState),
2571        VMSTATE_UINT64(iq, IntelIOMMUState),
2572        VMSTATE_UINT32(intr_size, IntelIOMMUState),
2573        VMSTATE_UINT16(iq_head, IntelIOMMUState),
2574        VMSTATE_UINT16(iq_tail, IntelIOMMUState),
2575        VMSTATE_UINT16(iq_size, IntelIOMMUState),
2576        VMSTATE_UINT16(next_frcd_reg, IntelIOMMUState),
2577        VMSTATE_UINT8_ARRAY(csr, IntelIOMMUState, DMAR_REG_SIZE),
2578        VMSTATE_UINT8(iq_last_desc_type, IntelIOMMUState),
2579        VMSTATE_BOOL(root_extended, IntelIOMMUState),
2580        VMSTATE_BOOL(dmar_enabled, IntelIOMMUState),
2581        VMSTATE_BOOL(qi_enabled, IntelIOMMUState),
2582        VMSTATE_BOOL(intr_enabled, IntelIOMMUState),
2583        VMSTATE_BOOL(intr_eime, IntelIOMMUState),
2584        VMSTATE_END_OF_LIST()
2585    }
2586};
2587
2588static const MemoryRegionOps vtd_mem_ops = {
2589    .read = vtd_mem_read,
2590    .write = vtd_mem_write,
2591    .endianness = DEVICE_LITTLE_ENDIAN,
2592    .impl = {
2593        .min_access_size = 4,
2594        .max_access_size = 8,
2595    },
2596    .valid = {
2597        .min_access_size = 4,
2598        .max_access_size = 8,
2599    },
2600};
2601
2602static Property vtd_properties[] = {
2603    DEFINE_PROP_UINT32("version", IntelIOMMUState, version, 0),
2604    DEFINE_PROP_ON_OFF_AUTO("eim", IntelIOMMUState, intr_eim,
2605                            ON_OFF_AUTO_AUTO),
2606    DEFINE_PROP_BOOL("x-buggy-eim", IntelIOMMUState, buggy_eim, false),
2607    DEFINE_PROP_UINT8("x-aw-bits", IntelIOMMUState, aw_bits,
2608                      VTD_HOST_ADDRESS_WIDTH),
2609    DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE),
2610    DEFINE_PROP_END_OF_LIST(),
2611};
2612
2613/* Read IRTE entry with specific index */
2614static int vtd_irte_get(IntelIOMMUState *iommu, uint16_t index,
2615                        VTD_IR_TableEntry *entry, uint16_t sid)
2616{
2617    static const uint16_t vtd_svt_mask[VTD_SQ_MAX] = \
2618        {0xffff, 0xfffb, 0xfff9, 0xfff8};
2619    dma_addr_t addr = 0x00;
2620    uint16_t mask, source_id;
2621    uint8_t bus, bus_max, bus_min;
2622
2623    addr = iommu->intr_root + index * sizeof(*entry);
2624    if (dma_memory_read(&address_space_memory, addr, entry,
2625                        sizeof(*entry))) {
2626        trace_vtd_err("Memory read failed for IRTE.");
2627        return -VTD_FR_IR_ROOT_INVAL;
2628    }
2629
2630    trace_vtd_ir_irte_get(index, le64_to_cpu(entry->data[1]),
2631                          le64_to_cpu(entry->data[0]));
2632
2633    if (!entry->irte.present) {
2634        trace_vtd_err_irte(index, le64_to_cpu(entry->data[1]),
2635                           le64_to_cpu(entry->data[0]));
2636        return -VTD_FR_IR_ENTRY_P;
2637    }
2638
2639    if (entry->irte.__reserved_0 || entry->irte.__reserved_1 ||
2640        entry->irte.__reserved_2) {
2641        trace_vtd_err_irte(index, le64_to_cpu(entry->data[1]),
2642                           le64_to_cpu(entry->data[0]));
2643        return -VTD_FR_IR_IRTE_RSVD;
2644    }
2645
2646    if (sid != X86_IOMMU_SID_INVALID) {
2647        /* Validate IRTE SID */
2648        source_id = le32_to_cpu(entry->irte.source_id);
2649        switch (entry->irte.sid_vtype) {
2650        case VTD_SVT_NONE:
2651            break;
2652
2653        case VTD_SVT_ALL:
2654            mask = vtd_svt_mask[entry->irte.sid_q];
2655            if ((source_id & mask) != (sid & mask)) {
2656                trace_vtd_err_irte_sid(index, sid, source_id);
2657                return -VTD_FR_IR_SID_ERR;
2658            }
2659            break;
2660
2661        case VTD_SVT_BUS:
2662            bus_max = source_id >> 8;
2663            bus_min = source_id & 0xff;
2664            bus = sid >> 8;
2665            if (bus > bus_max || bus < bus_min) {
2666                trace_vtd_err_irte_sid_bus(index, bus, bus_min, bus_max);
2667                return -VTD_FR_IR_SID_ERR;
2668            }
2669            break;
2670
2671        default:
2672            trace_vtd_err_irte_svt(index, entry->irte.sid_vtype);
2673            /* Take this as verification failure. */
2674            return -VTD_FR_IR_SID_ERR;
2675            break;
2676        }
2677    }
2678
2679    return 0;
2680}
2681
2682/* Fetch IRQ information of specific IR index */
2683static int vtd_remap_irq_get(IntelIOMMUState *iommu, uint16_t index,
2684                             VTDIrq *irq, uint16_t sid)
2685{
2686    VTD_IR_TableEntry irte = {};
2687    int ret = 0;
2688
2689    ret = vtd_irte_get(iommu, index, &irte, sid);
2690    if (ret) {
2691        return ret;
2692    }
2693
2694    irq->trigger_mode = irte.irte.trigger_mode;
2695    irq->vector = irte.irte.vector;
2696    irq->delivery_mode = irte.irte.delivery_mode;
2697    irq->dest = le32_to_cpu(irte.irte.dest_id);
2698    if (!iommu->intr_eime) {
2699#define  VTD_IR_APIC_DEST_MASK         (0xff00ULL)
2700#define  VTD_IR_APIC_DEST_SHIFT        (8)
2701        irq->dest = (irq->dest & VTD_IR_APIC_DEST_MASK) >>
2702            VTD_IR_APIC_DEST_SHIFT;
2703    }
2704    irq->dest_mode = irte.irte.dest_mode;
2705    irq->redir_hint = irte.irte.redir_hint;
2706
2707    trace_vtd_ir_remap(index, irq->trigger_mode, irq->vector,
2708                       irq->delivery_mode, irq->dest, irq->dest_mode);
2709
2710    return 0;
2711}
2712
2713/* Generate one MSI message from VTDIrq info */
2714static void vtd_generate_msi_message(VTDIrq *irq, MSIMessage *msg_out)
2715{
2716    VTD_MSIMessage msg = {};
2717
2718    /* Generate address bits */
2719    msg.dest_mode = irq->dest_mode;
2720    msg.redir_hint = irq->redir_hint;
2721    msg.dest = irq->dest;
2722    msg.__addr_hi = irq->dest & 0xffffff00;
2723    msg.__addr_head = cpu_to_le32(0xfee);
2724    /* Keep this from original MSI address bits */
2725    msg.__not_used = irq->msi_addr_last_bits;
2726
2727    /* Generate data bits */
2728    msg.vector = irq->vector;
2729    msg.delivery_mode = irq->delivery_mode;
2730    msg.level = 1;
2731    msg.trigger_mode = irq->trigger_mode;
2732
2733    msg_out->address = msg.msi_addr;
2734    msg_out->data = msg.msi_data;
2735}
2736
2737/* Interrupt remapping for MSI/MSI-X entry */
2738static int vtd_interrupt_remap_msi(IntelIOMMUState *iommu,
2739                                   MSIMessage *origin,
2740                                   MSIMessage *translated,
2741                                   uint16_t sid)
2742{
2743    int ret = 0;
2744    VTD_IR_MSIAddress addr;
2745    uint16_t index;
2746    VTDIrq irq = {};
2747
2748    assert(origin && translated);
2749
2750    trace_vtd_ir_remap_msi_req(origin->address, origin->data);
2751
2752    if (!iommu || !iommu->intr_enabled) {
2753        memcpy(translated, origin, sizeof(*origin));
2754        goto out;
2755    }
2756
2757    if (origin->address & VTD_MSI_ADDR_HI_MASK) {
2758        trace_vtd_err("MSI address high 32 bits non-zero when "
2759                      "Interrupt Remapping enabled.");
2760        return -VTD_FR_IR_REQ_RSVD;
2761    }
2762
2763    addr.data = origin->address & VTD_MSI_ADDR_LO_MASK;
2764    if (addr.addr.__head != 0xfee) {
2765        trace_vtd_err("MSI addr low 32 bit invalid.");
2766        return -VTD_FR_IR_REQ_RSVD;
2767    }
2768
2769    /* This is compatible mode. */
2770    if (addr.addr.int_mode != VTD_IR_INT_FORMAT_REMAP) {
2771        memcpy(translated, origin, sizeof(*origin));
2772        goto out;
2773    }
2774
2775    index = addr.addr.index_h << 15 | le16_to_cpu(addr.addr.index_l);
2776
2777#define  VTD_IR_MSI_DATA_SUBHANDLE       (0x0000ffff)
2778#define  VTD_IR_MSI_DATA_RESERVED        (0xffff0000)
2779
2780    if (addr.addr.sub_valid) {
2781        /* See VT-d spec 5.1.2.2 and 5.1.3 on subhandle */
2782        index += origin->data & VTD_IR_MSI_DATA_SUBHANDLE;
2783    }
2784
2785    ret = vtd_remap_irq_get(iommu, index, &irq, sid);
2786    if (ret) {
2787        return ret;
2788    }
2789
2790    if (addr.addr.sub_valid) {
2791        trace_vtd_ir_remap_type("MSI");
2792        if (origin->data & VTD_IR_MSI_DATA_RESERVED) {
2793            trace_vtd_err_ir_msi_invalid(sid, origin->address, origin->data);
2794            return -VTD_FR_IR_REQ_RSVD;
2795        }
2796    } else {
2797        uint8_t vector = origin->data & 0xff;
2798        uint8_t trigger_mode = (origin->data >> MSI_DATA_TRIGGER_SHIFT) & 0x1;
2799
2800        trace_vtd_ir_remap_type("IOAPIC");
2801        /* IOAPIC entry vector should be aligned with IRTE vector
2802         * (see vt-d spec 5.1.5.1). */
2803        if (vector != irq.vector) {
2804            trace_vtd_warn_ir_vector(sid, index, vector, irq.vector);
2805        }
2806
2807        /* The Trigger Mode field must match the Trigger Mode in the IRTE.
2808         * (see vt-d spec 5.1.5.1). */
2809        if (trigger_mode != irq.trigger_mode) {
2810            trace_vtd_warn_ir_trigger(sid, index, trigger_mode,
2811                                      irq.trigger_mode);
2812        }
2813    }
2814
2815    /*
2816     * We'd better keep the last two bits, assuming that guest OS
2817     * might modify it. Keep it does not hurt after all.
2818     */
2819    irq.msi_addr_last_bits = addr.addr.__not_care;
2820
2821    /* Translate VTDIrq to MSI message */
2822    vtd_generate_msi_message(&irq, translated);
2823
2824out:
2825    trace_vtd_ir_remap_msi(origin->address, origin->data,
2826                           translated->address, translated->data);
2827    return 0;
2828}
2829
2830static int vtd_int_remap(X86IOMMUState *iommu, MSIMessage *src,
2831                         MSIMessage *dst, uint16_t sid)
2832{
2833    return vtd_interrupt_remap_msi(INTEL_IOMMU_DEVICE(iommu),
2834                                   src, dst, sid);
2835}
2836
2837static MemTxResult vtd_mem_ir_read(void *opaque, hwaddr addr,
2838                                   uint64_t *data, unsigned size,
2839                                   MemTxAttrs attrs)
2840{
2841    return MEMTX_OK;
2842}
2843
2844static MemTxResult vtd_mem_ir_write(void *opaque, hwaddr addr,
2845                                    uint64_t value, unsigned size,
2846                                    MemTxAttrs attrs)
2847{
2848    int ret = 0;
2849    MSIMessage from = {}, to = {};
2850    uint16_t sid = X86_IOMMU_SID_INVALID;
2851
2852    from.address = (uint64_t) addr + VTD_INTERRUPT_ADDR_FIRST;
2853    from.data = (uint32_t) value;
2854
2855    if (!attrs.unspecified) {
2856        /* We have explicit Source ID */
2857        sid = attrs.requester_id;
2858    }
2859
2860    ret = vtd_interrupt_remap_msi(opaque, &from, &to, sid);
2861    if (ret) {
2862        /* TODO: report error */
2863        /* Drop this interrupt */
2864        return MEMTX_ERROR;
2865    }
2866
2867    apic_get_class()->send_msi(&to);
2868
2869    return MEMTX_OK;
2870}
2871
2872static const MemoryRegionOps vtd_mem_ir_ops = {
2873    .read_with_attrs = vtd_mem_ir_read,
2874    .write_with_attrs = vtd_mem_ir_write,
2875    .endianness = DEVICE_LITTLE_ENDIAN,
2876    .impl = {
2877        .min_access_size = 4,
2878        .max_access_size = 4,
2879    },
2880    .valid = {
2881        .min_access_size = 4,
2882        .max_access_size = 4,
2883    },
2884};
2885
2886VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
2887{
2888    uintptr_t key = (uintptr_t)bus;
2889    VTDBus *vtd_bus = g_hash_table_lookup(s->vtd_as_by_busptr, &key);
2890    VTDAddressSpace *vtd_dev_as;
2891    char name[128];
2892
2893    if (!vtd_bus) {
2894        uintptr_t *new_key = g_malloc(sizeof(*new_key));
2895        *new_key = (uintptr_t)bus;
2896        /* No corresponding free() */
2897        vtd_bus = g_malloc0(sizeof(VTDBus) + sizeof(VTDAddressSpace *) * \
2898                            PCI_DEVFN_MAX);
2899        vtd_bus->bus = bus;
2900        g_hash_table_insert(s->vtd_as_by_busptr, new_key, vtd_bus);
2901    }
2902
2903    vtd_dev_as = vtd_bus->dev_as[devfn];
2904
2905    if (!vtd_dev_as) {
2906        snprintf(name, sizeof(name), "intel_iommu_devfn_%d", devfn);
2907        vtd_bus->dev_as[devfn] = vtd_dev_as = g_malloc0(sizeof(VTDAddressSpace));
2908
2909        vtd_dev_as->bus = bus;
2910        vtd_dev_as->devfn = (uint8_t)devfn;
2911        vtd_dev_as->iommu_state = s;
2912        vtd_dev_as->context_cache_entry.context_cache_gen = 0;
2913        vtd_dev_as->iova_tree = iova_tree_new();
2914
2915        /*
2916         * Memory region relationships looks like (Address range shows
2917         * only lower 32 bits to make it short in length...):
2918         *
2919         * |-----------------+-------------------+----------|
2920         * | Name            | Address range     | Priority |
2921         * |-----------------+-------------------+----------+
2922         * | vtd_root        | 00000000-ffffffff |        0 |
2923         * |  intel_iommu    | 00000000-ffffffff |        1 |
2924         * |  vtd_sys_alias  | 00000000-ffffffff |        1 |
2925         * |  intel_iommu_ir | fee00000-feefffff |       64 |
2926         * |-----------------+-------------------+----------|
2927         *
2928         * We enable/disable DMAR by switching enablement for
2929         * vtd_sys_alias and intel_iommu regions. IR region is always
2930         * enabled.
2931         */
2932        memory_region_init_iommu(&vtd_dev_as->iommu, sizeof(vtd_dev_as->iommu),
2933                                 TYPE_INTEL_IOMMU_MEMORY_REGION, OBJECT(s),
2934                                 "intel_iommu_dmar",
2935                                 UINT64_MAX);
2936        memory_region_init_alias(&vtd_dev_as->sys_alias, OBJECT(s),
2937                                 "vtd_sys_alias", get_system_memory(),
2938                                 0, memory_region_size(get_system_memory()));
2939        memory_region_init_io(&vtd_dev_as->iommu_ir, OBJECT(s),
2940                              &vtd_mem_ir_ops, s, "intel_iommu_ir",
2941                              VTD_INTERRUPT_ADDR_SIZE);
2942        memory_region_init(&vtd_dev_as->root, OBJECT(s),
2943                           "vtd_root", UINT64_MAX);
2944        memory_region_add_subregion_overlap(&vtd_dev_as->root,
2945                                            VTD_INTERRUPT_ADDR_FIRST,
2946                                            &vtd_dev_as->iommu_ir, 64);
2947        address_space_init(&vtd_dev_as->as, &vtd_dev_as->root, name);
2948        memory_region_add_subregion_overlap(&vtd_dev_as->root, 0,
2949                                            &vtd_dev_as->sys_alias, 1);
2950        memory_region_add_subregion_overlap(&vtd_dev_as->root, 0,
2951                                            MEMORY_REGION(&vtd_dev_as->iommu),
2952                                            1);
2953        vtd_switch_address_space(vtd_dev_as);
2954    }
2955    return vtd_dev_as;
2956}
2957
2958/* Unmap the whole range in the notifier's scope. */
2959static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
2960{
2961    IOMMUTLBEntry entry;
2962    hwaddr size;
2963    hwaddr start = n->start;
2964    hwaddr end = n->end;
2965    IntelIOMMUState *s = as->iommu_state;
2966    DMAMap map;
2967
2968    /*
2969     * Note: all the codes in this function has a assumption that IOVA
2970     * bits are no more than VTD_MGAW bits (which is restricted by
2971     * VT-d spec), otherwise we need to consider overflow of 64 bits.
2972     */
2973
2974    if (end > VTD_ADDRESS_SIZE(s->aw_bits)) {
2975        /*
2976         * Don't need to unmap regions that is bigger than the whole
2977         * VT-d supported address space size
2978         */
2979        end = VTD_ADDRESS_SIZE(s->aw_bits);
2980    }
2981
2982    assert(start <= end);
2983    size = end - start;
2984
2985    if (ctpop64(size) != 1) {
2986        /*
2987         * This size cannot format a correct mask. Let's enlarge it to
2988         * suite the minimum available mask.
2989         */
2990        int n = 64 - clz64(size);
2991        if (n > s->aw_bits) {
2992            /* should not happen, but in case it happens, limit it */
2993            n = s->aw_bits;
2994        }
2995        size = 1ULL << n;
2996    }
2997
2998    entry.target_as = &address_space_memory;
2999    /* Adjust iova for the size */
3000    entry.iova = n->start & ~(size - 1);
3001    /* This field is meaningless for unmap */
3002    entry.translated_addr = 0;
3003    entry.perm = IOMMU_NONE;
3004    entry.addr_mask = size - 1;
3005
3006    trace_vtd_as_unmap_whole(pci_bus_num(as->bus),
3007                             VTD_PCI_SLOT(as->devfn),
3008                             VTD_PCI_FUNC(as->devfn),
3009                             entry.iova, size);
3010
3011    map.iova = entry.iova;
3012    map.size = entry.addr_mask;
3013    iova_tree_remove(as->iova_tree, &map);
3014
3015    memory_region_notify_one(n, &entry);
3016}
3017
3018static void vtd_address_space_unmap_all(IntelIOMMUState *s)
3019{
3020    VTDAddressSpace *vtd_as;
3021    IOMMUNotifier *n;
3022
3023    QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
3024        IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) {
3025            vtd_address_space_unmap(vtd_as, n);
3026        }
3027    }
3028}
3029
3030static void vtd_address_space_refresh_all(IntelIOMMUState *s)
3031{
3032    vtd_address_space_unmap_all(s);
3033    vtd_switch_address_space_all(s);
3034}
3035
3036static int vtd_replay_hook(IOMMUTLBEntry *entry, void *private)
3037{
3038    memory_region_notify_one((IOMMUNotifier *)private, entry);
3039    return 0;
3040}
3041
3042static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
3043{
3044    VTDAddressSpace *vtd_as = container_of(iommu_mr, VTDAddressSpace, iommu);
3045    IntelIOMMUState *s = vtd_as->iommu_state;
3046    uint8_t bus_n = pci_bus_num(vtd_as->bus);
3047    VTDContextEntry ce;
3048
3049    /*
3050     * The replay can be triggered by either a invalidation or a newly
3051     * created entry. No matter what, we release existing mappings
3052     * (it means flushing caches for UNMAP-only registers).
3053     */
3054    vtd_address_space_unmap(vtd_as, n);
3055
3056    if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) {
3057        trace_vtd_replay_ce_valid(bus_n, PCI_SLOT(vtd_as->devfn),
3058                                  PCI_FUNC(vtd_as->devfn),
3059                                  VTD_CONTEXT_ENTRY_DID(ce.hi),
3060                                  ce.hi, ce.lo);
3061        if (vtd_as_has_map_notifier(vtd_as)) {
3062            /* This is required only for MAP typed notifiers */
3063            vtd_page_walk_info info = {
3064                .hook_fn = vtd_replay_hook,
3065                .private = (void *)n,
3066                .notify_unmap = false,
3067                .aw = s->aw_bits,
3068                .as = vtd_as,
3069                .domain_id = VTD_CONTEXT_ENTRY_DID(ce.hi),
3070            };
3071
3072            vtd_page_walk(&ce, 0, ~0ULL, &info);
3073        }
3074    } else {
3075        trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn),
3076                                    PCI_FUNC(vtd_as->devfn));
3077    }
3078
3079    return;
3080}
3081
3082/* Do the initialization. It will also be called when reset, so pay
3083 * attention when adding new initialization stuff.
3084 */
3085static void vtd_init(IntelIOMMUState *s)
3086{
3087    X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
3088
3089    memset(s->csr, 0, DMAR_REG_SIZE);
3090    memset(s->wmask, 0, DMAR_REG_SIZE);
3091    memset(s->w1cmask, 0, DMAR_REG_SIZE);
3092    memset(s->womask, 0, DMAR_REG_SIZE);
3093
3094    s->root = 0;
3095    s->root_extended = false;
3096    s->dmar_enabled = false;
3097    s->iq_head = 0;
3098    s->iq_tail = 0;
3099    s->iq = 0;
3100    s->iq_size = 0;
3101    s->qi_enabled = false;
3102    s->iq_last_desc_type = VTD_INV_DESC_NONE;
3103    s->next_frcd_reg = 0;
3104    s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND |
3105             VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS |
3106             VTD_CAP_SAGAW_39bit | VTD_CAP_MGAW(s->aw_bits);
3107    if (s->aw_bits == VTD_HOST_AW_48BIT) {
3108        s->cap |= VTD_CAP_SAGAW_48bit;
3109    }
3110    s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
3111
3112    /*
3113     * Rsvd field masks for spte
3114     */
3115    vtd_paging_entry_rsvd_field[0] = ~0ULL;
3116    vtd_paging_entry_rsvd_field[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits);
3117    vtd_paging_entry_rsvd_field[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits);
3118    vtd_paging_entry_rsvd_field[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits);
3119    vtd_paging_entry_rsvd_field[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits);
3120    vtd_paging_entry_rsvd_field[5] = VTD_SPTE_LPAGE_L1_RSVD_MASK(s->aw_bits);
3121    vtd_paging_entry_rsvd_field[6] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits);
3122    vtd_paging_entry_rsvd_field[7] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits);
3123    vtd_paging_entry_rsvd_field[8] = VTD_SPTE_LPAGE_L4_RSVD_MASK(s->aw_bits);
3124
3125    if (x86_iommu->intr_supported) {
3126        s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV;
3127        if (s->intr_eim == ON_OFF_AUTO_ON) {
3128            s->ecap |= VTD_ECAP_EIM;
3129        }
3130        assert(s->intr_eim != ON_OFF_AUTO_AUTO);
3131    }
3132
3133    if (x86_iommu->dt_supported) {
3134        s->ecap |= VTD_ECAP_DT;
3135    }
3136
3137    if (x86_iommu->pt_supported) {
3138        s->ecap |= VTD_ECAP_PT;
3139    }
3140
3141    if (s->caching_mode) {
3142        s->cap |= VTD_CAP_CM;
3143    }
3144
3145    vtd_reset_caches(s);
3146
3147    /* Define registers with default values and bit semantics */
3148    vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0);
3149    vtd_define_quad(s, DMAR_CAP_REG, s->cap, 0, 0);
3150    vtd_define_quad(s, DMAR_ECAP_REG, s->ecap, 0, 0);
3151    vtd_define_long(s, DMAR_GCMD_REG, 0, 0xff800000UL, 0);
3152    vtd_define_long_wo(s, DMAR_GCMD_REG, 0xff800000UL);
3153    vtd_define_long(s, DMAR_GSTS_REG, 0, 0, 0);
3154    vtd_define_quad(s, DMAR_RTADDR_REG, 0, 0xfffffffffffff000ULL, 0);
3155    vtd_define_quad(s, DMAR_CCMD_REG, 0, 0xe0000003ffffffffULL, 0);
3156    vtd_define_quad_wo(s, DMAR_CCMD_REG, 0x3ffff0000ULL);
3157
3158    /* Advanced Fault Logging not supported */
3159    vtd_define_long(s, DMAR_FSTS_REG, 0, 0, 0x11UL);
3160    vtd_define_long(s, DMAR_FECTL_REG, 0x80000000UL, 0x80000000UL, 0);
3161    vtd_define_long(s, DMAR_FEDATA_REG, 0, 0x0000ffffUL, 0);
3162    vtd_define_long(s, DMAR_FEADDR_REG, 0, 0xfffffffcUL, 0);
3163
3164    /* Treated as RsvdZ when EIM in ECAP_REG is not supported
3165     * vtd_define_long(s, DMAR_FEUADDR_REG, 0, 0xffffffffUL, 0);
3166     */
3167    vtd_define_long(s, DMAR_FEUADDR_REG, 0, 0, 0);
3168
3169    /* Treated as RO for implementations that PLMR and PHMR fields reported
3170     * as Clear in the CAP_REG.
3171     * vtd_define_long(s, DMAR_PMEN_REG, 0, 0x80000000UL, 0);
3172     */
3173    vtd_define_long(s, DMAR_PMEN_REG, 0, 0, 0);
3174
3175    vtd_define_quad(s, DMAR_IQH_REG, 0, 0, 0);
3176    vtd_define_quad(s, DMAR_IQT_REG, 0, 0x7fff0ULL, 0);
3177    vtd_define_quad(s, DMAR_IQA_REG, 0, 0xfffffffffffff007ULL, 0);
3178    vtd_define_long(s, DMAR_ICS_REG, 0, 0, 0x1UL);
3179    vtd_define_long(s, DMAR_IECTL_REG, 0x80000000UL, 0x80000000UL, 0);
3180    vtd_define_long(s, DMAR_IEDATA_REG, 0, 0xffffffffUL, 0);
3181    vtd_define_long(s, DMAR_IEADDR_REG, 0, 0xfffffffcUL, 0);
3182    /* Treadted as RsvdZ when EIM in ECAP_REG is not supported */
3183    vtd_define_long(s, DMAR_IEUADDR_REG, 0, 0, 0);
3184
3185    /* IOTLB registers */
3186    vtd_define_quad(s, DMAR_IOTLB_REG, 0, 0Xb003ffff00000000ULL, 0);
3187    vtd_define_quad(s, DMAR_IVA_REG, 0, 0xfffffffffffff07fULL, 0);
3188    vtd_define_quad_wo(s, DMAR_IVA_REG, 0xfffffffffffff07fULL);
3189
3190    /* Fault Recording Registers, 128-bit */
3191    vtd_define_quad(s, DMAR_FRCD_REG_0_0, 0, 0, 0);
3192    vtd_define_quad(s, DMAR_FRCD_REG_0_2, 0, 0, 0x8000000000000000ULL);
3193
3194    /*
3195     * Interrupt remapping registers.
3196     */
3197    vtd_define_quad(s, DMAR_IRTA_REG, 0, 0xfffffffffffff80fULL, 0);
3198}
3199
3200/* Should not reset address_spaces when reset because devices will still use
3201 * the address space they got at first (won't ask the bus again).
3202 */
3203static void vtd_reset(DeviceState *dev)
3204{
3205    IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev);
3206
3207    vtd_init(s);
3208    vtd_address_space_refresh_all(s);
3209}
3210
3211static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
3212{
3213    IntelIOMMUState *s = opaque;
3214    VTDAddressSpace *vtd_as;
3215
3216    assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
3217
3218    vtd_as = vtd_find_add_as(s, bus, devfn);
3219    return &vtd_as->as;
3220}
3221
3222static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
3223{
3224    X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
3225
3226    /* Currently Intel IOMMU IR only support "kernel-irqchip={off|split}" */
3227    if (x86_iommu->intr_supported && kvm_irqchip_in_kernel() &&
3228        !kvm_irqchip_is_split()) {
3229        error_setg(errp, "Intel Interrupt Remapping cannot work with "
3230                         "kernel-irqchip=on, please use 'split|off'.");
3231        return false;
3232    }
3233    if (s->intr_eim == ON_OFF_AUTO_ON && !x86_iommu->intr_supported) {
3234        error_setg(errp, "eim=on cannot be selected without intremap=on");
3235        return false;
3236    }
3237
3238    if (s->intr_eim == ON_OFF_AUTO_AUTO) {
3239        s->intr_eim = (kvm_irqchip_in_kernel() || s->buggy_eim)
3240                      && x86_iommu->intr_supported ?
3241                                              ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
3242    }
3243    if (s->intr_eim == ON_OFF_AUTO_ON && !s->buggy_eim) {
3244        if (!kvm_irqchip_in_kernel()) {
3245            error_setg(errp, "eim=on requires accel=kvm,kernel-irqchip=split");
3246            return false;
3247        }
3248        if (!kvm_enable_x2apic()) {
3249            error_setg(errp, "eim=on requires support on the KVM side"
3250                             "(X2APIC_API, first shipped in v4.7)");
3251            return false;
3252        }
3253    }
3254
3255    /* Currently only address widths supported are 39 and 48 bits */
3256    if ((s->aw_bits != VTD_HOST_AW_39BIT) &&
3257        (s->aw_bits != VTD_HOST_AW_48BIT)) {
3258        error_setg(errp, "Supported values for x-aw-bits are: %d, %d",
3259                   VTD_HOST_AW_39BIT, VTD_HOST_AW_48BIT);
3260        return false;
3261    }
3262
3263    return true;
3264}
3265
3266static void vtd_realize(DeviceState *dev, Error **errp)
3267{
3268    MachineState *ms = MACHINE(qdev_get_machine());
3269    PCMachineState *pcms = PC_MACHINE(ms);
3270    PCIBus *bus = pcms->bus;
3271    IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev);
3272    X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(dev);
3273
3274    x86_iommu->type = TYPE_INTEL;
3275
3276    if (!vtd_decide_config(s, errp)) {
3277        return;
3278    }
3279
3280    QLIST_INIT(&s->vtd_as_with_notifiers);
3281    qemu_mutex_init(&s->iommu_lock);
3282    memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num));
3283    memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s,
3284                          "intel_iommu", DMAR_REG_SIZE);
3285    sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->csrmem);
3286    /* No corresponding destroy */
3287    s->iotlb = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal,
3288                                     g_free, g_free);
3289    s->vtd_as_by_busptr = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal,
3290                                              g_free, g_free);
3291    vtd_init(s);
3292    sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, Q35_HOST_BRIDGE_IOMMU_ADDR);
3293    pci_setup_iommu(bus, vtd_host_dma_iommu, dev);
3294    /* Pseudo address space under root PCI bus. */
3295    pcms->ioapic_as = vtd_host_dma_iommu(bus, s, Q35_PSEUDO_DEVFN_IOAPIC);
3296}
3297
3298static void vtd_class_init(ObjectClass *klass, void *data)
3299{
3300    DeviceClass *dc = DEVICE_CLASS(klass);
3301    X86IOMMUClass *x86_class = X86_IOMMU_CLASS(klass);
3302
3303    dc->reset = vtd_reset;
3304    dc->vmsd = &vtd_vmstate;
3305    dc->props = vtd_properties;
3306    dc->hotpluggable = false;
3307    x86_class->realize = vtd_realize;
3308    x86_class->int_remap = vtd_int_remap;
3309    /* Supported by the pc-q35-* machine types */
3310    dc->user_creatable = true;
3311}
3312
3313static const TypeInfo vtd_info = {
3314    .name          = TYPE_INTEL_IOMMU_DEVICE,
3315    .parent        = TYPE_X86_IOMMU_DEVICE,
3316    .instance_size = sizeof(IntelIOMMUState),
3317    .class_init    = vtd_class_init,
3318};
3319
3320static void vtd_iommu_memory_region_class_init(ObjectClass *klass,
3321                                                     void *data)
3322{
3323    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
3324
3325    imrc->translate = vtd_iommu_translate;
3326    imrc->notify_flag_changed = vtd_iommu_notify_flag_changed;
3327    imrc->replay = vtd_iommu_replay;
3328}
3329
3330static const TypeInfo vtd_iommu_memory_region_info = {
3331    .parent = TYPE_IOMMU_MEMORY_REGION,
3332    .name = TYPE_INTEL_IOMMU_MEMORY_REGION,
3333    .class_init = vtd_iommu_memory_region_class_init,
3334};
3335
3336static void vtd_register_types(void)
3337{
3338    type_register_static(&vtd_info);
3339    type_register_static(&vtd_iommu_memory_region_info);
3340}
3341
3342type_init(vtd_register_types)
3343