qemu/hw/i386/intel_iommu.c
<<
>>
Prefs
   1/*
   2 * QEMU emulation of an Intel IOMMU (VT-d)
   3 *   (DMA Remapping device)
   4 *
   5 * Copyright (C) 2013 Knut Omang, Oracle <knut.omang@oracle.com>
   6 * Copyright (C) 2014 Le Tan, <tamlokveer@gmail.com>
   7 *
   8 * This program is free software; you can redistribute it and/or modify
   9 * it under the terms of the GNU General Public License as published by
  10 * the Free Software Foundation; either version 2 of the License, or
  11 * (at your option) any later version.
  12
  13 * This program is distributed in the hope that it will be useful,
  14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 * GNU General Public License for more details.
  17
  18 * You should have received a copy of the GNU General Public License along
  19 * with this program; if not, see <http://www.gnu.org/licenses/>.
  20 */
  21
  22#include "qemu/osdep.h"
  23#include "qemu/error-report.h"
  24#include "qemu/main-loop.h"
  25#include "qapi/error.h"
  26#include "hw/sysbus.h"
  27#include "intel_iommu_internal.h"
  28#include "hw/pci/pci.h"
  29#include "hw/pci/pci_bus.h"
  30#include "hw/qdev-properties.h"
  31#include "hw/i386/pc.h"
  32#include "hw/i386/apic-msidef.h"
  33#include "hw/i386/x86-iommu.h"
  34#include "hw/pci-host/q35.h"
  35#include "sysemu/kvm.h"
  36#include "sysemu/dma.h"
  37#include "sysemu/sysemu.h"
  38#include "hw/i386/apic_internal.h"
  39#include "kvm/kvm_i386.h"
  40#include "migration/vmstate.h"
  41#include "trace.h"
  42
  43/* context entry operations */
  44#define VTD_CE_GET_RID2PASID(ce) \
  45    ((ce)->val[1] & VTD_SM_CONTEXT_ENTRY_RID2PASID_MASK)
  46#define VTD_CE_GET_PASID_DIR_TABLE(ce) \
  47    ((ce)->val[0] & VTD_PASID_DIR_BASE_ADDR_MASK)
  48
  49/* pe operations */
  50#define VTD_PE_GET_TYPE(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_PGTT)
  51#define VTD_PE_GET_LEVEL(pe) (2 + (((pe)->val[0] >> 2) & VTD_SM_PASID_ENTRY_AW))
  52#define VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write) {\
  53    if (ret_fr) {                                                             \
  54        ret_fr = -ret_fr;                                                     \
  55        if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {                   \
  56            trace_vtd_fault_disabled();                                       \
  57        } else {                                                              \
  58            vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);      \
  59        }                                                                     \
  60        goto error;                                                           \
  61    }                                                                         \
  62}
  63
  64static void vtd_address_space_refresh_all(IntelIOMMUState *s);
  65static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n);
  66
  67static void vtd_panic_require_caching_mode(void)
  68{
  69    error_report("We need to set caching-mode=on for intel-iommu to enable "
  70                 "device assignment with IOMMU protection.");
  71    exit(1);
  72}
  73
  74static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
  75                            uint64_t wmask, uint64_t w1cmask)
  76{
  77    stq_le_p(&s->csr[addr], val);
  78    stq_le_p(&s->wmask[addr], wmask);
  79    stq_le_p(&s->w1cmask[addr], w1cmask);
  80}
  81
  82static void vtd_define_quad_wo(IntelIOMMUState *s, hwaddr addr, uint64_t mask)
  83{
  84    stq_le_p(&s->womask[addr], mask);
  85}
  86
  87static void vtd_define_long(IntelIOMMUState *s, hwaddr addr, uint32_t val,
  88                            uint32_t wmask, uint32_t w1cmask)
  89{
  90    stl_le_p(&s->csr[addr], val);
  91    stl_le_p(&s->wmask[addr], wmask);
  92    stl_le_p(&s->w1cmask[addr], w1cmask);
  93}
  94
  95static void vtd_define_long_wo(IntelIOMMUState *s, hwaddr addr, uint32_t mask)
  96{
  97    stl_le_p(&s->womask[addr], mask);
  98}
  99
 100/* "External" get/set operations */
 101static void vtd_set_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val)
 102{
 103    uint64_t oldval = ldq_le_p(&s->csr[addr]);
 104    uint64_t wmask = ldq_le_p(&s->wmask[addr]);
 105    uint64_t w1cmask = ldq_le_p(&s->w1cmask[addr]);
 106    stq_le_p(&s->csr[addr],
 107             ((oldval & ~wmask) | (val & wmask)) & ~(w1cmask & val));
 108}
 109
 110static void vtd_set_long(IntelIOMMUState *s, hwaddr addr, uint32_t val)
 111{
 112    uint32_t oldval = ldl_le_p(&s->csr[addr]);
 113    uint32_t wmask = ldl_le_p(&s->wmask[addr]);
 114    uint32_t w1cmask = ldl_le_p(&s->w1cmask[addr]);
 115    stl_le_p(&s->csr[addr],
 116             ((oldval & ~wmask) | (val & wmask)) & ~(w1cmask & val));
 117}
 118
 119static uint64_t vtd_get_quad(IntelIOMMUState *s, hwaddr addr)
 120{
 121    uint64_t val = ldq_le_p(&s->csr[addr]);
 122    uint64_t womask = ldq_le_p(&s->womask[addr]);
 123    return val & ~womask;
 124}
 125
 126static uint32_t vtd_get_long(IntelIOMMUState *s, hwaddr addr)
 127{
 128    uint32_t val = ldl_le_p(&s->csr[addr]);
 129    uint32_t womask = ldl_le_p(&s->womask[addr]);
 130    return val & ~womask;
 131}
 132
 133/* "Internal" get/set operations */
 134static uint64_t vtd_get_quad_raw(IntelIOMMUState *s, hwaddr addr)
 135{
 136    return ldq_le_p(&s->csr[addr]);
 137}
 138
 139static uint32_t vtd_get_long_raw(IntelIOMMUState *s, hwaddr addr)
 140{
 141    return ldl_le_p(&s->csr[addr]);
 142}
 143
 144static void vtd_set_quad_raw(IntelIOMMUState *s, hwaddr addr, uint64_t val)
 145{
 146    stq_le_p(&s->csr[addr], val);
 147}
 148
 149static uint32_t vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr,
 150                                        uint32_t clear, uint32_t mask)
 151{
 152    uint32_t new_val = (ldl_le_p(&s->csr[addr]) & ~clear) | mask;
 153    stl_le_p(&s->csr[addr], new_val);
 154    return new_val;
 155}
 156
 157static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
 158                                        uint64_t clear, uint64_t mask)
 159{
 160    uint64_t new_val = (ldq_le_p(&s->csr[addr]) & ~clear) | mask;
 161    stq_le_p(&s->csr[addr], new_val);
 162    return new_val;
 163}
 164
 165static inline void vtd_iommu_lock(IntelIOMMUState *s)
 166{
 167    qemu_mutex_lock(&s->iommu_lock);
 168}
 169
 170static inline void vtd_iommu_unlock(IntelIOMMUState *s)
 171{
 172    qemu_mutex_unlock(&s->iommu_lock);
 173}
 174
 175static void vtd_update_scalable_state(IntelIOMMUState *s)
 176{
 177    uint64_t val = vtd_get_quad_raw(s, DMAR_RTADDR_REG);
 178
 179    if (s->scalable_mode) {
 180        s->root_scalable = val & VTD_RTADDR_SMT;
 181    }
 182}
 183
 184static void vtd_update_iq_dw(IntelIOMMUState *s)
 185{
 186    uint64_t val = vtd_get_quad_raw(s, DMAR_IQA_REG);
 187
 188    if (s->ecap & VTD_ECAP_SMTS &&
 189        val & VTD_IQA_DW_MASK) {
 190        s->iq_dw = true;
 191    } else {
 192        s->iq_dw = false;
 193    }
 194}
 195
 196/* Whether the address space needs to notify new mappings */
 197static inline gboolean vtd_as_has_map_notifier(VTDAddressSpace *as)
 198{
 199    return as->notifier_flags & IOMMU_NOTIFIER_MAP;
 200}
 201
 202/* GHashTable functions */
 203static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2)
 204{
 205    return *((const uint64_t *)v1) == *((const uint64_t *)v2);
 206}
 207
 208static guint vtd_uint64_hash(gconstpointer v)
 209{
 210    return (guint)*(const uint64_t *)v;
 211}
 212
 213static gboolean vtd_hash_remove_by_domain(gpointer key, gpointer value,
 214                                          gpointer user_data)
 215{
 216    VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value;
 217    uint16_t domain_id = *(uint16_t *)user_data;
 218    return entry->domain_id == domain_id;
 219}
 220
 221/* The shift of an addr for a certain level of paging structure */
 222static inline uint32_t vtd_slpt_level_shift(uint32_t level)
 223{
 224    assert(level != 0);
 225    return VTD_PAGE_SHIFT_4K + (level - 1) * VTD_SL_LEVEL_BITS;
 226}
 227
 228static inline uint64_t vtd_slpt_level_page_mask(uint32_t level)
 229{
 230    return ~((1ULL << vtd_slpt_level_shift(level)) - 1);
 231}
 232
 233static gboolean vtd_hash_remove_by_page(gpointer key, gpointer value,
 234                                        gpointer user_data)
 235{
 236    VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value;
 237    VTDIOTLBPageInvInfo *info = (VTDIOTLBPageInvInfo *)user_data;
 238    uint64_t gfn = (info->addr >> VTD_PAGE_SHIFT_4K) & info->mask;
 239    uint64_t gfn_tlb = (info->addr & entry->mask) >> VTD_PAGE_SHIFT_4K;
 240    return (entry->domain_id == info->domain_id) &&
 241            (((entry->gfn & info->mask) == gfn) ||
 242             (entry->gfn == gfn_tlb));
 243}
 244
 245/* Reset all the gen of VTDAddressSpace to zero and set the gen of
 246 * IntelIOMMUState to 1.  Must be called with IOMMU lock held.
 247 */
 248static void vtd_reset_context_cache_locked(IntelIOMMUState *s)
 249{
 250    VTDAddressSpace *vtd_as;
 251    VTDBus *vtd_bus;
 252    GHashTableIter bus_it;
 253    uint32_t devfn_it;
 254
 255    trace_vtd_context_cache_reset();
 256
 257    g_hash_table_iter_init(&bus_it, s->vtd_as_by_busptr);
 258
 259    while (g_hash_table_iter_next (&bus_it, NULL, (void**)&vtd_bus)) {
 260        for (devfn_it = 0; devfn_it < PCI_DEVFN_MAX; ++devfn_it) {
 261            vtd_as = vtd_bus->dev_as[devfn_it];
 262            if (!vtd_as) {
 263                continue;
 264            }
 265            vtd_as->context_cache_entry.context_cache_gen = 0;
 266        }
 267    }
 268    s->context_cache_gen = 1;
 269}
 270
 271/* Must be called with IOMMU lock held. */
 272static void vtd_reset_iotlb_locked(IntelIOMMUState *s)
 273{
 274    assert(s->iotlb);
 275    g_hash_table_remove_all(s->iotlb);
 276}
 277
 278static void vtd_reset_iotlb(IntelIOMMUState *s)
 279{
 280    vtd_iommu_lock(s);
 281    vtd_reset_iotlb_locked(s);
 282    vtd_iommu_unlock(s);
 283}
 284
 285static void vtd_reset_caches(IntelIOMMUState *s)
 286{
 287    vtd_iommu_lock(s);
 288    vtd_reset_iotlb_locked(s);
 289    vtd_reset_context_cache_locked(s);
 290    vtd_iommu_unlock(s);
 291}
 292
 293static uint64_t vtd_get_iotlb_key(uint64_t gfn, uint16_t source_id,
 294                                  uint32_t level)
 295{
 296    return gfn | ((uint64_t)(source_id) << VTD_IOTLB_SID_SHIFT) |
 297           ((uint64_t)(level) << VTD_IOTLB_LVL_SHIFT);
 298}
 299
 300static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t level)
 301{
 302    return (addr & vtd_slpt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K;
 303}
 304
 305/* Must be called with IOMMU lock held */
 306static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, uint16_t source_id,
 307                                       hwaddr addr)
 308{
 309    VTDIOTLBEntry *entry;
 310    uint64_t key;
 311    int level;
 312
 313    for (level = VTD_SL_PT_LEVEL; level < VTD_SL_PML4_LEVEL; level++) {
 314        key = vtd_get_iotlb_key(vtd_get_iotlb_gfn(addr, level),
 315                                source_id, level);
 316        entry = g_hash_table_lookup(s->iotlb, &key);
 317        if (entry) {
 318            goto out;
 319        }
 320    }
 321
 322out:
 323    return entry;
 324}
 325
 326/* Must be with IOMMU lock held */
 327static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id,
 328                             uint16_t domain_id, hwaddr addr, uint64_t slpte,
 329                             uint8_t access_flags, uint32_t level)
 330{
 331    VTDIOTLBEntry *entry = g_malloc(sizeof(*entry));
 332    uint64_t *key = g_malloc(sizeof(*key));
 333    uint64_t gfn = vtd_get_iotlb_gfn(addr, level);
 334
 335    trace_vtd_iotlb_page_update(source_id, addr, slpte, domain_id);
 336    if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) {
 337        trace_vtd_iotlb_reset("iotlb exceeds size limit");
 338        vtd_reset_iotlb_locked(s);
 339    }
 340
 341    entry->gfn = gfn;
 342    entry->domain_id = domain_id;
 343    entry->slpte = slpte;
 344    entry->access_flags = access_flags;
 345    entry->mask = vtd_slpt_level_page_mask(level);
 346    *key = vtd_get_iotlb_key(gfn, source_id, level);
 347    g_hash_table_replace(s->iotlb, key, entry);
 348}
 349
 350/* Given the reg addr of both the message data and address, generate an
 351 * interrupt via MSI.
 352 */
 353static void vtd_generate_interrupt(IntelIOMMUState *s, hwaddr mesg_addr_reg,
 354                                   hwaddr mesg_data_reg)
 355{
 356    MSIMessage msi;
 357
 358    assert(mesg_data_reg < DMAR_REG_SIZE);
 359    assert(mesg_addr_reg < DMAR_REG_SIZE);
 360
 361    msi.address = vtd_get_long_raw(s, mesg_addr_reg);
 362    msi.data = vtd_get_long_raw(s, mesg_data_reg);
 363
 364    trace_vtd_irq_generate(msi.address, msi.data);
 365
 366    apic_get_class()->send_msi(&msi);
 367}
 368
 369/* Generate a fault event to software via MSI if conditions are met.
 370 * Notice that the value of FSTS_REG being passed to it should be the one
 371 * before any update.
 372 */
 373static void vtd_generate_fault_event(IntelIOMMUState *s, uint32_t pre_fsts)
 374{
 375    if (pre_fsts & VTD_FSTS_PPF || pre_fsts & VTD_FSTS_PFO ||
 376        pre_fsts & VTD_FSTS_IQE) {
 377        error_report_once("There are previous interrupt conditions "
 378                          "to be serviced by software, fault event "
 379                          "is not generated");
 380        return;
 381    }
 382    vtd_set_clear_mask_long(s, DMAR_FECTL_REG, 0, VTD_FECTL_IP);
 383    if (vtd_get_long_raw(s, DMAR_FECTL_REG) & VTD_FECTL_IM) {
 384        error_report_once("Interrupt Mask set, irq is not generated");
 385    } else {
 386        vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG);
 387        vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0);
 388    }
 389}
 390
 391/* Check if the Fault (F) field of the Fault Recording Register referenced by
 392 * @index is Set.
 393 */
 394static bool vtd_is_frcd_set(IntelIOMMUState *s, uint16_t index)
 395{
 396    /* Each reg is 128-bit */
 397    hwaddr addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4);
 398    addr += 8; /* Access the high 64-bit half */
 399
 400    assert(index < DMAR_FRCD_REG_NR);
 401
 402    return vtd_get_quad_raw(s, addr) & VTD_FRCD_F;
 403}
 404
 405/* Update the PPF field of Fault Status Register.
 406 * Should be called whenever change the F field of any fault recording
 407 * registers.
 408 */
 409static void vtd_update_fsts_ppf(IntelIOMMUState *s)
 410{
 411    uint32_t i;
 412    uint32_t ppf_mask = 0;
 413
 414    for (i = 0; i < DMAR_FRCD_REG_NR; i++) {
 415        if (vtd_is_frcd_set(s, i)) {
 416            ppf_mask = VTD_FSTS_PPF;
 417            break;
 418        }
 419    }
 420    vtd_set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_PPF, ppf_mask);
 421    trace_vtd_fsts_ppf(!!ppf_mask);
 422}
 423
 424static void vtd_set_frcd_and_update_ppf(IntelIOMMUState *s, uint16_t index)
 425{
 426    /* Each reg is 128-bit */
 427    hwaddr addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4);
 428    addr += 8; /* Access the high 64-bit half */
 429
 430    assert(index < DMAR_FRCD_REG_NR);
 431
 432    vtd_set_clear_mask_quad(s, addr, 0, VTD_FRCD_F);
 433    vtd_update_fsts_ppf(s);
 434}
 435
 436/* Must not update F field now, should be done later */
 437static void vtd_record_frcd(IntelIOMMUState *s, uint16_t index,
 438                            uint16_t source_id, hwaddr addr,
 439                            VTDFaultReason fault, bool is_write)
 440{
 441    uint64_t hi = 0, lo;
 442    hwaddr frcd_reg_addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4);
 443
 444    assert(index < DMAR_FRCD_REG_NR);
 445
 446    lo = VTD_FRCD_FI(addr);
 447    hi = VTD_FRCD_SID(source_id) | VTD_FRCD_FR(fault);
 448    if (!is_write) {
 449        hi |= VTD_FRCD_T;
 450    }
 451    vtd_set_quad_raw(s, frcd_reg_addr, lo);
 452    vtd_set_quad_raw(s, frcd_reg_addr + 8, hi);
 453
 454    trace_vtd_frr_new(index, hi, lo);
 455}
 456
 457/* Try to collapse multiple pending faults from the same requester */
 458static bool vtd_try_collapse_fault(IntelIOMMUState *s, uint16_t source_id)
 459{
 460    uint32_t i;
 461    uint64_t frcd_reg;
 462    hwaddr addr = DMAR_FRCD_REG_OFFSET + 8; /* The high 64-bit half */
 463
 464    for (i = 0; i < DMAR_FRCD_REG_NR; i++) {
 465        frcd_reg = vtd_get_quad_raw(s, addr);
 466        if ((frcd_reg & VTD_FRCD_F) &&
 467            ((frcd_reg & VTD_FRCD_SID_MASK) == source_id)) {
 468            return true;
 469        }
 470        addr += 16; /* 128-bit for each */
 471    }
 472    return false;
 473}
 474
 475/* Log and report an DMAR (address translation) fault to software */
 476static void vtd_report_dmar_fault(IntelIOMMUState *s, uint16_t source_id,
 477                                  hwaddr addr, VTDFaultReason fault,
 478                                  bool is_write)
 479{
 480    uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG);
 481
 482    assert(fault < VTD_FR_MAX);
 483
 484    trace_vtd_dmar_fault(source_id, fault, addr, is_write);
 485
 486    if (fsts_reg & VTD_FSTS_PFO) {
 487        error_report_once("New fault is not recorded due to "
 488                          "Primary Fault Overflow");
 489        return;
 490    }
 491
 492    if (vtd_try_collapse_fault(s, source_id)) {
 493        error_report_once("New fault is not recorded due to "
 494                          "compression of faults");
 495        return;
 496    }
 497
 498    if (vtd_is_frcd_set(s, s->next_frcd_reg)) {
 499        error_report_once("Next Fault Recording Reg is used, "
 500                          "new fault is not recorded, set PFO field");
 501        vtd_set_clear_mask_long(s, DMAR_FSTS_REG, 0, VTD_FSTS_PFO);
 502        return;
 503    }
 504
 505    vtd_record_frcd(s, s->next_frcd_reg, source_id, addr, fault, is_write);
 506
 507    if (fsts_reg & VTD_FSTS_PPF) {
 508        error_report_once("There are pending faults already, "
 509                          "fault event is not generated");
 510        vtd_set_frcd_and_update_ppf(s, s->next_frcd_reg);
 511        s->next_frcd_reg++;
 512        if (s->next_frcd_reg == DMAR_FRCD_REG_NR) {
 513            s->next_frcd_reg = 0;
 514        }
 515    } else {
 516        vtd_set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_FRI_MASK,
 517                                VTD_FSTS_FRI(s->next_frcd_reg));
 518        vtd_set_frcd_and_update_ppf(s, s->next_frcd_reg); /* Will set PPF */
 519        s->next_frcd_reg++;
 520        if (s->next_frcd_reg == DMAR_FRCD_REG_NR) {
 521            s->next_frcd_reg = 0;
 522        }
 523        /* This case actually cause the PPF to be Set.
 524         * So generate fault event (interrupt).
 525         */
 526         vtd_generate_fault_event(s, fsts_reg);
 527    }
 528}
 529
 530/* Handle Invalidation Queue Errors of queued invalidation interface error
 531 * conditions.
 532 */
 533static void vtd_handle_inv_queue_error(IntelIOMMUState *s)
 534{
 535    uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG);
 536
 537    vtd_set_clear_mask_long(s, DMAR_FSTS_REG, 0, VTD_FSTS_IQE);
 538    vtd_generate_fault_event(s, fsts_reg);
 539}
 540
 541/* Set the IWC field and try to generate an invalidation completion interrupt */
 542static void vtd_generate_completion_event(IntelIOMMUState *s)
 543{
 544    if (vtd_get_long_raw(s, DMAR_ICS_REG) & VTD_ICS_IWC) {
 545        trace_vtd_inv_desc_wait_irq("One pending, skip current");
 546        return;
 547    }
 548    vtd_set_clear_mask_long(s, DMAR_ICS_REG, 0, VTD_ICS_IWC);
 549    vtd_set_clear_mask_long(s, DMAR_IECTL_REG, 0, VTD_IECTL_IP);
 550    if (vtd_get_long_raw(s, DMAR_IECTL_REG) & VTD_IECTL_IM) {
 551        trace_vtd_inv_desc_wait_irq("IM in IECTL_REG is set, "
 552                                    "new event not generated");
 553        return;
 554    } else {
 555        /* Generate the interrupt event */
 556        trace_vtd_inv_desc_wait_irq("Generating complete event");
 557        vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG);
 558        vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0);
 559    }
 560}
 561
 562static inline bool vtd_root_entry_present(IntelIOMMUState *s,
 563                                          VTDRootEntry *re,
 564                                          uint8_t devfn)
 565{
 566    if (s->root_scalable && devfn > UINT8_MAX / 2) {
 567        return re->hi & VTD_ROOT_ENTRY_P;
 568    }
 569
 570    return re->lo & VTD_ROOT_ENTRY_P;
 571}
 572
 573static int vtd_get_root_entry(IntelIOMMUState *s, uint8_t index,
 574                              VTDRootEntry *re)
 575{
 576    dma_addr_t addr;
 577
 578    addr = s->root + index * sizeof(*re);
 579    if (dma_memory_read(&address_space_memory, addr,
 580                        re, sizeof(*re), MEMTXATTRS_UNSPECIFIED)) {
 581        re->lo = 0;
 582        return -VTD_FR_ROOT_TABLE_INV;
 583    }
 584    re->lo = le64_to_cpu(re->lo);
 585    re->hi = le64_to_cpu(re->hi);
 586    return 0;
 587}
 588
 589static inline bool vtd_ce_present(VTDContextEntry *context)
 590{
 591    return context->lo & VTD_CONTEXT_ENTRY_P;
 592}
 593
 594static int vtd_get_context_entry_from_root(IntelIOMMUState *s,
 595                                           VTDRootEntry *re,
 596                                           uint8_t index,
 597                                           VTDContextEntry *ce)
 598{
 599    dma_addr_t addr, ce_size;
 600
 601    /* we have checked that root entry is present */
 602    ce_size = s->root_scalable ? VTD_CTX_ENTRY_SCALABLE_SIZE :
 603              VTD_CTX_ENTRY_LEGACY_SIZE;
 604
 605    if (s->root_scalable && index > UINT8_MAX / 2) {
 606        index = index & (~VTD_DEVFN_CHECK_MASK);
 607        addr = re->hi & VTD_ROOT_ENTRY_CTP;
 608    } else {
 609        addr = re->lo & VTD_ROOT_ENTRY_CTP;
 610    }
 611
 612    addr = addr + index * ce_size;
 613    if (dma_memory_read(&address_space_memory, addr,
 614                        ce, ce_size, MEMTXATTRS_UNSPECIFIED)) {
 615        return -VTD_FR_CONTEXT_TABLE_INV;
 616    }
 617
 618    ce->lo = le64_to_cpu(ce->lo);
 619    ce->hi = le64_to_cpu(ce->hi);
 620    if (ce_size == VTD_CTX_ENTRY_SCALABLE_SIZE) {
 621        ce->val[2] = le64_to_cpu(ce->val[2]);
 622        ce->val[3] = le64_to_cpu(ce->val[3]);
 623    }
 624    return 0;
 625}
 626
 627static inline dma_addr_t vtd_ce_get_slpt_base(VTDContextEntry *ce)
 628{
 629    return ce->lo & VTD_CONTEXT_ENTRY_SLPTPTR;
 630}
 631
 632static inline uint64_t vtd_get_slpte_addr(uint64_t slpte, uint8_t aw)
 633{
 634    return slpte & VTD_SL_PT_BASE_ADDR_MASK(aw);
 635}
 636
 637/* Whether the pte indicates the address of the page frame */
 638static inline bool vtd_is_last_slpte(uint64_t slpte, uint32_t level)
 639{
 640    return level == VTD_SL_PT_LEVEL || (slpte & VTD_SL_PT_PAGE_SIZE_MASK);
 641}
 642
 643/* Get the content of a spte located in @base_addr[@index] */
 644static uint64_t vtd_get_slpte(dma_addr_t base_addr, uint32_t index)
 645{
 646    uint64_t slpte;
 647
 648    assert(index < VTD_SL_PT_ENTRY_NR);
 649
 650    if (dma_memory_read(&address_space_memory,
 651                        base_addr + index * sizeof(slpte),
 652                        &slpte, sizeof(slpte), MEMTXATTRS_UNSPECIFIED)) {
 653        slpte = (uint64_t)-1;
 654        return slpte;
 655    }
 656    slpte = le64_to_cpu(slpte);
 657    return slpte;
 658}
 659
 660/* Given an iova and the level of paging structure, return the offset
 661 * of current level.
 662 */
 663static inline uint32_t vtd_iova_level_offset(uint64_t iova, uint32_t level)
 664{
 665    return (iova >> vtd_slpt_level_shift(level)) &
 666            ((1ULL << VTD_SL_LEVEL_BITS) - 1);
 667}
 668
 669/* Check Capability Register to see if the @level of page-table is supported */
 670static inline bool vtd_is_level_supported(IntelIOMMUState *s, uint32_t level)
 671{
 672    return VTD_CAP_SAGAW_MASK & s->cap &
 673           (1ULL << (level - 2 + VTD_CAP_SAGAW_SHIFT));
 674}
 675
 676/* Return true if check passed, otherwise false */
 677static inline bool vtd_pe_type_check(X86IOMMUState *x86_iommu,
 678                                     VTDPASIDEntry *pe)
 679{
 680    switch (VTD_PE_GET_TYPE(pe)) {
 681    case VTD_SM_PASID_ENTRY_FLT:
 682    case VTD_SM_PASID_ENTRY_SLT:
 683    case VTD_SM_PASID_ENTRY_NESTED:
 684        break;
 685    case VTD_SM_PASID_ENTRY_PT:
 686        if (!x86_iommu->pt_supported) {
 687            return false;
 688        }
 689        break;
 690    default:
 691        /* Unknown type */
 692        return false;
 693    }
 694    return true;
 695}
 696
 697static inline bool vtd_pdire_present(VTDPASIDDirEntry *pdire)
 698{
 699    return pdire->val & 1;
 700}
 701
 702/**
 703 * Caller of this function should check present bit if wants
 704 * to use pdir entry for further usage except for fpd bit check.
 705 */
 706static int vtd_get_pdire_from_pdir_table(dma_addr_t pasid_dir_base,
 707                                         uint32_t pasid,
 708                                         VTDPASIDDirEntry *pdire)
 709{
 710    uint32_t index;
 711    dma_addr_t addr, entry_size;
 712
 713    index = VTD_PASID_DIR_INDEX(pasid);
 714    entry_size = VTD_PASID_DIR_ENTRY_SIZE;
 715    addr = pasid_dir_base + index * entry_size;
 716    if (dma_memory_read(&address_space_memory, addr,
 717                        pdire, entry_size, MEMTXATTRS_UNSPECIFIED)) {
 718        return -VTD_FR_PASID_TABLE_INV;
 719    }
 720
 721    return 0;
 722}
 723
 724static inline bool vtd_pe_present(VTDPASIDEntry *pe)
 725{
 726    return pe->val[0] & VTD_PASID_ENTRY_P;
 727}
 728
 729static int vtd_get_pe_in_pasid_leaf_table(IntelIOMMUState *s,
 730                                          uint32_t pasid,
 731                                          dma_addr_t addr,
 732                                          VTDPASIDEntry *pe)
 733{
 734    uint32_t index;
 735    dma_addr_t entry_size;
 736    X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
 737
 738    index = VTD_PASID_TABLE_INDEX(pasid);
 739    entry_size = VTD_PASID_ENTRY_SIZE;
 740    addr = addr + index * entry_size;
 741    if (dma_memory_read(&address_space_memory, addr,
 742                        pe, entry_size, MEMTXATTRS_UNSPECIFIED)) {
 743        return -VTD_FR_PASID_TABLE_INV;
 744    }
 745
 746    /* Do translation type check */
 747    if (!vtd_pe_type_check(x86_iommu, pe)) {
 748        return -VTD_FR_PASID_TABLE_INV;
 749    }
 750
 751    if (!vtd_is_level_supported(s, VTD_PE_GET_LEVEL(pe))) {
 752        return -VTD_FR_PASID_TABLE_INV;
 753    }
 754
 755    return 0;
 756}
 757
 758/**
 759 * Caller of this function should check present bit if wants
 760 * to use pasid entry for further usage except for fpd bit check.
 761 */
 762static int vtd_get_pe_from_pdire(IntelIOMMUState *s,
 763                                 uint32_t pasid,
 764                                 VTDPASIDDirEntry *pdire,
 765                                 VTDPASIDEntry *pe)
 766{
 767    dma_addr_t addr = pdire->val & VTD_PASID_TABLE_BASE_ADDR_MASK;
 768
 769    return vtd_get_pe_in_pasid_leaf_table(s, pasid, addr, pe);
 770}
 771
 772/**
 773 * This function gets a pasid entry from a specified pasid
 774 * table (includes dir and leaf table) with a specified pasid.
 775 * Sanity check should be done to ensure return a present
 776 * pasid entry to caller.
 777 */
 778static int vtd_get_pe_from_pasid_table(IntelIOMMUState *s,
 779                                       dma_addr_t pasid_dir_base,
 780                                       uint32_t pasid,
 781                                       VTDPASIDEntry *pe)
 782{
 783    int ret;
 784    VTDPASIDDirEntry pdire;
 785
 786    ret = vtd_get_pdire_from_pdir_table(pasid_dir_base,
 787                                        pasid, &pdire);
 788    if (ret) {
 789        return ret;
 790    }
 791
 792    if (!vtd_pdire_present(&pdire)) {
 793        return -VTD_FR_PASID_TABLE_INV;
 794    }
 795
 796    ret = vtd_get_pe_from_pdire(s, pasid, &pdire, pe);
 797    if (ret) {
 798        return ret;
 799    }
 800
 801    if (!vtd_pe_present(pe)) {
 802        return -VTD_FR_PASID_TABLE_INV;
 803    }
 804
 805    return 0;
 806}
 807
 808static int vtd_ce_get_rid2pasid_entry(IntelIOMMUState *s,
 809                                      VTDContextEntry *ce,
 810                                      VTDPASIDEntry *pe)
 811{
 812    uint32_t pasid;
 813    dma_addr_t pasid_dir_base;
 814    int ret = 0;
 815
 816    pasid = VTD_CE_GET_RID2PASID(ce);
 817    pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce);
 818    ret = vtd_get_pe_from_pasid_table(s, pasid_dir_base, pasid, pe);
 819
 820    return ret;
 821}
 822
 823static int vtd_ce_get_pasid_fpd(IntelIOMMUState *s,
 824                                VTDContextEntry *ce,
 825                                bool *pe_fpd_set)
 826{
 827    int ret;
 828    uint32_t pasid;
 829    dma_addr_t pasid_dir_base;
 830    VTDPASIDDirEntry pdire;
 831    VTDPASIDEntry pe;
 832
 833    pasid = VTD_CE_GET_RID2PASID(ce);
 834    pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce);
 835
 836    /*
 837     * No present bit check since fpd is meaningful even
 838     * if the present bit is clear.
 839     */
 840    ret = vtd_get_pdire_from_pdir_table(pasid_dir_base, pasid, &pdire);
 841    if (ret) {
 842        return ret;
 843    }
 844
 845    if (pdire.val & VTD_PASID_DIR_FPD) {
 846        *pe_fpd_set = true;
 847        return 0;
 848    }
 849
 850    if (!vtd_pdire_present(&pdire)) {
 851        return -VTD_FR_PASID_TABLE_INV;
 852    }
 853
 854    /*
 855     * No present bit check since fpd is meaningful even
 856     * if the present bit is clear.
 857     */
 858    ret = vtd_get_pe_from_pdire(s, pasid, &pdire, &pe);
 859    if (ret) {
 860        return ret;
 861    }
 862
 863    if (pe.val[0] & VTD_PASID_ENTRY_FPD) {
 864        *pe_fpd_set = true;
 865    }
 866
 867    return 0;
 868}
 869
 870/* Get the page-table level that hardware should use for the second-level
 871 * page-table walk from the Address Width field of context-entry.
 872 */
 873static inline uint32_t vtd_ce_get_level(VTDContextEntry *ce)
 874{
 875    return 2 + (ce->hi & VTD_CONTEXT_ENTRY_AW);
 876}
 877
 878static uint32_t vtd_get_iova_level(IntelIOMMUState *s,
 879                                   VTDContextEntry *ce)
 880{
 881    VTDPASIDEntry pe;
 882
 883    if (s->root_scalable) {
 884        vtd_ce_get_rid2pasid_entry(s, ce, &pe);
 885        return VTD_PE_GET_LEVEL(&pe);
 886    }
 887
 888    return vtd_ce_get_level(ce);
 889}
 890
 891static inline uint32_t vtd_ce_get_agaw(VTDContextEntry *ce)
 892{
 893    return 30 + (ce->hi & VTD_CONTEXT_ENTRY_AW) * 9;
 894}
 895
 896static uint32_t vtd_get_iova_agaw(IntelIOMMUState *s,
 897                                  VTDContextEntry *ce)
 898{
 899    VTDPASIDEntry pe;
 900
 901    if (s->root_scalable) {
 902        vtd_ce_get_rid2pasid_entry(s, ce, &pe);
 903        return 30 + ((pe.val[0] >> 2) & VTD_SM_PASID_ENTRY_AW) * 9;
 904    }
 905
 906    return vtd_ce_get_agaw(ce);
 907}
 908
 909static inline uint32_t vtd_ce_get_type(VTDContextEntry *ce)
 910{
 911    return ce->lo & VTD_CONTEXT_ENTRY_TT;
 912}
 913
 914/* Only for Legacy Mode. Return true if check passed, otherwise false */
 915static inline bool vtd_ce_type_check(X86IOMMUState *x86_iommu,
 916                                     VTDContextEntry *ce)
 917{
 918    switch (vtd_ce_get_type(ce)) {
 919    case VTD_CONTEXT_TT_MULTI_LEVEL:
 920        /* Always supported */
 921        break;
 922    case VTD_CONTEXT_TT_DEV_IOTLB:
 923        if (!x86_iommu->dt_supported) {
 924            error_report_once("%s: DT specified but not supported", __func__);
 925            return false;
 926        }
 927        break;
 928    case VTD_CONTEXT_TT_PASS_THROUGH:
 929        if (!x86_iommu->pt_supported) {
 930            error_report_once("%s: PT specified but not supported", __func__);
 931            return false;
 932        }
 933        break;
 934    default:
 935        /* Unknown type */
 936        error_report_once("%s: unknown ce type: %"PRIu32, __func__,
 937                          vtd_ce_get_type(ce));
 938        return false;
 939    }
 940    return true;
 941}
 942
 943static inline uint64_t vtd_iova_limit(IntelIOMMUState *s,
 944                                      VTDContextEntry *ce, uint8_t aw)
 945{
 946    uint32_t ce_agaw = vtd_get_iova_agaw(s, ce);
 947    return 1ULL << MIN(ce_agaw, aw);
 948}
 949
 950/* Return true if IOVA passes range check, otherwise false. */
 951static inline bool vtd_iova_range_check(IntelIOMMUState *s,
 952                                        uint64_t iova, VTDContextEntry *ce,
 953                                        uint8_t aw)
 954{
 955    /*
 956     * Check if @iova is above 2^X-1, where X is the minimum of MGAW
 957     * in CAP_REG and AW in context-entry.
 958     */
 959    return !(iova & ~(vtd_iova_limit(s, ce, aw) - 1));
 960}
 961
 962static dma_addr_t vtd_get_iova_pgtbl_base(IntelIOMMUState *s,
 963                                          VTDContextEntry *ce)
 964{
 965    VTDPASIDEntry pe;
 966
 967    if (s->root_scalable) {
 968        vtd_ce_get_rid2pasid_entry(s, ce, &pe);
 969        return pe.val[0] & VTD_SM_PASID_ENTRY_SLPTPTR;
 970    }
 971
 972    return vtd_ce_get_slpt_base(ce);
 973}
 974
 975/*
 976 * Rsvd field masks for spte:
 977 *     vtd_spte_rsvd 4k pages
 978 *     vtd_spte_rsvd_large large pages
 979 */
 980static uint64_t vtd_spte_rsvd[5];
 981static uint64_t vtd_spte_rsvd_large[5];
 982
 983static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level)
 984{
 985    uint64_t rsvd_mask = vtd_spte_rsvd[level];
 986
 987    if ((level == VTD_SL_PD_LEVEL || level == VTD_SL_PDP_LEVEL) &&
 988        (slpte & VTD_SL_PT_PAGE_SIZE_MASK)) {
 989        /* large page */
 990        rsvd_mask = vtd_spte_rsvd_large[level];
 991    }
 992
 993    return slpte & rsvd_mask;
 994}
 995
 996/* Find the VTD address space associated with a given bus number */
 997static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num)
 998{
 999    VTDBus *vtd_bus = s->vtd_as_by_bus_num[bus_num];
1000    GHashTableIter iter;
1001
1002    if (vtd_bus) {
1003        return vtd_bus;
1004    }
1005
1006    /*
1007     * Iterate over the registered buses to find the one which
1008     * currently holds this bus number and update the bus_num
1009     * lookup table.
1010     */
1011    g_hash_table_iter_init(&iter, s->vtd_as_by_busptr);
1012    while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) {
1013        if (pci_bus_num(vtd_bus->bus) == bus_num) {
1014            s->vtd_as_by_bus_num[bus_num] = vtd_bus;
1015            return vtd_bus;
1016        }
1017    }
1018
1019    return NULL;
1020}
1021
1022/* Given the @iova, get relevant @slptep. @slpte_level will be the last level
1023 * of the translation, can be used for deciding the size of large page.
1024 */
1025static int vtd_iova_to_slpte(IntelIOMMUState *s, VTDContextEntry *ce,
1026                             uint64_t iova, bool is_write,
1027                             uint64_t *slptep, uint32_t *slpte_level,
1028                             bool *reads, bool *writes, uint8_t aw_bits)
1029{
1030    dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce);
1031    uint32_t level = vtd_get_iova_level(s, ce);
1032    uint32_t offset;
1033    uint64_t slpte;
1034    uint64_t access_right_check;
1035    uint64_t xlat, size;
1036
1037    if (!vtd_iova_range_check(s, iova, ce, aw_bits)) {
1038        error_report_once("%s: detected IOVA overflow (iova=0x%" PRIx64 ")",
1039                          __func__, iova);
1040        return -VTD_FR_ADDR_BEYOND_MGAW;
1041    }
1042
1043    /* FIXME: what is the Atomics request here? */
1044    access_right_check = is_write ? VTD_SL_W : VTD_SL_R;
1045
1046    while (true) {
1047        offset = vtd_iova_level_offset(iova, level);
1048        slpte = vtd_get_slpte(addr, offset);
1049
1050        if (slpte == (uint64_t)-1) {
1051            error_report_once("%s: detected read error on DMAR slpte "
1052                              "(iova=0x%" PRIx64 ")", __func__, iova);
1053            if (level == vtd_get_iova_level(s, ce)) {
1054                /* Invalid programming of context-entry */
1055                return -VTD_FR_CONTEXT_ENTRY_INV;
1056            } else {
1057                return -VTD_FR_PAGING_ENTRY_INV;
1058            }
1059        }
1060        *reads = (*reads) && (slpte & VTD_SL_R);
1061        *writes = (*writes) && (slpte & VTD_SL_W);
1062        if (!(slpte & access_right_check)) {
1063            error_report_once("%s: detected slpte permission error "
1064                              "(iova=0x%" PRIx64 ", level=0x%" PRIx32 ", "
1065                              "slpte=0x%" PRIx64 ", write=%d)", __func__,
1066                              iova, level, slpte, is_write);
1067            return is_write ? -VTD_FR_WRITE : -VTD_FR_READ;
1068        }
1069        if (vtd_slpte_nonzero_rsvd(slpte, level)) {
1070            error_report_once("%s: detected splte reserve non-zero "
1071                              "iova=0x%" PRIx64 ", level=0x%" PRIx32
1072                              "slpte=0x%" PRIx64 ")", __func__, iova,
1073                              level, slpte);
1074            return -VTD_FR_PAGING_ENTRY_RSVD;
1075        }
1076
1077        if (vtd_is_last_slpte(slpte, level)) {
1078            *slptep = slpte;
1079            *slpte_level = level;
1080            break;
1081        }
1082        addr = vtd_get_slpte_addr(slpte, aw_bits);
1083        level--;
1084    }
1085
1086    xlat = vtd_get_slpte_addr(*slptep, aw_bits);
1087    size = ~vtd_slpt_level_page_mask(level) + 1;
1088
1089    /*
1090     * From VT-d spec 3.14: Untranslated requests and translation
1091     * requests that result in an address in the interrupt range will be
1092     * blocked with condition code LGN.4 or SGN.8.
1093     */
1094    if ((xlat > VTD_INTERRUPT_ADDR_LAST ||
1095         xlat + size - 1 < VTD_INTERRUPT_ADDR_FIRST)) {
1096        return 0;
1097    } else {
1098        error_report_once("%s: xlat address is in interrupt range "
1099                          "(iova=0x%" PRIx64 ", level=0x%" PRIx32 ", "
1100                          "slpte=0x%" PRIx64 ", write=%d, "
1101                          "xlat=0x%" PRIx64 ", size=0x%" PRIx64 ")",
1102                          __func__, iova, level, slpte, is_write,
1103                          xlat, size);
1104        return s->scalable_mode ? -VTD_FR_SM_INTERRUPT_ADDR :
1105                                  -VTD_FR_INTERRUPT_ADDR;
1106    }
1107}
1108
1109typedef int (*vtd_page_walk_hook)(IOMMUTLBEvent *event, void *private);
1110
1111/**
1112 * Constant information used during page walking
1113 *
1114 * @hook_fn: hook func to be called when detected page
1115 * @private: private data to be passed into hook func
1116 * @notify_unmap: whether we should notify invalid entries
1117 * @as: VT-d address space of the device
1118 * @aw: maximum address width
1119 * @domain: domain ID of the page walk
1120 */
1121typedef struct {
1122    VTDAddressSpace *as;
1123    vtd_page_walk_hook hook_fn;
1124    void *private;
1125    bool notify_unmap;
1126    uint8_t aw;
1127    uint16_t domain_id;
1128} vtd_page_walk_info;
1129
1130static int vtd_page_walk_one(IOMMUTLBEvent *event, vtd_page_walk_info *info)
1131{
1132    VTDAddressSpace *as = info->as;
1133    vtd_page_walk_hook hook_fn = info->hook_fn;
1134    void *private = info->private;
1135    IOMMUTLBEntry *entry = &event->entry;
1136    DMAMap target = {
1137        .iova = entry->iova,
1138        .size = entry->addr_mask,
1139        .translated_addr = entry->translated_addr,
1140        .perm = entry->perm,
1141    };
1142    const DMAMap *mapped = iova_tree_find(as->iova_tree, &target);
1143
1144    if (event->type == IOMMU_NOTIFIER_UNMAP && !info->notify_unmap) {
1145        trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
1146        return 0;
1147    }
1148
1149    assert(hook_fn);
1150
1151    /* Update local IOVA mapped ranges */
1152    if (event->type == IOMMU_NOTIFIER_MAP) {
1153        if (mapped) {
1154            /* If it's exactly the same translation, skip */
1155            if (!memcmp(mapped, &target, sizeof(target))) {
1156                trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask,
1157                                                 entry->translated_addr);
1158                return 0;
1159            } else {
1160                /*
1161                 * Translation changed.  Normally this should not
1162                 * happen, but it can happen when with buggy guest
1163                 * OSes.  Note that there will be a small window that
1164                 * we don't have map at all.  But that's the best
1165                 * effort we can do.  The ideal way to emulate this is
1166                 * atomically modify the PTE to follow what has
1167                 * changed, but we can't.  One example is that vfio
1168                 * driver only has VFIO_IOMMU_[UN]MAP_DMA but no
1169                 * interface to modify a mapping (meanwhile it seems
1170                 * meaningless to even provide one).  Anyway, let's
1171                 * mark this as a TODO in case one day we'll have
1172                 * a better solution.
1173                 */
1174                IOMMUAccessFlags cache_perm = entry->perm;
1175                int ret;
1176
1177                /* Emulate an UNMAP */
1178                event->type = IOMMU_NOTIFIER_UNMAP;
1179                entry->perm = IOMMU_NONE;
1180                trace_vtd_page_walk_one(info->domain_id,
1181                                        entry->iova,
1182                                        entry->translated_addr,
1183                                        entry->addr_mask,
1184                                        entry->perm);
1185                ret = hook_fn(event, private);
1186                if (ret) {
1187                    return ret;
1188                }
1189                /* Drop any existing mapping */
1190                iova_tree_remove(as->iova_tree, &target);
1191                /* Recover the correct type */
1192                event->type = IOMMU_NOTIFIER_MAP;
1193                entry->perm = cache_perm;
1194            }
1195        }
1196        iova_tree_insert(as->iova_tree, &target);
1197    } else {
1198        if (!mapped) {
1199            /* Skip since we didn't map this range at all */
1200            trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
1201            return 0;
1202        }
1203        iova_tree_remove(as->iova_tree, &target);
1204    }
1205
1206    trace_vtd_page_walk_one(info->domain_id, entry->iova,
1207                            entry->translated_addr, entry->addr_mask,
1208                            entry->perm);
1209    return hook_fn(event, private);
1210}
1211
1212/**
1213 * vtd_page_walk_level - walk over specific level for IOVA range
1214 *
1215 * @addr: base GPA addr to start the walk
1216 * @start: IOVA range start address
1217 * @end: IOVA range end address (start <= addr < end)
1218 * @read: whether parent level has read permission
1219 * @write: whether parent level has write permission
1220 * @info: constant information for the page walk
1221 */
1222static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
1223                               uint64_t end, uint32_t level, bool read,
1224                               bool write, vtd_page_walk_info *info)
1225{
1226    bool read_cur, write_cur, entry_valid;
1227    uint32_t offset;
1228    uint64_t slpte;
1229    uint64_t subpage_size, subpage_mask;
1230    IOMMUTLBEvent event;
1231    uint64_t iova = start;
1232    uint64_t iova_next;
1233    int ret = 0;
1234
1235    trace_vtd_page_walk_level(addr, level, start, end);
1236
1237    subpage_size = 1ULL << vtd_slpt_level_shift(level);
1238    subpage_mask = vtd_slpt_level_page_mask(level);
1239
1240    while (iova < end) {
1241        iova_next = (iova & subpage_mask) + subpage_size;
1242
1243        offset = vtd_iova_level_offset(iova, level);
1244        slpte = vtd_get_slpte(addr, offset);
1245
1246        if (slpte == (uint64_t)-1) {
1247            trace_vtd_page_walk_skip_read(iova, iova_next);
1248            goto next;
1249        }
1250
1251        if (vtd_slpte_nonzero_rsvd(slpte, level)) {
1252            trace_vtd_page_walk_skip_reserve(iova, iova_next);
1253            goto next;
1254        }
1255
1256        /* Permissions are stacked with parents' */
1257        read_cur = read && (slpte & VTD_SL_R);
1258        write_cur = write && (slpte & VTD_SL_W);
1259
1260        /*
1261         * As long as we have either read/write permission, this is a
1262         * valid entry. The rule works for both page entries and page
1263         * table entries.
1264         */
1265        entry_valid = read_cur | write_cur;
1266
1267        if (!vtd_is_last_slpte(slpte, level) && entry_valid) {
1268            /*
1269             * This is a valid PDE (or even bigger than PDE).  We need
1270             * to walk one further level.
1271             */
1272            ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw),
1273                                      iova, MIN(iova_next, end), level - 1,
1274                                      read_cur, write_cur, info);
1275        } else {
1276            /*
1277             * This means we are either:
1278             *
1279             * (1) the real page entry (either 4K page, or huge page)
1280             * (2) the whole range is invalid
1281             *
1282             * In either case, we send an IOTLB notification down.
1283             */
1284            event.entry.target_as = &address_space_memory;
1285            event.entry.iova = iova & subpage_mask;
1286            event.entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
1287            event.entry.addr_mask = ~subpage_mask;
1288            /* NOTE: this is only meaningful if entry_valid == true */
1289            event.entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw);
1290            event.type = event.entry.perm ? IOMMU_NOTIFIER_MAP :
1291                                            IOMMU_NOTIFIER_UNMAP;
1292            ret = vtd_page_walk_one(&event, info);
1293        }
1294
1295        if (ret < 0) {
1296            return ret;
1297        }
1298
1299next:
1300        iova = iova_next;
1301    }
1302
1303    return 0;
1304}
1305
1306/**
1307 * vtd_page_walk - walk specific IOVA range, and call the hook
1308 *
1309 * @s: intel iommu state
1310 * @ce: context entry to walk upon
1311 * @start: IOVA address to start the walk
1312 * @end: IOVA range end address (start <= addr < end)
1313 * @info: page walking information struct
1314 */
1315static int vtd_page_walk(IntelIOMMUState *s, VTDContextEntry *ce,
1316                         uint64_t start, uint64_t end,
1317                         vtd_page_walk_info *info)
1318{
1319    dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce);
1320    uint32_t level = vtd_get_iova_level(s, ce);
1321
1322    if (!vtd_iova_range_check(s, start, ce, info->aw)) {
1323        return -VTD_FR_ADDR_BEYOND_MGAW;
1324    }
1325
1326    if (!vtd_iova_range_check(s, end, ce, info->aw)) {
1327        /* Fix end so that it reaches the maximum */
1328        end = vtd_iova_limit(s, ce, info->aw);
1329    }
1330
1331    return vtd_page_walk_level(addr, start, end, level, true, true, info);
1332}
1333
1334static int vtd_root_entry_rsvd_bits_check(IntelIOMMUState *s,
1335                                          VTDRootEntry *re)
1336{
1337    /* Legacy Mode reserved bits check */
1338    if (!s->root_scalable &&
1339        (re->hi || (re->lo & VTD_ROOT_ENTRY_RSVD(s->aw_bits))))
1340        goto rsvd_err;
1341
1342    /* Scalable Mode reserved bits check */
1343    if (s->root_scalable &&
1344        ((re->lo & VTD_ROOT_ENTRY_RSVD(s->aw_bits)) ||
1345         (re->hi & VTD_ROOT_ENTRY_RSVD(s->aw_bits))))
1346        goto rsvd_err;
1347
1348    return 0;
1349
1350rsvd_err:
1351    error_report_once("%s: invalid root entry: hi=0x%"PRIx64
1352                      ", lo=0x%"PRIx64,
1353                      __func__, re->hi, re->lo);
1354    return -VTD_FR_ROOT_ENTRY_RSVD;
1355}
1356
1357static inline int vtd_context_entry_rsvd_bits_check(IntelIOMMUState *s,
1358                                                    VTDContextEntry *ce)
1359{
1360    if (!s->root_scalable &&
1361        (ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI ||
1362         ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO(s->aw_bits))) {
1363        error_report_once("%s: invalid context entry: hi=%"PRIx64
1364                          ", lo=%"PRIx64" (reserved nonzero)",
1365                          __func__, ce->hi, ce->lo);
1366        return -VTD_FR_CONTEXT_ENTRY_RSVD;
1367    }
1368
1369    if (s->root_scalable &&
1370        (ce->val[0] & VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(s->aw_bits) ||
1371         ce->val[1] & VTD_SM_CONTEXT_ENTRY_RSVD_VAL1 ||
1372         ce->val[2] ||
1373         ce->val[3])) {
1374        error_report_once("%s: invalid context entry: val[3]=%"PRIx64
1375                          ", val[2]=%"PRIx64
1376                          ", val[1]=%"PRIx64
1377                          ", val[0]=%"PRIx64" (reserved nonzero)",
1378                          __func__, ce->val[3], ce->val[2],
1379                          ce->val[1], ce->val[0]);
1380        return -VTD_FR_CONTEXT_ENTRY_RSVD;
1381    }
1382
1383    return 0;
1384}
1385
1386static int vtd_ce_rid2pasid_check(IntelIOMMUState *s,
1387                                  VTDContextEntry *ce)
1388{
1389    VTDPASIDEntry pe;
1390
1391    /*
1392     * Make sure in Scalable Mode, a present context entry
1393     * has valid rid2pasid setting, which includes valid
1394     * rid2pasid field and corresponding pasid entry setting
1395     */
1396    return vtd_ce_get_rid2pasid_entry(s, ce, &pe);
1397}
1398
1399/* Map a device to its corresponding domain (context-entry) */
1400static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
1401                                    uint8_t devfn, VTDContextEntry *ce)
1402{
1403    VTDRootEntry re;
1404    int ret_fr;
1405    X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
1406
1407    ret_fr = vtd_get_root_entry(s, bus_num, &re);
1408    if (ret_fr) {
1409        return ret_fr;
1410    }
1411
1412    if (!vtd_root_entry_present(s, &re, devfn)) {
1413        /* Not error - it's okay we don't have root entry. */
1414        trace_vtd_re_not_present(bus_num);
1415        return -VTD_FR_ROOT_ENTRY_P;
1416    }
1417
1418    ret_fr = vtd_root_entry_rsvd_bits_check(s, &re);
1419    if (ret_fr) {
1420        return ret_fr;
1421    }
1422
1423    ret_fr = vtd_get_context_entry_from_root(s, &re, devfn, ce);
1424    if (ret_fr) {
1425        return ret_fr;
1426    }
1427
1428    if (!vtd_ce_present(ce)) {
1429        /* Not error - it's okay we don't have context entry. */
1430        trace_vtd_ce_not_present(bus_num, devfn);
1431        return -VTD_FR_CONTEXT_ENTRY_P;
1432    }
1433
1434    ret_fr = vtd_context_entry_rsvd_bits_check(s, ce);
1435    if (ret_fr) {
1436        return ret_fr;
1437    }
1438
1439    /* Check if the programming of context-entry is valid */
1440    if (!s->root_scalable &&
1441        !vtd_is_level_supported(s, vtd_ce_get_level(ce))) {
1442        error_report_once("%s: invalid context entry: hi=%"PRIx64
1443                          ", lo=%"PRIx64" (level %d not supported)",
1444                          __func__, ce->hi, ce->lo,
1445                          vtd_ce_get_level(ce));
1446        return -VTD_FR_CONTEXT_ENTRY_INV;
1447    }
1448
1449    if (!s->root_scalable) {
1450        /* Do translation type check */
1451        if (!vtd_ce_type_check(x86_iommu, ce)) {
1452            /* Errors dumped in vtd_ce_type_check() */
1453            return -VTD_FR_CONTEXT_ENTRY_INV;
1454        }
1455    } else {
1456        /*
1457         * Check if the programming of context-entry.rid2pasid
1458         * and corresponding pasid setting is valid, and thus
1459         * avoids to check pasid entry fetching result in future
1460         * helper function calling.
1461         */
1462        ret_fr = vtd_ce_rid2pasid_check(s, ce);
1463        if (ret_fr) {
1464            return ret_fr;
1465        }
1466    }
1467
1468    return 0;
1469}
1470
1471static int vtd_sync_shadow_page_hook(IOMMUTLBEvent *event,
1472                                     void *private)
1473{
1474    memory_region_notify_iommu(private, 0, *event);
1475    return 0;
1476}
1477
1478static uint16_t vtd_get_domain_id(IntelIOMMUState *s,
1479                                  VTDContextEntry *ce)
1480{
1481    VTDPASIDEntry pe;
1482
1483    if (s->root_scalable) {
1484        vtd_ce_get_rid2pasid_entry(s, ce, &pe);
1485        return VTD_SM_PASID_ENTRY_DID(pe.val[1]);
1486    }
1487
1488    return VTD_CONTEXT_ENTRY_DID(ce->hi);
1489}
1490
1491static int vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as,
1492                                            VTDContextEntry *ce,
1493                                            hwaddr addr, hwaddr size)
1494{
1495    IntelIOMMUState *s = vtd_as->iommu_state;
1496    vtd_page_walk_info info = {
1497        .hook_fn = vtd_sync_shadow_page_hook,
1498        .private = (void *)&vtd_as->iommu,
1499        .notify_unmap = true,
1500        .aw = s->aw_bits,
1501        .as = vtd_as,
1502        .domain_id = vtd_get_domain_id(s, ce),
1503    };
1504
1505    return vtd_page_walk(s, ce, addr, addr + size, &info);
1506}
1507
1508static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as)
1509{
1510    int ret;
1511    VTDContextEntry ce;
1512    IOMMUNotifier *n;
1513
1514    if (!(vtd_as->iommu.iommu_notify_flags & IOMMU_NOTIFIER_IOTLB_EVENTS)) {
1515        return 0;
1516    }
1517
1518    ret = vtd_dev_to_context_entry(vtd_as->iommu_state,
1519                                   pci_bus_num(vtd_as->bus),
1520                                   vtd_as->devfn, &ce);
1521    if (ret) {
1522        if (ret == -VTD_FR_CONTEXT_ENTRY_P) {
1523            /*
1524             * It's a valid scenario to have a context entry that is
1525             * not present.  For example, when a device is removed
1526             * from an existing domain then the context entry will be
1527             * zeroed by the guest before it was put into another
1528             * domain.  When this happens, instead of synchronizing
1529             * the shadow pages we should invalidate all existing
1530             * mappings and notify the backends.
1531             */
1532            IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) {
1533                vtd_address_space_unmap(vtd_as, n);
1534            }
1535            ret = 0;
1536        }
1537        return ret;
1538    }
1539
1540    return vtd_sync_shadow_page_table_range(vtd_as, &ce, 0, UINT64_MAX);
1541}
1542
1543/*
1544 * Check if specific device is configured to bypass address
1545 * translation for DMA requests. In Scalable Mode, bypass
1546 * 1st-level translation or 2nd-level translation, it depends
1547 * on PGTT setting.
1548 */
1549static bool vtd_dev_pt_enabled(IntelIOMMUState *s, VTDContextEntry *ce)
1550{
1551    VTDPASIDEntry pe;
1552    int ret;
1553
1554    if (s->root_scalable) {
1555        ret = vtd_ce_get_rid2pasid_entry(s, ce, &pe);
1556        if (ret) {
1557            error_report_once("%s: vtd_ce_get_rid2pasid_entry error: %"PRId32,
1558                              __func__, ret);
1559            return false;
1560        }
1561        return (VTD_PE_GET_TYPE(&pe) == VTD_SM_PASID_ENTRY_PT);
1562    }
1563
1564    return (vtd_ce_get_type(ce) == VTD_CONTEXT_TT_PASS_THROUGH);
1565
1566}
1567
1568static bool vtd_as_pt_enabled(VTDAddressSpace *as)
1569{
1570    IntelIOMMUState *s;
1571    VTDContextEntry ce;
1572    int ret;
1573
1574    assert(as);
1575
1576    s = as->iommu_state;
1577    ret = vtd_dev_to_context_entry(s, pci_bus_num(as->bus),
1578                                   as->devfn, &ce);
1579    if (ret) {
1580        /*
1581         * Possibly failed to parse the context entry for some reason
1582         * (e.g., during init, or any guest configuration errors on
1583         * context entries). We should assume PT not enabled for
1584         * safety.
1585         */
1586        return false;
1587    }
1588
1589    return vtd_dev_pt_enabled(s, &ce);
1590}
1591
1592/* Return whether the device is using IOMMU translation. */
1593static bool vtd_switch_address_space(VTDAddressSpace *as)
1594{
1595    bool use_iommu;
1596    /* Whether we need to take the BQL on our own */
1597    bool take_bql = !qemu_mutex_iothread_locked();
1598
1599    assert(as);
1600
1601    use_iommu = as->iommu_state->dmar_enabled && !vtd_as_pt_enabled(as);
1602
1603    trace_vtd_switch_address_space(pci_bus_num(as->bus),
1604                                   VTD_PCI_SLOT(as->devfn),
1605                                   VTD_PCI_FUNC(as->devfn),
1606                                   use_iommu);
1607
1608    /*
1609     * It's possible that we reach here without BQL, e.g., when called
1610     * from vtd_pt_enable_fast_path(). However the memory APIs need
1611     * it. We'd better make sure we have had it already, or, take it.
1612     */
1613    if (take_bql) {
1614        qemu_mutex_lock_iothread();
1615    }
1616
1617    /* Turn off first then on the other */
1618    if (use_iommu) {
1619        memory_region_set_enabled(&as->nodmar, false);
1620        memory_region_set_enabled(MEMORY_REGION(&as->iommu), true);
1621    } else {
1622        memory_region_set_enabled(MEMORY_REGION(&as->iommu), false);
1623        memory_region_set_enabled(&as->nodmar, true);
1624    }
1625
1626    if (take_bql) {
1627        qemu_mutex_unlock_iothread();
1628    }
1629
1630    return use_iommu;
1631}
1632
1633static void vtd_switch_address_space_all(IntelIOMMUState *s)
1634{
1635    GHashTableIter iter;
1636    VTDBus *vtd_bus;
1637    int i;
1638
1639    g_hash_table_iter_init(&iter, s->vtd_as_by_busptr);
1640    while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) {
1641        for (i = 0; i < PCI_DEVFN_MAX; i++) {
1642            if (!vtd_bus->dev_as[i]) {
1643                continue;
1644            }
1645            vtd_switch_address_space(vtd_bus->dev_as[i]);
1646        }
1647    }
1648}
1649
1650static inline uint16_t vtd_make_source_id(uint8_t bus_num, uint8_t devfn)
1651{
1652    return ((bus_num & 0xffUL) << 8) | (devfn & 0xffUL);
1653}
1654
1655static const bool vtd_qualified_faults[] = {
1656    [VTD_FR_RESERVED] = false,
1657    [VTD_FR_ROOT_ENTRY_P] = false,
1658    [VTD_FR_CONTEXT_ENTRY_P] = true,
1659    [VTD_FR_CONTEXT_ENTRY_INV] = true,
1660    [VTD_FR_ADDR_BEYOND_MGAW] = true,
1661    [VTD_FR_WRITE] = true,
1662    [VTD_FR_READ] = true,
1663    [VTD_FR_PAGING_ENTRY_INV] = true,
1664    [VTD_FR_ROOT_TABLE_INV] = false,
1665    [VTD_FR_CONTEXT_TABLE_INV] = false,
1666    [VTD_FR_INTERRUPT_ADDR] = true,
1667    [VTD_FR_ROOT_ENTRY_RSVD] = false,
1668    [VTD_FR_PAGING_ENTRY_RSVD] = true,
1669    [VTD_FR_CONTEXT_ENTRY_TT] = true,
1670    [VTD_FR_PASID_TABLE_INV] = false,
1671    [VTD_FR_SM_INTERRUPT_ADDR] = true,
1672    [VTD_FR_MAX] = false,
1673};
1674
1675/* To see if a fault condition is "qualified", which is reported to software
1676 * only if the FPD field in the context-entry used to process the faulting
1677 * request is 0.
1678 */
1679static inline bool vtd_is_qualified_fault(VTDFaultReason fault)
1680{
1681    return vtd_qualified_faults[fault];
1682}
1683
1684static inline bool vtd_is_interrupt_addr(hwaddr addr)
1685{
1686    return VTD_INTERRUPT_ADDR_FIRST <= addr && addr <= VTD_INTERRUPT_ADDR_LAST;
1687}
1688
1689static void vtd_pt_enable_fast_path(IntelIOMMUState *s, uint16_t source_id)
1690{
1691    VTDBus *vtd_bus;
1692    VTDAddressSpace *vtd_as;
1693    bool success = false;
1694
1695    vtd_bus = vtd_find_as_from_bus_num(s, VTD_SID_TO_BUS(source_id));
1696    if (!vtd_bus) {
1697        goto out;
1698    }
1699
1700    vtd_as = vtd_bus->dev_as[VTD_SID_TO_DEVFN(source_id)];
1701    if (!vtd_as) {
1702        goto out;
1703    }
1704
1705    if (vtd_switch_address_space(vtd_as) == false) {
1706        /* We switched off IOMMU region successfully. */
1707        success = true;
1708    }
1709
1710out:
1711    trace_vtd_pt_enable_fast_path(source_id, success);
1712}
1713
1714/* Map dev to context-entry then do a paging-structures walk to do a iommu
1715 * translation.
1716 *
1717 * Called from RCU critical section.
1718 *
1719 * @bus_num: The bus number
1720 * @devfn: The devfn, which is the  combined of device and function number
1721 * @is_write: The access is a write operation
1722 * @entry: IOMMUTLBEntry that contain the addr to be translated and result
1723 *
1724 * Returns true if translation is successful, otherwise false.
1725 */
1726static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
1727                                   uint8_t devfn, hwaddr addr, bool is_write,
1728                                   IOMMUTLBEntry *entry)
1729{
1730    IntelIOMMUState *s = vtd_as->iommu_state;
1731    VTDContextEntry ce;
1732    uint8_t bus_num = pci_bus_num(bus);
1733    VTDContextCacheEntry *cc_entry;
1734    uint64_t slpte, page_mask;
1735    uint32_t level;
1736    uint16_t source_id = vtd_make_source_id(bus_num, devfn);
1737    int ret_fr;
1738    bool is_fpd_set = false;
1739    bool reads = true;
1740    bool writes = true;
1741    uint8_t access_flags;
1742    VTDIOTLBEntry *iotlb_entry;
1743
1744    /*
1745     * We have standalone memory region for interrupt addresses, we
1746     * should never receive translation requests in this region.
1747     */
1748    assert(!vtd_is_interrupt_addr(addr));
1749
1750    vtd_iommu_lock(s);
1751
1752    cc_entry = &vtd_as->context_cache_entry;
1753
1754    /* Try to fetch slpte form IOTLB */
1755    iotlb_entry = vtd_lookup_iotlb(s, source_id, addr);
1756    if (iotlb_entry) {
1757        trace_vtd_iotlb_page_hit(source_id, addr, iotlb_entry->slpte,
1758                                 iotlb_entry->domain_id);
1759        slpte = iotlb_entry->slpte;
1760        access_flags = iotlb_entry->access_flags;
1761        page_mask = iotlb_entry->mask;
1762        goto out;
1763    }
1764
1765    /* Try to fetch context-entry from cache first */
1766    if (cc_entry->context_cache_gen == s->context_cache_gen) {
1767        trace_vtd_iotlb_cc_hit(bus_num, devfn, cc_entry->context_entry.hi,
1768                               cc_entry->context_entry.lo,
1769                               cc_entry->context_cache_gen);
1770        ce = cc_entry->context_entry;
1771        is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD;
1772        if (!is_fpd_set && s->root_scalable) {
1773            ret_fr = vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set);
1774            VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write);
1775        }
1776    } else {
1777        ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
1778        is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD;
1779        if (!ret_fr && !is_fpd_set && s->root_scalable) {
1780            ret_fr = vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set);
1781        }
1782        VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write);
1783        /* Update context-cache */
1784        trace_vtd_iotlb_cc_update(bus_num, devfn, ce.hi, ce.lo,
1785                                  cc_entry->context_cache_gen,
1786                                  s->context_cache_gen);
1787        cc_entry->context_entry = ce;
1788        cc_entry->context_cache_gen = s->context_cache_gen;
1789    }
1790
1791    /*
1792     * We don't need to translate for pass-through context entries.
1793     * Also, let's ignore IOTLB caching as well for PT devices.
1794     */
1795    if (vtd_dev_pt_enabled(s, &ce)) {
1796        entry->iova = addr & VTD_PAGE_MASK_4K;
1797        entry->translated_addr = entry->iova;
1798        entry->addr_mask = ~VTD_PAGE_MASK_4K;
1799        entry->perm = IOMMU_RW;
1800        trace_vtd_translate_pt(source_id, entry->iova);
1801
1802        /*
1803         * When this happens, it means firstly caching-mode is not
1804         * enabled, and this is the first passthrough translation for
1805         * the device. Let's enable the fast path for passthrough.
1806         *
1807         * When passthrough is disabled again for the device, we can
1808         * capture it via the context entry invalidation, then the
1809         * IOMMU region can be swapped back.
1810         */
1811        vtd_pt_enable_fast_path(s, source_id);
1812        vtd_iommu_unlock(s);
1813        return true;
1814    }
1815
1816    ret_fr = vtd_iova_to_slpte(s, &ce, addr, is_write, &slpte, &level,
1817                               &reads, &writes, s->aw_bits);
1818    VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write);
1819
1820    page_mask = vtd_slpt_level_page_mask(level);
1821    access_flags = IOMMU_ACCESS_FLAG(reads, writes);
1822    vtd_update_iotlb(s, source_id, vtd_get_domain_id(s, &ce), addr, slpte,
1823                     access_flags, level);
1824out:
1825    vtd_iommu_unlock(s);
1826    entry->iova = addr & page_mask;
1827    entry->translated_addr = vtd_get_slpte_addr(slpte, s->aw_bits) & page_mask;
1828    entry->addr_mask = ~page_mask;
1829    entry->perm = access_flags;
1830    return true;
1831
1832error:
1833    vtd_iommu_unlock(s);
1834    entry->iova = 0;
1835    entry->translated_addr = 0;
1836    entry->addr_mask = 0;
1837    entry->perm = IOMMU_NONE;
1838    return false;
1839}
1840
1841static void vtd_root_table_setup(IntelIOMMUState *s)
1842{
1843    s->root = vtd_get_quad_raw(s, DMAR_RTADDR_REG);
1844    s->root &= VTD_RTADDR_ADDR_MASK(s->aw_bits);
1845
1846    vtd_update_scalable_state(s);
1847
1848    trace_vtd_reg_dmar_root(s->root, s->root_scalable);
1849}
1850
1851static void vtd_iec_notify_all(IntelIOMMUState *s, bool global,
1852                               uint32_t index, uint32_t mask)
1853{
1854    x86_iommu_iec_notify_all(X86_IOMMU_DEVICE(s), global, index, mask);
1855}
1856
1857static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s)
1858{
1859    uint64_t value = 0;
1860    value = vtd_get_quad_raw(s, DMAR_IRTA_REG);
1861    s->intr_size = 1UL << ((value & VTD_IRTA_SIZE_MASK) + 1);
1862    s->intr_root = value & VTD_IRTA_ADDR_MASK(s->aw_bits);
1863    s->intr_eime = value & VTD_IRTA_EIME;
1864
1865    /* Notify global invalidation */
1866    vtd_iec_notify_all(s, true, 0, 0);
1867
1868    trace_vtd_reg_ir_root(s->intr_root, s->intr_size);
1869}
1870
1871static void vtd_iommu_replay_all(IntelIOMMUState *s)
1872{
1873    VTDAddressSpace *vtd_as;
1874
1875    QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
1876        vtd_sync_shadow_page_table(vtd_as);
1877    }
1878}
1879
1880static void vtd_context_global_invalidate(IntelIOMMUState *s)
1881{
1882    trace_vtd_inv_desc_cc_global();
1883    /* Protects context cache */
1884    vtd_iommu_lock(s);
1885    s->context_cache_gen++;
1886    if (s->context_cache_gen == VTD_CONTEXT_CACHE_GEN_MAX) {
1887        vtd_reset_context_cache_locked(s);
1888    }
1889    vtd_iommu_unlock(s);
1890    vtd_address_space_refresh_all(s);
1891    /*
1892     * From VT-d spec 6.5.2.1, a global context entry invalidation
1893     * should be followed by a IOTLB global invalidation, so we should
1894     * be safe even without this. Hoewever, let's replay the region as
1895     * well to be safer, and go back here when we need finer tunes for
1896     * VT-d emulation codes.
1897     */
1898    vtd_iommu_replay_all(s);
1899}
1900
1901/* Do a context-cache device-selective invalidation.
1902 * @func_mask: FM field after shifting
1903 */
1904static void vtd_context_device_invalidate(IntelIOMMUState *s,
1905                                          uint16_t source_id,
1906                                          uint16_t func_mask)
1907{
1908    uint16_t mask;
1909    VTDBus *vtd_bus;
1910    VTDAddressSpace *vtd_as;
1911    uint8_t bus_n, devfn;
1912    uint16_t devfn_it;
1913
1914    trace_vtd_inv_desc_cc_devices(source_id, func_mask);
1915
1916    switch (func_mask & 3) {
1917    case 0:
1918        mask = 0;   /* No bits in the SID field masked */
1919        break;
1920    case 1:
1921        mask = 4;   /* Mask bit 2 in the SID field */
1922        break;
1923    case 2:
1924        mask = 6;   /* Mask bit 2:1 in the SID field */
1925        break;
1926    case 3:
1927        mask = 7;   /* Mask bit 2:0 in the SID field */
1928        break;
1929    default:
1930        g_assert_not_reached();
1931    }
1932    mask = ~mask;
1933
1934    bus_n = VTD_SID_TO_BUS(source_id);
1935    vtd_bus = vtd_find_as_from_bus_num(s, bus_n);
1936    if (vtd_bus) {
1937        devfn = VTD_SID_TO_DEVFN(source_id);
1938        for (devfn_it = 0; devfn_it < PCI_DEVFN_MAX; ++devfn_it) {
1939            vtd_as = vtd_bus->dev_as[devfn_it];
1940            if (vtd_as && ((devfn_it & mask) == (devfn & mask))) {
1941                trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(devfn_it),
1942                                             VTD_PCI_FUNC(devfn_it));
1943                vtd_iommu_lock(s);
1944                vtd_as->context_cache_entry.context_cache_gen = 0;
1945                vtd_iommu_unlock(s);
1946                /*
1947                 * Do switch address space when needed, in case if the
1948                 * device passthrough bit is switched.
1949                 */
1950                vtd_switch_address_space(vtd_as);
1951                /*
1952                 * So a device is moving out of (or moving into) a
1953                 * domain, resync the shadow page table.
1954                 * This won't bring bad even if we have no such
1955                 * notifier registered - the IOMMU notification
1956                 * framework will skip MAP notifications if that
1957                 * happened.
1958                 */
1959                vtd_sync_shadow_page_table(vtd_as);
1960            }
1961        }
1962    }
1963}
1964
1965/* Context-cache invalidation
1966 * Returns the Context Actual Invalidation Granularity.
1967 * @val: the content of the CCMD_REG
1968 */
1969static uint64_t vtd_context_cache_invalidate(IntelIOMMUState *s, uint64_t val)
1970{
1971    uint64_t caig;
1972    uint64_t type = val & VTD_CCMD_CIRG_MASK;
1973
1974    switch (type) {
1975    case VTD_CCMD_DOMAIN_INVL:
1976        /* Fall through */
1977    case VTD_CCMD_GLOBAL_INVL:
1978        caig = VTD_CCMD_GLOBAL_INVL_A;
1979        vtd_context_global_invalidate(s);
1980        break;
1981
1982    case VTD_CCMD_DEVICE_INVL:
1983        caig = VTD_CCMD_DEVICE_INVL_A;
1984        vtd_context_device_invalidate(s, VTD_CCMD_SID(val), VTD_CCMD_FM(val));
1985        break;
1986
1987    default:
1988        error_report_once("%s: invalid context: 0x%" PRIx64,
1989                          __func__, val);
1990        caig = 0;
1991    }
1992    return caig;
1993}
1994
1995static void vtd_iotlb_global_invalidate(IntelIOMMUState *s)
1996{
1997    trace_vtd_inv_desc_iotlb_global();
1998    vtd_reset_iotlb(s);
1999    vtd_iommu_replay_all(s);
2000}
2001
2002static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
2003{
2004    VTDContextEntry ce;
2005    VTDAddressSpace *vtd_as;
2006
2007    trace_vtd_inv_desc_iotlb_domain(domain_id);
2008
2009    vtd_iommu_lock(s);
2010    g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_domain,
2011                                &domain_id);
2012    vtd_iommu_unlock(s);
2013
2014    QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
2015        if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
2016                                      vtd_as->devfn, &ce) &&
2017            domain_id == vtd_get_domain_id(s, &ce)) {
2018            vtd_sync_shadow_page_table(vtd_as);
2019        }
2020    }
2021}
2022
2023static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
2024                                           uint16_t domain_id, hwaddr addr,
2025                                           uint8_t am)
2026{
2027    VTDAddressSpace *vtd_as;
2028    VTDContextEntry ce;
2029    int ret;
2030    hwaddr size = (1 << am) * VTD_PAGE_SIZE;
2031
2032    QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) {
2033        ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
2034                                       vtd_as->devfn, &ce);
2035        if (!ret && domain_id == vtd_get_domain_id(s, &ce)) {
2036            if (vtd_as_has_map_notifier(vtd_as)) {
2037                /*
2038                 * As long as we have MAP notifications registered in
2039                 * any of our IOMMU notifiers, we need to sync the
2040                 * shadow page table.
2041                 */
2042                vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size);
2043            } else {
2044                /*
2045                 * For UNMAP-only notifiers, we don't need to walk the
2046                 * page tables.  We just deliver the PSI down to
2047                 * invalidate caches.
2048                 */
2049                IOMMUTLBEvent event = {
2050                    .type = IOMMU_NOTIFIER_UNMAP,
2051                    .entry = {
2052                        .target_as = &address_space_memory,
2053                        .iova = addr,
2054                        .translated_addr = 0,
2055                        .addr_mask = size - 1,
2056                        .perm = IOMMU_NONE,
2057                    },
2058                };
2059                memory_region_notify_iommu(&vtd_as->iommu, 0, event);
2060            }
2061        }
2062    }
2063}
2064
2065static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
2066                                      hwaddr addr, uint8_t am)
2067{
2068    VTDIOTLBPageInvInfo info;
2069
2070    trace_vtd_inv_desc_iotlb_pages(domain_id, addr, am);
2071
2072    assert(am <= VTD_MAMV);
2073    info.domain_id = domain_id;
2074    info.addr = addr;
2075    info.mask = ~((1 << am) - 1);
2076    vtd_iommu_lock(s);
2077    g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
2078    vtd_iommu_unlock(s);
2079    vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am);
2080}
2081
2082/* Flush IOTLB
2083 * Returns the IOTLB Actual Invalidation Granularity.
2084 * @val: the content of the IOTLB_REG
2085 */
2086static uint64_t vtd_iotlb_flush(IntelIOMMUState *s, uint64_t val)
2087{
2088    uint64_t iaig;
2089    uint64_t type = val & VTD_TLB_FLUSH_GRANU_MASK;
2090    uint16_t domain_id;
2091    hwaddr addr;
2092    uint8_t am;
2093
2094    switch (type) {
2095    case VTD_TLB_GLOBAL_FLUSH:
2096        iaig = VTD_TLB_GLOBAL_FLUSH_A;
2097        vtd_iotlb_global_invalidate(s);
2098        break;
2099
2100    case VTD_TLB_DSI_FLUSH:
2101        domain_id = VTD_TLB_DID(val);
2102        iaig = VTD_TLB_DSI_FLUSH_A;
2103        vtd_iotlb_domain_invalidate(s, domain_id);
2104        break;
2105
2106    case VTD_TLB_PSI_FLUSH:
2107        domain_id = VTD_TLB_DID(val);
2108        addr = vtd_get_quad_raw(s, DMAR_IVA_REG);
2109        am = VTD_IVA_AM(addr);
2110        addr = VTD_IVA_ADDR(addr);
2111        if (am > VTD_MAMV) {
2112            error_report_once("%s: address mask overflow: 0x%" PRIx64,
2113                              __func__, vtd_get_quad_raw(s, DMAR_IVA_REG));
2114            iaig = 0;
2115            break;
2116        }
2117        iaig = VTD_TLB_PSI_FLUSH_A;
2118        vtd_iotlb_page_invalidate(s, domain_id, addr, am);
2119        break;
2120
2121    default:
2122        error_report_once("%s: invalid granularity: 0x%" PRIx64,
2123                          __func__, val);
2124        iaig = 0;
2125    }
2126    return iaig;
2127}
2128
2129static void vtd_fetch_inv_desc(IntelIOMMUState *s);
2130
2131static inline bool vtd_queued_inv_disable_check(IntelIOMMUState *s)
2132{
2133    return s->qi_enabled && (s->iq_tail == s->iq_head) &&
2134           (s->iq_last_desc_type == VTD_INV_DESC_WAIT);
2135}
2136
2137static void vtd_handle_gcmd_qie(IntelIOMMUState *s, bool en)
2138{
2139    uint64_t iqa_val = vtd_get_quad_raw(s, DMAR_IQA_REG);
2140
2141    trace_vtd_inv_qi_enable(en);
2142
2143    if (en) {
2144        s->iq = iqa_val & VTD_IQA_IQA_MASK(s->aw_bits);
2145        /* 2^(x+8) entries */
2146        s->iq_size = 1UL << ((iqa_val & VTD_IQA_QS) + 8 - (s->iq_dw ? 1 : 0));
2147        s->qi_enabled = true;
2148        trace_vtd_inv_qi_setup(s->iq, s->iq_size);
2149        /* Ok - report back to driver */
2150        vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_QIES);
2151
2152        if (s->iq_tail != 0) {
2153            /*
2154             * This is a spec violation but Windows guests are known to set up
2155             * Queued Invalidation this way so we allow the write and process
2156             * Invalidation Descriptors right away.
2157             */
2158            trace_vtd_warn_invalid_qi_tail(s->iq_tail);
2159            if (!(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) {
2160                vtd_fetch_inv_desc(s);
2161            }
2162        }
2163    } else {
2164        if (vtd_queued_inv_disable_check(s)) {
2165            /* disable Queued Invalidation */
2166            vtd_set_quad_raw(s, DMAR_IQH_REG, 0);
2167            s->iq_head = 0;
2168            s->qi_enabled = false;
2169            /* Ok - report back to driver */
2170            vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_QIES, 0);
2171        } else {
2172            error_report_once("%s: detected improper state when disable QI "
2173                              "(head=0x%x, tail=0x%x, last_type=%d)",
2174                              __func__,
2175                              s->iq_head, s->iq_tail, s->iq_last_desc_type);
2176        }
2177    }
2178}
2179
2180/* Set Root Table Pointer */
2181static void vtd_handle_gcmd_srtp(IntelIOMMUState *s)
2182{
2183    vtd_root_table_setup(s);
2184    /* Ok - report back to driver */
2185    vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_RTPS);
2186    vtd_reset_caches(s);
2187    vtd_address_space_refresh_all(s);
2188}
2189
2190/* Set Interrupt Remap Table Pointer */
2191static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s)
2192{
2193    vtd_interrupt_remap_table_setup(s);
2194    /* Ok - report back to driver */
2195    vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS);
2196}
2197
2198/* Handle Translation Enable/Disable */
2199static void vtd_handle_gcmd_te(IntelIOMMUState *s, bool en)
2200{
2201    if (s->dmar_enabled == en) {
2202        return;
2203    }
2204
2205    trace_vtd_dmar_enable(en);
2206
2207    if (en) {
2208        s->dmar_enabled = true;
2209        /* Ok - report back to driver */
2210        vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_TES);
2211    } else {
2212        s->dmar_enabled = false;
2213
2214        /* Clear the index of Fault Recording Register */
2215        s->next_frcd_reg = 0;
2216        /* Ok - report back to driver */
2217        vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_TES, 0);
2218    }
2219
2220    vtd_reset_caches(s);
2221    vtd_address_space_refresh_all(s);
2222}
2223
2224/* Handle Interrupt Remap Enable/Disable */
2225static void vtd_handle_gcmd_ire(IntelIOMMUState *s, bool en)
2226{
2227    trace_vtd_ir_enable(en);
2228
2229    if (en) {
2230        s->intr_enabled = true;
2231        /* Ok - report back to driver */
2232        vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRES);
2233    } else {
2234        s->intr_enabled = false;
2235        /* Ok - report back to driver */
2236        vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_IRES, 0);
2237    }
2238}
2239
2240/* Handle write to Global Command Register */
2241static void vtd_handle_gcmd_write(IntelIOMMUState *s)
2242{
2243    X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
2244    uint32_t status = vtd_get_long_raw(s, DMAR_GSTS_REG);
2245    uint32_t val = vtd_get_long_raw(s, DMAR_GCMD_REG);
2246    uint32_t changed = status ^ val;
2247
2248    trace_vtd_reg_write_gcmd(status, val);
2249    if ((changed & VTD_GCMD_TE) && s->dma_translation) {
2250        /* Translation enable/disable */
2251        vtd_handle_gcmd_te(s, val & VTD_GCMD_TE);
2252    }
2253    if (val & VTD_GCMD_SRTP) {
2254        /* Set/update the root-table pointer */
2255        vtd_handle_gcmd_srtp(s);
2256    }
2257    if (changed & VTD_GCMD_QIE) {
2258        /* Queued Invalidation Enable */
2259        vtd_handle_gcmd_qie(s, val & VTD_GCMD_QIE);
2260    }
2261    if (val & VTD_GCMD_SIRTP) {
2262        /* Set/update the interrupt remapping root-table pointer */
2263        vtd_handle_gcmd_sirtp(s);
2264    }
2265    if ((changed & VTD_GCMD_IRE) &&
2266        x86_iommu_ir_supported(x86_iommu)) {
2267        /* Interrupt remap enable/disable */
2268        vtd_handle_gcmd_ire(s, val & VTD_GCMD_IRE);
2269    }
2270}
2271
2272/* Handle write to Context Command Register */
2273static void vtd_handle_ccmd_write(IntelIOMMUState *s)
2274{
2275    uint64_t ret;
2276    uint64_t val = vtd_get_quad_raw(s, DMAR_CCMD_REG);
2277
2278    /* Context-cache invalidation request */
2279    if (val & VTD_CCMD_ICC) {
2280        if (s->qi_enabled) {
2281            error_report_once("Queued Invalidation enabled, "
2282                              "should not use register-based invalidation");
2283            return;
2284        }
2285        ret = vtd_context_cache_invalidate(s, val);
2286        /* Invalidation completed. Change something to show */
2287        vtd_set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_ICC, 0ULL);
2288        ret = vtd_set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_CAIG_MASK,
2289                                      ret);
2290    }
2291}
2292
2293/* Handle write to IOTLB Invalidation Register */
2294static void vtd_handle_iotlb_write(IntelIOMMUState *s)
2295{
2296    uint64_t ret;
2297    uint64_t val = vtd_get_quad_raw(s, DMAR_IOTLB_REG);
2298
2299    /* IOTLB invalidation request */
2300    if (val & VTD_TLB_IVT) {
2301        if (s->qi_enabled) {
2302            error_report_once("Queued Invalidation enabled, "
2303                              "should not use register-based invalidation");
2304            return;
2305        }
2306        ret = vtd_iotlb_flush(s, val);
2307        /* Invalidation completed. Change something to show */
2308        vtd_set_clear_mask_quad(s, DMAR_IOTLB_REG, VTD_TLB_IVT, 0ULL);
2309        ret = vtd_set_clear_mask_quad(s, DMAR_IOTLB_REG,
2310                                      VTD_TLB_FLUSH_GRANU_MASK_A, ret);
2311    }
2312}
2313
2314/* Fetch an Invalidation Descriptor from the Invalidation Queue */
2315static bool vtd_get_inv_desc(IntelIOMMUState *s,
2316                             VTDInvDesc *inv_desc)
2317{
2318    dma_addr_t base_addr = s->iq;
2319    uint32_t offset = s->iq_head;
2320    uint32_t dw = s->iq_dw ? 32 : 16;
2321    dma_addr_t addr = base_addr + offset * dw;
2322
2323    if (dma_memory_read(&address_space_memory, addr,
2324                        inv_desc, dw, MEMTXATTRS_UNSPECIFIED)) {
2325        error_report_once("Read INV DESC failed.");
2326        return false;
2327    }
2328    inv_desc->lo = le64_to_cpu(inv_desc->lo);
2329    inv_desc->hi = le64_to_cpu(inv_desc->hi);
2330    if (dw == 32) {
2331        inv_desc->val[2] = le64_to_cpu(inv_desc->val[2]);
2332        inv_desc->val[3] = le64_to_cpu(inv_desc->val[3]);
2333    }
2334    return true;
2335}
2336
2337static bool vtd_process_wait_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc)
2338{
2339    if ((inv_desc->hi & VTD_INV_DESC_WAIT_RSVD_HI) ||
2340        (inv_desc->lo & VTD_INV_DESC_WAIT_RSVD_LO)) {
2341        error_report_once("%s: invalid wait desc: hi=%"PRIx64", lo=%"PRIx64
2342                          " (reserved nonzero)", __func__, inv_desc->hi,
2343                          inv_desc->lo);
2344        return false;
2345    }
2346    if (inv_desc->lo & VTD_INV_DESC_WAIT_SW) {
2347        /* Status Write */
2348        uint32_t status_data = (uint32_t)(inv_desc->lo >>
2349                               VTD_INV_DESC_WAIT_DATA_SHIFT);
2350
2351        assert(!(inv_desc->lo & VTD_INV_DESC_WAIT_IF));
2352
2353        /* FIXME: need to be masked with HAW? */
2354        dma_addr_t status_addr = inv_desc->hi;
2355        trace_vtd_inv_desc_wait_sw(status_addr, status_data);
2356        status_data = cpu_to_le32(status_data);
2357        if (dma_memory_write(&address_space_memory, status_addr,
2358                             &status_data, sizeof(status_data),
2359                             MEMTXATTRS_UNSPECIFIED)) {
2360            trace_vtd_inv_desc_wait_write_fail(inv_desc->hi, inv_desc->lo);
2361            return false;
2362        }
2363    } else if (inv_desc->lo & VTD_INV_DESC_WAIT_IF) {
2364        /* Interrupt flag */
2365        vtd_generate_completion_event(s);
2366    } else {
2367        error_report_once("%s: invalid wait desc: hi=%"PRIx64", lo=%"PRIx64
2368                          " (unknown type)", __func__, inv_desc->hi,
2369                          inv_desc->lo);
2370        return false;
2371    }
2372    return true;
2373}
2374
2375static bool vtd_process_context_cache_desc(IntelIOMMUState *s,
2376                                           VTDInvDesc *inv_desc)
2377{
2378    uint16_t sid, fmask;
2379
2380    if ((inv_desc->lo & VTD_INV_DESC_CC_RSVD) || inv_desc->hi) {
2381        error_report_once("%s: invalid cc inv desc: hi=%"PRIx64", lo=%"PRIx64
2382                          " (reserved nonzero)", __func__, inv_desc->hi,
2383                          inv_desc->lo);
2384        return false;
2385    }
2386    switch (inv_desc->lo & VTD_INV_DESC_CC_G) {
2387    case VTD_INV_DESC_CC_DOMAIN:
2388        trace_vtd_inv_desc_cc_domain(
2389            (uint16_t)VTD_INV_DESC_CC_DID(inv_desc->lo));
2390        /* Fall through */
2391    case VTD_INV_DESC_CC_GLOBAL:
2392        vtd_context_global_invalidate(s);
2393        break;
2394
2395    case VTD_INV_DESC_CC_DEVICE:
2396        sid = VTD_INV_DESC_CC_SID(inv_desc->lo);
2397        fmask = VTD_INV_DESC_CC_FM(inv_desc->lo);
2398        vtd_context_device_invalidate(s, sid, fmask);
2399        break;
2400
2401    default:
2402        error_report_once("%s: invalid cc inv desc: hi=%"PRIx64", lo=%"PRIx64
2403                          " (invalid type)", __func__, inv_desc->hi,
2404                          inv_desc->lo);
2405        return false;
2406    }
2407    return true;
2408}
2409
2410static bool vtd_process_iotlb_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc)
2411{
2412    uint16_t domain_id;
2413    uint8_t am;
2414    hwaddr addr;
2415
2416    if ((inv_desc->lo & VTD_INV_DESC_IOTLB_RSVD_LO) ||
2417        (inv_desc->hi & VTD_INV_DESC_IOTLB_RSVD_HI)) {
2418        error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64
2419                          ", lo=0x%"PRIx64" (reserved bits unzero)",
2420                          __func__, inv_desc->hi, inv_desc->lo);
2421        return false;
2422    }
2423
2424    switch (inv_desc->lo & VTD_INV_DESC_IOTLB_G) {
2425    case VTD_INV_DESC_IOTLB_GLOBAL:
2426        vtd_iotlb_global_invalidate(s);
2427        break;
2428
2429    case VTD_INV_DESC_IOTLB_DOMAIN:
2430        domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo);
2431        vtd_iotlb_domain_invalidate(s, domain_id);
2432        break;
2433
2434    case VTD_INV_DESC_IOTLB_PAGE:
2435        domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo);
2436        addr = VTD_INV_DESC_IOTLB_ADDR(inv_desc->hi);
2437        am = VTD_INV_DESC_IOTLB_AM(inv_desc->hi);
2438        if (am > VTD_MAMV) {
2439            error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64
2440                              ", lo=0x%"PRIx64" (am=%u > VTD_MAMV=%u)",
2441                              __func__, inv_desc->hi, inv_desc->lo,
2442                              am, (unsigned)VTD_MAMV);
2443            return false;
2444        }
2445        vtd_iotlb_page_invalidate(s, domain_id, addr, am);
2446        break;
2447
2448    default:
2449        error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64
2450                          ", lo=0x%"PRIx64" (type mismatch: 0x%llx)",
2451                          __func__, inv_desc->hi, inv_desc->lo,
2452                          inv_desc->lo & VTD_INV_DESC_IOTLB_G);
2453        return false;
2454    }
2455    return true;
2456}
2457
2458static bool vtd_process_inv_iec_desc(IntelIOMMUState *s,
2459                                     VTDInvDesc *inv_desc)
2460{
2461    trace_vtd_inv_desc_iec(inv_desc->iec.granularity,
2462                           inv_desc->iec.index,
2463                           inv_desc->iec.index_mask);
2464
2465    vtd_iec_notify_all(s, !inv_desc->iec.granularity,
2466                       inv_desc->iec.index,
2467                       inv_desc->iec.index_mask);
2468    return true;
2469}
2470
2471static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s,
2472                                          VTDInvDesc *inv_desc)
2473{
2474    VTDAddressSpace *vtd_dev_as;
2475    IOMMUTLBEvent event;
2476    struct VTDBus *vtd_bus;
2477    hwaddr addr;
2478    uint64_t sz;
2479    uint16_t sid;
2480    uint8_t devfn;
2481    bool size;
2482    uint8_t bus_num;
2483
2484    addr = VTD_INV_DESC_DEVICE_IOTLB_ADDR(inv_desc->hi);
2485    sid = VTD_INV_DESC_DEVICE_IOTLB_SID(inv_desc->lo);
2486    devfn = sid & 0xff;
2487    bus_num = sid >> 8;
2488    size = VTD_INV_DESC_DEVICE_IOTLB_SIZE(inv_desc->hi);
2489
2490    if ((inv_desc->lo & VTD_INV_DESC_DEVICE_IOTLB_RSVD_LO) ||
2491        (inv_desc->hi & VTD_INV_DESC_DEVICE_IOTLB_RSVD_HI)) {
2492        error_report_once("%s: invalid dev-iotlb inv desc: hi=%"PRIx64
2493                          ", lo=%"PRIx64" (reserved nonzero)", __func__,
2494                          inv_desc->hi, inv_desc->lo);
2495        return false;
2496    }
2497
2498    vtd_bus = vtd_find_as_from_bus_num(s, bus_num);
2499    if (!vtd_bus) {
2500        goto done;
2501    }
2502
2503    vtd_dev_as = vtd_bus->dev_as[devfn];
2504    if (!vtd_dev_as) {
2505        goto done;
2506    }
2507
2508    /* According to ATS spec table 2.4:
2509     * S = 0, bits 15:12 = xxxx     range size: 4K
2510     * S = 1, bits 15:12 = xxx0     range size: 8K
2511     * S = 1, bits 15:12 = xx01     range size: 16K
2512     * S = 1, bits 15:12 = x011     range size: 32K
2513     * S = 1, bits 15:12 = 0111     range size: 64K
2514     * ...
2515     */
2516    if (size) {
2517        sz = (VTD_PAGE_SIZE * 2) << cto64(addr >> VTD_PAGE_SHIFT);
2518        addr &= ~(sz - 1);
2519    } else {
2520        sz = VTD_PAGE_SIZE;
2521    }
2522
2523    event.type = IOMMU_NOTIFIER_DEVIOTLB_UNMAP;
2524    event.entry.target_as = &vtd_dev_as->as;
2525    event.entry.addr_mask = sz - 1;
2526    event.entry.iova = addr;
2527    event.entry.perm = IOMMU_NONE;
2528    event.entry.translated_addr = 0;
2529    memory_region_notify_iommu(&vtd_dev_as->iommu, 0, event);
2530
2531done:
2532    return true;
2533}
2534
2535static bool vtd_process_inv_desc(IntelIOMMUState *s)
2536{
2537    VTDInvDesc inv_desc;
2538    uint8_t desc_type;
2539
2540    trace_vtd_inv_qi_head(s->iq_head);
2541    if (!vtd_get_inv_desc(s, &inv_desc)) {
2542        s->iq_last_desc_type = VTD_INV_DESC_NONE;
2543        return false;
2544    }
2545
2546    desc_type = inv_desc.lo & VTD_INV_DESC_TYPE;
2547    /* FIXME: should update at first or at last? */
2548    s->iq_last_desc_type = desc_type;
2549
2550    switch (desc_type) {
2551    case VTD_INV_DESC_CC:
2552        trace_vtd_inv_desc("context-cache", inv_desc.hi, inv_desc.lo);
2553        if (!vtd_process_context_cache_desc(s, &inv_desc)) {
2554            return false;
2555        }
2556        break;
2557
2558    case VTD_INV_DESC_IOTLB:
2559        trace_vtd_inv_desc("iotlb", inv_desc.hi, inv_desc.lo);
2560        if (!vtd_process_iotlb_desc(s, &inv_desc)) {
2561            return false;
2562        }
2563        break;
2564
2565    /*
2566     * TODO: the entity of below two cases will be implemented in future series.
2567     * To make guest (which integrates scalable mode support patch set in
2568     * iommu driver) work, just return true is enough so far.
2569     */
2570    case VTD_INV_DESC_PC:
2571        break;
2572
2573    case VTD_INV_DESC_PIOTLB:
2574        break;
2575
2576    case VTD_INV_DESC_WAIT:
2577        trace_vtd_inv_desc("wait", inv_desc.hi, inv_desc.lo);
2578        if (!vtd_process_wait_desc(s, &inv_desc)) {
2579            return false;
2580        }
2581        break;
2582
2583    case VTD_INV_DESC_IEC:
2584        trace_vtd_inv_desc("iec", inv_desc.hi, inv_desc.lo);
2585        if (!vtd_process_inv_iec_desc(s, &inv_desc)) {
2586            return false;
2587        }
2588        break;
2589
2590    case VTD_INV_DESC_DEVICE:
2591        trace_vtd_inv_desc("device", inv_desc.hi, inv_desc.lo);
2592        if (!vtd_process_device_iotlb_desc(s, &inv_desc)) {
2593            return false;
2594        }
2595        break;
2596
2597    default:
2598        error_report_once("%s: invalid inv desc: hi=%"PRIx64", lo=%"PRIx64
2599                          " (unknown type)", __func__, inv_desc.hi,
2600                          inv_desc.lo);
2601        return false;
2602    }
2603    s->iq_head++;
2604    if (s->iq_head == s->iq_size) {
2605        s->iq_head = 0;
2606    }
2607    return true;
2608}
2609
2610/* Try to fetch and process more Invalidation Descriptors */
2611static void vtd_fetch_inv_desc(IntelIOMMUState *s)
2612{
2613    int qi_shift;
2614
2615    /* Refer to 10.4.23 of VT-d spec 3.0 */
2616    qi_shift = s->iq_dw ? VTD_IQH_QH_SHIFT_5 : VTD_IQH_QH_SHIFT_4;
2617
2618    trace_vtd_inv_qi_fetch();
2619
2620    if (s->iq_tail >= s->iq_size) {
2621        /* Detects an invalid Tail pointer */
2622        error_report_once("%s: detected invalid QI tail "
2623                          "(tail=0x%x, size=0x%x)",
2624                          __func__, s->iq_tail, s->iq_size);
2625        vtd_handle_inv_queue_error(s);
2626        return;
2627    }
2628    while (s->iq_head != s->iq_tail) {
2629        if (!vtd_process_inv_desc(s)) {
2630            /* Invalidation Queue Errors */
2631            vtd_handle_inv_queue_error(s);
2632            break;
2633        }
2634        /* Must update the IQH_REG in time */
2635        vtd_set_quad_raw(s, DMAR_IQH_REG,
2636                         (((uint64_t)(s->iq_head)) << qi_shift) &
2637                         VTD_IQH_QH_MASK);
2638    }
2639}
2640
2641/* Handle write to Invalidation Queue Tail Register */
2642static void vtd_handle_iqt_write(IntelIOMMUState *s)
2643{
2644    uint64_t val = vtd_get_quad_raw(s, DMAR_IQT_REG);
2645
2646    if (s->iq_dw && (val & VTD_IQT_QT_256_RSV_BIT)) {
2647        error_report_once("%s: RSV bit is set: val=0x%"PRIx64,
2648                          __func__, val);
2649        return;
2650    }
2651    s->iq_tail = VTD_IQT_QT(s->iq_dw, val);
2652    trace_vtd_inv_qi_tail(s->iq_tail);
2653
2654    if (s->qi_enabled && !(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) {
2655        /* Process Invalidation Queue here */
2656        vtd_fetch_inv_desc(s);
2657    }
2658}
2659
2660static void vtd_handle_fsts_write(IntelIOMMUState *s)
2661{
2662    uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG);
2663    uint32_t fectl_reg = vtd_get_long_raw(s, DMAR_FECTL_REG);
2664    uint32_t status_fields = VTD_FSTS_PFO | VTD_FSTS_PPF | VTD_FSTS_IQE;
2665
2666    if ((fectl_reg & VTD_FECTL_IP) && !(fsts_reg & status_fields)) {
2667        vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0);
2668        trace_vtd_fsts_clear_ip();
2669    }
2670    /* FIXME: when IQE is Clear, should we try to fetch some Invalidation
2671     * Descriptors if there are any when Queued Invalidation is enabled?
2672     */
2673}
2674
2675static void vtd_handle_fectl_write(IntelIOMMUState *s)
2676{
2677    uint32_t fectl_reg;
2678    /* FIXME: when software clears the IM field, check the IP field. But do we
2679     * need to compare the old value and the new value to conclude that
2680     * software clears the IM field? Or just check if the IM field is zero?
2681     */
2682    fectl_reg = vtd_get_long_raw(s, DMAR_FECTL_REG);
2683
2684    trace_vtd_reg_write_fectl(fectl_reg);
2685
2686    if ((fectl_reg & VTD_FECTL_IP) && !(fectl_reg & VTD_FECTL_IM)) {
2687        vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG);
2688        vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0);
2689    }
2690}
2691
2692static void vtd_handle_ics_write(IntelIOMMUState *s)
2693{
2694    uint32_t ics_reg = vtd_get_long_raw(s, DMAR_ICS_REG);
2695    uint32_t iectl_reg = vtd_get_long_raw(s, DMAR_IECTL_REG);
2696
2697    if ((iectl_reg & VTD_IECTL_IP) && !(ics_reg & VTD_ICS_IWC)) {
2698        trace_vtd_reg_ics_clear_ip();
2699        vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0);
2700    }
2701}
2702
2703static void vtd_handle_iectl_write(IntelIOMMUState *s)
2704{
2705    uint32_t iectl_reg;
2706    /* FIXME: when software clears the IM field, check the IP field. But do we
2707     * need to compare the old value and the new value to conclude that
2708     * software clears the IM field? Or just check if the IM field is zero?
2709     */
2710    iectl_reg = vtd_get_long_raw(s, DMAR_IECTL_REG);
2711
2712    trace_vtd_reg_write_iectl(iectl_reg);
2713
2714    if ((iectl_reg & VTD_IECTL_IP) && !(iectl_reg & VTD_IECTL_IM)) {
2715        vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG);
2716        vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0);
2717    }
2718}
2719
2720static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size)
2721{
2722    IntelIOMMUState *s = opaque;
2723    uint64_t val;
2724
2725    trace_vtd_reg_read(addr, size);
2726
2727    if (addr + size > DMAR_REG_SIZE) {
2728        error_report_once("%s: MMIO over range: addr=0x%" PRIx64
2729                          " size=0x%x", __func__, addr, size);
2730        return (uint64_t)-1;
2731    }
2732
2733    switch (addr) {
2734    /* Root Table Address Register, 64-bit */
2735    case DMAR_RTADDR_REG:
2736        val = vtd_get_quad_raw(s, DMAR_RTADDR_REG);
2737        if (size == 4) {
2738            val = val & ((1ULL << 32) - 1);
2739        }
2740        break;
2741
2742    case DMAR_RTADDR_REG_HI:
2743        assert(size == 4);
2744        val = vtd_get_quad_raw(s, DMAR_RTADDR_REG) >> 32;
2745        break;
2746
2747    /* Invalidation Queue Address Register, 64-bit */
2748    case DMAR_IQA_REG:
2749        val = s->iq | (vtd_get_quad(s, DMAR_IQA_REG) & VTD_IQA_QS);
2750        if (size == 4) {
2751            val = val & ((1ULL << 32) - 1);
2752        }
2753        break;
2754
2755    case DMAR_IQA_REG_HI:
2756        assert(size == 4);
2757        val = s->iq >> 32;
2758        break;
2759
2760    default:
2761        if (size == 4) {
2762            val = vtd_get_long(s, addr);
2763        } else {
2764            val = vtd_get_quad(s, addr);
2765        }
2766    }
2767
2768    return val;
2769}
2770
2771static void vtd_mem_write(void *opaque, hwaddr addr,
2772                          uint64_t val, unsigned size)
2773{
2774    IntelIOMMUState *s = opaque;
2775
2776    trace_vtd_reg_write(addr, size, val);
2777
2778    if (addr + size > DMAR_REG_SIZE) {
2779        error_report_once("%s: MMIO over range: addr=0x%" PRIx64
2780                          " size=0x%x", __func__, addr, size);
2781        return;
2782    }
2783
2784    switch (addr) {
2785    /* Global Command Register, 32-bit */
2786    case DMAR_GCMD_REG:
2787        vtd_set_long(s, addr, val);
2788        vtd_handle_gcmd_write(s);
2789        break;
2790
2791    /* Context Command Register, 64-bit */
2792    case DMAR_CCMD_REG:
2793        if (size == 4) {
2794            vtd_set_long(s, addr, val);
2795        } else {
2796            vtd_set_quad(s, addr, val);
2797            vtd_handle_ccmd_write(s);
2798        }
2799        break;
2800
2801    case DMAR_CCMD_REG_HI:
2802        assert(size == 4);
2803        vtd_set_long(s, addr, val);
2804        vtd_handle_ccmd_write(s);
2805        break;
2806
2807    /* IOTLB Invalidation Register, 64-bit */
2808    case DMAR_IOTLB_REG:
2809        if (size == 4) {
2810            vtd_set_long(s, addr, val);
2811        } else {
2812            vtd_set_quad(s, addr, val);
2813            vtd_handle_iotlb_write(s);
2814        }
2815        break;
2816
2817    case DMAR_IOTLB_REG_HI:
2818        assert(size == 4);
2819        vtd_set_long(s, addr, val);
2820        vtd_handle_iotlb_write(s);
2821        break;
2822
2823    /* Invalidate Address Register, 64-bit */
2824    case DMAR_IVA_REG:
2825        if (size == 4) {
2826            vtd_set_long(s, addr, val);
2827        } else {
2828            vtd_set_quad(s, addr, val);
2829        }
2830        break;
2831
2832    case DMAR_IVA_REG_HI:
2833        assert(size == 4);
2834        vtd_set_long(s, addr, val);
2835        break;
2836
2837    /* Fault Status Register, 32-bit */
2838    case DMAR_FSTS_REG:
2839        assert(size == 4);
2840        vtd_set_long(s, addr, val);
2841        vtd_handle_fsts_write(s);
2842        break;
2843
2844    /* Fault Event Control Register, 32-bit */
2845    case DMAR_FECTL_REG:
2846        assert(size == 4);
2847        vtd_set_long(s, addr, val);
2848        vtd_handle_fectl_write(s);
2849        break;
2850
2851    /* Fault Event Data Register, 32-bit */
2852    case DMAR_FEDATA_REG:
2853        assert(size == 4);
2854        vtd_set_long(s, addr, val);
2855        break;
2856
2857    /* Fault Event Address Register, 32-bit */
2858    case DMAR_FEADDR_REG:
2859        if (size == 4) {
2860            vtd_set_long(s, addr, val);
2861        } else {
2862            /*
2863             * While the register is 32-bit only, some guests (Xen...) write to
2864             * it with 64-bit.
2865             */
2866            vtd_set_quad(s, addr, val);
2867        }
2868        break;
2869
2870    /* Fault Event Upper Address Register, 32-bit */
2871    case DMAR_FEUADDR_REG:
2872        assert(size == 4);
2873        vtd_set_long(s, addr, val);
2874        break;
2875
2876    /* Protected Memory Enable Register, 32-bit */
2877    case DMAR_PMEN_REG:
2878        assert(size == 4);
2879        vtd_set_long(s, addr, val);
2880        break;
2881
2882    /* Root Table Address Register, 64-bit */
2883    case DMAR_RTADDR_REG:
2884        if (size == 4) {
2885            vtd_set_long(s, addr, val);
2886        } else {
2887            vtd_set_quad(s, addr, val);
2888        }
2889        break;
2890
2891    case DMAR_RTADDR_REG_HI:
2892        assert(size == 4);
2893        vtd_set_long(s, addr, val);
2894        break;
2895
2896    /* Invalidation Queue Tail Register, 64-bit */
2897    case DMAR_IQT_REG:
2898        if (size == 4) {
2899            vtd_set_long(s, addr, val);
2900        } else {
2901            vtd_set_quad(s, addr, val);
2902        }
2903        vtd_handle_iqt_write(s);
2904        break;
2905
2906    case DMAR_IQT_REG_HI:
2907        assert(size == 4);
2908        vtd_set_long(s, addr, val);
2909        /* 19:63 of IQT_REG is RsvdZ, do nothing here */
2910        break;
2911
2912    /* Invalidation Queue Address Register, 64-bit */
2913    case DMAR_IQA_REG:
2914        if (size == 4) {
2915            vtd_set_long(s, addr, val);
2916        } else {
2917            vtd_set_quad(s, addr, val);
2918        }
2919        vtd_update_iq_dw(s);
2920        break;
2921
2922    case DMAR_IQA_REG_HI:
2923        assert(size == 4);
2924        vtd_set_long(s, addr, val);
2925        break;
2926
2927    /* Invalidation Completion Status Register, 32-bit */
2928    case DMAR_ICS_REG:
2929        assert(size == 4);
2930        vtd_set_long(s, addr, val);
2931        vtd_handle_ics_write(s);
2932        break;
2933
2934    /* Invalidation Event Control Register, 32-bit */
2935    case DMAR_IECTL_REG:
2936        assert(size == 4);
2937        vtd_set_long(s, addr, val);
2938        vtd_handle_iectl_write(s);
2939        break;
2940
2941    /* Invalidation Event Data Register, 32-bit */
2942    case DMAR_IEDATA_REG:
2943        assert(size == 4);
2944        vtd_set_long(s, addr, val);
2945        break;
2946
2947    /* Invalidation Event Address Register, 32-bit */
2948    case DMAR_IEADDR_REG:
2949        assert(size == 4);
2950        vtd_set_long(s, addr, val);
2951        break;
2952
2953    /* Invalidation Event Upper Address Register, 32-bit */
2954    case DMAR_IEUADDR_REG:
2955        assert(size == 4);
2956        vtd_set_long(s, addr, val);
2957        break;
2958
2959    /* Fault Recording Registers, 128-bit */
2960    case DMAR_FRCD_REG_0_0:
2961        if (size == 4) {
2962            vtd_set_long(s, addr, val);
2963        } else {
2964            vtd_set_quad(s, addr, val);
2965        }
2966        break;
2967
2968    case DMAR_FRCD_REG_0_1:
2969        assert(size == 4);
2970        vtd_set_long(s, addr, val);
2971        break;
2972
2973    case DMAR_FRCD_REG_0_2:
2974        if (size == 4) {
2975            vtd_set_long(s, addr, val);
2976        } else {
2977            vtd_set_quad(s, addr, val);
2978            /* May clear bit 127 (Fault), update PPF */
2979            vtd_update_fsts_ppf(s);
2980        }
2981        break;
2982
2983    case DMAR_FRCD_REG_0_3:
2984        assert(size == 4);
2985        vtd_set_long(s, addr, val);
2986        /* May clear bit 127 (Fault), update PPF */
2987        vtd_update_fsts_ppf(s);
2988        break;
2989
2990    case DMAR_IRTA_REG:
2991        if (size == 4) {
2992            vtd_set_long(s, addr, val);
2993        } else {
2994            vtd_set_quad(s, addr, val);
2995        }
2996        break;
2997
2998    case DMAR_IRTA_REG_HI:
2999        assert(size == 4);
3000        vtd_set_long(s, addr, val);
3001        break;
3002
3003    default:
3004        if (size == 4) {
3005            vtd_set_long(s, addr, val);
3006        } else {
3007            vtd_set_quad(s, addr, val);
3008        }
3009    }
3010}
3011
3012static IOMMUTLBEntry vtd_iommu_translate(IOMMUMemoryRegion *iommu, hwaddr addr,
3013                                         IOMMUAccessFlags flag, int iommu_idx)
3014{
3015    VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
3016    IntelIOMMUState *s = vtd_as->iommu_state;
3017    IOMMUTLBEntry iotlb = {
3018        /* We'll fill in the rest later. */
3019        .target_as = &address_space_memory,
3020    };
3021    bool success;
3022
3023    if (likely(s->dmar_enabled)) {
3024        success = vtd_do_iommu_translate(vtd_as, vtd_as->bus, vtd_as->devfn,
3025                                         addr, flag & IOMMU_WO, &iotlb);
3026    } else {
3027        /* DMAR disabled, passthrough, use 4k-page*/
3028        iotlb.iova = addr & VTD_PAGE_MASK_4K;
3029        iotlb.translated_addr = addr & VTD_PAGE_MASK_4K;
3030        iotlb.addr_mask = ~VTD_PAGE_MASK_4K;
3031        iotlb.perm = IOMMU_RW;
3032        success = true;
3033    }
3034
3035    if (likely(success)) {
3036        trace_vtd_dmar_translate(pci_bus_num(vtd_as->bus),
3037                                 VTD_PCI_SLOT(vtd_as->devfn),
3038                                 VTD_PCI_FUNC(vtd_as->devfn),
3039                                 iotlb.iova, iotlb.translated_addr,
3040                                 iotlb.addr_mask);
3041    } else {
3042        error_report_once("%s: detected translation failure "
3043                          "(dev=%02x:%02x:%02x, iova=0x%" PRIx64 ")",
3044                          __func__, pci_bus_num(vtd_as->bus),
3045                          VTD_PCI_SLOT(vtd_as->devfn),
3046                          VTD_PCI_FUNC(vtd_as->devfn),
3047                          addr);
3048    }
3049
3050    return iotlb;
3051}
3052
3053static int vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu,
3054                                         IOMMUNotifierFlag old,
3055                                         IOMMUNotifierFlag new,
3056                                         Error **errp)
3057{
3058    VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
3059    IntelIOMMUState *s = vtd_as->iommu_state;
3060
3061    /* TODO: add support for VFIO and vhost users */
3062    if (s->snoop_control) {
3063        error_setg_errno(errp, ENOTSUP,
3064                         "Snoop Control with vhost or VFIO is not supported");
3065        return -ENOTSUP;
3066    }
3067
3068    /* Update per-address-space notifier flags */
3069    vtd_as->notifier_flags = new;
3070
3071    if (old == IOMMU_NOTIFIER_NONE) {
3072        QLIST_INSERT_HEAD(&s->vtd_as_with_notifiers, vtd_as, next);
3073    } else if (new == IOMMU_NOTIFIER_NONE) {
3074        QLIST_REMOVE(vtd_as, next);
3075    }
3076    return 0;
3077}
3078
3079static int vtd_post_load(void *opaque, int version_id)
3080{
3081    IntelIOMMUState *iommu = opaque;
3082
3083    /*
3084     * We don't need to migrate the root_scalable because we can
3085     * simply do the calculation after the loading is complete.  We
3086     * can actually do similar things with root, dmar_enabled, etc.
3087     * however since we've had them already so we'd better keep them
3088     * for compatibility of migration.
3089     */
3090    vtd_update_scalable_state(iommu);
3091
3092    vtd_update_iq_dw(iommu);
3093
3094    /*
3095     * Memory regions are dynamically turned on/off depending on
3096     * context entry configurations from the guest. After migration,
3097     * we need to make sure the memory regions are still correct.
3098     */
3099    vtd_switch_address_space_all(iommu);
3100
3101    return 0;
3102}
3103
3104static const VMStateDescription vtd_vmstate = {
3105    .name = "iommu-intel",
3106    .version_id = 1,
3107    .minimum_version_id = 1,
3108    .priority = MIG_PRI_IOMMU,
3109    .post_load = vtd_post_load,
3110    .fields = (VMStateField[]) {
3111        VMSTATE_UINT64(root, IntelIOMMUState),
3112        VMSTATE_UINT64(intr_root, IntelIOMMUState),
3113        VMSTATE_UINT64(iq, IntelIOMMUState),
3114        VMSTATE_UINT32(intr_size, IntelIOMMUState),
3115        VMSTATE_UINT16(iq_head, IntelIOMMUState),
3116        VMSTATE_UINT16(iq_tail, IntelIOMMUState),
3117        VMSTATE_UINT16(iq_size, IntelIOMMUState),
3118        VMSTATE_UINT16(next_frcd_reg, IntelIOMMUState),
3119        VMSTATE_UINT8_ARRAY(csr, IntelIOMMUState, DMAR_REG_SIZE),
3120        VMSTATE_UINT8(iq_last_desc_type, IntelIOMMUState),
3121        VMSTATE_UNUSED(1),      /* bool root_extended is obsolete by VT-d */
3122        VMSTATE_BOOL(dmar_enabled, IntelIOMMUState),
3123        VMSTATE_BOOL(qi_enabled, IntelIOMMUState),
3124        VMSTATE_BOOL(intr_enabled, IntelIOMMUState),
3125        VMSTATE_BOOL(intr_eime, IntelIOMMUState),
3126        VMSTATE_END_OF_LIST()
3127    }
3128};
3129
3130static const MemoryRegionOps vtd_mem_ops = {
3131    .read = vtd_mem_read,
3132    .write = vtd_mem_write,
3133    .endianness = DEVICE_LITTLE_ENDIAN,
3134    .impl = {
3135        .min_access_size = 4,
3136        .max_access_size = 8,
3137    },
3138    .valid = {
3139        .min_access_size = 4,
3140        .max_access_size = 8,
3141    },
3142};
3143
3144static Property vtd_properties[] = {
3145    DEFINE_PROP_UINT32("version", IntelIOMMUState, version, 0),
3146    DEFINE_PROP_ON_OFF_AUTO("eim", IntelIOMMUState, intr_eim,
3147                            ON_OFF_AUTO_AUTO),
3148    DEFINE_PROP_BOOL("x-buggy-eim", IntelIOMMUState, buggy_eim, false),
3149    DEFINE_PROP_UINT8("aw-bits", IntelIOMMUState, aw_bits,
3150                      VTD_HOST_ADDRESS_WIDTH),
3151    DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE),
3152    DEFINE_PROP_BOOL("x-scalable-mode", IntelIOMMUState, scalable_mode, FALSE),
3153    DEFINE_PROP_BOOL("snoop-control", IntelIOMMUState, snoop_control, false),
3154    DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true),
3155    DEFINE_PROP_BOOL("dma-translation", IntelIOMMUState, dma_translation, true),
3156    DEFINE_PROP_END_OF_LIST(),
3157};
3158
3159/* Read IRTE entry with specific index */
3160static int vtd_irte_get(IntelIOMMUState *iommu, uint16_t index,
3161                        VTD_IR_TableEntry *entry, uint16_t sid)
3162{
3163    static const uint16_t vtd_svt_mask[VTD_SQ_MAX] = \
3164        {0xffff, 0xfffb, 0xfff9, 0xfff8};
3165    dma_addr_t addr = 0x00;
3166    uint16_t mask, source_id;
3167    uint8_t bus, bus_max, bus_min;
3168
3169    if (index >= iommu->intr_size) {
3170        error_report_once("%s: index too large: ind=0x%x",
3171                          __func__, index);
3172        return -VTD_FR_IR_INDEX_OVER;
3173    }
3174
3175    addr = iommu->intr_root + index * sizeof(*entry);
3176    if (dma_memory_read(&address_space_memory, addr,
3177                        entry, sizeof(*entry), MEMTXATTRS_UNSPECIFIED)) {
3178        error_report_once("%s: read failed: ind=0x%x addr=0x%" PRIx64,
3179                          __func__, index, addr);
3180        return -VTD_FR_IR_ROOT_INVAL;
3181    }
3182
3183    trace_vtd_ir_irte_get(index, le64_to_cpu(entry->data[1]),
3184                          le64_to_cpu(entry->data[0]));
3185
3186    if (!entry->irte.present) {
3187        error_report_once("%s: detected non-present IRTE "
3188                          "(index=%u, high=0x%" PRIx64 ", low=0x%" PRIx64 ")",
3189                          __func__, index, le64_to_cpu(entry->data[1]),
3190                          le64_to_cpu(entry->data[0]));
3191        return -VTD_FR_IR_ENTRY_P;
3192    }
3193
3194    if (entry->irte.__reserved_0 || entry->irte.__reserved_1 ||
3195        entry->irte.__reserved_2) {
3196        error_report_once("%s: detected non-zero reserved IRTE "
3197                          "(index=%u, high=0x%" PRIx64 ", low=0x%" PRIx64 ")",
3198                          __func__, index, le64_to_cpu(entry->data[1]),
3199                          le64_to_cpu(entry->data[0]));
3200        return -VTD_FR_IR_IRTE_RSVD;
3201    }
3202
3203    if (sid != X86_IOMMU_SID_INVALID) {
3204        /* Validate IRTE SID */
3205        source_id = le32_to_cpu(entry->irte.source_id);
3206        switch (entry->irte.sid_vtype) {
3207        case VTD_SVT_NONE:
3208            break;
3209
3210        case VTD_SVT_ALL:
3211            mask = vtd_svt_mask[entry->irte.sid_q];
3212            if ((source_id & mask) != (sid & mask)) {
3213                error_report_once("%s: invalid IRTE SID "
3214                                  "(index=%u, sid=%u, source_id=%u)",
3215                                  __func__, index, sid, source_id);
3216                return -VTD_FR_IR_SID_ERR;
3217            }
3218            break;
3219
3220        case VTD_SVT_BUS:
3221            bus_max = source_id >> 8;
3222            bus_min = source_id & 0xff;
3223            bus = sid >> 8;
3224            if (bus > bus_max || bus < bus_min) {
3225                error_report_once("%s: invalid SVT_BUS "
3226                                  "(index=%u, bus=%u, min=%u, max=%u)",
3227                                  __func__, index, bus, bus_min, bus_max);
3228                return -VTD_FR_IR_SID_ERR;
3229            }
3230            break;
3231
3232        default:
3233            error_report_once("%s: detected invalid IRTE SVT "
3234                              "(index=%u, type=%d)", __func__,
3235                              index, entry->irte.sid_vtype);
3236            /* Take this as verification failure. */
3237            return -VTD_FR_IR_SID_ERR;
3238        }
3239    }
3240
3241    return 0;
3242}
3243
3244/* Fetch IRQ information of specific IR index */
3245static int vtd_remap_irq_get(IntelIOMMUState *iommu, uint16_t index,
3246                             X86IOMMUIrq *irq, uint16_t sid)
3247{
3248    VTD_IR_TableEntry irte = {};
3249    int ret = 0;
3250
3251    ret = vtd_irte_get(iommu, index, &irte, sid);
3252    if (ret) {
3253        return ret;
3254    }
3255
3256    irq->trigger_mode = irte.irte.trigger_mode;
3257    irq->vector = irte.irte.vector;
3258    irq->delivery_mode = irte.irte.delivery_mode;
3259    irq->dest = le32_to_cpu(irte.irte.dest_id);
3260    if (!iommu->intr_eime) {
3261#define  VTD_IR_APIC_DEST_MASK         (0xff00ULL)
3262#define  VTD_IR_APIC_DEST_SHIFT        (8)
3263        irq->dest = (irq->dest & VTD_IR_APIC_DEST_MASK) >>
3264            VTD_IR_APIC_DEST_SHIFT;
3265    }
3266    irq->dest_mode = irte.irte.dest_mode;
3267    irq->redir_hint = irte.irte.redir_hint;
3268
3269    trace_vtd_ir_remap(index, irq->trigger_mode, irq->vector,
3270                       irq->delivery_mode, irq->dest, irq->dest_mode);
3271
3272    return 0;
3273}
3274
3275/* Interrupt remapping for MSI/MSI-X entry */
3276static int vtd_interrupt_remap_msi(IntelIOMMUState *iommu,
3277                                   MSIMessage *origin,
3278                                   MSIMessage *translated,
3279                                   uint16_t sid)
3280{
3281    int ret = 0;
3282    VTD_IR_MSIAddress addr;
3283    uint16_t index;
3284    X86IOMMUIrq irq = {};
3285
3286    assert(origin && translated);
3287
3288    trace_vtd_ir_remap_msi_req(origin->address, origin->data);
3289
3290    if (!iommu || !iommu->intr_enabled) {
3291        memcpy(translated, origin, sizeof(*origin));
3292        goto out;
3293    }
3294
3295    if (origin->address & VTD_MSI_ADDR_HI_MASK) {
3296        error_report_once("%s: MSI address high 32 bits non-zero detected: "
3297                          "address=0x%" PRIx64, __func__, origin->address);
3298        return -VTD_FR_IR_REQ_RSVD;
3299    }
3300
3301    addr.data = origin->address & VTD_MSI_ADDR_LO_MASK;
3302    if (addr.addr.__head != 0xfee) {
3303        error_report_once("%s: MSI address low 32 bit invalid: 0x%" PRIx32,
3304                          __func__, addr.data);
3305        return -VTD_FR_IR_REQ_RSVD;
3306    }
3307
3308    /* This is compatible mode. */
3309    if (addr.addr.int_mode != VTD_IR_INT_FORMAT_REMAP) {
3310        memcpy(translated, origin, sizeof(*origin));
3311        goto out;
3312    }
3313
3314    index = addr.addr.index_h << 15 | le16_to_cpu(addr.addr.index_l);
3315
3316#define  VTD_IR_MSI_DATA_SUBHANDLE       (0x0000ffff)
3317#define  VTD_IR_MSI_DATA_RESERVED        (0xffff0000)
3318
3319    if (addr.addr.sub_valid) {
3320        /* See VT-d spec 5.1.2.2 and 5.1.3 on subhandle */
3321        index += origin->data & VTD_IR_MSI_DATA_SUBHANDLE;
3322    }
3323
3324    ret = vtd_remap_irq_get(iommu, index, &irq, sid);
3325    if (ret) {
3326        return ret;
3327    }
3328
3329    if (addr.addr.sub_valid) {
3330        trace_vtd_ir_remap_type("MSI");
3331        if (origin->data & VTD_IR_MSI_DATA_RESERVED) {
3332            error_report_once("%s: invalid IR MSI "
3333                              "(sid=%u, address=0x%" PRIx64
3334                              ", data=0x%" PRIx32 ")",
3335                              __func__, sid, origin->address, origin->data);
3336            return -VTD_FR_IR_REQ_RSVD;
3337        }
3338    } else {
3339        uint8_t vector = origin->data & 0xff;
3340        uint8_t trigger_mode = (origin->data >> MSI_DATA_TRIGGER_SHIFT) & 0x1;
3341
3342        trace_vtd_ir_remap_type("IOAPIC");
3343        /* IOAPIC entry vector should be aligned with IRTE vector
3344         * (see vt-d spec 5.1.5.1). */
3345        if (vector != irq.vector) {
3346            trace_vtd_warn_ir_vector(sid, index, vector, irq.vector);
3347        }
3348
3349        /* The Trigger Mode field must match the Trigger Mode in the IRTE.
3350         * (see vt-d spec 5.1.5.1). */
3351        if (trigger_mode != irq.trigger_mode) {
3352            trace_vtd_warn_ir_trigger(sid, index, trigger_mode,
3353                                      irq.trigger_mode);
3354        }
3355    }
3356
3357    /*
3358     * We'd better keep the last two bits, assuming that guest OS
3359     * might modify it. Keep it does not hurt after all.
3360     */
3361    irq.msi_addr_last_bits = addr.addr.__not_care;
3362
3363    /* Translate X86IOMMUIrq to MSI message */
3364    x86_iommu_irq_to_msi_message(&irq, translated);
3365
3366out:
3367    trace_vtd_ir_remap_msi(origin->address, origin->data,
3368                           translated->address, translated->data);
3369    return 0;
3370}
3371
3372static int vtd_int_remap(X86IOMMUState *iommu, MSIMessage *src,
3373                         MSIMessage *dst, uint16_t sid)
3374{
3375    return vtd_interrupt_remap_msi(INTEL_IOMMU_DEVICE(iommu),
3376                                   src, dst, sid);
3377}
3378
3379static MemTxResult vtd_mem_ir_read(void *opaque, hwaddr addr,
3380                                   uint64_t *data, unsigned size,
3381                                   MemTxAttrs attrs)
3382{
3383    return MEMTX_OK;
3384}
3385
3386static MemTxResult vtd_mem_ir_write(void *opaque, hwaddr addr,
3387                                    uint64_t value, unsigned size,
3388                                    MemTxAttrs attrs)
3389{
3390    int ret = 0;
3391    MSIMessage from = {}, to = {};
3392    uint16_t sid = X86_IOMMU_SID_INVALID;
3393
3394    from.address = (uint64_t) addr + VTD_INTERRUPT_ADDR_FIRST;
3395    from.data = (uint32_t) value;
3396
3397    if (!attrs.unspecified) {
3398        /* We have explicit Source ID */
3399        sid = attrs.requester_id;
3400    }
3401
3402    ret = vtd_interrupt_remap_msi(opaque, &from, &to, sid);
3403    if (ret) {
3404        /* TODO: report error */
3405        /* Drop this interrupt */
3406        return MEMTX_ERROR;
3407    }
3408
3409    apic_get_class()->send_msi(&to);
3410
3411    return MEMTX_OK;
3412}
3413
3414static const MemoryRegionOps vtd_mem_ir_ops = {
3415    .read_with_attrs = vtd_mem_ir_read,
3416    .write_with_attrs = vtd_mem_ir_write,
3417    .endianness = DEVICE_LITTLE_ENDIAN,
3418    .impl = {
3419        .min_access_size = 4,
3420        .max_access_size = 4,
3421    },
3422    .valid = {
3423        .min_access_size = 4,
3424        .max_access_size = 4,
3425    },
3426};
3427
3428VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
3429{
3430    uintptr_t key = (uintptr_t)bus;
3431    VTDBus *vtd_bus = g_hash_table_lookup(s->vtd_as_by_busptr, &key);
3432    VTDAddressSpace *vtd_dev_as;
3433    char name[128];
3434
3435    if (!vtd_bus) {
3436        uintptr_t *new_key = g_malloc(sizeof(*new_key));
3437        *new_key = (uintptr_t)bus;
3438        /* No corresponding free() */
3439        vtd_bus = g_malloc0(sizeof(VTDBus) + sizeof(VTDAddressSpace *) * \
3440                            PCI_DEVFN_MAX);
3441        vtd_bus->bus = bus;
3442        g_hash_table_insert(s->vtd_as_by_busptr, new_key, vtd_bus);
3443    }
3444
3445    vtd_dev_as = vtd_bus->dev_as[devfn];
3446
3447    if (!vtd_dev_as) {
3448        snprintf(name, sizeof(name), "vtd-%02x.%x", PCI_SLOT(devfn),
3449                 PCI_FUNC(devfn));
3450        vtd_bus->dev_as[devfn] = vtd_dev_as = g_new0(VTDAddressSpace, 1);
3451
3452        vtd_dev_as->bus = bus;
3453        vtd_dev_as->devfn = (uint8_t)devfn;
3454        vtd_dev_as->iommu_state = s;
3455        vtd_dev_as->context_cache_entry.context_cache_gen = 0;
3456        vtd_dev_as->iova_tree = iova_tree_new();
3457
3458        memory_region_init(&vtd_dev_as->root, OBJECT(s), name, UINT64_MAX);
3459        address_space_init(&vtd_dev_as->as, &vtd_dev_as->root, "vtd-root");
3460
3461        /*
3462         * Build the DMAR-disabled container with aliases to the
3463         * shared MRs.  Note that aliasing to a shared memory region
3464         * could help the memory API to detect same FlatViews so we
3465         * can have devices to share the same FlatView when DMAR is
3466         * disabled (either by not providing "intel_iommu=on" or with
3467         * "iommu=pt").  It will greatly reduce the total number of
3468         * FlatViews of the system hence VM runs faster.
3469         */
3470        memory_region_init_alias(&vtd_dev_as->nodmar, OBJECT(s),
3471                                 "vtd-nodmar", &s->mr_nodmar, 0,
3472                                 memory_region_size(&s->mr_nodmar));
3473
3474        /*
3475         * Build the per-device DMAR-enabled container.
3476         *
3477         * TODO: currently we have per-device IOMMU memory region only
3478         * because we have per-device IOMMU notifiers for devices.  If
3479         * one day we can abstract the IOMMU notifiers out of the
3480         * memory regions then we can also share the same memory
3481         * region here just like what we've done above with the nodmar
3482         * region.
3483         */
3484        strcat(name, "-dmar");
3485        memory_region_init_iommu(&vtd_dev_as->iommu, sizeof(vtd_dev_as->iommu),
3486                                 TYPE_INTEL_IOMMU_MEMORY_REGION, OBJECT(s),
3487                                 name, UINT64_MAX);
3488        memory_region_init_alias(&vtd_dev_as->iommu_ir, OBJECT(s), "vtd-ir",
3489                                 &s->mr_ir, 0, memory_region_size(&s->mr_ir));
3490        memory_region_add_subregion_overlap(MEMORY_REGION(&vtd_dev_as->iommu),
3491                                            VTD_INTERRUPT_ADDR_FIRST,
3492                                            &vtd_dev_as->iommu_ir, 1);
3493
3494        /*
3495         * Hook both the containers under the root container, we
3496         * switch between DMAR & noDMAR by enable/disable
3497         * corresponding sub-containers
3498         */
3499        memory_region_add_subregion_overlap(&vtd_dev_as->root, 0,
3500                                            MEMORY_REGION(&vtd_dev_as->iommu),
3501                                            0);
3502        memory_region_add_subregion_overlap(&vtd_dev_as->root, 0,
3503                                            &vtd_dev_as->nodmar, 0);
3504
3505        vtd_switch_address_space(vtd_dev_as);
3506    }
3507    return vtd_dev_as;
3508}
3509
3510/* Unmap the whole range in the notifier's scope. */
3511static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
3512{
3513    hwaddr size, remain;
3514    hwaddr start = n->start;
3515    hwaddr end = n->end;
3516    IntelIOMMUState *s = as->iommu_state;
3517    DMAMap map;
3518
3519    /*
3520     * Note: all the codes in this function has a assumption that IOVA
3521     * bits are no more than VTD_MGAW bits (which is restricted by
3522     * VT-d spec), otherwise we need to consider overflow of 64 bits.
3523     */
3524
3525    if (end > VTD_ADDRESS_SIZE(s->aw_bits) - 1) {
3526        /*
3527         * Don't need to unmap regions that is bigger than the whole
3528         * VT-d supported address space size
3529         */
3530        end = VTD_ADDRESS_SIZE(s->aw_bits) - 1;
3531    }
3532
3533    assert(start <= end);
3534    size = remain = end - start + 1;
3535
3536    while (remain >= VTD_PAGE_SIZE) {
3537        IOMMUTLBEvent event;
3538        uint64_t mask = dma_aligned_pow2_mask(start, end, s->aw_bits);
3539        uint64_t size = mask + 1;
3540
3541        assert(size);
3542
3543        event.type = IOMMU_NOTIFIER_UNMAP;
3544        event.entry.iova = start;
3545        event.entry.addr_mask = mask;
3546        event.entry.target_as = &address_space_memory;
3547        event.entry.perm = IOMMU_NONE;
3548        /* This field is meaningless for unmap */
3549        event.entry.translated_addr = 0;
3550
3551        memory_region_notify_iommu_one(n, &event);
3552
3553        start += size;
3554        remain -= size;
3555    }
3556
3557    assert(!remain);
3558
3559    trace_vtd_as_unmap_whole(pci_bus_num(as->bus),
3560                             VTD_PCI_SLOT(as->devfn),
3561                             VTD_PCI_FUNC(as->devfn),
3562                             n->start, size);
3563
3564    map.iova = n->start;
3565    map.size = size;
3566    iova_tree_remove(as->iova_tree, &map);
3567}
3568
3569static void vtd_address_space_unmap_all(IntelIOMMUState *s)
3570{
3571    VTDAddressSpace *vtd_as;
3572    IOMMUNotifier *n;
3573
3574    QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
3575        IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) {
3576            vtd_address_space_unmap(vtd_as, n);
3577        }
3578    }
3579}
3580
3581static void vtd_address_space_refresh_all(IntelIOMMUState *s)
3582{
3583    vtd_address_space_unmap_all(s);
3584    vtd_switch_address_space_all(s);
3585}
3586
3587static int vtd_replay_hook(IOMMUTLBEvent *event, void *private)
3588{
3589    memory_region_notify_iommu_one(private, event);
3590    return 0;
3591}
3592
3593static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
3594{
3595    VTDAddressSpace *vtd_as = container_of(iommu_mr, VTDAddressSpace, iommu);
3596    IntelIOMMUState *s = vtd_as->iommu_state;
3597    uint8_t bus_n = pci_bus_num(vtd_as->bus);
3598    VTDContextEntry ce;
3599
3600    /*
3601     * The replay can be triggered by either a invalidation or a newly
3602     * created entry. No matter what, we release existing mappings
3603     * (it means flushing caches for UNMAP-only registers).
3604     */
3605    vtd_address_space_unmap(vtd_as, n);
3606
3607    if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) {
3608        trace_vtd_replay_ce_valid(s->root_scalable ? "scalable mode" :
3609                                  "legacy mode",
3610                                  bus_n, PCI_SLOT(vtd_as->devfn),
3611                                  PCI_FUNC(vtd_as->devfn),
3612                                  vtd_get_domain_id(s, &ce),
3613                                  ce.hi, ce.lo);
3614        if (vtd_as_has_map_notifier(vtd_as)) {
3615            /* This is required only for MAP typed notifiers */
3616            vtd_page_walk_info info = {
3617                .hook_fn = vtd_replay_hook,
3618                .private = (void *)n,
3619                .notify_unmap = false,
3620                .aw = s->aw_bits,
3621                .as = vtd_as,
3622                .domain_id = vtd_get_domain_id(s, &ce),
3623            };
3624
3625            vtd_page_walk(s, &ce, 0, ~0ULL, &info);
3626        }
3627    } else {
3628        trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn),
3629                                    PCI_FUNC(vtd_as->devfn));
3630    }
3631
3632    return;
3633}
3634
3635/* Do the initialization. It will also be called when reset, so pay
3636 * attention when adding new initialization stuff.
3637 */
3638static void vtd_init(IntelIOMMUState *s)
3639{
3640    X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
3641
3642    memset(s->csr, 0, DMAR_REG_SIZE);
3643    memset(s->wmask, 0, DMAR_REG_SIZE);
3644    memset(s->w1cmask, 0, DMAR_REG_SIZE);
3645    memset(s->womask, 0, DMAR_REG_SIZE);
3646
3647    s->root = 0;
3648    s->root_scalable = false;
3649    s->dmar_enabled = false;
3650    s->intr_enabled = false;
3651    s->iq_head = 0;
3652    s->iq_tail = 0;
3653    s->iq = 0;
3654    s->iq_size = 0;
3655    s->qi_enabled = false;
3656    s->iq_last_desc_type = VTD_INV_DESC_NONE;
3657    s->iq_dw = false;
3658    s->next_frcd_reg = 0;
3659    s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND |
3660             VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS |
3661             VTD_CAP_MGAW(s->aw_bits);
3662    if (s->dma_drain) {
3663        s->cap |= VTD_CAP_DRAIN;
3664    }
3665    if (s->dma_translation) {
3666            if (s->aw_bits >= VTD_HOST_AW_39BIT) {
3667                    s->cap |= VTD_CAP_SAGAW_39bit;
3668            }
3669            if (s->aw_bits >= VTD_HOST_AW_48BIT) {
3670                    s->cap |= VTD_CAP_SAGAW_48bit;
3671            }
3672    }
3673    s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
3674
3675    /*
3676     * Rsvd field masks for spte
3677     */
3678    vtd_spte_rsvd[0] = ~0ULL;
3679    vtd_spte_rsvd[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits,
3680                                                  x86_iommu->dt_supported);
3681    vtd_spte_rsvd[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits);
3682    vtd_spte_rsvd[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits);
3683    vtd_spte_rsvd[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits);
3684
3685    vtd_spte_rsvd_large[2] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits,
3686                                                         x86_iommu->dt_supported);
3687    vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits,
3688                                                         x86_iommu->dt_supported);
3689
3690    if (s->scalable_mode || s->snoop_control) {
3691        vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP;
3692        vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP;
3693        vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP;
3694    }
3695
3696    if (x86_iommu_ir_supported(x86_iommu)) {
3697        s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV;
3698        if (s->intr_eim == ON_OFF_AUTO_ON) {
3699            s->ecap |= VTD_ECAP_EIM;
3700        }
3701        assert(s->intr_eim != ON_OFF_AUTO_AUTO);
3702    }
3703
3704    if (x86_iommu->dt_supported) {
3705        s->ecap |= VTD_ECAP_DT;
3706    }
3707
3708    if (x86_iommu->pt_supported) {
3709        s->ecap |= VTD_ECAP_PT;
3710    }
3711
3712    if (s->caching_mode) {
3713        s->cap |= VTD_CAP_CM;
3714    }
3715
3716    /* TODO: read cap/ecap from host to decide which cap to be exposed. */
3717    if (s->scalable_mode) {
3718        s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS;
3719    }
3720
3721    if (s->snoop_control) {
3722        s->ecap |= VTD_ECAP_SC;
3723    }
3724
3725    vtd_reset_caches(s);
3726
3727    /* Define registers with default values and bit semantics */
3728    vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0);
3729    vtd_define_quad(s, DMAR_CAP_REG, s->cap, 0, 0);
3730    vtd_define_quad(s, DMAR_ECAP_REG, s->ecap, 0, 0);
3731    vtd_define_long(s, DMAR_GCMD_REG, 0, 0xff800000UL, 0);
3732    vtd_define_long_wo(s, DMAR_GCMD_REG, 0xff800000UL);
3733    vtd_define_long(s, DMAR_GSTS_REG, 0, 0, 0);
3734    vtd_define_quad(s, DMAR_RTADDR_REG, 0, 0xfffffffffffffc00ULL, 0);
3735    vtd_define_quad(s, DMAR_CCMD_REG, 0, 0xe0000003ffffffffULL, 0);
3736    vtd_define_quad_wo(s, DMAR_CCMD_REG, 0x3ffff0000ULL);
3737
3738    /* Advanced Fault Logging not supported */
3739    vtd_define_long(s, DMAR_FSTS_REG, 0, 0, 0x11UL);
3740    vtd_define_long(s, DMAR_FECTL_REG, 0x80000000UL, 0x80000000UL, 0);
3741    vtd_define_long(s, DMAR_FEDATA_REG, 0, 0x0000ffffUL, 0);
3742    vtd_define_long(s, DMAR_FEADDR_REG, 0, 0xfffffffcUL, 0);
3743
3744    /* Treated as RsvdZ when EIM in ECAP_REG is not supported
3745     * vtd_define_long(s, DMAR_FEUADDR_REG, 0, 0xffffffffUL, 0);
3746     */
3747    vtd_define_long(s, DMAR_FEUADDR_REG, 0, 0, 0);
3748
3749    /* Treated as RO for implementations that PLMR and PHMR fields reported
3750     * as Clear in the CAP_REG.
3751     * vtd_define_long(s, DMAR_PMEN_REG, 0, 0x80000000UL, 0);
3752     */
3753    vtd_define_long(s, DMAR_PMEN_REG, 0, 0, 0);
3754
3755    vtd_define_quad(s, DMAR_IQH_REG, 0, 0, 0);
3756    vtd_define_quad(s, DMAR_IQT_REG, 0, 0x7fff0ULL, 0);
3757    vtd_define_quad(s, DMAR_IQA_REG, 0, 0xfffffffffffff807ULL, 0);
3758    vtd_define_long(s, DMAR_ICS_REG, 0, 0, 0x1UL);
3759    vtd_define_long(s, DMAR_IECTL_REG, 0x80000000UL, 0x80000000UL, 0);
3760    vtd_define_long(s, DMAR_IEDATA_REG, 0, 0xffffffffUL, 0);
3761    vtd_define_long(s, DMAR_IEADDR_REG, 0, 0xfffffffcUL, 0);
3762    /* Treadted as RsvdZ when EIM in ECAP_REG is not supported */
3763    vtd_define_long(s, DMAR_IEUADDR_REG, 0, 0, 0);
3764
3765    /* IOTLB registers */
3766    vtd_define_quad(s, DMAR_IOTLB_REG, 0, 0Xb003ffff00000000ULL, 0);
3767    vtd_define_quad(s, DMAR_IVA_REG, 0, 0xfffffffffffff07fULL, 0);
3768    vtd_define_quad_wo(s, DMAR_IVA_REG, 0xfffffffffffff07fULL);
3769
3770    /* Fault Recording Registers, 128-bit */
3771    vtd_define_quad(s, DMAR_FRCD_REG_0_0, 0, 0, 0);
3772    vtd_define_quad(s, DMAR_FRCD_REG_0_2, 0, 0, 0x8000000000000000ULL);
3773
3774    /*
3775     * Interrupt remapping registers.
3776     */
3777    vtd_define_quad(s, DMAR_IRTA_REG, 0, 0xfffffffffffff80fULL, 0);
3778}
3779
3780/* Should not reset address_spaces when reset because devices will still use
3781 * the address space they got at first (won't ask the bus again).
3782 */
3783static void vtd_reset(DeviceState *dev)
3784{
3785    IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev);
3786
3787    vtd_init(s);
3788    vtd_address_space_refresh_all(s);
3789}
3790
3791static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
3792{
3793    IntelIOMMUState *s = opaque;
3794    VTDAddressSpace *vtd_as;
3795
3796    assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
3797
3798    vtd_as = vtd_find_add_as(s, bus, devfn);
3799    return &vtd_as->as;
3800}
3801
3802static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
3803{
3804    X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
3805
3806    if (s->intr_eim == ON_OFF_AUTO_ON && !x86_iommu_ir_supported(x86_iommu)) {
3807        error_setg(errp, "eim=on cannot be selected without intremap=on");
3808        return false;
3809    }
3810
3811    if (s->intr_eim == ON_OFF_AUTO_AUTO) {
3812        s->intr_eim = (kvm_irqchip_in_kernel() || s->buggy_eim)
3813                      && x86_iommu_ir_supported(x86_iommu) ?
3814                                              ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
3815    }
3816    if (s->intr_eim == ON_OFF_AUTO_ON && !s->buggy_eim) {
3817        if (!kvm_irqchip_is_split()) {
3818            error_setg(errp, "eim=on requires accel=kvm,kernel-irqchip=split");
3819            return false;
3820        }
3821    }
3822
3823    /* Currently only address widths supported are 39 and 48 bits */
3824    if ((s->aw_bits != VTD_HOST_AW_39BIT) &&
3825        (s->aw_bits != VTD_HOST_AW_48BIT)) {
3826        error_setg(errp, "Supported values for aw-bits are: %d, %d",
3827                   VTD_HOST_AW_39BIT, VTD_HOST_AW_48BIT);
3828        return false;
3829    }
3830
3831    if (s->scalable_mode && !s->dma_drain) {
3832        error_setg(errp, "Need to set dma_drain for scalable mode");
3833        return false;
3834    }
3835
3836    return true;
3837}
3838
3839static int vtd_machine_done_notify_one(Object *child, void *unused)
3840{
3841    IntelIOMMUState *iommu = INTEL_IOMMU_DEVICE(x86_iommu_get_default());
3842
3843    /*
3844     * We hard-coded here because vfio-pci is the only special case
3845     * here.  Let's be more elegant in the future when we can, but so
3846     * far there seems to be no better way.
3847     */
3848    if (object_dynamic_cast(child, "vfio-pci") && !iommu->caching_mode) {
3849        vtd_panic_require_caching_mode();
3850    }
3851
3852    return 0;
3853}
3854
3855static void vtd_machine_done_hook(Notifier *notifier, void *unused)
3856{
3857    object_child_foreach_recursive(object_get_root(),
3858                                   vtd_machine_done_notify_one, NULL);
3859}
3860
3861static Notifier vtd_machine_done_notify = {
3862    .notify = vtd_machine_done_hook,
3863};
3864
3865static void vtd_realize(DeviceState *dev, Error **errp)
3866{
3867    MachineState *ms = MACHINE(qdev_get_machine());
3868    PCMachineState *pcms = PC_MACHINE(ms);
3869    X86MachineState *x86ms = X86_MACHINE(ms);
3870    PCIBus *bus = pcms->bus;
3871    IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev);
3872
3873    if (!vtd_decide_config(s, errp)) {
3874        return;
3875    }
3876
3877    QLIST_INIT(&s->vtd_as_with_notifiers);
3878    qemu_mutex_init(&s->iommu_lock);
3879    memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num));
3880    memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s,
3881                          "intel_iommu", DMAR_REG_SIZE);
3882
3883    /* Create the shared memory regions by all devices */
3884    memory_region_init(&s->mr_nodmar, OBJECT(s), "vtd-nodmar",
3885                       UINT64_MAX);
3886    memory_region_init_io(&s->mr_ir, OBJECT(s), &vtd_mem_ir_ops,
3887                          s, "vtd-ir", VTD_INTERRUPT_ADDR_SIZE);
3888    memory_region_init_alias(&s->mr_sys_alias, OBJECT(s),
3889                             "vtd-sys-alias", get_system_memory(), 0,
3890                             memory_region_size(get_system_memory()));
3891    memory_region_add_subregion_overlap(&s->mr_nodmar, 0,
3892                                        &s->mr_sys_alias, 0);
3893    memory_region_add_subregion_overlap(&s->mr_nodmar,
3894                                        VTD_INTERRUPT_ADDR_FIRST,
3895                                        &s->mr_ir, 1);
3896
3897    sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->csrmem);
3898    /* No corresponding destroy */
3899    s->iotlb = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal,
3900                                     g_free, g_free);
3901    s->vtd_as_by_busptr = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal,
3902                                              g_free, g_free);
3903    vtd_init(s);
3904    sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, Q35_HOST_BRIDGE_IOMMU_ADDR);
3905    pci_setup_iommu(bus, vtd_host_dma_iommu, dev);
3906    /* Pseudo address space under root PCI bus. */
3907    x86ms->ioapic_as = vtd_host_dma_iommu(bus, s, Q35_PSEUDO_DEVFN_IOAPIC);
3908    qemu_add_machine_init_done_notifier(&vtd_machine_done_notify);
3909}
3910
3911static void vtd_class_init(ObjectClass *klass, void *data)
3912{
3913    DeviceClass *dc = DEVICE_CLASS(klass);
3914    X86IOMMUClass *x86_class = X86_IOMMU_DEVICE_CLASS(klass);
3915
3916    dc->reset = vtd_reset;
3917    dc->vmsd = &vtd_vmstate;
3918    device_class_set_props(dc, vtd_properties);
3919    dc->hotpluggable = false;
3920    x86_class->realize = vtd_realize;
3921    x86_class->int_remap = vtd_int_remap;
3922    /* Supported by the pc-q35-* machine types */
3923    dc->user_creatable = true;
3924    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
3925    dc->desc = "Intel IOMMU (VT-d) DMA Remapping device";
3926}
3927
3928static const TypeInfo vtd_info = {
3929    .name          = TYPE_INTEL_IOMMU_DEVICE,
3930    .parent        = TYPE_X86_IOMMU_DEVICE,
3931    .instance_size = sizeof(IntelIOMMUState),
3932    .class_init    = vtd_class_init,
3933};
3934
3935static void vtd_iommu_memory_region_class_init(ObjectClass *klass,
3936                                                     void *data)
3937{
3938    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
3939
3940    imrc->translate = vtd_iommu_translate;
3941    imrc->notify_flag_changed = vtd_iommu_notify_flag_changed;
3942    imrc->replay = vtd_iommu_replay;
3943}
3944
3945static const TypeInfo vtd_iommu_memory_region_info = {
3946    .parent = TYPE_IOMMU_MEMORY_REGION,
3947    .name = TYPE_INTEL_IOMMU_MEMORY_REGION,
3948    .class_init = vtd_iommu_memory_region_class_init,
3949};
3950
3951static void vtd_register_types(void)
3952{
3953    type_register_static(&vtd_info);
3954    type_register_static(&vtd_iommu_memory_region_info);
3955}
3956
3957type_init(vtd_register_types)
3958